monolish  0.14.2
MONOlithic LIner equation Solvers for Highly-parallel architecture
crs-dense_matmul.cpp
Go to the documentation of this file.
1 #include "../../../../include/monolish_blas.hpp"
2 #include "../../../internal/monolish_internal.hpp"
3 
4 namespace monolish {
5 
6 // double ///////////////////
9  Logger &logger = Logger::get_instance();
10  logger.func_in(monolish_func);
11 
12  // err
13  assert(A.get_col() == B.get_row());
14  assert(A.get_row() == C.get_row());
15  assert(B.get_col() == C.get_col());
16  assert(util::is_same_device_mem_stat(A, B, C));
17 
18  const double *vald = A.val.data();
19  const int *rowd = A.row_ptr.data();
20  const int *cold = A.col_ind.data();
21 
22  const double *Bd = B.val.data();
23  double *Cd = C.val.data();
24 
25  // MN = MK * KN
26  const int M = A.get_row();
27  const int N = B.get_col();
28 
29  if (A.get_device_mem_stat() == true) {
30 #if MONOLISH_USE_NVIDIA_GPU // CUDA11 will support SpMM
31 #if 0 // row major SpMM is not supported in cuda 10.2
32  size_t nnz = A.get_nnz();
33 
34  cusparseHandle_t sp_handle;
35  cusparseCreate(&sp_handle);
36  cudaDeviceSynchronize();
37 
38  cusparseMatDescr_t descr = 0;
39  cusparseCreateMatDescr(&descr);
40  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
41  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
42  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
43 
44  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
45 
46  const double alpha = 1.0;
47  const double beta = 0.0;
48 
49 #pragma omp target data use_device_ptr(Bd, Cd, vald, rowd, cold)
50  {
51  cusparseSpMatDescr_t matA;
52  cusparseDnMatDescr_t matB, matC;
53  void* dBuffer = NULL;
54  size_t buffersize = 0;
55 
56  cusparseCreateCsr(&matA, M, K, nnz, (void*)rowd, (void*)cold, (void*)vald,
57  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
58 
59  cusparseCreateDnMat(&matB, K, N, N, (void*)Bd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
60 
61  cusparseCreateDnMat(&matC, M, N, N, (void*)Cd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
62 
63  cusparseSpMM_bufferSize(sp_handle,
64  CUSPARSE_OPERATION_TRANSPOSE,
65  CUSPARSE_OPERATION_TRANSPOSE,
66  &alpha, matA, matB, &beta, matC, CUDA_R_64F,
67  CUSPARSE_MM_ALG_DEFAULT, &buffersize);
68 
69  cudaMalloc(&dBuffer, buffersize);
70 
71  cusparseSpMM(sp_handle,
72  CUSPARSE_OPERATION_TRANSPOSE,
73  CUSPARSE_OPERATION_TRANSPOSE,
74  &alpha, matA, matB, &beta, matC, CUDA_R_64F,
75  CUSPARSE_MM_ALG_DEFAULT, dBuffer);
76 
77  cudaFree(dBuffer);
78  }
79 
80 #else
81 #pragma omp target teams distribute parallel for
82  for (int j = 0; j < N; j++) {
83  for (int i = 0; i < M; i++) {
84  double tmp = 0;
85  for (int k = rowd[i]; k < rowd[i + 1]; k++) {
86  tmp += vald[k] * Bd[N * cold[k] + j];
87  }
88  Cd[i * N + j] = tmp;
89  }
90  }
91 #endif
92 #else
93  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
94 #endif
95  } else {
96 // MKL
97 #if MONOLISH_USE_MKL
98  const double alpha = 1.0;
99  const double beta = 0.0;
100  const int K = A.get_col();
101 
102  // sparse_matrix_t mklA;
103  // struct matrix_descr descrA;
104  // descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
105  // mkl_sparse_d_create_csr(&mklA, SPARSE_INDEX_BASE_ZERO, M, K, (int*)rowd,
106  // (int*)rowd+1, (int*)cold, (double*)vald); mkl_sparse_set_mm_hint (mklA,
107  // SPARSE_OPERATION_NON_TRANSPOSE, descrA, 100); // We haven't seen any
108  // performance improvement by using hint.
109  // mkl_sparse_d_mm(SPARSE_OPERATION_NON_TRANSPOSE, alpha, mklA, descrA,
110  // SPARSE_LAYOUT_ROW_MAJOR,
111  // Bd, N, K, beta, Cd, N);
112 
113  mkl_dcsrmm("N", &M, &N, &K, &alpha, "G__C", vald, cold, rowd, rowd + 1, Bd,
114  &N, &beta, Cd, &N);
115 
116 // OSS
117 #else
118 #if USE_AVX // avx_cpu
119  const int vecL = 4;
120 
121 #pragma omp parallel for
122  for (int i = 0; i < (int)(M * N); i++) {
123  Cd[i] = 0.0;
124  }
125 
126 #pragma omp parallel for
127  for (int i = 0; i < (int)M; i++) {
128  int start = (int)rowd[i];
129  int end = (int)rowd[i + 1];
130  const int Cr = i * N;
131  for (int k = start; k < end; k++) {
132  const int Br = N * cold[k];
133  const Dreg Av = SIMD_FUNC(broadcast_sd)(&vald[k]);
134  Dreg tv, Bv, Cv;
135  int j;
136  for (j = 0; j < (int)N - (vecL - 1); j += vecL) {
137  const int BB = Br + j;
138  const int CC = Cr + j;
139 
140  Bv = SIMD_FUNC(loadu_pd)((double *)&Bd[BB]);
141  Cv = SIMD_FUNC(loadu_pd)((double *)&Cd[CC]);
142  tv = SIMD_FUNC(mul_pd)(Av, Bv);
143  Cv = SIMD_FUNC(add_pd)(Cv, tv);
144  SIMD_FUNC(storeu_pd)((double *)&Cd[CC], Cv);
145  }
146 
147  for (; j < (int)N; j++) {
148  Cd[Cr + j] += vald[k] * Bd[Br + j];
149  }
150  }
151  }
152 #else // Scalar_cpu
153 #pragma omp parallel for
154  for (int j = 0; j < (int)N; j++) {
155  for (int i = 0; i < (int)M; i++) {
156  double tmp = 0;
157  int start = (int)rowd[i];
158  int end = (int)rowd[i + 1];
159  for (int k = start; k < end; k++) {
160  tmp += vald[k] * Bd[N * cold[k] + j];
161  }
162  Cd[i * N + j] = tmp;
163  }
164  }
165 #endif
166 #endif
167  }
168  logger.func_out();
169 }
170 
171 // float ///////////////////
174  Logger &logger = Logger::get_instance();
175  logger.func_in(monolish_func);
176 
177  // err
178  assert(A.get_col() == B.get_row());
179  assert(A.get_row() == C.get_row());
180  assert(B.get_col() == C.get_col());
181  assert(util::is_same_device_mem_stat(A, B, C));
182 
183  const float *vald = A.val.data();
184  const int *rowd = A.row_ptr.data();
185  const int *cold = A.col_ind.data();
186 
187  const float *Bd = B.val.data();
188  float *Cd = C.val.data();
189 
190  // MN = MK * KN
191  const int M = A.get_row();
192  const int N = B.get_col();
193 
194  if (A.get_device_mem_stat() == true) {
195 #if MONOLISH_USE_NVIDIA_GPU
196 #pragma omp target teams distribute parallel for
197  for (size_t j = 0; j < N; j++) {
198  for (size_t i = 0; i < M; i++) {
199  float tmp = 0;
200  for (size_t k = (size_t)rowd[i]; k < (size_t)rowd[i + 1]; k++) {
201  tmp += vald[k] * Bd[N * cold[k] + j];
202  }
203  Cd[i * N + j] = tmp;
204  }
205  }
206 #else
207  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
208 #endif
209  } else {
210 // MKL
211 #if MONOLISH_USE_MKL
212  const int K = A.get_col();
213  const float alpha = 1.0;
214  const float beta = 0.0;
215  mkl_scsrmm("N", &M, &N, &K, &alpha, "G__C", vald, cold, rowd, rowd + 1, Bd,
216  &N, &beta, Cd, &N);
217 
218 // OSS
219 #else
220 #if MONOLISH_USE_AVX // avx_cpu
221  // const int vecL = 8;
222 
223 #pragma omp parallel for
224  for (int i = 0; i < (int)(M * N); i++) {
225  Cd[i] = 0.0;
226  }
227 
228 #pragma omp parallel for
229  for (int i = 0; i < (int)M; i++) {
230  int start = (int)rowd[i];
231  int end = (int)rowd[i + 1];
232  const int Cr = i * N;
233  for (int k = start; k < end; k++) {
234  const int Br = N * cold[k];
235  const Sreg Av = SIMD_FUNC(broadcast_ss)(&vald[k]);
236  Sreg tv, Bv, Cv;
237  int j;
238  for (j = 0; j < (int)N - 31; j += 32) {
239  const int BB = Br + j;
240  const int CC = Cr + j;
241 
242  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB]);
243  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC]);
244  tv = SIMD_FUNC(mul_ps)(Av, Bv);
245  Cv = SIMD_FUNC(add_ps)(Cv, tv);
246  SIMD_FUNC(storeu_ps)((float *)&Cd[CC], Cv);
247 
248  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 8]);
249  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 8]);
250  tv = SIMD_FUNC(mul_ps)(Av, Bv);
251  Cv = SIMD_FUNC(add_ps)(Cv, tv);
252  SIMD_FUNC(storeu_ps)((float *)&Cd[CC + 8], Cv);
253 
254  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 16]);
255  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 16]);
256  tv = SIMD_FUNC(mul_ps)(Av, Bv);
257  Cv = SIMD_FUNC(add_ps)(Cv, tv);
258  SIMD_FUNC(storeu_ps)((float *)&Cd[Cr + j + 16], Cv);
259 
260  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 24]);
261  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 24]);
262  tv = SIMD_FUNC(mul_ps)(Av, Bv);
263  Cv = SIMD_FUNC(add_ps)(Cv, tv);
264  SIMD_FUNC(storeu_ps)((float *)&Cd[CC + 24], Cv);
265  }
266  for (; j < (int)N - 7; j += 8) {
267  const int BB = Br + j;
268  const int CC = Cr + j;
269 
270  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB]);
271  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC]);
272  tv = SIMD_FUNC(mul_ps)(Av, Bv);
273  Cv = SIMD_FUNC(add_ps)(Cv, tv);
274  SIMD_FUNC(storeu_ps)((float *)&Cd[CC], Cv);
275  }
276  for (; j < (int)N; j++) {
277  Cd[Cr + j] += vald[k] * Bd[Br + j];
278  }
279  }
280  }
281 #else // Scalar_cpu
282 #pragma omp parallel for
283  for (int j = 0; j < (int)N; j++) {
284  for (int i = 0; i < (int)M; i++) {
285  double tmp = 0;
286  int start = (int)rowd[i];
287  int end = (int)rowd[i + 1];
288  for (int k = start; k < end; k++) {
289  tmp += vald[k] * Bd[N * cold[k] + j];
290  }
291  Cd[i * N + j] = tmp;
292  }
293  }
294 #endif
295 #endif
296  }
297  logger.func_out();
298 }
299 } // namespace monolish
monolish::matrix::CRS::val
std::vector< Float > val
CRS format value, which stores values of the non-zero elements (size nnz)
Definition: monolish_crs.hpp:68
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::Logger
logger class (singleton, for developper class)
Definition: monolish_logger.hpp:19
monolish::Logger::func_out
void func_out()
Definition: logger_utils.cpp:80
monolish::matrix::CRS::get_nnz
size_t get_nnz() const
get # of non-zeros
Definition: monolish_crs.hpp:261
monolish::matrix::Dense::get_row
size_t get_row() const
get # of row
Definition: monolish_dense.hpp:199
monolish::matrix::Dense::val
std::vector< Float > val
Dense format value(size M x N)
Definition: monolish_dense.hpp:47
monolish::matrix::Dense
Dense format Matrix.
Definition: monolish_coo.hpp:35
monolish::matrix::CRS::col_ind
std::vector< int > col_ind
CRS format column index, which stores column numbers of the non-zero elements (size nnz)
Definition: monolish_crs.hpp:74
monolish::util::is_same_device_mem_stat
bool is_same_device_mem_stat(const T &arg1, const U &arg2)
compare same device memory status
Definition: monolish_common.hpp:454
monolish::matrix::CRS::get_row
size_t get_row() const
get # of row
Definition: monolish_crs.hpp:243
monolish
Definition: monolish_matrix_blas.hpp:10
monolish::matrix::Dense::get_col
size_t get_col() const
get # of col
Definition: monolish_dense.hpp:208
monolish::matrix::CRS::get_device_mem_stat
bool get_device_mem_stat() const
true: sended, false: not send
Definition: monolish_crs.hpp:330
monolish::matrix::CRS::get_col
size_t get_col() const
get # of col
Definition: monolish_crs.hpp:252
monolish::matrix::CRS::row_ptr
std::vector< int > row_ptr
CRS format row pointer, which stores the starting points of the rows of the arrays value and col_ind ...
Definition: monolish_crs.hpp:80
monolish::blas::matmul
void matmul(const matrix::Dense< double > &A, const matrix::Dense< double > &B, matrix::Dense< double > &C)
Dense matrix multiplication: C = AB.
Definition: dense-dense_matmul.cpp:7
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42
monolish::matrix::CRS
Compressed Row Storage (CRS) format Matrix.
Definition: monolish_coo.hpp:36
monolish::Logger::func_in
void func_in(const std::string func_name)
Definition: logger_utils.cpp:69