monolish  0.14.0
MONOlithic LIner equation Solvers for Highly-parallel architecture
crs-dense_matmul.cpp
Go to the documentation of this file.
1 #include "../../../../include/monolish_blas.hpp"
2 #include "../../../internal/monolish_internal.hpp"
3 
4 namespace monolish {
5 
6 // double ///////////////////
9  Logger &logger = Logger::get_instance();
10  logger.func_in(monolish_func);
11 
12  // err
13  assert(A.get_col() == B.get_row());
14  assert(A.get_row() == C.get_row());
15  assert(B.get_col() == C.get_col());
16  assert(util::is_same_device_mem_stat(A, B, C));
17 
18  const double *vald = A.val.data();
19  const int *rowd = A.row_ptr.data();
20  const int *cold = A.col_ind.data();
21 
22  const double *Bd = B.val.data();
23  double *Cd = C.val.data();
24 
25  // MN = MK * KN
26  const int M = A.get_row();
27  const int N = B.get_col();
28 
29  if (A.get_device_mem_stat() == true) {
30 #if MONOLISH_USE_GPU // CUDA11 will support SpMM
31 #if 0 // row major SpMM is not supported in cuda 10.2
32  size_t nnz = A.get_nnz();
33 
34  cusparseHandle_t sp_handle;
35  cusparseCreate(&sp_handle);
36  cudaDeviceSynchronize();
37 
38  cusparseMatDescr_t descr = 0;
39  cusparseCreateMatDescr(&descr);
40  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
41  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
42  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
43 
44  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
45 
46  const double alpha = 1.0;
47  const double beta = 0.0;
48 
49 #pragma omp target data use_device_ptr(Bd, Cd, vald, rowd, cold)
50  {
51  cusparseSpMatDescr_t matA;
52  cusparseDnMatDescr_t matB, matC;
53  void* dBuffer = NULL;
54  size_t buffersize = 0;
55 
56  cusparseCreateCsr(&matA, M, K, nnz, (void*)rowd, (void*)cold, (void*)vald,
57  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
58 
59  cusparseCreateDnMat(&matB, K, N, N, (void*)Bd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
60 
61  cusparseCreateDnMat(&matC, M, N, N, (void*)Cd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
62 
63  cusparseSpMM_bufferSize(sp_handle,
64  CUSPARSE_OPERATION_TRANSPOSE,
65  CUSPARSE_OPERATION_TRANSPOSE,
66  &alpha, matA, matB, &beta, matC, CUDA_R_64F,
67  CUSPARSE_MM_ALG_DEFAULT, &buffersize);
68 
69  cudaMalloc(&dBuffer, buffersize);
70 
71  cusparseSpMM(sp_handle,
72  CUSPARSE_OPERATION_TRANSPOSE,
73  CUSPARSE_OPERATION_TRANSPOSE,
74  &alpha, matA, matB, &beta, matC, CUDA_R_64F,
75  CUSPARSE_MM_ALG_DEFAULT, dBuffer);
76 
77  cudaFree(dBuffer);
78  }
79 
80 #else
81 #pragma omp target teams distribute parallel for
82  for (int j = 0; j < N; j++) {
83  for (int i = 0; i < M; i++) {
84  double tmp = 0;
85  for (int k = rowd[i]; k < rowd[i + 1]; k++) {
86  tmp += vald[k] * Bd[N * cold[k] + j];
87  }
88  Cd[i * N + j] = tmp;
89  }
90  }
91 #endif
92 #else
93  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
94 #endif
95  } else {
96 // MKL
97 #if MONOLISH_USE_MKL
98  const double alpha = 1.0;
99  const double beta = 0.0;
100  const int K = A.get_col();
101  mkl_dcsrmm("N", &M, &N, &K, &alpha, "G__C", vald, cold, rowd, rowd + 1, Bd,
102  &N, &beta, Cd, &N);
103 
104 // OSS
105 #else
106 #if USE_AVX // avx_cpu
107  const int vecL = 4;
108 
109 #pragma omp parallel for
110  for (int i = 0; i < (int)(M * N); i++) {
111  Cd[i] = 0.0;
112  }
113 
114 #pragma omp parallel for
115  for (int i = 0; i < (int)M; i++) {
116  int start = (int)rowd[i];
117  int end = (int)rowd[i + 1];
118  const int Cr = i * N;
119  for (int k = start; k < end; k++) {
120  const int Br = N * cold[k];
121  const Dreg Av = SIMD_FUNC(broadcast_sd)(&vald[k]);
122  Dreg tv, Bv, Cv;
123  int j;
124  for (j = 0; j < (int)N - (vecL - 1); j += vecL) {
125  const int BB = Br + j;
126  const int CC = Cr + j;
127 
128  Bv = SIMD_FUNC(loadu_pd)((double *)&Bd[BB]);
129  Cv = SIMD_FUNC(loadu_pd)((double *)&Cd[CC]);
130  tv = SIMD_FUNC(mul_pd)(Av, Bv);
131  Cv = SIMD_FUNC(add_pd)(Cv, tv);
132  SIMD_FUNC(storeu_pd)((double *)&Cd[CC], Cv);
133  }
134 
135  for (; j < (int)N; j++) {
136  Cd[Cr + j] += vald[k] * Bd[Br + j];
137  }
138  }
139  }
140 #else // Scalar_cpu
141 #pragma omp parallel for
142  for (int j = 0; j < (int)N; j++) {
143  for (int i = 0; i < (int)M; i++) {
144  double tmp = 0;
145  int start = (int)rowd[i];
146  int end = (int)rowd[i + 1];
147  for (int k = start; k < end; k++) {
148  tmp += vald[k] * Bd[N * cold[k] + j];
149  }
150  Cd[i * N + j] = tmp;
151  }
152  }
153 #endif
154 #endif
155  }
156  logger.func_out();
157 }
158 
159 // float ///////////////////
162  Logger &logger = Logger::get_instance();
163  logger.func_in(monolish_func);
164 
165  // err
166  assert(A.get_col() == B.get_row());
167  assert(A.get_row() == C.get_row());
168  assert(B.get_col() == C.get_col());
169  assert(util::is_same_device_mem_stat(A, B, C));
170 
171  const float *vald = A.val.data();
172  const int *rowd = A.row_ptr.data();
173  const int *cold = A.col_ind.data();
174 
175  const float *Bd = B.val.data();
176  float *Cd = C.val.data();
177 
178  // MN = MK * KN
179  const int M = A.get_row();
180  const int N = B.get_col();
181 
182  if (A.get_device_mem_stat() == true) {
183 #if MONOLISH_USE_GPU
184 #pragma omp target teams distribute parallel for
185  for (size_t j = 0; j < N; j++) {
186  for (size_t i = 0; i < M; i++) {
187  float tmp = 0;
188  for (size_t k = (size_t)rowd[i]; k < (size_t)rowd[i + 1]; k++) {
189  tmp += vald[k] * Bd[N * cold[k] + j];
190  }
191  Cd[i * N + j] = tmp;
192  }
193  }
194 #else
195  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
196 #endif
197  } else {
198 // MKL
199 #if MONOLISH_USE_MKL
200  const int K = A.get_col();
201  const float alpha = 1.0;
202  const float beta = 0.0;
203  mkl_scsrmm("N", &M, &N, &K, &alpha, "G__C", vald, cold, rowd, rowd + 1, Bd,
204  &N, &beta, Cd, &N);
205 
206 // OSS
207 #else
208 #if MONOLISH_USE_AVX // avx_cpu
209  // const int vecL = 8;
210 
211 #pragma omp parallel for
212  for (int i = 0; i < (int)(M * N); i++) {
213  Cd[i] = 0.0;
214  }
215 
216 #pragma omp parallel for
217  for (int i = 0; i < (int)M; i++) {
218  int start = (int)rowd[i];
219  int end = (int)rowd[i + 1];
220  const int Cr = i * N;
221  for (int k = start; k < end; k++) {
222  const int Br = N * cold[k];
223  const Sreg Av = SIMD_FUNC(broadcast_ss)(&vald[k]);
224  Sreg tv, Bv, Cv;
225  int j;
226  for (j = 0; j < (int)N - 31; j += 32) {
227  const int BB = Br + j;
228  const int CC = Cr + j;
229 
230  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB]);
231  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC]);
232  tv = SIMD_FUNC(mul_ps)(Av, Bv);
233  Cv = SIMD_FUNC(add_ps)(Cv, tv);
234  SIMD_FUNC(storeu_ps)((float *)&Cd[CC], Cv);
235 
236  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 8]);
237  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 8]);
238  tv = SIMD_FUNC(mul_ps)(Av, Bv);
239  Cv = SIMD_FUNC(add_ps)(Cv, tv);
240  SIMD_FUNC(storeu_ps)((float *)&Cd[CC + 8], Cv);
241 
242  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 16]);
243  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 16]);
244  tv = SIMD_FUNC(mul_ps)(Av, Bv);
245  Cv = SIMD_FUNC(add_ps)(Cv, tv);
246  SIMD_FUNC(storeu_ps)((float *)&Cd[Cr + j + 16], Cv);
247 
248  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB + 24]);
249  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC + 24]);
250  tv = SIMD_FUNC(mul_ps)(Av, Bv);
251  Cv = SIMD_FUNC(add_ps)(Cv, tv);
252  SIMD_FUNC(storeu_ps)((float *)&Cd[CC + 24], Cv);
253  }
254  for (; j < (int)N - 7; j += 8) {
255  const int BB = Br + j;
256  const int CC = Cr + j;
257 
258  Bv = SIMD_FUNC(loadu_ps)((float *)&Bd[BB]);
259  Cv = SIMD_FUNC(loadu_ps)((float *)&Cd[CC]);
260  tv = SIMD_FUNC(mul_ps)(Av, Bv);
261  Cv = SIMD_FUNC(add_ps)(Cv, tv);
262  SIMD_FUNC(storeu_ps)((float *)&Cd[CC], Cv);
263  }
264  for (; j < (int)N; j++) {
265  Cd[Cr + j] += vald[k] * Bd[Br + j];
266  }
267  }
268  }
269 #else // Scalar_cpu
270 #pragma omp parallel for
271  for (int j = 0; j < (int)N; j++) {
272  for (int i = 0; i < (int)M; i++) {
273  double tmp = 0;
274  int start = (int)rowd[i];
275  int end = (int)rowd[i + 1];
276  for (int k = start; k < end; k++) {
277  tmp += vald[k] * Bd[N * cold[k] + j];
278  }
279  Cd[i * N + j] = tmp;
280  }
281  }
282 #endif
283 #endif
284  }
285  logger.func_out();
286 }
287 } // namespace monolish
monolish::matrix::CRS::val
std::vector< Float > val
CRS format value, which stores values of the non-zero elements (size nnz)
Definition: monolish_crs.hpp:61
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::Logger
logger class (singleton, for developper class)
Definition: monolish_logger.hpp:19
monolish::Logger::func_out
void func_out()
Definition: logger_utils.cpp:80
monolish::matrix::CRS::get_nnz
size_t get_nnz() const
get # of non-zeros
Definition: monolish_crs.hpp:254
monolish::matrix::Dense::get_row
size_t get_row() const
get # of row
Definition: monolish_dense.hpp:199
monolish::matrix::Dense::val
std::vector< Float > val
Dense format value(size M x N)
Definition: monolish_dense.hpp:47
monolish::matrix::Dense
Dense format Matrix.
Definition: monolish_coo.hpp:28
monolish::matrix::CRS::col_ind
std::vector< int > col_ind
CRS format column index, which stores column numbers of the non-zero elements (size nnz)
Definition: monolish_crs.hpp:67
monolish::util::is_same_device_mem_stat
bool is_same_device_mem_stat(const T &arg1, const U &arg2)
compare same device memory status
Definition: monolish_common.hpp:431
monolish::matrix::CRS::get_row
size_t get_row() const
get # of row
Definition: monolish_crs.hpp:236
monolish
Definition: monolish_matrix_blas.hpp:9
monolish::matrix::Dense::get_col
size_t get_col() const
get # of col
Definition: monolish_dense.hpp:208
monolish::matrix::CRS::get_device_mem_stat
bool get_device_mem_stat() const
true: sended, false: not send
Definition: monolish_crs.hpp:323
monolish::matrix::CRS::get_col
size_t get_col() const
get # of col
Definition: monolish_crs.hpp:245
monolish::matrix::CRS::row_ptr
std::vector< int > row_ptr
CRS format row pointer, which stores the starting points of the rows of the arrays value and col_ind ...
Definition: monolish_crs.hpp:73
monolish::blas::matmul
void matmul(const matrix::Dense< double > &A, const matrix::Dense< double > &B, matrix::Dense< double > &C)
Dense matrix multiplication: C = AB.
Definition: dense-dense_matmul.cpp:7
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42
monolish::matrix::CRS
Compressed Row Storage (CRS) format Matrix.
Definition: monolish_coo.hpp:29
monolish::Logger::func_in
void func_in(const std::string func_name)
Definition: logger_utils.cpp:69