1 #include "../../../../include/monolish_blas.hpp"
2 #include "../../../internal/monolish_internal.hpp"
18 const double *vald = A.
val.data();
19 const int *rowd = A.
row_ptr.data();
20 const int *cold = A.
col_ind.data();
22 const double *Bd = B.
val.data();
23 double *Cd = C.
val.data();
30 #if MONOLISH_USE_NVIDIA_GPU // CUDA11 will support SpMM
31 #if 0 // row major SpMM is not supported in cuda 10.2
34 cusparseHandle_t sp_handle;
35 cusparseCreate(&sp_handle);
36 cudaDeviceSynchronize();
38 cusparseMatDescr_t descr = 0;
39 cusparseCreateMatDescr(&descr);
40 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
41 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
42 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
44 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
46 const double alpha = 1.0;
47 const double beta = 0.0;
49 #pragma omp target data use_device_ptr(Bd, Cd, vald, rowd, cold)
51 cusparseSpMatDescr_t matA;
52 cusparseDnMatDescr_t matB, matC;
54 size_t buffersize = 0;
56 cusparseCreateCsr(&matA, M, K, nnz, (
void*)rowd, (
void*)cold, (
void*)vald,
57 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
59 cusparseCreateDnMat(&matB, K, N, N, (
void*)Bd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
61 cusparseCreateDnMat(&matC, M, N, N, (
void*)Cd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
63 cusparseSpMM_bufferSize(sp_handle,
64 CUSPARSE_OPERATION_TRANSPOSE,
65 CUSPARSE_OPERATION_TRANSPOSE,
66 &alpha, matA, matB, &beta, matC, CUDA_R_64F,
67 CUSPARSE_MM_ALG_DEFAULT, &buffersize);
69 cudaMalloc(&dBuffer, buffersize);
71 cusparseSpMM(sp_handle,
72 CUSPARSE_OPERATION_TRANSPOSE,
73 CUSPARSE_OPERATION_TRANSPOSE,
74 &alpha, matA, matB, &beta, matC, CUDA_R_64F,
75 CUSPARSE_MM_ALG_DEFAULT, dBuffer);
81 #pragma omp target teams distribute parallel for
82 for (
int j = 0; j < N; j++) {
83 for (
int i = 0; i < M; i++) {
85 for (
int k = rowd[i]; k < rowd[i + 1]; k++) {
86 tmp += vald[k] * Bd[N * cold[k] + j];
93 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
98 const double alpha = 1.0;
99 const double beta = 0.0;
113 mkl_dcsrmm(
"N", &M, &N, &K, &alpha,
"G__C", vald, cold, rowd, rowd + 1, Bd,
118 #if USE_AVX // avx_cpu
121 #pragma omp parallel for
122 for (
int i = 0; i < (int)(M * N); i++) {
126 #pragma omp parallel for
127 for (
int i = 0; i < (int)M; i++) {
128 int start = (int)rowd[i];
129 int end = (int)rowd[i + 1];
130 const int Cr = i * N;
131 for (
int k = start; k < end; k++) {
132 const int Br = N * cold[k];
133 const Dreg Av = SIMD_FUNC(broadcast_sd)(&vald[k]);
136 for (j = 0; j < (int)N - (vecL - 1); j += vecL) {
137 const int BB = Br + j;
138 const int CC = Cr + j;
140 Bv = SIMD_FUNC(loadu_pd)((
double *)&Bd[BB]);
141 Cv = SIMD_FUNC(loadu_pd)((
double *)&Cd[CC]);
142 tv = SIMD_FUNC(mul_pd)(Av, Bv);
143 Cv = SIMD_FUNC(add_pd)(Cv, tv);
144 SIMD_FUNC(storeu_pd)((
double *)&Cd[CC], Cv);
147 for (; j < (int)N; j++) {
148 Cd[Cr + j] += vald[k] * Bd[Br + j];
153 #pragma omp parallel for
154 for (
int j = 0; j < (int)N; j++) {
155 for (
int i = 0; i < (int)M; i++) {
157 int start = (int)rowd[i];
158 int end = (int)rowd[i + 1];
159 for (
int k = start; k < end; k++) {
160 tmp += vald[k] * Bd[N * cold[k] + j];
183 const float *vald = A.
val.data();
184 const int *rowd = A.
row_ptr.data();
185 const int *cold = A.
col_ind.data();
187 const float *Bd = B.
val.data();
188 float *Cd = C.
val.data();
195 #if MONOLISH_USE_NVIDIA_GPU
196 #pragma omp target teams distribute parallel for
197 for (
size_t j = 0; j < N; j++) {
198 for (
size_t i = 0; i < M; i++) {
200 for (
size_t k = (
size_t)rowd[i]; k < (size_t)rowd[i + 1]; k++) {
201 tmp += vald[k] * Bd[N * cold[k] + j];
207 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
213 const float alpha = 1.0;
214 const float beta = 0.0;
215 mkl_scsrmm(
"N", &M, &N, &K, &alpha,
"G__C", vald, cold, rowd, rowd + 1, Bd,
220 #if MONOLISH_USE_AVX // avx_cpu
223 #pragma omp parallel for
224 for (
int i = 0; i < (int)(M * N); i++) {
228 #pragma omp parallel for
229 for (
int i = 0; i < (int)M; i++) {
230 int start = (int)rowd[i];
231 int end = (int)rowd[i + 1];
232 const int Cr = i * N;
233 for (
int k = start; k < end; k++) {
234 const int Br = N * cold[k];
235 const Sreg Av = SIMD_FUNC(broadcast_ss)(&vald[k]);
238 for (j = 0; j < (int)N - 31; j += 32) {
239 const int BB = Br + j;
240 const int CC = Cr + j;
242 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB]);
243 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC]);
244 tv = SIMD_FUNC(mul_ps)(Av, Bv);
245 Cv = SIMD_FUNC(add_ps)(Cv, tv);
246 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC], Cv);
248 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 8]);
249 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 8]);
250 tv = SIMD_FUNC(mul_ps)(Av, Bv);
251 Cv = SIMD_FUNC(add_ps)(Cv, tv);
252 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC + 8], Cv);
254 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 16]);
255 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 16]);
256 tv = SIMD_FUNC(mul_ps)(Av, Bv);
257 Cv = SIMD_FUNC(add_ps)(Cv, tv);
258 SIMD_FUNC(storeu_ps)((
float *)&Cd[Cr + j + 16], Cv);
260 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 24]);
261 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 24]);
262 tv = SIMD_FUNC(mul_ps)(Av, Bv);
263 Cv = SIMD_FUNC(add_ps)(Cv, tv);
264 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC + 24], Cv);
266 for (; j < (int)N - 7; j += 8) {
267 const int BB = Br + j;
268 const int CC = Cr + j;
270 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB]);
271 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC]);
272 tv = SIMD_FUNC(mul_ps)(Av, Bv);
273 Cv = SIMD_FUNC(add_ps)(Cv, tv);
274 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC], Cv);
276 for (; j < (int)N; j++) {
277 Cd[Cr + j] += vald[k] * Bd[Br + j];
282 #pragma omp parallel for
283 for (
int j = 0; j < (int)N; j++) {
284 for (
int i = 0; i < (int)M; i++) {
286 int start = (int)rowd[i];
287 int end = (int)rowd[i + 1];
288 for (
int k = start; k < end; k++) {
289 tmp += vald[k] * Bd[N * cold[k] + j];