1 #include "../../../../include/monolish_blas.hpp"
2 #include "../../../internal/monolish_internal.hpp"
18 const double *vald = A.
val.data();
19 const int *rowd = A.
row_ptr.data();
20 const int *cold = A.
col_ind.data();
22 const double *Bd = B.
val.data();
23 double *Cd = C.
val.data();
30 #if MONOLISH_USE_GPU // CUDA11 will support SpMM
31 #if 0 // row major SpMM is not supported in cuda 10.2
34 cusparseHandle_t sp_handle;
35 cusparseCreate(&sp_handle);
36 cudaDeviceSynchronize();
38 cusparseMatDescr_t descr = 0;
39 cusparseCreateMatDescr(&descr);
40 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
41 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
42 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
44 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
46 const double alpha = 1.0;
47 const double beta = 0.0;
49 #pragma omp target data use_device_ptr(Bd, Cd, vald, rowd, cold)
51 cusparseSpMatDescr_t matA;
52 cusparseDnMatDescr_t matB, matC;
54 size_t buffersize = 0;
56 cusparseCreateCsr(&matA, M, K, nnz, (
void*)rowd, (
void*)cold, (
void*)vald,
57 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
59 cusparseCreateDnMat(&matB, K, N, N, (
void*)Bd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
61 cusparseCreateDnMat(&matC, M, N, N, (
void*)Cd, CUDA_R_64F, CUSPARSE_ORDER_ROW);
63 cusparseSpMM_bufferSize(sp_handle,
64 CUSPARSE_OPERATION_TRANSPOSE,
65 CUSPARSE_OPERATION_TRANSPOSE,
66 &alpha, matA, matB, &beta, matC, CUDA_R_64F,
67 CUSPARSE_MM_ALG_DEFAULT, &buffersize);
69 cudaMalloc(&dBuffer, buffersize);
71 cusparseSpMM(sp_handle,
72 CUSPARSE_OPERATION_TRANSPOSE,
73 CUSPARSE_OPERATION_TRANSPOSE,
74 &alpha, matA, matB, &beta, matC, CUDA_R_64F,
75 CUSPARSE_MM_ALG_DEFAULT, dBuffer);
81 #pragma omp target teams distribute parallel for
82 for (
int j = 0; j < N; j++) {
83 for (
int i = 0; i < M; i++) {
85 for (
int k = rowd[i]; k < rowd[i + 1]; k++) {
86 tmp += vald[k] * Bd[N * cold[k] + j];
93 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
98 const double alpha = 1.0;
99 const double beta = 0.0;
101 mkl_dcsrmm(
"N", &M, &N, &K, &alpha,
"G__C", vald, cold, rowd, rowd + 1, Bd,
106 #if USE_AVX // avx_cpu
109 #pragma omp parallel for
110 for (
int i = 0; i < (int)(M * N); i++) {
114 #pragma omp parallel for
115 for (
int i = 0; i < (int)M; i++) {
116 int start = (int)rowd[i];
117 int end = (int)rowd[i + 1];
118 const int Cr = i * N;
119 for (
int k = start; k < end; k++) {
120 const int Br = N * cold[k];
121 const Dreg Av = SIMD_FUNC(broadcast_sd)(&vald[k]);
124 for (j = 0; j < (int)N - (vecL - 1); j += vecL) {
125 const int BB = Br + j;
126 const int CC = Cr + j;
128 Bv = SIMD_FUNC(loadu_pd)((
double *)&Bd[BB]);
129 Cv = SIMD_FUNC(loadu_pd)((
double *)&Cd[CC]);
130 tv = SIMD_FUNC(mul_pd)(Av, Bv);
131 Cv = SIMD_FUNC(add_pd)(Cv, tv);
132 SIMD_FUNC(storeu_pd)((
double *)&Cd[CC], Cv);
135 for (; j < (int)N; j++) {
136 Cd[Cr + j] += vald[k] * Bd[Br + j];
141 #pragma omp parallel for
142 for (
int j = 0; j < (int)N; j++) {
143 for (
int i = 0; i < (int)M; i++) {
145 int start = (int)rowd[i];
146 int end = (int)rowd[i + 1];
147 for (
int k = start; k < end; k++) {
148 tmp += vald[k] * Bd[N * cold[k] + j];
171 const float *vald = A.
val.data();
172 const int *rowd = A.
row_ptr.data();
173 const int *cold = A.
col_ind.data();
175 const float *Bd = B.
val.data();
176 float *Cd = C.
val.data();
184 #pragma omp target teams distribute parallel for
185 for (
size_t j = 0; j < N; j++) {
186 for (
size_t i = 0; i < M; i++) {
188 for (
size_t k = (
size_t)rowd[i]; k < (size_t)rowd[i + 1]; k++) {
189 tmp += vald[k] * Bd[N * cold[k] + j];
195 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
201 const float alpha = 1.0;
202 const float beta = 0.0;
203 mkl_scsrmm(
"N", &M, &N, &K, &alpha,
"G__C", vald, cold, rowd, rowd + 1, Bd,
208 #if MONOLISH_USE_AVX // avx_cpu
211 #pragma omp parallel for
212 for (
int i = 0; i < (int)(M * N); i++) {
216 #pragma omp parallel for
217 for (
int i = 0; i < (int)M; i++) {
218 int start = (int)rowd[i];
219 int end = (int)rowd[i + 1];
220 const int Cr = i * N;
221 for (
int k = start; k < end; k++) {
222 const int Br = N * cold[k];
223 const Sreg Av = SIMD_FUNC(broadcast_ss)(&vald[k]);
226 for (j = 0; j < (int)N - 31; j += 32) {
227 const int BB = Br + j;
228 const int CC = Cr + j;
230 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB]);
231 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC]);
232 tv = SIMD_FUNC(mul_ps)(Av, Bv);
233 Cv = SIMD_FUNC(add_ps)(Cv, tv);
234 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC], Cv);
236 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 8]);
237 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 8]);
238 tv = SIMD_FUNC(mul_ps)(Av, Bv);
239 Cv = SIMD_FUNC(add_ps)(Cv, tv);
240 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC + 8], Cv);
242 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 16]);
243 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 16]);
244 tv = SIMD_FUNC(mul_ps)(Av, Bv);
245 Cv = SIMD_FUNC(add_ps)(Cv, tv);
246 SIMD_FUNC(storeu_ps)((
float *)&Cd[Cr + j + 16], Cv);
248 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB + 24]);
249 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC + 24]);
250 tv = SIMD_FUNC(mul_ps)(Av, Bv);
251 Cv = SIMD_FUNC(add_ps)(Cv, tv);
252 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC + 24], Cv);
254 for (; j < (int)N - 7; j += 8) {
255 const int BB = Br + j;
256 const int CC = Cr + j;
258 Bv = SIMD_FUNC(loadu_ps)((
float *)&Bd[BB]);
259 Cv = SIMD_FUNC(loadu_ps)((
float *)&Cd[CC]);
260 tv = SIMD_FUNC(mul_ps)(Av, Bv);
261 Cv = SIMD_FUNC(add_ps)(Cv, tv);
262 SIMD_FUNC(storeu_ps)((
float *)&Cd[CC], Cv);
264 for (; j < (int)N; j++) {
265 Cd[Cr + j] += vald[k] * Bd[Br + j];
270 #pragma omp parallel for
271 for (
int j = 0; j < (int)N; j++) {
272 for (
int i = 0; i < (int)M; i++) {
274 int start = (int)rowd[i];
275 int end = (int)rowd[i + 1];
276 for (
int k = start; k < end; k++) {
277 tmp += vald[k] * Bd[N * cold[k] + j];