7 template <
typename VEC1,
typename VEC2>
8 void Dmatvec_core(
const matrix::CRS<double> &A,
const VEC1 &x, VEC2 &y) {
13 assert(A.get_row() == y.size());
14 assert(A.get_col() == x.size());
17 const double *vald = A.val.data();
18 const double *xd = x.data();
19 const int *rowd = A.row_ptr.data();
20 const int *cold = A.col_ind.data();
21 double *yd = y.data();
22 const size_t xoffset = x.get_offset();
23 const size_t yoffset = y.get_offset();
25 if (A.get_device_mem_stat() ==
true) {
26 #if MONOLISH_USE_NVIDIA_GPU // gpu
27 cusparseHandle_t sp_handle;
28 cusparseCreate(&sp_handle);
29 cudaDeviceSynchronize();
31 cusparseMatDescr_t descr = 0;
32 cusparseCreateMatDescr(&descr);
33 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
34 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
35 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
37 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
39 const int m = A.get_row();
40 const int n = A.get_col();
41 const double alpha = 1.0;
42 const double beta = 0.0;
43 const int nnz = A.get_nnz();
45 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
47 internal::check_CUDA(cusparseDcsrmv(sp_handle, trans, m, n, nnz, &alpha,
48 descr, vald, rowd, cold, xd + xoffset,
49 &beta, yd + yoffset));
52 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
59 const double alpha = 1.0;
60 const double beta = 0.0;
63 struct matrix_descr descrA;
64 descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
66 mkl_sparse_d_create_csr(&mklA, SPARSE_INDEX_BASE_ZERO, m, n, (
int *)rowd,
67 (
int *)rowd + 1, (
int *)cold, (
double *)vald);
70 mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, mklA, descrA,
71 xd + xoffset, beta, yd + yoffset);
75 #pragma omp parallel for
76 for (
int i = 0; i < (int)A.get_row(); i++) {
78 for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
79 ytmp += vald[j] * (xd + xoffset)[cold[j]];
81 yd[i + yoffset] = ytmp;
90 template <
typename VEC1,
typename VEC2>
91 void Smatvec_core(
const matrix::CRS<float> &A,
const VEC1 &x, VEC2 &y) {
96 assert(A.get_row() == y.size());
97 assert(A.get_col() == x.size());
100 const float *vald = A.val.data();
101 const float *xd = x.data();
102 const int *rowd = A.row_ptr.data();
103 const int *cold = A.col_ind.data();
104 float *yd = y.data();
105 const size_t xoffset = x.get_offset();
106 const size_t yoffset = y.get_offset();
108 if (A.get_device_mem_stat() ==
true) {
109 #if MONOLISH_USE_NVIDIA_GPU // gpu
110 cusparseHandle_t sp_handle;
111 cusparseCreate(&sp_handle);
112 cudaDeviceSynchronize();
114 cusparseMatDescr_t descr = 0;
115 cusparseCreateMatDescr(&descr);
116 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
117 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
118 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
120 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
122 const int m = A.get_row();
123 const int n = A.get_col();
124 const int nnz = A.get_nnz();
125 const float alpha = 1.0;
126 const float beta = 0.0;
128 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
130 internal::check_CUDA(cusparseScsrmv(sp_handle, trans, m, n, nnz, &alpha,
131 descr, vald, rowd, cold, xd + xoffset,
132 &beta, yd + yoffset));
135 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
140 const int m = A.get_row();
141 const int n = A.get_col();
142 const float alpha = 1.0;
143 const float beta = 0.0;
145 sparse_matrix_t mklA;
146 struct matrix_descr descrA;
147 descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
149 mkl_sparse_s_create_csr(&mklA, SPARSE_INDEX_BASE_ZERO, m, n, (
int *)rowd,
150 (
int *)rowd + 1, (
int *)cold, (
float *)vald);
153 mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, mklA, descrA,
154 xd + xoffset, beta, yd + yoffset);
158 #pragma omp parallel for
159 for (
int i = 0; i < (int)A.get_row(); i++) {
161 for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
162 ytmp += vald[j] * (xd + xoffset)[cold[j]];
164 yd[i + yoffset] = ytmp;