7 template <
typename VEC1,
typename VEC2>
8 void Dmatvec_core(
const matrix::CRS<double> &A,
const VEC1 &x, VEC2 &y) {
13 assert(A.get_row() == y.size());
14 assert(A.get_col() == x.size());
17 const double *vald = A.val.data();
18 const double *xd = x.data();
19 const int *rowd = A.row_ptr.data();
20 const int *cold = A.col_ind.data();
21 double *yd = y.data();
22 const size_t xoffset = x.get_offset();
23 const size_t yoffset = y.get_offset();
25 if (A.get_device_mem_stat() ==
true) {
26 #if MONOLISH_USE_GPU // gpu
27 cusparseHandle_t sp_handle;
28 cusparseCreate(&sp_handle);
29 cudaDeviceSynchronize();
31 cusparseMatDescr_t descr = 0;
32 cusparseCreateMatDescr(&descr);
33 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
34 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
35 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
37 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
39 const int m = A.get_row();
40 const int n = A.get_col();
41 const double alpha = 1.0;
42 const double beta = 0.0;
43 const int nnz = A.get_nnz();
45 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
47 internal::check_CUDA(cusparseDcsrmv(sp_handle, trans, m, n, nnz, &alpha,
48 descr, vald, rowd, cold, xd + xoffset,
49 &beta, yd + yoffset));
52 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
57 const int m = A.get_row();
58 const int n = A.get_col();
59 const double alpha = 1.0;
60 const double beta = 0.0;
61 mkl_dcsrmv(
"N", &m, &n, &alpha,
"G__C", vald, cold, rowd, rowd + 1,
62 xd + xoffset, &beta, yd + yoffset);
66 #pragma omp parallel for
67 for (
int i = 0; i < (int)A.get_row(); i++) {
69 for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
70 ytmp += vald[j] * (xd + xoffset)[cold[j]];
72 yd[i + yoffset] = ytmp;
81 template <
typename VEC1,
typename VEC2>
82 void Smatvec_core(
const matrix::CRS<float> &A,
const VEC1 &x, VEC2 &y) {
87 assert(A.get_row() == y.size());
88 assert(A.get_col() == x.size());
91 const float *vald = A.val.data();
92 const float *xd = x.data();
93 const int *rowd = A.row_ptr.data();
94 const int *cold = A.col_ind.data();
96 const size_t xoffset = x.get_offset();
97 const size_t yoffset = y.get_offset();
99 if (A.get_device_mem_stat() ==
true) {
100 #if MONOLISH_USE_GPU // gpu
101 cusparseHandle_t sp_handle;
102 cusparseCreate(&sp_handle);
103 cudaDeviceSynchronize();
105 cusparseMatDescr_t descr = 0;
106 cusparseCreateMatDescr(&descr);
107 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
108 cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
109 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
111 const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
113 const int m = A.get_row();
114 const int n = A.get_col();
115 const int nnz = A.get_nnz();
116 const float alpha = 1.0;
117 const float beta = 0.0;
119 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
121 internal::check_CUDA(cusparseScsrmv(sp_handle, trans, m, n, nnz, &alpha,
122 descr, vald, rowd, cold, xd + xoffset,
123 &beta, yd + yoffset));
126 throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
131 const int m = A.get_row();
132 const int n = A.get_col();
133 const float alpha = 1.0;
134 const float beta = 0.0;
135 mkl_scsrmv(
"N", &m, &n, &alpha,
"G__C", vald, cold, rowd, rowd + 1,
136 xd + xoffset, &beta, yd + yoffset);
140 #pragma omp parallel for
141 for (
int i = 0; i < (int)A.get_row(); i++) {
143 for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
144 ytmp += vald[j] * (xd + xoffset)[cold[j]];
146 yd[i + yoffset] = ytmp;