monolish  0.14.2
MONOlithic LIner equation Solvers for Highly-parallel architecture
crs_matvec.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 namespace monolish {
4 
5 namespace {
6 // double ///////////////////
7 template <typename VEC1, typename VEC2>
8 void Dmatvec_core(const matrix::CRS<double> &A, const VEC1 &x, VEC2 &y) {
9  Logger &logger = Logger::get_instance();
10  logger.func_in(monolish_func);
11 
12  // err, M = MN * N
13  assert(A.get_row() == y.size());
14  assert(A.get_col() == x.size());
15  assert(util::is_same_device_mem_stat(A, x, y));
16 
17  const double *vald = A.val.data();
18  const double *xd = x.data();
19  const int *rowd = A.row_ptr.data();
20  const int *cold = A.col_ind.data();
21  double *yd = y.data();
22  const size_t xoffset = x.get_offset();
23  const size_t yoffset = y.get_offset();
24 
25  if (A.get_device_mem_stat() == true) {
26 #if MONOLISH_USE_NVIDIA_GPU // gpu
27  cusparseHandle_t sp_handle;
28  cusparseCreate(&sp_handle);
29  cudaDeviceSynchronize();
30 
31  cusparseMatDescr_t descr = 0;
32  cusparseCreateMatDescr(&descr);
33  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
34  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
35  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
36 
37  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
38 
39  const int m = A.get_row();
40  const int n = A.get_col();
41  const double alpha = 1.0;
42  const double beta = 0.0;
43  const int nnz = A.get_nnz();
44 
45 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
46  {
47  internal::check_CUDA(cusparseDcsrmv(sp_handle, trans, m, n, nnz, &alpha,
48  descr, vald, rowd, cold, xd + xoffset,
49  &beta, yd + yoffset));
50  }
51 #else
52  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
53 #endif
54  } else {
55  // MKL
56 #if MONOLISH_USE_MKL
57  int m = A.get_row();
58  int n = A.get_col();
59  const double alpha = 1.0;
60  const double beta = 0.0;
61 
62  sparse_matrix_t mklA;
63  struct matrix_descr descrA;
64  descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
65 
66  mkl_sparse_d_create_csr(&mklA, SPARSE_INDEX_BASE_ZERO, m, n, (int *)rowd,
67  (int *)rowd + 1, (int *)cold, (double *)vald);
68  // mkl_sparse_set_mv_hint (mklA, SPARSE_OPERATION_NON_TRANSPOSE, descrA,
69  // 100); // We haven't seen any performance improvement by using hint.
70  mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, mklA, descrA,
71  xd + xoffset, beta, yd + yoffset);
72 
73  // OSS
74 #else
75 #pragma omp parallel for
76  for (int i = 0; i < (int)A.get_row(); i++) {
77  double ytmp = 0.0;
78  for (int j = rowd[i]; j < rowd[i + 1]; j++) {
79  ytmp += vald[j] * (xd + xoffset)[cold[j]];
80  }
81  yd[i + yoffset] = ytmp;
82  }
83 #endif
84  }
85 
86  logger.func_out();
87 }
88 
89 // float ///////////////////
90 template <typename VEC1, typename VEC2>
91 void Smatvec_core(const matrix::CRS<float> &A, const VEC1 &x, VEC2 &y) {
92  Logger &logger = Logger::get_instance();
93  logger.func_in(monolish_func);
94 
95  // err, M = MN * N
96  assert(A.get_row() == y.size());
97  assert(A.get_col() == x.size());
98  assert(util::is_same_device_mem_stat(A, x, y));
99 
100  const float *vald = A.val.data();
101  const float *xd = x.data();
102  const int *rowd = A.row_ptr.data();
103  const int *cold = A.col_ind.data();
104  float *yd = y.data();
105  const size_t xoffset = x.get_offset();
106  const size_t yoffset = y.get_offset();
107 
108  if (A.get_device_mem_stat() == true) {
109 #if MONOLISH_USE_NVIDIA_GPU // gpu
110  cusparseHandle_t sp_handle;
111  cusparseCreate(&sp_handle);
112  cudaDeviceSynchronize();
113 
114  cusparseMatDescr_t descr = 0;
115  cusparseCreateMatDescr(&descr);
116  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
117  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
118  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
119 
120  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
121 
122  const int m = A.get_row();
123  const int n = A.get_col();
124  const int nnz = A.get_nnz();
125  const float alpha = 1.0;
126  const float beta = 0.0;
127 
128 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
129  {
130  internal::check_CUDA(cusparseScsrmv(sp_handle, trans, m, n, nnz, &alpha,
131  descr, vald, rowd, cold, xd + xoffset,
132  &beta, yd + yoffset));
133  }
134 #else
135  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
136 #endif
137  } else {
138  // MKL
139 #if MONOLISH_USE_MKL
140  const int m = A.get_row();
141  const int n = A.get_col();
142  const float alpha = 1.0;
143  const float beta = 0.0;
144 
145  sparse_matrix_t mklA;
146  struct matrix_descr descrA;
147  descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
148 
149  mkl_sparse_s_create_csr(&mklA, SPARSE_INDEX_BASE_ZERO, m, n, (int *)rowd,
150  (int *)rowd + 1, (int *)cold, (float *)vald);
151  // mkl_sparse_set_mv_hint (mklA, SPARSE_OPERATION_NON_TRANSPOSE, descrA,
152  // 100); // We haven't seen any performance improvement by using hint.
153  mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, mklA, descrA,
154  xd + xoffset, beta, yd + yoffset);
155 
156  // OSS
157 #else
158 #pragma omp parallel for
159  for (int i = 0; i < (int)A.get_row(); i++) {
160  float ytmp = 0.0;
161  for (int j = rowd[i]; j < rowd[i + 1]; j++) {
162  ytmp += vald[j] * (xd + xoffset)[cold[j]];
163  }
164  yd[i + yoffset] = ytmp;
165  }
166 #endif
167  }
168 
169  logger.func_out();
170 }
171 } // namespace
172 } // namespace monolish
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::util::is_same_device_mem_stat
bool is_same_device_mem_stat(const T &arg1, const U &arg2)
compare same device memory status
Definition: monolish_common.hpp:454
monolish
Definition: monolish_matrix_blas.hpp:10
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42