monolish  0.14.0
MONOlithic LIner equation Solvers for Highly-parallel architecture
crs_matvec.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 namespace monolish {
4 
5 namespace {
6 // double ///////////////////
7 template <typename VEC1, typename VEC2>
8 void Dmatvec_core(const matrix::CRS<double> &A, const VEC1 &x, VEC2 &y) {
9  Logger &logger = Logger::get_instance();
10  logger.func_in(monolish_func);
11 
12  // err, M = MN * N
13  assert(A.get_row() == y.size());
14  assert(A.get_col() == x.size());
15  assert(util::is_same_device_mem_stat(A, x, y));
16 
17  const double *vald = A.val.data();
18  const double *xd = x.data();
19  const int *rowd = A.row_ptr.data();
20  const int *cold = A.col_ind.data();
21  double *yd = y.data();
22  const size_t xoffset = x.get_offset();
23  const size_t yoffset = y.get_offset();
24 
25  if (A.get_device_mem_stat() == true) {
26 #if MONOLISH_USE_GPU // gpu
27  cusparseHandle_t sp_handle;
28  cusparseCreate(&sp_handle);
29  cudaDeviceSynchronize();
30 
31  cusparseMatDescr_t descr = 0;
32  cusparseCreateMatDescr(&descr);
33  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
34  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
35  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
36 
37  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
38 
39  const int m = A.get_row();
40  const int n = A.get_col();
41  const double alpha = 1.0;
42  const double beta = 0.0;
43  const int nnz = A.get_nnz();
44 
45 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
46  {
47  internal::check_CUDA(cusparseDcsrmv(sp_handle, trans, m, n, nnz, &alpha,
48  descr, vald, rowd, cold, xd + xoffset,
49  &beta, yd + yoffset));
50  }
51 #else
52  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
53 #endif
54  } else {
55  // MKL
56 #if MONOLISH_USE_MKL
57  const int m = A.get_row();
58  const int n = A.get_col();
59  const double alpha = 1.0;
60  const double beta = 0.0;
61  mkl_dcsrmv("N", &m, &n, &alpha, "G__C", vald, cold, rowd, rowd + 1,
62  xd + xoffset, &beta, yd + yoffset);
63 
64  // OSS
65 #else
66 #pragma omp parallel for
67  for (int i = 0; i < (int)A.get_row(); i++) {
68  double ytmp = 0.0;
69  for (int j = rowd[i]; j < rowd[i + 1]; j++) {
70  ytmp += vald[j] * (xd + xoffset)[cold[j]];
71  }
72  yd[i + yoffset] = ytmp;
73  }
74 #endif
75  }
76 
77  logger.func_out();
78 }
79 
80 // float ///////////////////
81 template <typename VEC1, typename VEC2>
82 void Smatvec_core(const matrix::CRS<float> &A, const VEC1 &x, VEC2 &y) {
83  Logger &logger = Logger::get_instance();
84  logger.func_in(monolish_func);
85 
86  // err, M = MN * N
87  assert(A.get_row() == y.size());
88  assert(A.get_col() == x.size());
89  assert(util::is_same_device_mem_stat(A, x, y));
90 
91  const float *vald = A.val.data();
92  const float *xd = x.data();
93  const int *rowd = A.row_ptr.data();
94  const int *cold = A.col_ind.data();
95  float *yd = y.data();
96  const size_t xoffset = x.get_offset();
97  const size_t yoffset = y.get_offset();
98 
99  if (A.get_device_mem_stat() == true) {
100 #if MONOLISH_USE_GPU // gpu
101  cusparseHandle_t sp_handle;
102  cusparseCreate(&sp_handle);
103  cudaDeviceSynchronize();
104 
105  cusparseMatDescr_t descr = 0;
106  cusparseCreateMatDescr(&descr);
107  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
108  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
109  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
110 
111  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
112 
113  const int m = A.get_row();
114  const int n = A.get_col();
115  const int nnz = A.get_nnz();
116  const float alpha = 1.0;
117  const float beta = 0.0;
118 
119 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold)
120  {
121  internal::check_CUDA(cusparseScsrmv(sp_handle, trans, m, n, nnz, &alpha,
122  descr, vald, rowd, cold, xd + xoffset,
123  &beta, yd + yoffset));
124  }
125 #else
126  throw std::runtime_error("error USE_GPU is false, but gpu_status == true");
127 #endif
128  } else {
129  // MKL
130 #if MONOLISH_USE_MKL
131  const int m = A.get_row();
132  const int n = A.get_col();
133  const float alpha = 1.0;
134  const float beta = 0.0;
135  mkl_scsrmv("N", &m, &n, &alpha, "G__C", vald, cold, rowd, rowd + 1,
136  xd + xoffset, &beta, yd + yoffset);
137 
138  // OSS
139 #else
140 #pragma omp parallel for
141  for (int i = 0; i < (int)A.get_row(); i++) {
142  float ytmp = 0.0;
143  for (int j = rowd[i]; j < rowd[i + 1]; j++) {
144  ytmp += vald[j] * (xd + xoffset)[cold[j]];
145  }
146  yd[i + yoffset] = ytmp;
147  }
148 #endif
149  }
150 
151  logger.func_out();
152 }
153 } // namespace
154 } // namespace monolish
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::util::is_same_device_mem_stat
bool is_same_device_mem_stat(const T &arg1, const U &arg2)
compare same device memory status
Definition: monolish_common.hpp:431
monolish
Definition: monolish_matrix_blas.hpp:9
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42