7 template <
typename VEC1, 
typename VEC2>
 
    8 void Dmatvec_core(
const matrix::CRS<double> &A, 
const VEC1 &x, VEC2 &y) {
 
   13   assert(A.get_row() == y.size());
 
   14   assert(A.get_col() == x.size());
 
   17   const double *vald = A.val.data();
 
   18   const double *xd = x.data();
 
   19   const int *rowd = A.row_ptr.data();
 
   20   const int *cold = A.col_ind.data();
 
   21   double *yd = y.data();
 
   22   const size_t xoffset = x.get_offset();
 
   23   const size_t yoffset = y.get_offset();
 
   25   if (A.get_device_mem_stat() == 
true) {
 
   26 #if MONOLISH_USE_GPU // gpu 
   27     cusparseHandle_t sp_handle;
 
   28     cusparseCreate(&sp_handle);
 
   29     cudaDeviceSynchronize();
 
   31     cusparseMatDescr_t descr = 0;
 
   32     cusparseCreateMatDescr(&descr);
 
   33     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 
   34     cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
 
   35     cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
 
   37     const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
   39     const int m = A.get_row();
 
   40     const int n = A.get_col();
 
   41     const double alpha = 1.0;
 
   42     const double beta = 0.0;
 
   43     const int nnz = A.get_nnz();
 
   45 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold) 
   47       internal::check_CUDA(cusparseDcsrmv(sp_handle, trans, m, n, nnz, &alpha,
 
   48                                           descr, vald, rowd, cold, xd + xoffset,
 
   49                                           &beta, yd + yoffset));
 
   52     throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
 
   57     const int m = A.get_row();
 
   58     const int n = A.get_col();
 
   59     const double alpha = 1.0;
 
   60     const double beta = 0.0;
 
   61     mkl_dcsrmv(
"N", &m, &n, &alpha, 
"G__C", vald, cold, rowd, rowd + 1,
 
   62                xd + xoffset, &beta, yd + yoffset);
 
   66 #pragma omp parallel for 
   67     for (
int i = 0; i < (int)A.get_row(); i++) {
 
   69       for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
 
   70         ytmp += vald[j] * (xd + xoffset)[cold[j]];
 
   72       yd[i + yoffset] = ytmp;
 
   81 template <
typename VEC1, 
typename VEC2>
 
   82 void Smatvec_core(
const matrix::CRS<float> &A, 
const VEC1 &x, VEC2 &y) {
 
   87   assert(A.get_row() == y.size());
 
   88   assert(A.get_col() == x.size());
 
   91   const float *vald = A.val.data();
 
   92   const float *xd = x.data();
 
   93   const int *rowd = A.row_ptr.data();
 
   94   const int *cold = A.col_ind.data();
 
   96   const size_t xoffset = x.get_offset();
 
   97   const size_t yoffset = y.get_offset();
 
   99   if (A.get_device_mem_stat() == 
true) {
 
  100 #if MONOLISH_USE_GPU // gpu 
  101     cusparseHandle_t sp_handle;
 
  102     cusparseCreate(&sp_handle);
 
  103     cudaDeviceSynchronize();
 
  105     cusparseMatDescr_t descr = 0;
 
  106     cusparseCreateMatDescr(&descr);
 
  107     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 
  108     cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
 
  109     cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
 
  111     const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
  113     const int m = A.get_row();
 
  114     const int n = A.get_col();
 
  115     const int nnz = A.get_nnz();
 
  116     const float alpha = 1.0;
 
  117     const float beta = 0.0;
 
  119 #pragma omp target data use_device_ptr(xd, yd, vald, rowd, cold) 
  121       internal::check_CUDA(cusparseScsrmv(sp_handle, trans, m, n, nnz, &alpha,
 
  122                                           descr, vald, rowd, cold, xd + xoffset,
 
  123                                           &beta, yd + yoffset));
 
  126     throw std::runtime_error(
"error USE_GPU is false, but gpu_status == true");
 
  131     const int m = A.get_row();
 
  132     const int n = A.get_col();
 
  133     const float alpha = 1.0;
 
  134     const float beta = 0.0;
 
  135     mkl_scsrmv(
"N", &m, &n, &alpha, 
"G__C", vald, cold, rowd, rowd + 1,
 
  136                xd + xoffset, &beta, yd + yoffset);
 
  140 #pragma omp parallel for 
  141     for (
int i = 0; i < (int)A.get_row(); i++) {
 
  143       for (
int j = rowd[i]; j < rowd[i + 1]; j++) {
 
  144         ytmp += vald[j] * (xd + xoffset)[cold[j]];
 
  146       yd[i + yoffset] = ytmp;