monolish  0.14.0
MONOlithic LIner equation Solvers for Highly-parallel architecture
lobpcg_standard.cpp
Go to the documentation of this file.
1 #include "../../include/monolish_blas.hpp"
2 #include "../../include/monolish_eigen.hpp"
3 #include "../../include/monolish_solver.hpp"
4 #include "../../include/monolish_vml.hpp"
5 #include "../internal/lapack/monolish_lapack.hpp"
6 #include "../internal/monolish_internal.hpp"
7 
8 namespace monolish {
9 
10 template <typename MATRIX, typename T>
12  MATRIX &A, vector<T> &l, matrix::Dense<T> &xinout) {
13  Logger &logger = Logger::get_instance();
14  logger.solver_in(monolish_func);
15 
16  // size consistency check
17  // the size of xinout is (m, A.get_col())
18  if (A.get_col() != xinout.get_col()) {
19  throw std::runtime_error("error, A.col != x.col");
20  }
21  if (l.size() != xinout.get_row()) {
22  throw std::runtime_error("error, l.size != x.row");
23  }
24 
25  this->precond.create_precond(A);
26 
27  // Algorithm following DOI:10.1007/978-3-319-69953-0_14
28  // m is the number of eigenpairs to compute
29  const std::size_t m = l.size();
30  // n is the dimension of the original space
31  const std::size_t n = A.get_col();
32 
33  // Internal memory
34  // currently 6m+(6m+3m+4) vectors are used
35  matrix::Dense<T> wxp(3 * m, n);
36  matrix::Dense<T> WXP(3 * m, n);
37  // TODO: wxp_p and WXP_p are not needed when m=1
38  matrix::Dense<T> wxp_p(3 * m, n);
39  matrix::Dense<T> WXP_p(3 * m, n);
40  // TODO: twxp is not needed when transpose matmul is supported
41  matrix::Dense<T> twxp(n, 3 * m);
42  // TODO: vtmp1 and vtmp2 are not needed when in-place preconditioner is used
43  // and view1D is supported by preconditioners
44  monolish::vector<T> vtmp1(n);
45  monolish::vector<T> vtmp2(n);
46  // TODO: zero is not needed when view1D supports fill(T val)
47  monolish::vector<T> zero(n, 0.0);
48  // TODO: r is not needed when util::random_vector supports view1D
50 
51  // view1Ds for calculation
52  std::vector<view1D<matrix::Dense<T>, T>> w;
53  std::vector<view1D<matrix::Dense<T>, T>> x;
54  std::vector<view1D<matrix::Dense<T>, T>> p;
55  std::vector<view1D<matrix::Dense<T>, T>> W;
56  std::vector<view1D<matrix::Dense<T>, T>> X;
57  std::vector<view1D<matrix::Dense<T>, T>> P;
58  std::vector<view1D<matrix::Dense<T>, T>> wp;
59  std::vector<view1D<matrix::Dense<T>, T>> xp;
60  std::vector<view1D<matrix::Dense<T>, T>> pp;
61  std::vector<view1D<matrix::Dense<T>, T>> Wp;
62  std::vector<view1D<matrix::Dense<T>, T>> Xp;
63  std::vector<view1D<matrix::Dense<T>, T>> Pp;
64  for (std::size_t i = 0; i < m; ++i) {
65  w.push_back(view1D<matrix::Dense<T>, T>(wxp, i * n, (i + 1) * n));
66  x.push_back(view1D<matrix::Dense<T>, T>(wxp, (m + i) * n, (m + i + 1) * n));
67  p.push_back(
68  view1D<matrix::Dense<T>, T>(wxp, (2 * m + i) * n, (2 * m + i + 1) * n));
69  W.push_back(view1D<matrix::Dense<T>, T>(WXP, i * n, (i + 1) * n));
70  X.push_back(view1D<matrix::Dense<T>, T>(WXP, (m + i) * n, (m + i + 1) * n));
71  P.push_back(
72  view1D<matrix::Dense<T>, T>(WXP, (2 * m + i) * n, (2 * m + i + 1) * n));
73  wp.push_back(view1D<matrix::Dense<T>, T>(wxp_p, i * n, (i + 1) * n));
74  xp.push_back(
75  view1D<matrix::Dense<T>, T>(wxp_p, (m + i) * n, (m + i + 1) * n));
76  pp.push_back(view1D<matrix::Dense<T>, T>(wxp_p, (2 * m + i) * n,
77  (2 * m + i + 1) * n));
78  Wp.push_back(view1D<matrix::Dense<T>, T>(WXP_p, i * n, (i + 1) * n));
79  Xp.push_back(
80  view1D<matrix::Dense<T>, T>(WXP_p, (m + i) * n, (m + i + 1) * n));
81  Pp.push_back(view1D<matrix::Dense<T>, T>(WXP_p, (2 * m + i) * n,
82  (2 * m + i + 1) * n));
83  }
84 
85  if (this->get_initvec_scheme() == monolish::solver::initvec_scheme::RANDOM) {
86  // Preparing initial input to be orthonormal to each other
87  for (std::size_t i = 0; i < m; ++i) {
88  T minval = 0.0;
89  T maxval = 1.0;
90  util::random_vector(r, minval, maxval);
91  blas::copy(r, x[i]);
92  T norm;
93  blas::nrm2(x[i], norm);
94  blas::scal(1.0 / norm, x[i]);
95  }
96  } else { // initvec_scheme::USER
97  // Copy User-supplied xinout to x
98  for (std::size_t i = 0; i < m; ++i) {
99  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
100  blas::copy(xinout_i, x[i]);
101  }
102  }
103  if (A.get_device_mem_stat() == true) {
104  monolish::util::send(wxp, WXP, wxp_p, WXP_p, twxp, vtmp1, vtmp2, zero,
105  xinout);
106  }
107 
108  for (std::size_t i = 0; i < m; ++i) {
109  // X = A x
110  blas::matvec(A, x[i], X[i]);
111  // mu = (x, X)
112  T mu = blas::dot(x[i], X[i]);
113  // w = X - mu x, normalize
114  blas::axpyz(-mu, x[i], X[i], w[i]);
115  T norm = blas::nrm2(w[i]);
116  blas::scal(1.0 / norm, w[i]);
117  // W = A w
118  blas::matvec(A, w[i], W[i]);
119  }
120 
121  // B singular flag
122  bool is_singular = false;
123 
124  matrix::Dense<T> Sam(3 * m, 3 * m);
125  matrix::Dense<T> Sbm(3 * m, 3 * m);
126  vector<T> lambda(3 * m);
127  if (A.get_device_mem_stat() == true) {
129  }
130 
131  for (std::size_t iter = 0; iter < this->get_maxiter(); iter++) {
132  if (A.get_device_mem_stat() == true) {
133  monolish::util::send(Sam, lambda);
134  }
135  if (iter == 0 || is_singular) {
136  // It is intended not to resize actual memory layout
137  // and just use the beginning part of
138  // (i.e. not touching {Sam,Sbm,wxp,twxp,WXP}.{val,nnz})
139  Sam.set_col(2 * m);
140  Sam.set_row(2 * m);
141  Sbm.set_col(2 * m);
142  Sbm.set_row(2 * m);
143  wxp.set_row(2 * m);
144  wxp_p.set_row(2 * m);
145  twxp.set_col(2 * m);
146  WXP.set_row(2 * m);
147  WXP_p.set_row(2 * m);
148  }
149  if (A.get_device_mem_stat() == true) {
150  wxp.nonfree_recv();
151  twxp.device_free();
152  }
153  twxp.transpose(wxp);
154  if (A.get_device_mem_stat() == true) {
155  twxp.send();
156  }
157  // Sa = { w, x, p }^T { W, X, P }
158  // = { w, x, p }^T A { w, x, p }
159  blas::matmul(WXP, twxp, Sam);
160  // Sb = { w, x, p }^T { w, x, p }
161  blas::matmul(wxp, twxp, Sbm);
162 
163  // workaround on GPU; synchronize Sam, Sbm
164  if (A.get_device_mem_stat() == true) {
165  Sam.nonfree_recv();
166  Sbm.nonfree_recv();
167  }
168  // Generalized Eigendecomposition
169  // Sa v = lambda Sb v
170  const char jobz = 'V';
171  const char uplo = 'L';
172  int info = internal::lapack::sygvd(Sam, Sbm, lambda, 1, &jobz, &uplo);
173  // 5m < info <= 6m means order 3 of B is not positive definite, similar to
174  // step 0. therefore we set is_singular flag to true and restart the
175  // iteration step.
176  int lbound = 5 * m;
177  int ubound = 6 * m;
178  if (iter == 0 || is_singular) {
179  lbound = 3 * m;
180  ubound = 4 * m;
181  }
182  if (lbound < info && info <= ubound) {
183  if (this->get_print_rhistory()) {
184  *this->rhistory_stream << iter + 1 << "\t"
185  << "singular; restart the step" << std::endl;
186  }
187  is_singular = true;
188  iter--;
189  continue;
190  } else if (info != 0 && info < static_cast<int>(m)) {
191  std::string s = "sygvd returns ";
192  s += std::to_string(info);
193  s += ": internal LAPACK sygvd returned error";
194  throw std::runtime_error(s);
195  }
196  if (A.get_device_mem_stat() == true) {
197  monolish::util::recv(Sam, lambda);
198  }
199 
200  // prepare previous step results
201  blas::copy(wxp, wxp_p);
202  blas::copy(WXP, WXP_p);
203  vector<T> residual(m);
204  for (std::size_t i = 0; i < m; ++i) {
205  // copy current eigenvalue results
206  l[i] = lambda[i];
207 
208  // extract eigenvector of Sa v = lambda Sb v
209  vector<T> b(3 * m);
210  if (iter == 0 || is_singular) {
211  b.resize(2 * m);
212  }
213  Sam.row(i, b);
214 
215  if (iter == 0 || is_singular) {
216  b.resize(3 * m);
217  // b[2m]...b[3m-1] is not calculated so explicitly set to 0
218  for (std::size_t j = 2 * m; j < 3 * m; ++j) {
219  b[j] = 0.0;
220  }
221  }
222  blas::copy(zero, p[i]);
223  blas::copy(zero, x[i]);
224  blas::copy(zero, P[i]);
225  blas::copy(zero, X[i]);
226  for (std::size_t j = 0; j < m; ++j) {
227  // x[i] = \Sum_j b[j] w[j] + b[m+j] x[j] + b[2m+j] p[j], normalize
228  // p[i] = \Sum_j b[j] w[j] + b[2m+j] p[j], normalize
229  blas::axpy(b[j], wp[j], p[i]);
230  blas::axpy(b[2 * m + j], pp[j], p[i]);
231  blas::axpy(b[j], wp[j], x[i]);
232  blas::axpy(b[2 * m + j], pp[j], x[i]);
233  blas::axpy(b[m + j], xp[j], x[i]);
234 
235  // X[i] = \Sum_j b[j]W[j] + b[m+j]X[j] + b[2m+j]P[j], normalize with x
236  // P[i] = \Sum_j b[j]W[j] + b[2m+j]P[j], normalize with p
237  blas::axpy(b[j], Wp[j], P[i]);
238  blas::axpy(b[2 * m + j], Pp[j], P[i]);
239  blas::axpy(b[j], Wp[j], X[i]);
240  blas::axpy(b[2 * m + j], Pp[j], X[i]);
241  blas::axpy(b[m + j], Xp[j], X[i]);
242  }
243 
244  T normp;
245  blas::nrm2(p[i], normp);
246  blas::scal(1.0 / normp, p[i]);
247  blas::scal(1.0 / normp, P[i]);
248  T normx;
249  blas::nrm2(x[i], normx);
250  blas::scal(1.0 / normx, x[i]);
251  blas::scal(1.0 / normx, X[i]);
252 
253  // w[i] = X[i] - lambda x[i]
254  blas::axpyz(-l[i], x[i], X[i], w[i]);
255  // apply preconditioner
256  blas::copy(w[i], vtmp2);
257  this->precond.apply_precond(vtmp2, vtmp1);
258  blas::copy(vtmp1, w[i]);
259 
260  // residual calculation
261  blas::nrm2(w[i], residual[i]);
262  if (this->get_print_rhistory()) {
263  *this->rhistory_stream << iter + 1 << "\t" << i << "\t"
264  << std::scientific << residual[i] << std::endl;
265  }
266  if (std::isnan(residual[i])) {
267  for (std::size_t j = 0; j < m; ++j) {
268  view1D<matrix::Dense<T>, T> xinout_j(xinout, j * n, (j + 1) * n);
269  blas::copy(x[j], xinout_j);
270  }
271  logger.solver_out();
273  }
274  // normalize w
275  blas::scal(1.0 / residual[i], w[i]);
276  // W = A w
277  blas::matvec(A, w[i], W[i]);
278  }
279 
280  // early return when residual is small enough
281  if (vml::max(residual) < this->get_tol() &&
282  this->get_miniter() < iter + 1) {
283  for (std::size_t i = 0; i < m; ++i) {
284  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
285  blas::copy(x[i], xinout_i);
286  }
287  logger.solver_out();
289  }
290 
291  // reset is_singular flag
292  if (iter == 0 || is_singular) {
293  is_singular = false;
294  Sam.set_row(3 * m);
295  Sam.set_col(3 * m);
296  Sbm.set_row(3 * m);
297  Sbm.set_col(3 * m);
298  wxp.set_row(3 * m);
299  wxp_p.set_row(3 * m);
300  twxp.set_col(3 * m);
301  WXP.set_row(3 * m);
302  WXP_p.set_row(3 * m);
303  }
304  }
305  for (std::size_t i = 0; i < m; ++i) {
306  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
307  blas::copy(x[i], xinout_i);
308  }
309  logger.solver_out();
311 }
312 
313 template int
314 standard_eigen::LOBPCG<matrix::Dense<double>, double>::monolish_LOBPCG(
315  matrix::Dense<double> &A, vector<double> &l, matrix::Dense<double> &x);
316 template int
317 standard_eigen::LOBPCG<matrix::Dense<float>, float>::monolish_LOBPCG(
318  matrix::Dense<float> &A, vector<float> &l, matrix::Dense<float> &x);
319 template int
320 standard_eigen::LOBPCG<matrix::CRS<double>, double>::monolish_LOBPCG(
321  matrix::CRS<double> &A, vector<double> &l, matrix::Dense<double> &x);
322 template int standard_eigen::LOBPCG<matrix::CRS<float>, float>::monolish_LOBPCG(
323  matrix::CRS<float> &A, vector<float> &l, matrix::Dense<float> &x);
324 // template int
325 // standard_eigen::LOBPCG<matrix::LinearOperator<double>,
326 // double>::monolish_LOBPCG(
327 // matrix::LinearOperator<double> &A, double &l, vector<double> &x);
328 // template int
329 // standard_eigen::LOBPCG<matrix::LinearOperator<float>,
330 // float>::monolish_LOBPCG(
331 // matrix::LinearOperator<float> &A, float &l, vector<float> &x);
332 
333 template <typename MATRIX, typename T>
334 int standard_eigen::LOBPCG<MATRIX, T>::solve(MATRIX &A, vector<T> &l,
335  matrix::Dense<T> &x) {
336  Logger &logger = Logger::get_instance();
337  logger.solver_in(monolish_func);
338 
339  int ret = 0;
340  if (this->get_lib() == 0) {
341  ret = monolish_LOBPCG(A, l, x);
342  }
343 
344  logger.solver_out();
345  return ret; // err code
346 }
347 
348 template int standard_eigen::LOBPCG<matrix::Dense<double>, double>::solve(
349  matrix::Dense<double> &A, vector<double> &l, matrix::Dense<double> &x);
350 template int standard_eigen::LOBPCG<matrix::Dense<float>, float>::solve(
351  matrix::Dense<float> &A, vector<float> &l, matrix::Dense<float> &x);
352 template int standard_eigen::LOBPCG<matrix::CRS<double>, double>::solve(
353  matrix::CRS<double> &A, vector<double> &l, matrix::Dense<double> &x);
354 template int standard_eigen::LOBPCG<matrix::CRS<float>, float>::solve(
355  matrix::CRS<float> &A, vector<float> &l, matrix::Dense<float> &x);
356 // template int
357 // standard_eigen::LOBPCG<matrix::LinearOperator<double>, double>::solve(
358 // matrix::LinearOperator<double> &A, double &l, vector<double> &x);
359 // template int
360 // standard_eigen::LOBPCG<matrix::LinearOperator<float>, float>::solve(
361 // matrix::LinearOperator<float> &A, float &l, vector<float> &x);
362 
363 } // namespace monolish
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::blas::nrm2
void nrm2(const vector< double > &x, double &ans)
nrm2: ||x||_2
Definition: vector_blas.cpp:1080
monolish::blas::axpy
void axpy(const double alpha, const vector< double > &x, vector< double > &y)
axpy: y = ax + y
Definition: vector_blas.cpp:595
monolish::vml::max
void max(const matrix::CRS< double > &A, const matrix::CRS< double > &B, matrix::CRS< double > &C)
Create a new CRS matrix with greatest elements of two matrices (C[0:nnz] = max(A[0:nnz],...
Definition: matrix_vml.cpp:382
MONOLISH_SOLVER_SUCCESS
#define MONOLISH_SOLVER_SUCCESS
Definition: monolish_common.hpp:10
monolish::standard_eigen::LOBPCG::monolish_LOBPCG
int monolish_LOBPCG(MATRIX &A, vector< Float > &lambda, matrix::Dense< Float > &x)
MONOLISH_SOLVER_MAXITER
#define MONOLISH_SOLVER_MAXITER
Definition: monolish_common.hpp:12
monolish::blas::copy
void copy(const matrix::Dense< double > &A, matrix::Dense< double > &C)
Dense matrix copy (C=A)
Definition: dense_copy.cpp:25
monolish::solver::initvec_scheme::RANDOM
@ RANDOM
monolish
Definition: monolish_matrix_blas.hpp:9
monolish::blas::matvec
void matvec(const matrix::Dense< double > &A, const vector< double > &x, vector< double > &y)
Dense matrix and vector multiplication: y = Ax.
Definition: matvec_blas.cpp:11
monolish::blas::dot
void dot(const vector< double > &x, const vector< double > &y, double &ans)
inner product (dot)
Definition: vector_blas.cpp:974
monolish::util::recv
auto recv(T &x)
recv. and free data from GPU
Definition: monolish_common.hpp:612
monolish::standard_eigen::LOBPCG::solve
int solve(MATRIX &A, vector< Float > &lambda, matrix::Dense< Float > &x)
calculate eigenvalues and eigenvectors or A by LOBPCG method(lib=0: monolish)
monolish::util::random_vector
void random_vector(vector< T > &vec, const T min, const T max)
create random vector
Definition: monolish_common.hpp:252
monolish::blas::scal
void scal(const double alpha, vector< double > &x)
scal: x = alpha * x
Definition: vector_blas.cpp:1093
monolish::blas::axpyz
void axpyz(const double alpha, const vector< double > &x, const vector< double > &y, vector< double > &z)
axpyz: z = ax + y
Definition: vector_blas.cpp:666
monolish::vector
vector class
Definition: monolish_coo.hpp:25
monolish::blas::matmul
void matmul(const matrix::Dense< double > &A, const matrix::Dense< double > &B, matrix::Dense< double > &C)
Dense matrix multiplication: C = AB.
Definition: dense-dense_matmul.cpp:7
MONOLISH_SOLVER_RESIDUAL_NAN
#define MONOLISH_SOLVER_RESIDUAL_NAN
Definition: monolish_common.hpp:14
monolish::util::send
auto send(T &x)
send data to GPU
Definition: monolish_common.hpp:598
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42