monolish  0.14.0
MONOlithic LIner equation Solvers for Highly-parallel architecture
lobpcg_generalized.cpp
Go to the documentation of this file.
1 #include "../../include/monolish_blas.hpp"
2 #include "../../include/monolish_eigen.hpp"
3 #include "../../include/monolish_vml.hpp"
4 #include "../internal/lapack/monolish_lapack.hpp"
5 #include "../internal/monolish_internal.hpp"
6 
7 namespace monolish {
8 
9 template <typename MATRIX, typename T>
11  MATRIX &A, MATRIX &B, vector<T> &l, matrix::Dense<T> &xinout, int itype) {
12  // LOBPCG only support itype == 1 (Ax = lBx)
13  assert(itype == 1);
14 
15  Logger &logger = Logger::get_instance();
16  logger.solver_in(monolish_func);
17 
18  // size consistency check
19  // the size of xinout is (m, A.get_col())
20  if (A.get_col() != xinout.get_col()) {
21  throw std::runtime_error("error, A.col != x.col");
22  }
23  if (A.get_col() != B.get_col() || A.get_row() != B.get_row()) {
24  throw std::runtime_error("error, A shape != B shape");
25  }
26  if (l.size() != xinout.get_row()) {
27  throw std::runtime_error("error, l.size != x.row");
28  }
29 
30  this->precond.create_precond(A);
31 
32  // Algorithm following DOI:10.1007/978-3-319-69953-0_14
33  // m is the number of eigenpairs to compute
34  const std::size_t m = l.size();
35  // n is the dimension of the original space
36  const std::size_t n = A.get_col();
37 
38  // Internal memory
39  // currently 9m+(9m+3m+4) vectors are used
40  matrix::Dense<T> wxp(3 * m, n);
41  matrix::Dense<T> WXP(3 * m, n);
42  matrix::Dense<T> BWXP(3 * m, n);
43  // TODO: wxp_p, WXP_p, and BWXP_p are not needed when m=1
44  matrix::Dense<T> wxp_p(3 * m, n);
45  matrix::Dense<T> WXP_p(3 * m, n);
46  matrix::Dense<T> BWXP_p(3 * m, n);
47  // TODO: twxp is not needed when transpose matmul is supported
48  matrix::Dense<T> twxp(n, 3 * m);
49  // TODO: vtmp1 and vtmp2 are not needed when in-place preconditioner is used
50  // and view1D is supported by preconditioners
51  monolish::vector<T> vtmp1(n);
52  monolish::vector<T> vtmp2(n);
53  // TODO: zero is not needed when view1D supports fill(T val)
54  monolish::vector<T> zero(n, 0.0);
55  // TODO: r is not needed when util::random_vector supports view1D
57 
58  // view1Ds for calculation
59  std::vector<view1D<matrix::Dense<T>, T>> w;
60  std::vector<view1D<matrix::Dense<T>, T>> x;
61  std::vector<view1D<matrix::Dense<T>, T>> p;
62  std::vector<view1D<matrix::Dense<T>, T>> W;
63  std::vector<view1D<matrix::Dense<T>, T>> X;
64  std::vector<view1D<matrix::Dense<T>, T>> P;
65  std::vector<view1D<matrix::Dense<T>, T>> BW;
66  std::vector<view1D<matrix::Dense<T>, T>> BX;
67  std::vector<view1D<matrix::Dense<T>, T>> BP;
68  std::vector<view1D<matrix::Dense<T>, T>> wp;
69  std::vector<view1D<matrix::Dense<T>, T>> xp;
70  std::vector<view1D<matrix::Dense<T>, T>> pp;
71  std::vector<view1D<matrix::Dense<T>, T>> Wp;
72  std::vector<view1D<matrix::Dense<T>, T>> Xp;
73  std::vector<view1D<matrix::Dense<T>, T>> Pp;
74  std::vector<view1D<matrix::Dense<T>, T>> BWp;
75  std::vector<view1D<matrix::Dense<T>, T>> BXp;
76  std::vector<view1D<matrix::Dense<T>, T>> BPp;
77  for (std::size_t i = 0; i < m; ++i) {
78  w.push_back(view1D<matrix::Dense<T>, T>(wxp, i * n, (i + 1) * n));
79  x.push_back(view1D<matrix::Dense<T>, T>(wxp, (m + i) * n, (m + i + 1) * n));
80  p.push_back(
81  view1D<matrix::Dense<T>, T>(wxp, (2 * m + i) * n, (2 * m + i + 1) * n));
82  W.push_back(view1D<matrix::Dense<T>, T>(WXP, i * n, (i + 1) * n));
83  X.push_back(view1D<matrix::Dense<T>, T>(WXP, (m + i) * n, (m + i + 1) * n));
84  P.push_back(
85  view1D<matrix::Dense<T>, T>(WXP, (2 * m + i) * n, (2 * m + i + 1) * n));
86  BW.push_back(view1D<matrix::Dense<T>, T>(BWXP, i * n, (i + 1) * n));
87  BX.push_back(
88  view1D<matrix::Dense<T>, T>(BWXP, (m + i) * n, (m + i + 1) * n));
89  BP.push_back(view1D<matrix::Dense<T>, T>(BWXP, (2 * m + i) * n,
90  (2 * m + i + 1) * n));
91  wp.push_back(view1D<matrix::Dense<T>, T>(wxp_p, i * n, (i + 1) * n));
92  xp.push_back(
93  view1D<matrix::Dense<T>, T>(wxp_p, (m + i) * n, (m + i + 1) * n));
94  pp.push_back(view1D<matrix::Dense<T>, T>(wxp_p, (2 * m + i) * n,
95  (2 * m + i + 1) * n));
96  Wp.push_back(view1D<matrix::Dense<T>, T>(WXP_p, i * n, (i + 1) * n));
97  Xp.push_back(
98  view1D<matrix::Dense<T>, T>(WXP_p, (m + i) * n, (m + i + 1) * n));
99  Pp.push_back(view1D<matrix::Dense<T>, T>(WXP_p, (2 * m + i) * n,
100  (2 * m + i + 1) * n));
101  BWp.push_back(view1D<matrix::Dense<T>, T>(BWXP_p, i * n, (i + 1) * n));
102  BXp.push_back(
103  view1D<matrix::Dense<T>, T>(BWXP_p, (m + i) * n, (m + i + 1) * n));
104  BPp.push_back(view1D<matrix::Dense<T>, T>(BWXP_p, (2 * m + i) * n,
105  (2 * m + i + 1) * n));
106  }
107 
108  if (this->get_initvec_scheme() == monolish::solver::initvec_scheme::RANDOM) {
109  // Preparing initial input to be orthonormal to each other
110  for (std::size_t i = 0; i < m; ++i) {
111  T minval = 0.0;
112  T maxval = 1.0;
113  util::random_vector(r, minval, maxval);
114  blas::copy(r, x[i]);
115  T norm;
116  blas::nrm2(x[i], norm);
117  blas::scal(1.0 / norm, x[i]);
118  }
119  } else { // initvec_scheme::USER
120  // Copy User-supplied xinout to x
121  for (std::size_t i = 0; i < m; ++i) {
122  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
123  blas::copy(xinout_i, x[i]);
124  }
125  }
126 
127  if (A.get_device_mem_stat() == true) {
128  monolish::util::send(wxp, WXP, BWXP, wxp_p, WXP_p, BWXP_p, twxp, vtmp1,
129  vtmp2, zero, xinout);
130  }
131 
132  for (std::size_t i = 0; i < m; ++i) {
133  // X = A x
134  blas::matvec(A, x[i], X[i]);
135  // BX = B x
136  blas::matvec(B, x[i], BX[i]);
137  // mu = (x, X) / (x, BX);
138  T muA = blas::dot(x[i], X[i]);
139  T muB = blas::dot(x[i], BX[i]);
140  T mu = muA / muB;
141  // w = X - mu BX, normalize
142  blas::axpyz(-mu, x[i], BX[i], w[i]);
143  T norm = blas::nrm2(w[i]);
144  blas::scal(1.0 / norm, w[i]);
145  // W = A w
146  blas::matvec(A, w[i], W[i]);
147  // BW = B w
148  blas::matvec(B, w[i], BW[i]);
149  }
150 
151  // B singular flag
152  bool is_singular = false;
153 
154  matrix::Dense<T> Sam(3 * m, 3 * m);
155  matrix::Dense<T> Sbm(3 * m, 3 * m);
156  vector<T> lambda(3 * m);
157  if (A.get_device_mem_stat() == true) {
159  }
160 
161  for (std::size_t iter = 0; iter < this->get_maxiter(); iter++) {
162  if (A.get_device_mem_stat() == true) {
163  monolish::util::send(Sam, lambda);
164  }
165  if (iter == 0 || is_singular) {
166  // It is intended not to resize actual memory layout
167  // and just use the beginning part of
168  // (i.e. not touching {Sam,Sbm,wxp,twxp,WXP}.{val,nnz})
169  Sam.set_col(2 * m);
170  Sam.set_row(2 * m);
171  Sbm.set_col(2 * m);
172  Sbm.set_row(2 * m);
173  wxp.set_row(2 * m);
174  wxp_p.set_row(2 * m);
175  twxp.set_col(2 * m);
176  WXP.set_row(2 * m);
177  WXP_p.set_row(2 * m);
178  BWXP.set_row(2 * m);
179  BWXP_p.set_row(2 * m);
180  }
181  if (A.get_device_mem_stat() == true) {
182  wxp.nonfree_recv();
183  twxp.device_free();
184  }
185  twxp.transpose(wxp);
186  if (A.get_device_mem_stat() == true) {
187  twxp.send();
188  }
189  // Sa = { w, x, p }^T { W, X, P }
190  // = { w, x, p }^T A { w, x, p }
191  blas::matmul(WXP, twxp, Sam);
192  // Sb = { w, x, p }^T { BW, BX, BP }
193  blas::matmul(BWXP, twxp, Sbm);
194 
195  if (A.get_device_mem_stat() == true) {
196  Sam.nonfree_recv();
197  Sbm.nonfree_recv();
198  }
199  // Generalized Eigendecomposition
200  // Sa v = lambda Sb v
201  const char jobz = 'V';
202  const char uplo = 'L';
203  int info = internal::lapack::sygvd(Sam, Sbm, lambda, 1, &jobz, &uplo);
204  // 5m < info <= 6m means order 3 of B is not positive definite, similar to
205  // step 0. therefore we set is_singular flag to true and restart the
206  // iteration step.
207  int lbound = 5 * m;
208  int ubound = 6 * m;
209  if (iter == 0 || is_singular) {
210  lbound = 3 * m;
211  ubound = 4 * m;
212  }
213  if (lbound < info && info <= ubound) {
214  if (this->get_print_rhistory()) {
215  *this->rhistory_stream << iter + 1 << "\t"
216  << "singular; restart the step" << std::endl;
217  }
218  is_singular = true;
219  iter--;
220  continue;
221  } else if (info != 0 && info < static_cast<int>(m)) {
222  std::string s = "sygvd returns ";
223  s += std::to_string(info);
224  s += ": internal LAPACK sygvd returned error";
225  throw std::runtime_error(s);
226  }
227  if (A.get_device_mem_stat() == true) {
228  monolish::util::recv(Sam, lambda);
229  }
230 
231  // prepare previous step results
232  blas::copy(wxp, wxp_p);
233  blas::copy(WXP, WXP_p);
234  blas::copy(BWXP, BWXP_p);
235  vector<T> residual(m);
236  for (std::size_t i = 0; i < m; ++i) {
237  // copy current eigenvalue results
238  l[i] = lambda[i];
239 
240  // extract eigenvector of Sa v = lambda Sb v
241  vector<T> b(3 * m);
242  if (iter == 0 || is_singular) {
243  b.resize(2 * m);
244  }
245  Sam.row(i, b);
246 
247  if (iter == 0 || is_singular) {
248  b.resize(3 * m);
249  // b[2m]...b[3m-1] is not calculated so explicitly set to 0
250  for (std::size_t j = 2 * m; j < 3 * m; ++j) {
251  b[j] = 0.0;
252  }
253  }
254  blas::copy(zero, p[i]);
255  blas::copy(zero, x[i]);
256  blas::copy(zero, P[i]);
257  blas::copy(zero, X[i]);
258  blas::copy(zero, BP[i]);
259  blas::copy(zero, BX[i]);
260 
261  for (std::size_t j = 0; j < m; ++j) {
262  // x[i] = \Sum_j b[j] w[j] + b[m+j] x[j] + b[2m+j] p[j], normalize
263  // p[i] = \Sum_j b[j] w[j] + b[2m+j] p[j], normalize
264  blas::axpy(b[j], wp[j], p[i]);
265  blas::axpy(b[2 * m + j], pp[j], p[i]);
266  blas::axpy(b[j], wp[j], x[i]);
267  blas::axpy(b[2 * m + j], pp[j], x[i]);
268  blas::axpy(b[m + j], xp[j], x[i]);
269 
270  // X[i] = \Sum_j b[j]W[j] + b[m+j]X[j] + b[2m+j]P[j], normalize with x
271  // P[i] = \Sum_j b[j]W[j] + b[2m+j]P[j], normalize with p
272  blas::axpy(b[j], Wp[j], P[i]);
273  blas::axpy(b[2 * m + j], Pp[j], P[i]);
274  blas::axpy(b[j], Wp[j], X[i]);
275  blas::axpy(b[2 * m + j], Pp[j], X[i]);
276  blas::axpy(b[m + j], Xp[j], X[i]);
277 
278  // BX[i] = \Sum_j b[j]BW[j] + b[m+j]BX[j] + b[2m+j]BP[j], normalize with
279  // x BP[i] = \Sum_j b[j]BW[j] + b[2m+j]BP[j], normalize
280  // with p
281  blas::axpy(b[j], BWp[j], BP[i]);
282  blas::axpy(b[2 * m + j], BPp[j], BP[i]);
283  blas::axpy(b[j], BWp[j], BX[i]);
284  blas::axpy(b[2 * m + j], BPp[j], BX[i]);
285  blas::axpy(b[m + j], BXp[j], BX[i]);
286  }
287 
288  T normp;
289  blas::nrm2(p[i], normp);
290  blas::scal(1.0 / normp, p[i]);
291  blas::scal(1.0 / normp, P[i]);
292  blas::scal(1.0 / normp, BP[i]);
293  T normx;
294  blas::nrm2(x[i], normx);
295  blas::scal(1.0 / normx, x[i]);
296  blas::scal(1.0 / normx, X[i]);
297  blas::scal(1.0 / normx, BX[i]);
298 
299  // w[i] = X[i] - lambda x[i]
300  blas::axpyz(-l[i], x[i], X[i], w[i]);
301  // apply preconditioner
302  blas::copy(w[i], vtmp2);
303  this->precond.apply_precond(vtmp2, vtmp1);
304  blas::copy(vtmp1, w[i]);
305 
306  // residual calculation
307  blas::nrm2(w[i], residual[i]);
308  if (this->get_print_rhistory()) {
309  *this->rhistory_stream << iter + 1 << "\t" << i << "\t"
310  << std::scientific << residual[i] << std::endl;
311  }
312  if (std::isnan(residual[i])) {
313  for (std::size_t j = 0; j < m; ++j) {
314  view1D<matrix::Dense<T>, T> xinout_j(xinout, j * n, (j + 1) * n);
315  blas::copy(x[j], xinout_j);
316  }
317  logger.solver_out();
319  }
320  // normalize w
321  blas::scal(1.0 / residual[i], w[i]);
322  // W = A w
323  blas::matvec(A, w[i], W[i]);
324  // BW = B w
325  blas::matvec(B, w[i], BW[i]);
326  }
327 
328  // early return when residual is small enough
329  if (vml::max(residual) < this->get_tol() &&
330  this->get_miniter() < iter + 1) {
331  for (std::size_t i = 0; i < m; ++i) {
332  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
333  blas::copy(x[i], xinout_i);
334  }
335  logger.solver_out();
337  }
338 
339  // reset is_singular flag
340  if (iter == 0 || is_singular) {
341  is_singular = false;
342  Sam.set_row(3 * m);
343  Sam.set_col(3 * m);
344  Sbm.set_row(3 * m);
345  Sbm.set_col(3 * m);
346  wxp.set_row(3 * m);
347  wxp_p.set_row(3 * m);
348  twxp.set_col(3 * m);
349  WXP.set_row(3 * m);
350  WXP_p.set_row(3 * m);
351  BWXP.set_row(3 * m);
352  BWXP_p.set_row(3 * m);
353  }
354  }
355  for (std::size_t i = 0; i < m; ++i) {
356  view1D<matrix::Dense<T>, T> xinout_i(xinout, i * n, (i + 1) * n);
357  blas::copy(x[i], xinout_i);
358  }
359  logger.solver_out();
361 }
362 
363 template int
364 generalized_eigen::LOBPCG<matrix::Dense<double>, double>::monolish_LOBPCG(
365  matrix::Dense<double> &A, matrix::Dense<double> &B, vector<double> &l,
366  matrix::Dense<double> &x, int itype = 1);
367 template int
368 generalized_eigen::LOBPCG<matrix::Dense<float>, float>::monolish_LOBPCG(
369  matrix::Dense<float> &A, matrix::Dense<float> &B, vector<float> &l,
370  matrix::Dense<float> &x, int itype = 1);
371 template int
372 generalized_eigen::LOBPCG<matrix::CRS<double>, double>::monolish_LOBPCG(
373  matrix::CRS<double> &A, matrix::CRS<double> &B, vector<double> &l,
374  matrix::Dense<double> &x, int itype = 1);
375 template int
376 generalized_eigen::LOBPCG<matrix::CRS<float>, float>::monolish_LOBPCG(
377  matrix::CRS<float> &A, matrix::CRS<float> &B, vector<float> &l,
378  matrix::Dense<float> &x, int itype = 1);
379 template int generalized_eigen::LOBPCG<matrix::LinearOperator<double>, double>::
380  monolish_LOBPCG(matrix::LinearOperator<double> &A,
381  matrix::LinearOperator<double> &B, vector<double> &l,
382  matrix::Dense<double> &x, int itype = 1);
383 template int generalized_eigen::LOBPCG<matrix::LinearOperator<float>, float>::
384  monolish_LOBPCG(matrix::LinearOperator<float> &A,
385  matrix::LinearOperator<float> &B, vector<float> &l,
386  matrix::Dense<float> &x, int itype = 1);
387 
388 template <typename MATRIX, typename T>
389 int generalized_eigen::LOBPCG<MATRIX, T>::solve(MATRIX &A, MATRIX &B,
390  vector<T> &l,
391  matrix::Dense<T> &x,
392  int itype) {
393  Logger &logger = Logger::get_instance();
394  logger.solver_in(monolish_func);
395 
396  int ret = 0;
397  if (this->get_lib() == 0) {
398  ret = monolish_LOBPCG(A, B, l, x, itype);
399  }
400 
401  logger.solver_out();
402  return ret; // err code
403 }
404 
405 template int generalized_eigen::LOBPCG<matrix::Dense<double>, double>::solve(
406  matrix::Dense<double> &A, matrix::Dense<double> &B, vector<double> &l,
407  matrix::Dense<double> &x, int itype);
408 template int generalized_eigen::LOBPCG<matrix::Dense<float>, float>::solve(
409  matrix::Dense<float> &A, matrix::Dense<float> &B, vector<float> &l,
410  matrix::Dense<float> &x, int itype);
411 template int generalized_eigen::LOBPCG<matrix::CRS<double>, double>::solve(
412  matrix::CRS<double> &A, matrix::CRS<double> &B, vector<double> &l,
413  matrix::Dense<double> &x, int itype);
414 template int generalized_eigen::LOBPCG<matrix::CRS<float>, float>::solve(
415  matrix::CRS<float> &A, matrix::CRS<float> &B, vector<float> &l,
416  matrix::Dense<float> &x, int itype);
417 template int
418 generalized_eigen::LOBPCG<matrix::LinearOperator<double>, double>::solve(
419  matrix::LinearOperator<double> &A, matrix::LinearOperator<double> &B,
420  vector<double> &l, matrix::Dense<double> &x, int itype);
421 template int
422 generalized_eigen::LOBPCG<matrix::LinearOperator<float>, float>::solve(
423  matrix::LinearOperator<float> &A, matrix::LinearOperator<float> &B,
424  vector<float> &l, matrix::Dense<float> &x, int itype);
425 
426 } // namespace monolish
monolish_func
#define monolish_func
Definition: monolish_logger.hpp:9
monolish::blas::nrm2
void nrm2(const vector< double > &x, double &ans)
nrm2: ||x||_2
Definition: vector_blas.cpp:1080
monolish::blas::axpy
void axpy(const double alpha, const vector< double > &x, vector< double > &y)
axpy: y = ax + y
Definition: vector_blas.cpp:595
monolish::vml::max
void max(const matrix::CRS< double > &A, const matrix::CRS< double > &B, matrix::CRS< double > &C)
Create a new CRS matrix with greatest elements of two matrices (C[0:nnz] = max(A[0:nnz],...
Definition: matrix_vml.cpp:382
MONOLISH_SOLVER_SUCCESS
#define MONOLISH_SOLVER_SUCCESS
Definition: monolish_common.hpp:10
MONOLISH_SOLVER_MAXITER
#define MONOLISH_SOLVER_MAXITER
Definition: monolish_common.hpp:12
monolish::blas::copy
void copy(const matrix::Dense< double > &A, matrix::Dense< double > &C)
Dense matrix copy (C=A)
Definition: dense_copy.cpp:25
monolish::solver::initvec_scheme::RANDOM
@ RANDOM
monolish
Definition: monolish_matrix_blas.hpp:9
monolish::blas::matvec
void matvec(const matrix::Dense< double > &A, const vector< double > &x, vector< double > &y)
Dense matrix and vector multiplication: y = Ax.
Definition: matvec_blas.cpp:11
monolish::blas::dot
void dot(const vector< double > &x, const vector< double > &y, double &ans)
inner product (dot)
Definition: vector_blas.cpp:974
monolish::util::recv
auto recv(T &x)
recv. and free data from GPU
Definition: monolish_common.hpp:612
monolish::util::random_vector
void random_vector(vector< T > &vec, const T min, const T max)
create random vector
Definition: monolish_common.hpp:252
monolish::generalized_eigen::LOBPCG::monolish_LOBPCG
int monolish_LOBPCG(MATRIX &A, MATRIX &B, vector< Float > &lambda, matrix::Dense< Float > &x, int itype)
monolish::blas::scal
void scal(const double alpha, vector< double > &x)
scal: x = alpha * x
Definition: vector_blas.cpp:1093
monolish::blas::axpyz
void axpyz(const double alpha, const vector< double > &x, const vector< double > &y, vector< double > &z)
axpyz: z = ax + y
Definition: vector_blas.cpp:666
monolish::vector
vector class
Definition: monolish_coo.hpp:25
monolish::generalized_eigen::LOBPCG::solve
int solve(MATRIX &A, MATRIX &B, vector< Float > &lambda, matrix::Dense< Float > &x, int itype)
monolish::blas::matmul
void matmul(const matrix::Dense< double > &A, const matrix::Dense< double > &B, matrix::Dense< double > &C)
Dense matrix multiplication: C = AB.
Definition: dense-dense_matmul.cpp:7
MONOLISH_SOLVER_RESIDUAL_NAN
#define MONOLISH_SOLVER_RESIDUAL_NAN
Definition: monolish_common.hpp:14
monolish::util::send
auto send(T &x)
send data to GPU
Definition: monolish_common.hpp:598
monolish::Logger::get_instance
static Logger & get_instance()
Definition: monolish_logger.hpp:42