Line data Source code
1 : /*
2 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3 : SLEPc - Scalable Library for Eigenvalue Problem Computations
4 : Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
5 :
6 : This file is part of SLEPc.
7 : SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9 : */
10 : /*
11 : BV private kernels that use the LAPACK
12 : */
13 :
14 : #include <slepc/private/bvimpl.h>
15 : #include <slepcblaslapack.h>
16 :
17 : /*
18 : Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19 : */
20 2241 : SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21 : {
22 2241 : PetscBLASInt i,n=*len;
23 2241 : PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
24 :
25 2241 : PetscFunctionBegin;
26 2241 : if (PetscUnlikely(*datatype!=MPIU_REAL)) {
27 0 : (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
28 0 : MPI_Abort(PETSC_COMM_WORLD,1);
29 : }
30 4760 : for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
31 2241 : PetscFunctionReturnVoid();
32 : }
33 :
34 : /*
35 : Compute ||A|| for an mxn matrix
36 : */
37 11053 : PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,NormType type,PetscReal *nrm,PetscBool mpi)
38 : {
39 11053 : PetscBLASInt m,n,lda,i,j;
40 11053 : PetscMPIInt len;
41 11053 : PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
42 :
43 11053 : PetscFunctionBegin;
44 11053 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
45 11053 : PetscCall(PetscBLASIntCast(m_,&m));
46 11053 : PetscCall(PetscBLASIntCast(n_,&n));
47 11053 : PetscCall(PetscBLASIntCast(lda_,&lda));
48 11053 : if (type==NORM_FROBENIUS || type==NORM_2) {
49 11045 : lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&lda,rwork);
50 14239 : if (mpi) PetscCallMPI(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
51 9448 : else *nrm = lnrm;
52 11045 : PetscCall(PetscLogFlops(2.0*m*n));
53 8 : } else if (type==NORM_1) {
54 8 : if (mpi) {
55 0 : PetscCall(BVAllocateWork_Private(bv,2*n_));
56 0 : rwork = (PetscReal*)bv->work;
57 0 : rwork2 = rwork+n_;
58 0 : PetscCall(PetscArrayzero(rwork,n_));
59 0 : PetscCall(PetscArrayzero(rwork2,n_));
60 0 : for (j=0;j<n_;j++) {
61 0 : for (i=0;i<m_;i++) {
62 0 : rwork[j] += PetscAbsScalar(A[i+j*lda_]);
63 : }
64 : }
65 0 : PetscCall(PetscMPIIntCast(n_,&len));
66 0 : PetscCallMPI(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
67 0 : *nrm = 0.0;
68 0 : for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
69 : } else {
70 8 : *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&lda,rwork);
71 : }
72 8 : PetscCall(PetscLogFlops(1.0*m*n));
73 0 : } else if (type==NORM_INFINITY) {
74 0 : PetscCall(BVAllocateWork_Private(bv,m_));
75 0 : rwork = (PetscReal*)bv->work;
76 0 : lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&lda,rwork);
77 0 : if (mpi) PetscCallMPI(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
78 0 : else *nrm = lnrm;
79 0 : PetscCall(PetscLogFlops(1.0*m*n));
80 : }
81 11053 : PetscCall(PetscFPTrapPop());
82 11053 : PetscFunctionReturn(PETSC_SUCCESS);
83 : }
84 :
85 : /*
86 : Normalize the columns of an mxn matrix A
87 : */
88 3894 : PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,PetscScalar *eigi,PetscBool mpi)
89 : {
90 3894 : PetscBLASInt m,lda,j,k,info,zero=0;
91 3894 : PetscMPIInt len;
92 3894 : PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
93 :
94 3894 : PetscFunctionBegin;
95 3894 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
96 3894 : PetscCall(PetscBLASIntCast(m_,&m));
97 3894 : PetscCall(PetscBLASIntCast(lda_,&lda));
98 3894 : PetscCall(BVAllocateWork_Private(bv,2*n_));
99 3894 : rwork = (PetscReal*)bv->work;
100 3894 : rwork2 = rwork+n_;
101 : /* compute local norms */
102 43323 : for (j=0;j<n_;j++) {
103 39429 : k = 1;
104 : #if !defined(PETSC_USE_COMPLEX)
105 : if (eigi && eigi[j] != 0.0) k = 2;
106 : #endif
107 39429 : rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*lda_),&lda,rwork2);
108 39429 : if (k==2) { rwork[j+1] = rwork[j]; j++; }
109 : }
110 : /* reduction to get global norms */
111 3894 : if (mpi) {
112 84 : PetscCall(PetscMPIIntCast(n_,&len));
113 84 : PetscCall(PetscArrayzero(rwork2,n_));
114 252 : PetscCallMPI(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
115 84 : norms = rwork2;
116 : } else norms = rwork;
117 : /* scale columns */
118 43323 : for (j=0;j<n_;j++) {
119 39429 : k = 1;
120 : #if !defined(PETSC_USE_COMPLEX)
121 : if (eigi && eigi[j] != 0.0) k = 2;
122 : #endif
123 39429 : PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*lda_),&lda,&info));
124 39429 : SlepcCheckLapackInfo("lascl",info);
125 39429 : if (k==2) j++;
126 : }
127 3894 : PetscCall(PetscLogFlops(3.0*m*n_));
128 3894 : PetscCall(PetscFPTrapPop());
129 3894 : PetscFunctionReturn(PETSC_SUCCESS);
130 : }
131 :
132 : /*
133 : Compute the upper Cholesky factor in R and its inverse in S.
134 : If S == R then the inverse overwrites the Cholesky factor.
135 : */
136 105 : PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
137 : {
138 105 : PetscInt i,k,l,n,m,ld,lds;
139 105 : PetscScalar *pR,*pS;
140 105 : PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
141 :
142 105 : PetscFunctionBegin;
143 105 : l = bv->l;
144 105 : k = bv->k;
145 105 : PetscCall(MatGetSize(R,&m,NULL));
146 105 : n = k-l;
147 105 : PetscCall(PetscBLASIntCast(m,&m_));
148 105 : PetscCall(PetscBLASIntCast(n,&n_));
149 105 : ld = m;
150 105 : ld_ = m_;
151 105 : PetscCall(MatDenseGetArray(R,&pR));
152 :
153 105 : if (S==R) {
154 73 : PetscCall(BVAllocateWork_Private(bv,m*k));
155 73 : pS = bv->work;
156 73 : lds = ld;
157 73 : lds_ = ld_;
158 : } else {
159 32 : PetscCall(MatDenseGetArray(S,&pS));
160 32 : PetscCall(MatGetSize(S,&lds,NULL));
161 32 : PetscCall(PetscBLASIntCast(lds,&lds_));
162 : }
163 :
164 : /* save a copy of matrix in S */
165 1017 : for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
166 :
167 : /* compute upper Cholesky factor in R */
168 105 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
169 105 : PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
170 105 : PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
171 :
172 105 : if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
173 0 : for (i=l;i<k;i++) {
174 0 : PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
175 0 : pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
176 : }
177 0 : PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
178 0 : SlepcCheckLapackInfo("potrf",info);
179 0 : PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
180 : }
181 :
182 : /* compute S = inv(R) */
183 105 : if (S==R) {
184 73 : PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
185 : } else {
186 32 : PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
187 160 : for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
188 32 : PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
189 : }
190 105 : SlepcCheckLapackInfo("trtri",info);
191 105 : PetscCall(PetscFPTrapPop());
192 105 : PetscCall(PetscLogFlops(0.33*n*n*n));
193 :
194 : /* Zero out entries below the diagonal */
195 912 : for (i=l;i<k-1;i++) {
196 807 : PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
197 807 : if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
198 : }
199 105 : PetscCall(MatDenseRestoreArray(R,&pR));
200 105 : if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
201 105 : PetscFunctionReturn(PETSC_SUCCESS);
202 : }
203 :
204 : /*
205 : Compute the inverse of an upper triangular matrix R, store it in S.
206 : If S == R then the inverse overwrites R.
207 : */
208 73 : PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
209 : {
210 73 : PetscInt i,k,l,n,m,ld,lds;
211 73 : PetscScalar *pR,*pS;
212 73 : PetscBLASInt info,n_,m_ = 0,ld_,lds_;
213 :
214 73 : PetscFunctionBegin;
215 73 : l = bv->l;
216 73 : k = bv->k;
217 73 : PetscCall(MatGetSize(R,&m,NULL));
218 73 : n = k-l;
219 73 : PetscCall(PetscBLASIntCast(m,&m_));
220 73 : PetscCall(PetscBLASIntCast(n,&n_));
221 73 : ld = m;
222 73 : ld_ = m_;
223 73 : PetscCall(MatDenseGetArray(R,&pR));
224 :
225 73 : if (S==R) {
226 57 : PetscCall(BVAllocateWork_Private(bv,m*k));
227 57 : pS = bv->work;
228 57 : lds = ld;
229 57 : lds_ = ld_;
230 : } else {
231 16 : PetscCall(MatDenseGetArray(S,&pS));
232 16 : PetscCall(MatGetSize(S,&lds,NULL));
233 16 : PetscCall(PetscBLASIntCast(lds,&lds_));
234 : }
235 :
236 : /* compute S = inv(R) */
237 73 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
238 73 : if (S==R) {
239 57 : PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
240 : } else {
241 16 : PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
242 80 : for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
243 16 : PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
244 : }
245 73 : SlepcCheckLapackInfo("trtri",info);
246 73 : PetscCall(PetscFPTrapPop());
247 73 : PetscCall(PetscLogFlops(0.33*n*n*n));
248 :
249 73 : PetscCall(MatDenseRestoreArray(R,&pR));
250 73 : if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
251 73 : PetscFunctionReturn(PETSC_SUCCESS);
252 : }
253 :
254 : /*
255 : Compute the matrix to be used for post-multiplying the basis in the SVQB
256 : block orthogonalization method.
257 : On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
258 : the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
259 : If S == R then the result overwrites R.
260 : */
261 105 : PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
262 : {
263 105 : PetscInt i,j,k,l,n,m,ld,lds;
264 105 : PetscScalar *pR,*pS,*D,*work,a;
265 105 : PetscReal *eig,dummy;
266 105 : PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
267 : #if defined(PETSC_USE_COMPLEX)
268 105 : PetscReal *rwork,rdummy;
269 : #endif
270 :
271 105 : PetscFunctionBegin;
272 105 : l = bv->l;
273 105 : k = bv->k;
274 105 : PetscCall(MatGetSize(R,&m,NULL));
275 105 : PetscCall(MatDenseGetLDA(R,&ld));
276 105 : n = k-l;
277 105 : PetscCall(PetscBLASIntCast(m,&m_));
278 105 : PetscCall(PetscBLASIntCast(n,&n_));
279 105 : ld_ = m_;
280 105 : PetscCall(MatDenseGetArray(R,&pR));
281 :
282 105 : if (S==R) {
283 73 : pS = pR;
284 73 : lds = ld;
285 73 : lds_ = ld_;
286 : } else {
287 32 : PetscCall(MatDenseGetArray(S,&pS));
288 32 : PetscCall(MatDenseGetLDA(S,&lds));
289 32 : PetscCall(PetscBLASIntCast(lds,&lds_));
290 : }
291 105 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
292 :
293 : /* workspace query and memory allocation */
294 105 : lwork = -1;
295 : #if defined(PETSC_USE_COMPLEX)
296 105 : PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
297 105 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
298 105 : PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
299 : #else
300 : PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
301 : PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
302 : PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
303 : #endif
304 :
305 : /* copy and scale matrix */
306 1017 : for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
307 12793 : for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
308 12793 : for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
309 :
310 : /* compute eigendecomposition */
311 : #if defined(PETSC_USE_COMPLEX)
312 105 : PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
313 : #else
314 : PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
315 : #endif
316 105 : SlepcCheckLapackInfo("syev",info);
317 :
318 105 : if (S!=R) { /* R = U' */
319 800 : for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
320 : }
321 :
322 : /* compute S = D*U*Lambda^{-1/2} */
323 12793 : for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
324 12793 : for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
325 :
326 105 : if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
327 800 : for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
328 800 : for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
329 : }
330 :
331 : #if defined(PETSC_USE_COMPLEX)
332 105 : PetscCall(PetscFree4(eig,D,work,rwork));
333 : #else
334 : PetscCall(PetscFree3(eig,D,work));
335 : #endif
336 105 : PetscCall(PetscLogFlops(9.0*n*n*n));
337 105 : PetscCall(PetscFPTrapPop());
338 :
339 105 : PetscCall(MatDenseRestoreArray(R,&pR));
340 105 : if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
341 105 : PetscFunctionReturn(PETSC_SUCCESS);
342 : }
343 :
344 : /*
345 : QR factorization of an mxn matrix via parallel TSQR
346 : */
347 187 : PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
348 : {
349 187 : PetscInt level,plevel,nlevels,lda,worklen;
350 187 : PetscBLASInt m,n,ldq,i,j,k,l,nb,sz,lwork,info;
351 187 : PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
352 187 : PetscMPIInt rank,size,count,stride,powtwo,s = 0;
353 187 : MPI_Datatype tmat;
354 :
355 187 : PetscFunctionBegin;
356 187 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
357 187 : PetscCall(PetscBLASIntCast(m_,&m));
358 187 : PetscCall(PetscBLASIntCast(n_,&n));
359 187 : PetscCall(PetscBLASIntCast(ldq_,&ldq));
360 187 : k = PetscMin(m,n);
361 187 : nb = 16;
362 187 : lda = 2*n;
363 187 : PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
364 187 : PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
365 187 : nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
366 374 : PetscCall(PetscMPIIntCast(PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size))),&powtwo));
367 187 : worklen = n+n*nb;
368 187 : if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
369 187 : PetscCall(BVAllocateWork_Private(bv,worklen));
370 187 : tau = bv->work;
371 187 : work = bv->work+n;
372 187 : PetscCall(PetscBLASIntCast(n*nb,&lwork));
373 187 : if (nlevels) {
374 116 : A = bv->work+n+n*nb;
375 116 : QQ = bv->work+n+n*nb+n*lda;
376 116 : C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
377 : }
378 :
379 : /* Compute QR */
380 187 : PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&ldq,tau,work,&lwork,&info));
381 187 : SlepcCheckLapackInfo("geqrf",info);
382 :
383 : /* Extract R */
384 187 : if (R || nlevels) {
385 2121 : for (j=0;j<n;j++) {
386 18981 : for (i=0;i<=PetscMin(j,m-1);i++) {
387 17047 : if (nlevels) A[i+j*lda] = Q[i+j*ldq];
388 15391 : else R[i+j*ldr] = Q[i+j*ldq];
389 : }
390 17415 : for (i=PetscMin(j,m-1)+1;i<n;i++) {
391 15481 : if (nlevels) A[i+j*lda] = 0.0;
392 14005 : else R[i+j*ldr] = 0.0;
393 : }
394 : }
395 : }
396 :
397 : /* Compute orthogonal matrix in Q */
398 187 : PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&ldq,tau,work,&lwork,&info));
399 187 : SlepcCheckLapackInfo("orgqr",info);
400 :
401 187 : if (nlevels) {
402 :
403 116 : PetscCall(PetscMPIIntCast(n,&count));
404 116 : PetscCall(PetscMPIIntCast(lda,&stride));
405 116 : PetscCall(PetscBLASIntCast(lda,&l));
406 116 : PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
407 116 : PetscCallMPI(MPI_Type_commit(&tmat));
408 :
409 400 : for (level=nlevels;level>=1;level--) {
410 :
411 284 : plevel = PetscPowInt(2,level);
412 568 : PetscCall(PetscMPIIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
413 :
414 : /* Stack triangular matrices */
415 284 : if (rank<s && s<size) { /* send top part, receive bottom part */
416 124 : PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417 160 : } else if (s<size) { /* copy top to bottom, receive top part */
418 124 : PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
419 124 : PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
420 : }
421 284 : if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
422 168 : if (rank<size-powtwo) { /* send bottom part */
423 72 : PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
424 96 : } else if (rank>=powtwo) { /* receive bottom part */
425 72 : PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
426 : }
427 : }
428 : /* Compute QR and build orthogonal matrix */
429 284 : if (level<nlevels || (level==nlevels && s<size)) {
430 272 : PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
431 272 : SlepcCheckLapackInfo("geqrf",info);
432 272 : PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
433 272 : PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
434 272 : SlepcCheckLapackInfo("orgqr",info);
435 1600 : for (j=0;j<n;j++) {
436 4544 : for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
437 : }
438 12 : } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
439 12 : PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
440 72 : for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
441 : }
442 : }
443 :
444 : /* Extract R */
445 116 : if (R) {
446 664 : for (j=0;j<n;j++) {
447 2388 : for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
448 1840 : for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
449 : }
450 : }
451 :
452 : /* Accumulate orthogonal matrices */
453 400 : for (level=1;level<=nlevels;level++) {
454 284 : plevel = PetscPowInt(2,level);
455 568 : PetscCall(PetscMPIIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
456 284 : Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
457 284 : if (level<nlevels) {
458 168 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
459 168 : PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
460 : } else {
461 176 : for (i=0;i<m/l;i++) {
462 60 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&ldq,Qhalf,&l,&zero,C,&l));
463 320 : for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*ldq,C+j*l,l));
464 : }
465 116 : sz = m%l;
466 116 : if (sz) {
467 116 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&ldq,Qhalf,&l,&zero,C,&l));
468 664 : for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*ldq,C+j*l,sz));
469 : }
470 : }
471 : }
472 :
473 116 : PetscCallMPI(MPI_Type_free(&tmat));
474 : }
475 :
476 187 : PetscCall(PetscLogFlops(3.0*m*n*n));
477 187 : PetscCall(PetscFPTrapPop());
478 187 : PetscFunctionReturn(PETSC_SUCCESS);
479 : }
480 :
481 : /*
482 : Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
483 : all matrices are upper triangular stored in packed format
484 : */
485 32 : SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
486 : {
487 32 : PetscBLASInt n,i,j,k,one=1;
488 32 : PetscMPIInt tsize;
489 32 : PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
490 32 : PetscReal c;
491 :
492 32 : PetscFunctionBegin;
493 32 : PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize)); /* we assume len=1 */
494 32 : tsize /= sizeof(PetscScalar);
495 32 : n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
496 160 : for (j=0;j<n;j++) {
497 512 : for (i=0;i<=j;i++) {
498 384 : LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
499 384 : R1[(2*n-j-1)*j/2+j] = v;
500 384 : k = n-j-1;
501 384 : if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
502 : }
503 : }
504 32 : PetscFunctionReturnVoid();
505 : }
506 :
507 : /*
508 : Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
509 : */
510 73 : PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
511 : {
512 73 : PetscInt worklen;
513 73 : PetscBLASInt m,n,ldq,i,j,s,nb,lwork,info;
514 73 : PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
515 73 : PetscMPIInt size,count;
516 73 : MPI_Datatype tmat;
517 :
518 73 : PetscFunctionBegin;
519 73 : PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
520 73 : PetscCall(PetscBLASIntCast(m_,&m));
521 73 : PetscCall(PetscBLASIntCast(n_,&n));
522 73 : PetscCall(PetscBLASIntCast(ldq_,&ldq));
523 73 : nb = 16;
524 73 : s = n+n*(n-1)/2; /* length of packed triangular matrix */
525 73 : PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
526 73 : worklen = n+n*nb+2*s+ldq*n;
527 73 : PetscCall(BVAllocateWork_Private(bv,worklen));
528 73 : tau = bv->work;
529 73 : work = bv->work+n;
530 73 : R1 = bv->work+n+n*nb;
531 73 : R2 = bv->work+n+n*nb+s;
532 73 : A = bv->work+n+n*nb+2*s;
533 73 : PetscCall(PetscBLASIntCast(n*nb,&lwork));
534 73 : PetscCall(PetscArraycpy(A,Q,ldq*n));
535 :
536 : /* Compute QR */
537 73 : PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&ldq,tau,work,&lwork,&info));
538 73 : SlepcCheckLapackInfo("geqrf",info);
539 :
540 73 : if (size==1) {
541 : /* Extract R */
542 697 : for (j=0;j<n;j++) {
543 6232 : for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*ldq];
544 5576 : for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
545 : }
546 : } else {
547 : /* Use MPI reduction operation to obtain global R */
548 32 : PetscCall(PetscMPIIntCast(s,&count));
549 32 : PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
550 32 : PetscCallMPI(MPI_Type_commit(&tmat));
551 160 : for (i=0;i<n;i++) {
552 512 : for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*ldq]:0.0;
553 : }
554 96 : PetscCallMPI(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
555 160 : for (i=0;i<n;i++) {
556 384 : for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
557 512 : for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
558 : }
559 32 : PetscCallMPI(MPI_Type_free(&tmat));
560 : }
561 :
562 73 : PetscCall(PetscLogFlops(3.0*m*n*n));
563 73 : PetscCall(PetscFPTrapPop());
564 73 : PetscFunctionReturn(PETSC_SUCCESS);
565 : }
|