Actual source code: bvlapack.c

slepc-3.21.1 2024-04-26
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the LAPACK
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:     Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
 19: */
 20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
 21: {
 22:   PetscBLASInt i,n=*len;
 23:   PetscReal    *x = (PetscReal*)in,*y = (PetscReal*)inout;

 25:   PetscFunctionBegin;
 26:   if (PetscUnlikely(*datatype!=MPIU_REAL)) {
 27:     (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
 28:     MPI_Abort(PETSC_COMM_WORLD,1);
 29:   }
 30:   for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
 31:   PetscFunctionReturnVoid();
 32: }

 34: /*
 35:     Compute ||A|| for an mxn matrix
 36: */
 37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,NormType type,PetscReal *nrm,PetscBool mpi)
 38: {
 39:   PetscBLASInt   m,n,lda,i,j;
 40:   PetscMPIInt    len;
 41:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

 43:   PetscFunctionBegin;
 44:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
 45:   PetscCall(PetscBLASIntCast(m_,&m));
 46:   PetscCall(PetscBLASIntCast(n_,&n));
 47:   PetscCall(PetscBLASIntCast(lda_,&lda));
 48:   if (type==NORM_FROBENIUS || type==NORM_2) {
 49:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&lda,rwork);
 50:     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
 51:     else *nrm = lnrm;
 52:     PetscCall(PetscLogFlops(2.0*m*n));
 53:   } else if (type==NORM_1) {
 54:     if (mpi) {
 55:       PetscCall(BVAllocateWork_Private(bv,2*n_));
 56:       rwork = (PetscReal*)bv->work;
 57:       rwork2 = rwork+n_;
 58:       PetscCall(PetscArrayzero(rwork,n_));
 59:       PetscCall(PetscArrayzero(rwork2,n_));
 60:       for (j=0;j<n_;j++) {
 61:         for (i=0;i<m_;i++) {
 62:           rwork[j] += PetscAbsScalar(A[i+j*lda_]);
 63:         }
 64:       }
 65:       PetscCall(PetscMPIIntCast(n_,&len));
 66:       PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
 67:       *nrm = 0.0;
 68:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
 69:     } else {
 70:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&lda,rwork);
 71:     }
 72:     PetscCall(PetscLogFlops(1.0*m*n));
 73:   } else if (type==NORM_INFINITY) {
 74:     PetscCall(BVAllocateWork_Private(bv,m_));
 75:     rwork = (PetscReal*)bv->work;
 76:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&lda,rwork);
 77:     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
 78:     else *nrm = lnrm;
 79:     PetscCall(PetscLogFlops(1.0*m*n));
 80:   }
 81:   PetscCall(PetscFPTrapPop());
 82:   PetscFunctionReturn(PETSC_SUCCESS);
 83: }

 85: /*
 86:     Normalize the columns of an mxn matrix A
 87: */
 88: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,PetscScalar *eigi,PetscBool mpi)
 89: {
 90:   PetscBLASInt   m,lda,j,k,info,zero=0;
 91:   PetscMPIInt    len;
 92:   PetscReal      *norms,*rwork=NULL,*rwork2=NULL,done=1.0;

 94:   PetscFunctionBegin;
 95:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
 96:   PetscCall(PetscBLASIntCast(m_,&m));
 97:   PetscCall(PetscBLASIntCast(lda_,&lda));
 98:   PetscCall(BVAllocateWork_Private(bv,2*n_));
 99:   rwork = (PetscReal*)bv->work;
100:   rwork2 = rwork+n_;
101:   /* compute local norms */
102:   for (j=0;j<n_;j++) {
103:     k = 1;
104: #if !defined(PETSC_USE_COMPLEX)
105:     if (eigi && eigi[j] != 0.0) k = 2;
106: #endif
107:     rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*lda_),&lda,rwork2);
108:     if (k==2) { rwork[j+1] = rwork[j]; j++; }
109:   }
110:   /* reduction to get global norms */
111:   if (mpi) {
112:     PetscCall(PetscMPIIntCast(n_,&len));
113:     PetscCall(PetscArrayzero(rwork2,n_));
114:     PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
115:     norms = rwork2;
116:   } else norms = rwork;
117:   /* scale columns */
118:   for (j=0;j<n_;j++) {
119:     k = 1;
120: #if !defined(PETSC_USE_COMPLEX)
121:     if (eigi && eigi[j] != 0.0) k = 2;
122: #endif
123:     PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*lda_),&lda,&info));
124:     SlepcCheckLapackInfo("lascl",info);
125:     if (k==2) j++;
126:   }
127:   PetscCall(PetscLogFlops(3.0*m*n_));
128:   PetscCall(PetscFPTrapPop());
129:   PetscFunctionReturn(PETSC_SUCCESS);
130: }

132: /*
133:    Compute the upper Cholesky factor in R and its inverse in S.
134:    If S == R then the inverse overwrites the Cholesky factor.
135:  */
136: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
137: {
138:   PetscInt       i,k,l,n,m,ld,lds;
139:   PetscScalar    *pR,*pS;
140:   PetscBLASInt   info,n_ = 0,m_ = 0,ld_,lds_;

142:   PetscFunctionBegin;
143:   l = bv->l;
144:   k = bv->k;
145:   PetscCall(MatGetSize(R,&m,NULL));
146:   n = k-l;
147:   PetscCall(PetscBLASIntCast(m,&m_));
148:   PetscCall(PetscBLASIntCast(n,&n_));
149:   ld  = m;
150:   ld_ = m_;
151:   PetscCall(MatDenseGetArray(R,&pR));

153:   if (S==R) {
154:     PetscCall(BVAllocateWork_Private(bv,m*k));
155:     pS = bv->work;
156:     lds = ld;
157:     lds_ = ld_;
158:   } else {
159:     PetscCall(MatDenseGetArray(S,&pS));
160:     PetscCall(MatGetSize(S,&lds,NULL));
161:     PetscCall(PetscBLASIntCast(lds,&lds_));
162:   }

164:   /* save a copy of matrix in S */
165:   for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));

167:   /* compute upper Cholesky factor in R */
168:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
169:   PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
170:   PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));

172:   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
173:     for (i=l;i<k;i++) {
174:       PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
175:       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
176:     }
177:     PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
178:     SlepcCheckLapackInfo("potrf",info);
179:     PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
180:   }

182:   /* compute S = inv(R) */
183:   if (S==R) {
184:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
185:   } else {
186:     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
187:     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
188:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
189:   }
190:   SlepcCheckLapackInfo("trtri",info);
191:   PetscCall(PetscFPTrapPop());
192:   PetscCall(PetscLogFlops(0.33*n*n*n));

194:   /* Zero out entries below the diagonal */
195:   for (i=l;i<k-1;i++) {
196:     PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
197:     if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
198:   }
199:   PetscCall(MatDenseRestoreArray(R,&pR));
200:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
201:   PetscFunctionReturn(PETSC_SUCCESS);
202: }

204: /*
205:    Compute the inverse of an upper triangular matrix R, store it in S.
206:    If S == R then the inverse overwrites R.
207:  */
208: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
209: {
210:   PetscInt       i,k,l,n,m,ld,lds;
211:   PetscScalar    *pR,*pS;
212:   PetscBLASInt   info,n_,m_ = 0,ld_,lds_;

214:   PetscFunctionBegin;
215:   l = bv->l;
216:   k = bv->k;
217:   PetscCall(MatGetSize(R,&m,NULL));
218:   n = k-l;
219:   PetscCall(PetscBLASIntCast(m,&m_));
220:   PetscCall(PetscBLASIntCast(n,&n_));
221:   ld  = m;
222:   ld_ = m_;
223:   PetscCall(MatDenseGetArray(R,&pR));

225:   if (S==R) {
226:     PetscCall(BVAllocateWork_Private(bv,m*k));
227:     pS = bv->work;
228:     lds = ld;
229:     lds_ = ld_;
230:   } else {
231:     PetscCall(MatDenseGetArray(S,&pS));
232:     PetscCall(MatGetSize(S,&lds,NULL));
233:     PetscCall(PetscBLASIntCast(lds,&lds_));
234:   }

236:   /* compute S = inv(R) */
237:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
238:   if (S==R) {
239:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
240:   } else {
241:     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
242:     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
243:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
244:   }
245:   SlepcCheckLapackInfo("trtri",info);
246:   PetscCall(PetscFPTrapPop());
247:   PetscCall(PetscLogFlops(0.33*n*n*n));

249:   PetscCall(MatDenseRestoreArray(R,&pR));
250:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
251:   PetscFunctionReturn(PETSC_SUCCESS);
252: }

254: /*
255:    Compute the matrix to be used for post-multiplying the basis in the SVQB
256:    block orthogonalization method.
257:    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
258:    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
259:    If S == R then the result overwrites R.
260:  */
261: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
262: {
263:   PetscInt       i,j,k,l,n,m,ld,lds;
264:   PetscScalar    *pR,*pS,*D,*work,a;
265:   PetscReal      *eig,dummy;
266:   PetscBLASInt   info,lwork,n_,m_ = 0,ld_,lds_;
267: #if defined(PETSC_USE_COMPLEX)
268:   PetscReal      *rwork,rdummy;
269: #endif

271:   PetscFunctionBegin;
272:   l = bv->l;
273:   k = bv->k;
274:   PetscCall(MatGetSize(R,&m,NULL));
275:   PetscCall(MatDenseGetLDA(R,&ld));
276:   n = k-l;
277:   PetscCall(PetscBLASIntCast(m,&m_));
278:   PetscCall(PetscBLASIntCast(n,&n_));
279:   ld_ = m_;
280:   PetscCall(MatDenseGetArray(R,&pR));

282:   if (S==R) {
283:     pS = pR;
284:     lds = ld;
285:     lds_ = ld_;
286:   } else {
287:     PetscCall(MatDenseGetArray(S,&pS));
288:     PetscCall(MatDenseGetLDA(S,&lds));
289:     PetscCall(PetscBLASIntCast(lds,&lds_));
290:   }
291:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));

293:   /* workspace query and memory allocation */
294:   lwork = -1;
295: #if defined(PETSC_USE_COMPLEX)
296:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
297:   PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
298:   PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
299: #else
300:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
301:   PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
302:   PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
303: #endif

305:   /* copy and scale matrix */
306:   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
307:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
308:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];

310:   /* compute eigendecomposition */
311: #if defined(PETSC_USE_COMPLEX)
312:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
313: #else
314:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
315: #endif
316:   SlepcCheckLapackInfo("syev",info);

318:   if (S!=R) {   /* R = U' */
319:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
320:   }

322:   /* compute S = D*U*Lambda^{-1/2} */
323:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
324:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);

326:   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
327:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
328:     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
329:   }

331: #if defined(PETSC_USE_COMPLEX)
332:   PetscCall(PetscFree4(eig,D,work,rwork));
333: #else
334:   PetscCall(PetscFree3(eig,D,work));
335: #endif
336:   PetscCall(PetscLogFlops(9.0*n*n*n));
337:   PetscCall(PetscFPTrapPop());

339:   PetscCall(MatDenseRestoreArray(R,&pR));
340:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
341:   PetscFunctionReturn(PETSC_SUCCESS);
342: }

344: /*
345:     QR factorization of an mxn matrix via parallel TSQR
346: */
347: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
348: {
349:   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
350:   PetscBLASInt   m,n,ldq,i,j,k,l,s = 0,nb,sz,lwork,info;
351:   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
352:   PetscMPIInt    rank,size,count,stride;
353:   MPI_Datatype   tmat;

355:   PetscFunctionBegin;
356:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
357:   PetscCall(PetscBLASIntCast(m_,&m));
358:   PetscCall(PetscBLASIntCast(n_,&n));
359:   PetscCall(PetscBLASIntCast(ldq_,&ldq));
360:   k  = PetscMin(m,n);
361:   nb = 16;
362:   lda = 2*n;
363:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
364:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
365:   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
366:   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
367:   worklen = n+n*nb;
368:   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
369:   PetscCall(BVAllocateWork_Private(bv,worklen));
370:   tau  = bv->work;
371:   work = bv->work+n;
372:   PetscCall(PetscBLASIntCast(n*nb,&lwork));
373:   if (nlevels) {
374:     A  = bv->work+n+n*nb;
375:     QQ = bv->work+n+n*nb+n*lda;
376:     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
377:   }

379:   /* Compute QR */
380:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&ldq,tau,work,&lwork,&info));
381:   SlepcCheckLapackInfo("geqrf",info);

383:   /* Extract R */
384:   if (R || nlevels) {
385:     for (j=0;j<n;j++) {
386:       for (i=0;i<=PetscMin(j,m-1);i++) {
387:         if (nlevels) A[i+j*lda] = Q[i+j*ldq];
388:         else R[i+j*ldr] = Q[i+j*ldq];
389:       }
390:       for (i=PetscMin(j,m-1)+1;i<n;i++) {
391:         if (nlevels) A[i+j*lda] = 0.0;
392:         else R[i+j*ldr] = 0.0;
393:       }
394:     }
395:   }

397:   /* Compute orthogonal matrix in Q */
398:   PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&ldq,tau,work,&lwork,&info));
399:   SlepcCheckLapackInfo("orgqr",info);

401:   if (nlevels) {

403:     PetscCall(PetscMPIIntCast(n,&count));
404:     PetscCall(PetscMPIIntCast(lda,&stride));
405:     PetscCall(PetscBLASIntCast(lda,&l));
406:     PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
407:     PetscCallMPI(MPI_Type_commit(&tmat));

409:     for (level=nlevels;level>=1;level--) {

411:       plevel = PetscPowInt(2,level);
412:       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));

414:       /* Stack triangular matrices */
415:       if (rank<s && s<size) {  /* send top part, receive bottom part */
416:         PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417:       } else if (s<size) {  /* copy top to bottom, receive top part */
418:         PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
419:         PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
420:       }
421:       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
422:         if (rank<size-powtwo) {  /* send bottom part */
423:           PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
424:         } else if (rank>=powtwo) {  /* receive bottom part */
425:           PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
426:         }
427:       }
428:       /* Compute QR and build orthogonal matrix */
429:       if (level<nlevels || (level==nlevels && s<size)) {
430:         PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
431:         SlepcCheckLapackInfo("geqrf",info);
432:         PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
433:         PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
434:         SlepcCheckLapackInfo("orgqr",info);
435:         for (j=0;j<n;j++) {
436:           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
437:         }
438:       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
439:         PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
440:         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
441:       }
442:     }

444:     /* Extract R */
445:     if (R) {
446:       for (j=0;j<n;j++) {
447:         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
448:         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
449:       }
450:     }

452:     /* Accumulate orthogonal matrices */
453:     for (level=1;level<=nlevels;level++) {
454:       plevel = PetscPowInt(2,level);
455:       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
456:       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
457:       if (level<nlevels) {
458:         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
459:         PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
460:       } else {
461:         for (i=0;i<m/l;i++) {
462:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&ldq,Qhalf,&l,&zero,C,&l));
463:           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*ldq,C+j*l,l));
464:         }
465:         sz = m%l;
466:         if (sz) {
467:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&ldq,Qhalf,&l,&zero,C,&l));
468:           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*ldq,C+j*l,sz));
469:         }
470:       }
471:     }

473:     PetscCallMPI(MPI_Type_free(&tmat));
474:   }

476:   PetscCall(PetscLogFlops(3.0*m*n*n));
477:   PetscCall(PetscFPTrapPop());
478:   PetscFunctionReturn(PETSC_SUCCESS);
479: }

481: /*
482:     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
483:     all matrices are upper triangular stored in packed format
484: */
485: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
486: {
487:   PetscBLASInt   n,i,j,k,one=1;
488:   PetscMPIInt    tsize;
489:   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
490:   PetscReal      c;

492:   PetscFunctionBegin;
493:   PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize));  /* we assume len=1 */
494:   tsize /= sizeof(PetscScalar);
495:   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
496:   for (j=0;j<n;j++) {
497:     for (i=0;i<=j;i++) {
498:       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
499:       R1[(2*n-j-1)*j/2+j] = v;
500:       k = n-j-1;
501:       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
502:     }
503:   }
504:   PetscFunctionReturnVoid();
505: }

507: /*
508:     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
509: */
510: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
511: {
512:   PetscInt       worklen;
513:   PetscBLASInt   m,n,ldq,i,j,s,nb,lwork,info;
514:   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
515:   PetscMPIInt    size,count;
516:   MPI_Datatype   tmat;

518:   PetscFunctionBegin;
519:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
520:   PetscCall(PetscBLASIntCast(m_,&m));
521:   PetscCall(PetscBLASIntCast(n_,&n));
522:   PetscCall(PetscBLASIntCast(ldq_,&ldq));
523:   nb = 16;
524:   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
525:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
526:   worklen = n+n*nb+2*s+ldq*n;
527:   PetscCall(BVAllocateWork_Private(bv,worklen));
528:   tau  = bv->work;
529:   work = bv->work+n;
530:   R1   = bv->work+n+n*nb;
531:   R2   = bv->work+n+n*nb+s;
532:   A    = bv->work+n+n*nb+2*s;
533:   PetscCall(PetscBLASIntCast(n*nb,&lwork));
534:   PetscCall(PetscArraycpy(A,Q,ldq*n));

536:   /* Compute QR */
537:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&ldq,tau,work,&lwork,&info));
538:   SlepcCheckLapackInfo("geqrf",info);

540:   if (size==1) {
541:     /* Extract R */
542:     for (j=0;j<n;j++) {
543:       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*ldq];
544:       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
545:     }
546:   } else {
547:     /* Use MPI reduction operation to obtain global R */
548:     PetscCall(PetscMPIIntCast(s,&count));
549:     PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
550:     PetscCallMPI(MPI_Type_commit(&tmat));
551:     for (i=0;i<n;i++) {
552:       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*ldq]:0.0;
553:     }
554:     PetscCall(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
555:     for (i=0;i<n;i++) {
556:       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
557:       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
558:     }
559:     PetscCallMPI(MPI_Type_free(&tmat));
560:   }

562:   PetscCall(PetscLogFlops(3.0*m*n*n));
563:   PetscCall(PetscFPTrapPop());
564:   PetscFunctionReturn(PETSC_SUCCESS);
565: }