Actual source code: bvlapack.c
slepc-3.21.1 2024-04-26
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the LAPACK
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19: */
20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21: {
22: PetscBLASInt i,n=*len;
23: PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
25: PetscFunctionBegin;
26: if (PetscUnlikely(*datatype!=MPIU_REAL)) {
27: (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
28: MPI_Abort(PETSC_COMM_WORLD,1);
29: }
30: for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
31: PetscFunctionReturnVoid();
32: }
34: /*
35: Compute ||A|| for an mxn matrix
36: */
37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,NormType type,PetscReal *nrm,PetscBool mpi)
38: {
39: PetscBLASInt m,n,lda,i,j;
40: PetscMPIInt len;
41: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
43: PetscFunctionBegin;
44: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
45: PetscCall(PetscBLASIntCast(m_,&m));
46: PetscCall(PetscBLASIntCast(n_,&n));
47: PetscCall(PetscBLASIntCast(lda_,&lda));
48: if (type==NORM_FROBENIUS || type==NORM_2) {
49: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&lda,rwork);
50: if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
51: else *nrm = lnrm;
52: PetscCall(PetscLogFlops(2.0*m*n));
53: } else if (type==NORM_1) {
54: if (mpi) {
55: PetscCall(BVAllocateWork_Private(bv,2*n_));
56: rwork = (PetscReal*)bv->work;
57: rwork2 = rwork+n_;
58: PetscCall(PetscArrayzero(rwork,n_));
59: PetscCall(PetscArrayzero(rwork2,n_));
60: for (j=0;j<n_;j++) {
61: for (i=0;i<m_;i++) {
62: rwork[j] += PetscAbsScalar(A[i+j*lda_]);
63: }
64: }
65: PetscCall(PetscMPIIntCast(n_,&len));
66: PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
67: *nrm = 0.0;
68: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
69: } else {
70: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&lda,rwork);
71: }
72: PetscCall(PetscLogFlops(1.0*m*n));
73: } else if (type==NORM_INFINITY) {
74: PetscCall(BVAllocateWork_Private(bv,m_));
75: rwork = (PetscReal*)bv->work;
76: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&lda,rwork);
77: if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
78: else *nrm = lnrm;
79: PetscCall(PetscLogFlops(1.0*m*n));
80: }
81: PetscCall(PetscFPTrapPop());
82: PetscFunctionReturn(PETSC_SUCCESS);
83: }
85: /*
86: Normalize the columns of an mxn matrix A
87: */
88: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,PetscScalar *eigi,PetscBool mpi)
89: {
90: PetscBLASInt m,lda,j,k,info,zero=0;
91: PetscMPIInt len;
92: PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
94: PetscFunctionBegin;
95: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
96: PetscCall(PetscBLASIntCast(m_,&m));
97: PetscCall(PetscBLASIntCast(lda_,&lda));
98: PetscCall(BVAllocateWork_Private(bv,2*n_));
99: rwork = (PetscReal*)bv->work;
100: rwork2 = rwork+n_;
101: /* compute local norms */
102: for (j=0;j<n_;j++) {
103: k = 1;
104: #if !defined(PETSC_USE_COMPLEX)
105: if (eigi && eigi[j] != 0.0) k = 2;
106: #endif
107: rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*lda_),&lda,rwork2);
108: if (k==2) { rwork[j+1] = rwork[j]; j++; }
109: }
110: /* reduction to get global norms */
111: if (mpi) {
112: PetscCall(PetscMPIIntCast(n_,&len));
113: PetscCall(PetscArrayzero(rwork2,n_));
114: PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
115: norms = rwork2;
116: } else norms = rwork;
117: /* scale columns */
118: for (j=0;j<n_;j++) {
119: k = 1;
120: #if !defined(PETSC_USE_COMPLEX)
121: if (eigi && eigi[j] != 0.0) k = 2;
122: #endif
123: PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*lda_),&lda,&info));
124: SlepcCheckLapackInfo("lascl",info);
125: if (k==2) j++;
126: }
127: PetscCall(PetscLogFlops(3.0*m*n_));
128: PetscCall(PetscFPTrapPop());
129: PetscFunctionReturn(PETSC_SUCCESS);
130: }
132: /*
133: Compute the upper Cholesky factor in R and its inverse in S.
134: If S == R then the inverse overwrites the Cholesky factor.
135: */
136: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
137: {
138: PetscInt i,k,l,n,m,ld,lds;
139: PetscScalar *pR,*pS;
140: PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
142: PetscFunctionBegin;
143: l = bv->l;
144: k = bv->k;
145: PetscCall(MatGetSize(R,&m,NULL));
146: n = k-l;
147: PetscCall(PetscBLASIntCast(m,&m_));
148: PetscCall(PetscBLASIntCast(n,&n_));
149: ld = m;
150: ld_ = m_;
151: PetscCall(MatDenseGetArray(R,&pR));
153: if (S==R) {
154: PetscCall(BVAllocateWork_Private(bv,m*k));
155: pS = bv->work;
156: lds = ld;
157: lds_ = ld_;
158: } else {
159: PetscCall(MatDenseGetArray(S,&pS));
160: PetscCall(MatGetSize(S,&lds,NULL));
161: PetscCall(PetscBLASIntCast(lds,&lds_));
162: }
164: /* save a copy of matrix in S */
165: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
167: /* compute upper Cholesky factor in R */
168: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
169: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
170: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
172: if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
173: for (i=l;i<k;i++) {
174: PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
175: pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
176: }
177: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
178: SlepcCheckLapackInfo("potrf",info);
179: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
180: }
182: /* compute S = inv(R) */
183: if (S==R) {
184: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
185: } else {
186: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
187: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
188: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
189: }
190: SlepcCheckLapackInfo("trtri",info);
191: PetscCall(PetscFPTrapPop());
192: PetscCall(PetscLogFlops(0.33*n*n*n));
194: /* Zero out entries below the diagonal */
195: for (i=l;i<k-1;i++) {
196: PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
197: if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
198: }
199: PetscCall(MatDenseRestoreArray(R,&pR));
200: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
201: PetscFunctionReturn(PETSC_SUCCESS);
202: }
204: /*
205: Compute the inverse of an upper triangular matrix R, store it in S.
206: If S == R then the inverse overwrites R.
207: */
208: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
209: {
210: PetscInt i,k,l,n,m,ld,lds;
211: PetscScalar *pR,*pS;
212: PetscBLASInt info,n_,m_ = 0,ld_,lds_;
214: PetscFunctionBegin;
215: l = bv->l;
216: k = bv->k;
217: PetscCall(MatGetSize(R,&m,NULL));
218: n = k-l;
219: PetscCall(PetscBLASIntCast(m,&m_));
220: PetscCall(PetscBLASIntCast(n,&n_));
221: ld = m;
222: ld_ = m_;
223: PetscCall(MatDenseGetArray(R,&pR));
225: if (S==R) {
226: PetscCall(BVAllocateWork_Private(bv,m*k));
227: pS = bv->work;
228: lds = ld;
229: lds_ = ld_;
230: } else {
231: PetscCall(MatDenseGetArray(S,&pS));
232: PetscCall(MatGetSize(S,&lds,NULL));
233: PetscCall(PetscBLASIntCast(lds,&lds_));
234: }
236: /* compute S = inv(R) */
237: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
238: if (S==R) {
239: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
240: } else {
241: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
242: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
243: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
244: }
245: SlepcCheckLapackInfo("trtri",info);
246: PetscCall(PetscFPTrapPop());
247: PetscCall(PetscLogFlops(0.33*n*n*n));
249: PetscCall(MatDenseRestoreArray(R,&pR));
250: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
251: PetscFunctionReturn(PETSC_SUCCESS);
252: }
254: /*
255: Compute the matrix to be used for post-multiplying the basis in the SVQB
256: block orthogonalization method.
257: On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
258: the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
259: If S == R then the result overwrites R.
260: */
261: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
262: {
263: PetscInt i,j,k,l,n,m,ld,lds;
264: PetscScalar *pR,*pS,*D,*work,a;
265: PetscReal *eig,dummy;
266: PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
267: #if defined(PETSC_USE_COMPLEX)
268: PetscReal *rwork,rdummy;
269: #endif
271: PetscFunctionBegin;
272: l = bv->l;
273: k = bv->k;
274: PetscCall(MatGetSize(R,&m,NULL));
275: PetscCall(MatDenseGetLDA(R,&ld));
276: n = k-l;
277: PetscCall(PetscBLASIntCast(m,&m_));
278: PetscCall(PetscBLASIntCast(n,&n_));
279: ld_ = m_;
280: PetscCall(MatDenseGetArray(R,&pR));
282: if (S==R) {
283: pS = pR;
284: lds = ld;
285: lds_ = ld_;
286: } else {
287: PetscCall(MatDenseGetArray(S,&pS));
288: PetscCall(MatDenseGetLDA(S,&lds));
289: PetscCall(PetscBLASIntCast(lds,&lds_));
290: }
291: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
293: /* workspace query and memory allocation */
294: lwork = -1;
295: #if defined(PETSC_USE_COMPLEX)
296: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
297: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
298: PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
299: #else
300: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
301: PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
302: PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
303: #endif
305: /* copy and scale matrix */
306: for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
307: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
308: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
310: /* compute eigendecomposition */
311: #if defined(PETSC_USE_COMPLEX)
312: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
313: #else
314: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
315: #endif
316: SlepcCheckLapackInfo("syev",info);
318: if (S!=R) { /* R = U' */
319: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
320: }
322: /* compute S = D*U*Lambda^{-1/2} */
323: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
324: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
326: if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
327: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
328: for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
329: }
331: #if defined(PETSC_USE_COMPLEX)
332: PetscCall(PetscFree4(eig,D,work,rwork));
333: #else
334: PetscCall(PetscFree3(eig,D,work));
335: #endif
336: PetscCall(PetscLogFlops(9.0*n*n*n));
337: PetscCall(PetscFPTrapPop());
339: PetscCall(MatDenseRestoreArray(R,&pR));
340: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
341: PetscFunctionReturn(PETSC_SUCCESS);
342: }
344: /*
345: QR factorization of an mxn matrix via parallel TSQR
346: */
347: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
348: {
349: PetscInt level,plevel,nlevels,powtwo,lda,worklen;
350: PetscBLASInt m,n,ldq,i,j,k,l,s = 0,nb,sz,lwork,info;
351: PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
352: PetscMPIInt rank,size,count,stride;
353: MPI_Datatype tmat;
355: PetscFunctionBegin;
356: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
357: PetscCall(PetscBLASIntCast(m_,&m));
358: PetscCall(PetscBLASIntCast(n_,&n));
359: PetscCall(PetscBLASIntCast(ldq_,&ldq));
360: k = PetscMin(m,n);
361: nb = 16;
362: lda = 2*n;
363: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
364: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
365: nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
366: powtwo = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
367: worklen = n+n*nb;
368: if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
369: PetscCall(BVAllocateWork_Private(bv,worklen));
370: tau = bv->work;
371: work = bv->work+n;
372: PetscCall(PetscBLASIntCast(n*nb,&lwork));
373: if (nlevels) {
374: A = bv->work+n+n*nb;
375: QQ = bv->work+n+n*nb+n*lda;
376: C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
377: }
379: /* Compute QR */
380: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&ldq,tau,work,&lwork,&info));
381: SlepcCheckLapackInfo("geqrf",info);
383: /* Extract R */
384: if (R || nlevels) {
385: for (j=0;j<n;j++) {
386: for (i=0;i<=PetscMin(j,m-1);i++) {
387: if (nlevels) A[i+j*lda] = Q[i+j*ldq];
388: else R[i+j*ldr] = Q[i+j*ldq];
389: }
390: for (i=PetscMin(j,m-1)+1;i<n;i++) {
391: if (nlevels) A[i+j*lda] = 0.0;
392: else R[i+j*ldr] = 0.0;
393: }
394: }
395: }
397: /* Compute orthogonal matrix in Q */
398: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&ldq,tau,work,&lwork,&info));
399: SlepcCheckLapackInfo("orgqr",info);
401: if (nlevels) {
403: PetscCall(PetscMPIIntCast(n,&count));
404: PetscCall(PetscMPIIntCast(lda,&stride));
405: PetscCall(PetscBLASIntCast(lda,&l));
406: PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
407: PetscCallMPI(MPI_Type_commit(&tmat));
409: for (level=nlevels;level>=1;level--) {
411: plevel = PetscPowInt(2,level);
412: PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
414: /* Stack triangular matrices */
415: if (rank<s && s<size) { /* send top part, receive bottom part */
416: PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417: } else if (s<size) { /* copy top to bottom, receive top part */
418: PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
419: PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
420: }
421: if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
422: if (rank<size-powtwo) { /* send bottom part */
423: PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
424: } else if (rank>=powtwo) { /* receive bottom part */
425: PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
426: }
427: }
428: /* Compute QR and build orthogonal matrix */
429: if (level<nlevels || (level==nlevels && s<size)) {
430: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
431: SlepcCheckLapackInfo("geqrf",info);
432: PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
433: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
434: SlepcCheckLapackInfo("orgqr",info);
435: for (j=0;j<n;j++) {
436: for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
437: }
438: } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
439: PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
440: for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
441: }
442: }
444: /* Extract R */
445: if (R) {
446: for (j=0;j<n;j++) {
447: for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
448: for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
449: }
450: }
452: /* Accumulate orthogonal matrices */
453: for (level=1;level<=nlevels;level++) {
454: plevel = PetscPowInt(2,level);
455: PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
456: Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
457: if (level<nlevels) {
458: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
459: PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
460: } else {
461: for (i=0;i<m/l;i++) {
462: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&ldq,Qhalf,&l,&zero,C,&l));
463: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*ldq,C+j*l,l));
464: }
465: sz = m%l;
466: if (sz) {
467: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&ldq,Qhalf,&l,&zero,C,&l));
468: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*ldq,C+j*l,sz));
469: }
470: }
471: }
473: PetscCallMPI(MPI_Type_free(&tmat));
474: }
476: PetscCall(PetscLogFlops(3.0*m*n*n));
477: PetscCall(PetscFPTrapPop());
478: PetscFunctionReturn(PETSC_SUCCESS);
479: }
481: /*
482: Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
483: all matrices are upper triangular stored in packed format
484: */
485: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
486: {
487: PetscBLASInt n,i,j,k,one=1;
488: PetscMPIInt tsize;
489: PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
490: PetscReal c;
492: PetscFunctionBegin;
493: PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize)); /* we assume len=1 */
494: tsize /= sizeof(PetscScalar);
495: n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
496: for (j=0;j<n;j++) {
497: for (i=0;i<=j;i++) {
498: LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
499: R1[(2*n-j-1)*j/2+j] = v;
500: k = n-j-1;
501: if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
502: }
503: }
504: PetscFunctionReturnVoid();
505: }
507: /*
508: Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
509: */
510: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
511: {
512: PetscInt worklen;
513: PetscBLASInt m,n,ldq,i,j,s,nb,lwork,info;
514: PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
515: PetscMPIInt size,count;
516: MPI_Datatype tmat;
518: PetscFunctionBegin;
519: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
520: PetscCall(PetscBLASIntCast(m_,&m));
521: PetscCall(PetscBLASIntCast(n_,&n));
522: PetscCall(PetscBLASIntCast(ldq_,&ldq));
523: nb = 16;
524: s = n+n*(n-1)/2; /* length of packed triangular matrix */
525: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
526: worklen = n+n*nb+2*s+ldq*n;
527: PetscCall(BVAllocateWork_Private(bv,worklen));
528: tau = bv->work;
529: work = bv->work+n;
530: R1 = bv->work+n+n*nb;
531: R2 = bv->work+n+n*nb+s;
532: A = bv->work+n+n*nb+2*s;
533: PetscCall(PetscBLASIntCast(n*nb,&lwork));
534: PetscCall(PetscArraycpy(A,Q,ldq*n));
536: /* Compute QR */
537: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&ldq,tau,work,&lwork,&info));
538: SlepcCheckLapackInfo("geqrf",info);
540: if (size==1) {
541: /* Extract R */
542: for (j=0;j<n;j++) {
543: for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*ldq];
544: for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
545: }
546: } else {
547: /* Use MPI reduction operation to obtain global R */
548: PetscCall(PetscMPIIntCast(s,&count));
549: PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
550: PetscCallMPI(MPI_Type_commit(&tmat));
551: for (i=0;i<n;i++) {
552: for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*ldq]:0.0;
553: }
554: PetscCall(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
555: for (i=0;i<n;i++) {
556: for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
557: for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
558: }
559: PetscCallMPI(MPI_Type_free(&tmat));
560: }
562: PetscCall(PetscLogFlops(3.0*m*n*n));
563: PetscCall(PetscFPTrapPop());
564: PetscFunctionReturn(PETSC_SUCCESS);
565: }