LCOV - code coverage report
Current view: top level - sys/classes/bv/interface - bvlapack.c (source / functions) Hit Total Coverage
Test: SLEPc Lines: 326 353 92.4 %
Date: 2024-04-19 00:31:36 Functions: 9 9 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       3             :    SLEPc - Scalable Library for Eigenvalue Problem Computations
       4             :    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
       5             : 
       6             :    This file is part of SLEPc.
       7             :    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
       8             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       9             : */
      10             : /*
      11             :    BV private kernels that use the LAPACK
      12             : */
      13             : 
      14             : #include <slepc/private/bvimpl.h>
      15             : #include <slepcblaslapack.h>
      16             : 
      17             : /*
      18             :     Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
      19             : */
      20        2911 : SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
      21             : {
      22        2911 :   PetscBLASInt i,n=*len;
      23        2911 :   PetscReal    *x = (PetscReal*)in,*y = (PetscReal*)inout;
      24             : 
      25        2911 :   PetscFunctionBegin;
      26        2911 :   if (PetscUnlikely(*datatype!=MPIU_REAL)) {
      27           0 :     (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
      28           0 :     MPI_Abort(PETSC_COMM_WORLD,1);
      29             :   }
      30        6080 :   for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
      31        2911 :   PetscFunctionReturnVoid();
      32             : }
      33             : 
      34             : /*
      35             :     Compute ||A|| for an mxn matrix
      36             : */
      37       11156 : PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,NormType type,PetscReal *nrm,PetscBool mpi)
      38             : {
      39       11156 :   PetscBLASInt   m,n,lda,i,j;
      40       11156 :   PetscMPIInt    len;
      41       11156 :   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;
      42             : 
      43       11156 :   PetscFunctionBegin;
      44       11156 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
      45       11156 :   PetscCall(PetscBLASIntCast(m_,&m));
      46       11156 :   PetscCall(PetscBLASIntCast(n_,&n));
      47       11156 :   PetscCall(PetscBLASIntCast(lda_,&lda));
      48       11156 :   if (type==NORM_FROBENIUS || type==NORM_2) {
      49       11148 :     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&lda,rwork);
      50       17583 :     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
      51        9003 :     else *nrm = lnrm;
      52       11148 :     PetscCall(PetscLogFlops(2.0*m*n));
      53           8 :   } else if (type==NORM_1) {
      54           8 :     if (mpi) {
      55           0 :       PetscCall(BVAllocateWork_Private(bv,2*n_));
      56           0 :       rwork = (PetscReal*)bv->work;
      57           0 :       rwork2 = rwork+n_;
      58           0 :       PetscCall(PetscArrayzero(rwork,n_));
      59           0 :       PetscCall(PetscArrayzero(rwork2,n_));
      60           0 :       for (j=0;j<n_;j++) {
      61           0 :         for (i=0;i<m_;i++) {
      62           0 :           rwork[j] += PetscAbsScalar(A[i+j*lda_]);
      63             :         }
      64             :       }
      65           0 :       PetscCall(PetscMPIIntCast(n_,&len));
      66           0 :       PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
      67           0 :       *nrm = 0.0;
      68           0 :       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
      69             :     } else {
      70           8 :       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&lda,rwork);
      71             :     }
      72           8 :     PetscCall(PetscLogFlops(1.0*m*n));
      73           0 :   } else if (type==NORM_INFINITY) {
      74           0 :     PetscCall(BVAllocateWork_Private(bv,m_));
      75           0 :     rwork = (PetscReal*)bv->work;
      76           0 :     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&lda,rwork);
      77           0 :     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
      78           0 :     else *nrm = lnrm;
      79           0 :     PetscCall(PetscLogFlops(1.0*m*n));
      80             :   }
      81       11156 :   PetscCall(PetscFPTrapPop());
      82       11156 :   PetscFunctionReturn(PETSC_SUCCESS);
      83             : }
      84             : 
      85             : /*
      86             :     Normalize the columns of an mxn matrix A
      87             : */
      88        3874 : PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,PetscScalar *eigi,PetscBool mpi)
      89             : {
      90        3874 :   PetscBLASInt   m,lda,j,k,info,zero=0;
      91        3874 :   PetscMPIInt    len;
      92        3874 :   PetscReal      *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
      93             : 
      94        3874 :   PetscFunctionBegin;
      95        3874 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
      96        3874 :   PetscCall(PetscBLASIntCast(m_,&m));
      97        3874 :   PetscCall(PetscBLASIntCast(lda_,&lda));
      98        3874 :   PetscCall(BVAllocateWork_Private(bv,2*n_));
      99        3874 :   rwork = (PetscReal*)bv->work;
     100        3874 :   rwork2 = rwork+n_;
     101             :   /* compute local norms */
     102       43109 :   for (j=0;j<n_;j++) {
     103       39235 :     k = 1;
     104             : #if !defined(PETSC_USE_COMPLEX)
     105             :     if (eigi && eigi[j] != 0.0) k = 2;
     106             : #endif
     107       39235 :     rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*lda_),&lda,rwork2);
     108       39235 :     if (k==2) { rwork[j+1] = rwork[j]; j++; }
     109             :   }
     110             :   /* reduction to get global norms */
     111        3874 :   if (mpi) {
     112          78 :     PetscCall(PetscMPIIntCast(n_,&len));
     113          78 :     PetscCall(PetscArrayzero(rwork2,n_));
     114         312 :     PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
     115          78 :     norms = rwork2;
     116             :   } else norms = rwork;
     117             :   /* scale columns */
     118       43109 :   for (j=0;j<n_;j++) {
     119       39235 :     k = 1;
     120             : #if !defined(PETSC_USE_COMPLEX)
     121             :     if (eigi && eigi[j] != 0.0) k = 2;
     122             : #endif
     123       39235 :     PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*lda_),&lda,&info));
     124       39235 :     SlepcCheckLapackInfo("lascl",info);
     125       39235 :     if (k==2) j++;
     126             :   }
     127        3874 :   PetscCall(PetscLogFlops(3.0*m*n_));
     128        3874 :   PetscCall(PetscFPTrapPop());
     129        3874 :   PetscFunctionReturn(PETSC_SUCCESS);
     130             : }
     131             : 
     132             : /*
     133             :    Compute the upper Cholesky factor in R and its inverse in S.
     134             :    If S == R then the inverse overwrites the Cholesky factor.
     135             :  */
     136         105 : PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
     137             : {
     138         105 :   PetscInt       i,k,l,n,m,ld,lds;
     139         105 :   PetscScalar    *pR,*pS;
     140         105 :   PetscBLASInt   info,n_ = 0,m_ = 0,ld_,lds_;
     141             : 
     142         105 :   PetscFunctionBegin;
     143         105 :   l = bv->l;
     144         105 :   k = bv->k;
     145         105 :   PetscCall(MatGetSize(R,&m,NULL));
     146         105 :   n = k-l;
     147         105 :   PetscCall(PetscBLASIntCast(m,&m_));
     148         105 :   PetscCall(PetscBLASIntCast(n,&n_));
     149         105 :   ld  = m;
     150         105 :   ld_ = m_;
     151         105 :   PetscCall(MatDenseGetArray(R,&pR));
     152             : 
     153         105 :   if (S==R) {
     154          73 :     PetscCall(BVAllocateWork_Private(bv,m*k));
     155          73 :     pS = bv->work;
     156          73 :     lds = ld;
     157          73 :     lds_ = ld_;
     158             :   } else {
     159          32 :     PetscCall(MatDenseGetArray(S,&pS));
     160          32 :     PetscCall(MatGetSize(S,&lds,NULL));
     161          32 :     PetscCall(PetscBLASIntCast(lds,&lds_));
     162             :   }
     163             : 
     164             :   /* save a copy of matrix in S */
     165        1017 :   for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
     166             : 
     167             :   /* compute upper Cholesky factor in R */
     168         105 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
     169         105 :   PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
     170         105 :   PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
     171             : 
     172         105 :   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
     173           0 :     for (i=l;i<k;i++) {
     174           0 :       PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
     175           0 :       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
     176             :     }
     177           0 :     PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
     178           0 :     SlepcCheckLapackInfo("potrf",info);
     179           0 :     PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
     180             :   }
     181             : 
     182             :   /* compute S = inv(R) */
     183         105 :   if (S==R) {
     184          73 :     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
     185             :   } else {
     186          32 :     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
     187         160 :     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
     188          32 :     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
     189             :   }
     190         105 :   SlepcCheckLapackInfo("trtri",info);
     191         105 :   PetscCall(PetscFPTrapPop());
     192         105 :   PetscCall(PetscLogFlops(0.33*n*n*n));
     193             : 
     194             :   /* Zero out entries below the diagonal */
     195         912 :   for (i=l;i<k-1;i++) {
     196         807 :     PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
     197         807 :     if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
     198             :   }
     199         105 :   PetscCall(MatDenseRestoreArray(R,&pR));
     200         105 :   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
     201         105 :   PetscFunctionReturn(PETSC_SUCCESS);
     202             : }
     203             : 
     204             : /*
     205             :    Compute the inverse of an upper triangular matrix R, store it in S.
     206             :    If S == R then the inverse overwrites R.
     207             :  */
     208          73 : PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
     209             : {
     210          73 :   PetscInt       i,k,l,n,m,ld,lds;
     211          73 :   PetscScalar    *pR,*pS;
     212          73 :   PetscBLASInt   info,n_,m_ = 0,ld_,lds_;
     213             : 
     214          73 :   PetscFunctionBegin;
     215          73 :   l = bv->l;
     216          73 :   k = bv->k;
     217          73 :   PetscCall(MatGetSize(R,&m,NULL));
     218          73 :   n = k-l;
     219          73 :   PetscCall(PetscBLASIntCast(m,&m_));
     220          73 :   PetscCall(PetscBLASIntCast(n,&n_));
     221          73 :   ld  = m;
     222          73 :   ld_ = m_;
     223          73 :   PetscCall(MatDenseGetArray(R,&pR));
     224             : 
     225          73 :   if (S==R) {
     226          57 :     PetscCall(BVAllocateWork_Private(bv,m*k));
     227          57 :     pS = bv->work;
     228          57 :     lds = ld;
     229          57 :     lds_ = ld_;
     230             :   } else {
     231          16 :     PetscCall(MatDenseGetArray(S,&pS));
     232          16 :     PetscCall(MatGetSize(S,&lds,NULL));
     233          16 :     PetscCall(PetscBLASIntCast(lds,&lds_));
     234             :   }
     235             : 
     236             :   /* compute S = inv(R) */
     237          73 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
     238          73 :   if (S==R) {
     239          57 :     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
     240             :   } else {
     241          16 :     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
     242          80 :     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
     243          16 :     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
     244             :   }
     245          73 :   SlepcCheckLapackInfo("trtri",info);
     246          73 :   PetscCall(PetscFPTrapPop());
     247          73 :   PetscCall(PetscLogFlops(0.33*n*n*n));
     248             : 
     249          73 :   PetscCall(MatDenseRestoreArray(R,&pR));
     250          73 :   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
     251          73 :   PetscFunctionReturn(PETSC_SUCCESS);
     252             : }
     253             : 
     254             : /*
     255             :    Compute the matrix to be used for post-multiplying the basis in the SVQB
     256             :    block orthogonalization method.
     257             :    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
     258             :    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
     259             :    If S == R then the result overwrites R.
     260             :  */
     261         105 : PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
     262             : {
     263         105 :   PetscInt       i,j,k,l,n,m,ld,lds;
     264         105 :   PetscScalar    *pR,*pS,*D,*work,a;
     265         105 :   PetscReal      *eig,dummy;
     266         105 :   PetscBLASInt   info,lwork,n_,m_ = 0,ld_,lds_;
     267             : #if defined(PETSC_USE_COMPLEX)
     268         105 :   PetscReal      *rwork,rdummy;
     269             : #endif
     270             : 
     271         105 :   PetscFunctionBegin;
     272         105 :   l = bv->l;
     273         105 :   k = bv->k;
     274         105 :   PetscCall(MatGetSize(R,&m,NULL));
     275         105 :   PetscCall(MatDenseGetLDA(R,&ld));
     276         105 :   n = k-l;
     277         105 :   PetscCall(PetscBLASIntCast(m,&m_));
     278         105 :   PetscCall(PetscBLASIntCast(n,&n_));
     279         105 :   ld_ = m_;
     280         105 :   PetscCall(MatDenseGetArray(R,&pR));
     281             : 
     282         105 :   if (S==R) {
     283          73 :     pS = pR;
     284          73 :     lds = ld;
     285          73 :     lds_ = ld_;
     286             :   } else {
     287          32 :     PetscCall(MatDenseGetArray(S,&pS));
     288          32 :     PetscCall(MatDenseGetLDA(S,&lds));
     289          32 :     PetscCall(PetscBLASIntCast(lds,&lds_));
     290             :   }
     291         105 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
     292             : 
     293             :   /* workspace query and memory allocation */
     294         105 :   lwork = -1;
     295             : #if defined(PETSC_USE_COMPLEX)
     296         105 :   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
     297         105 :   PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
     298         105 :   PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
     299             : #else
     300             :   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
     301             :   PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
     302             :   PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
     303             : #endif
     304             : 
     305             :   /* copy and scale matrix */
     306        1017 :   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
     307       12793 :   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
     308       12793 :   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
     309             : 
     310             :   /* compute eigendecomposition */
     311             : #if defined(PETSC_USE_COMPLEX)
     312         105 :   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
     313             : #else
     314             :   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
     315             : #endif
     316         105 :   SlepcCheckLapackInfo("syev",info);
     317             : 
     318         105 :   if (S!=R) {   /* R = U' */
     319         800 :     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
     320             :   }
     321             : 
     322             :   /* compute S = D*U*Lambda^{-1/2} */
     323       12793 :   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
     324       12793 :   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
     325             : 
     326         105 :   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
     327         800 :     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
     328         800 :     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
     329             :   }
     330             : 
     331             : #if defined(PETSC_USE_COMPLEX)
     332         105 :   PetscCall(PetscFree4(eig,D,work,rwork));
     333             : #else
     334             :   PetscCall(PetscFree3(eig,D,work));
     335             : #endif
     336         105 :   PetscCall(PetscLogFlops(9.0*n*n*n));
     337         105 :   PetscCall(PetscFPTrapPop());
     338             : 
     339         105 :   PetscCall(MatDenseRestoreArray(R,&pR));
     340         105 :   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
     341         105 :   PetscFunctionReturn(PETSC_SUCCESS);
     342             : }
     343             : 
     344             : /*
     345             :     QR factorization of an mxn matrix via parallel TSQR
     346             : */
     347         195 : PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
     348             : {
     349         195 :   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
     350         195 :   PetscBLASInt   m,n,ldq,i,j,k,l,s = 0,nb,sz,lwork,info;
     351         195 :   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
     352         195 :   PetscMPIInt    rank,size,count,stride;
     353         195 :   MPI_Datatype   tmat;
     354             : 
     355         195 :   PetscFunctionBegin;
     356         195 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
     357         195 :   PetscCall(PetscBLASIntCast(m_,&m));
     358         195 :   PetscCall(PetscBLASIntCast(n_,&n));
     359         195 :   PetscCall(PetscBLASIntCast(ldq_,&ldq));
     360         195 :   k  = PetscMin(m,n);
     361         195 :   nb = 16;
     362         195 :   lda = 2*n;
     363         195 :   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
     364         195 :   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
     365         195 :   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
     366         195 :   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
     367         195 :   worklen = n+n*nb;
     368         195 :   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
     369         195 :   PetscCall(BVAllocateWork_Private(bv,worklen));
     370         195 :   tau  = bv->work;
     371         195 :   work = bv->work+n;
     372         195 :   PetscCall(PetscBLASIntCast(n*nb,&lwork));
     373         195 :   if (nlevels) {
     374         124 :     A  = bv->work+n+n*nb;
     375         124 :     QQ = bv->work+n+n*nb+n*lda;
     376         124 :     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
     377             :   }
     378             : 
     379             :   /* Compute QR */
     380         195 :   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&ldq,tau,work,&lwork,&info));
     381         195 :   SlepcCheckLapackInfo("geqrf",info);
     382             : 
     383             :   /* Extract R */
     384         195 :   if (R || nlevels) {
     385        2185 :     for (j=0;j<n;j++) {
     386       19261 :       for (i=0;i<=PetscMin(j,m-1);i++) {
     387       17271 :         if (nlevels) A[i+j*lda] = Q[i+j*ldq];
     388       15391 :         else R[i+j*ldr] = Q[i+j*ldq];
     389             :       }
     390       17639 :       for (i=PetscMin(j,m-1)+1;i<n;i++) {
     391       15649 :         if (nlevels) A[i+j*lda] = 0.0;
     392       14005 :         else R[i+j*ldr] = 0.0;
     393             :       }
     394             :     }
     395             :   }
     396             : 
     397             :   /* Compute orthogonal matrix in Q */
     398         195 :   PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&ldq,tau,work,&lwork,&info));
     399         195 :   SlepcCheckLapackInfo("orgqr",info);
     400             : 
     401         195 :   if (nlevels) {
     402             : 
     403         124 :     PetscCall(PetscMPIIntCast(n,&count));
     404         124 :     PetscCall(PetscMPIIntCast(lda,&stride));
     405         124 :     PetscCall(PetscBLASIntCast(lda,&l));
     406         124 :     PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
     407         124 :     PetscCallMPI(MPI_Type_commit(&tmat));
     408             : 
     409         468 :     for (level=nlevels;level>=1;level--) {
     410             : 
     411         344 :       plevel = PetscPowInt(2,level);
     412         688 :       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
     413             : 
     414             :       /* Stack triangular matrices */
     415         344 :       if (rank<s && s<size) {  /* send top part, receive bottom part */
     416         140 :         PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
     417         204 :       } else if (s<size) {  /* copy top to bottom, receive top part */
     418         140 :         PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
     419         140 :         PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
     420             :       }
     421         344 :       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
     422         220 :         if (rank<size-powtwo) {  /* send bottom part */
     423          60 :           PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
     424         160 :         } else if (rank>=powtwo) {  /* receive bottom part */
     425          60 :           PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
     426             :         }
     427             :       }
     428             :       /* Compute QR and build orthogonal matrix */
     429         344 :       if (level<nlevels || (level==nlevels && s<size)) {
     430         308 :         PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
     431         308 :         SlepcCheckLapackInfo("geqrf",info);
     432         308 :         PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
     433         308 :         PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
     434         308 :         SlepcCheckLapackInfo("orgqr",info);
     435        1888 :         for (j=0;j<n;j++) {
     436        5552 :           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
     437             :         }
     438          36 :       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
     439          36 :         PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
     440         264 :         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
     441             :       }
     442             :     }
     443             : 
     444             :     /* Extract R */
     445         124 :     if (R) {
     446         728 :       for (j=0;j<n;j++) {
     447        2668 :         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
     448        2064 :         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
     449             :       }
     450             :     }
     451             : 
     452             :     /* Accumulate orthogonal matrices */
     453         468 :     for (level=1;level<=nlevels;level++) {
     454         344 :       plevel = PetscPowInt(2,level);
     455         688 :       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
     456         344 :       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
     457         344 :       if (level<nlevels) {
     458         220 :         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
     459         220 :         PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
     460             :       } else {
     461         192 :         for (i=0;i<m/l;i++) {
     462          68 :           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&ldq,Qhalf,&l,&zero,C,&l));
     463         384 :           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*ldq,C+j*l,l));
     464             :         }
     465         124 :         sz = m%l;
     466         124 :         if (sz) {
     467         124 :           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&ldq,Qhalf,&l,&zero,C,&l));
     468         728 :           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*ldq,C+j*l,sz));
     469             :         }
     470             :       }
     471             :     }
     472             : 
     473         124 :     PetscCallMPI(MPI_Type_free(&tmat));
     474             :   }
     475             : 
     476         195 :   PetscCall(PetscLogFlops(3.0*m*n*n));
     477         195 :   PetscCall(PetscFPTrapPop());
     478         195 :   PetscFunctionReturn(PETSC_SUCCESS);
     479             : }
     480             : 
     481             : /*
     482             :     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
     483             :     all matrices are upper triangular stored in packed format
     484             : */
     485          32 : SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
     486             : {
     487          32 :   PetscBLASInt   n,i,j,k,one=1;
     488          32 :   PetscMPIInt    tsize;
     489          32 :   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
     490          32 :   PetscReal      c;
     491             : 
     492          32 :   PetscFunctionBegin;
     493          32 :   PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize));  /* we assume len=1 */
     494          32 :   tsize /= sizeof(PetscScalar);
     495          32 :   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
     496         160 :   for (j=0;j<n;j++) {
     497         512 :     for (i=0;i<=j;i++) {
     498         384 :       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
     499         384 :       R1[(2*n-j-1)*j/2+j] = v;
     500         384 :       k = n-j-1;
     501         384 :       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
     502             :     }
     503             :   }
     504          32 :   PetscFunctionReturnVoid();
     505             : }
     506             : 
     507             : /*
     508             :     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
     509             : */
     510          73 : PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
     511             : {
     512          73 :   PetscInt       worklen;
     513          73 :   PetscBLASInt   m,n,ldq,i,j,s,nb,lwork,info;
     514          73 :   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
     515          73 :   PetscMPIInt    size,count;
     516          73 :   MPI_Datatype   tmat;
     517             : 
     518          73 :   PetscFunctionBegin;
     519          73 :   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
     520          73 :   PetscCall(PetscBLASIntCast(m_,&m));
     521          73 :   PetscCall(PetscBLASIntCast(n_,&n));
     522          73 :   PetscCall(PetscBLASIntCast(ldq_,&ldq));
     523          73 :   nb = 16;
     524          73 :   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
     525          73 :   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
     526          73 :   worklen = n+n*nb+2*s+ldq*n;
     527          73 :   PetscCall(BVAllocateWork_Private(bv,worklen));
     528          73 :   tau  = bv->work;
     529          73 :   work = bv->work+n;
     530          73 :   R1   = bv->work+n+n*nb;
     531          73 :   R2   = bv->work+n+n*nb+s;
     532          73 :   A    = bv->work+n+n*nb+2*s;
     533          73 :   PetscCall(PetscBLASIntCast(n*nb,&lwork));
     534          73 :   PetscCall(PetscArraycpy(A,Q,ldq*n));
     535             : 
     536             :   /* Compute QR */
     537          73 :   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&ldq,tau,work,&lwork,&info));
     538          73 :   SlepcCheckLapackInfo("geqrf",info);
     539             : 
     540          73 :   if (size==1) {
     541             :     /* Extract R */
     542         697 :     for (j=0;j<n;j++) {
     543        6232 :       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*ldq];
     544        5576 :       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
     545             :     }
     546             :   } else {
     547             :     /* Use MPI reduction operation to obtain global R */
     548          32 :     PetscCall(PetscMPIIntCast(s,&count));
     549          32 :     PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
     550          32 :     PetscCallMPI(MPI_Type_commit(&tmat));
     551         160 :     for (i=0;i<n;i++) {
     552         512 :       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*ldq]:0.0;
     553             :     }
     554         128 :     PetscCall(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
     555         160 :     for (i=0;i<n;i++) {
     556         384 :       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
     557         512 :       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
     558             :     }
     559          32 :     PetscCallMPI(MPI_Type_free(&tmat));
     560             :   }
     561             : 
     562          73 :   PetscCall(PetscLogFlops(3.0*m*n*n));
     563          73 :   PetscCall(PetscFPTrapPop());
     564          73 :   PetscFunctionReturn(PETSC_SUCCESS);
     565             : }

Generated by: LCOV version 1.14