| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
| 4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
| 5 | |||
| 6 | This file is part of SLEPc. | ||
| 7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
| 8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 9 | */ | ||
| 10 | /* | ||
| 11 | Utility subroutines common to several impls | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/ | ||
| 15 | #include <slepcblaslapack.h> | ||
| 16 | |||
| 17 | /* | ||
| 18 | Compute the square root of an upper quasi-triangular matrix T, | ||
| 19 | using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T). | ||
| 20 | */ | ||
| 21 | 1166 | static PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld) | |
| 22 | { | ||
| 23 | 1166 | PetscScalar one=1.0,mone=-1.0; | |
| 24 | 1166 | PetscReal scal; | |
| 25 | 1166 | PetscBLASInt i,j,si,sj,r,ione=1,info; | |
| 26 | #if !defined(PETSC_USE_COMPLEX) | ||
| 27 | 514 | PetscReal alpha,theta,mu,mu2; | |
| 28 | #endif | ||
| 29 | |||
| 30 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
1166 | PetscFunctionBegin; |
| 31 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
42432 | for (j=0;j<n;j++) { |
| 32 | #if defined(PETSC_USE_COMPLEX) | ||
| 33 | 24539 | sj = 1; | |
| 34 | 24539 | T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]); | |
| 35 | #else | ||
| 36 |
4/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 5 times.
|
16727 | sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2; |
| 37 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
16727 | if (sj==1) { |
| 38 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
14207 | PetscCheck(T[j+j*ld]>=0.0,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Matrix has a real negative eigenvalue, no real primary square root exists"); |
| 39 | 14207 | T[j+j*ld] = PetscSqrtReal(T[j+j*ld]); | |
| 40 | } else { | ||
| 41 | /* square root of 2x2 block */ | ||
| 42 | 2520 | theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0; | |
| 43 | 2520 | mu = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0; | |
| 44 | 2520 | mu2 = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld]; | |
| 45 | 2520 | mu = PetscSqrtReal(mu2); | |
| 46 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
2520 | if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0); |
| 47 | ✗ | else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2))); | |
| 48 | 2520 | T[j+j*ld] /= 2.0*alpha; | |
| 49 | 2520 | T[j+1+(j+1)*ld] /= 2.0*alpha; | |
| 50 | 2520 | T[j+(j+1)*ld] /= 2.0*alpha; | |
| 51 | 2520 | T[j+1+j*ld] /= 2.0*alpha; | |
| 52 | 2520 | T[j+j*ld] += alpha-theta/(2.0*alpha); | |
| 53 | 2520 | T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha); | |
| 54 | } | ||
| 55 | #endif | ||
| 56 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1068808 | for (i=j-1;i>=0;i--) { |
| 57 | #if defined(PETSC_USE_COMPLEX) | ||
| 58 | 638365 | si = 1; | |
| 59 | #else | ||
| 60 |
4/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 5 times.
|
389177 | si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2; |
| 61 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
389177 | if (si==2) i--; |
| 62 | #endif | ||
| 63 | /* solve Sylvester equation of order si x sj */ | ||
| 64 | 1027542 | r = j-i-si; | |
| 65 |
12/22✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 2 times.
✓ Branch 10 taken 2 times.
✗ Branch 11 not taken.
✗ Branch 12 not taken.
✓ Branch 13 taken 2 times.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✓ Branch 16 taken 2 times.
✗ Branch 17 not taken.
✗ Branch 18 not taken.
✓ Branch 19 taken 2 times.
✗ Branch 20 not taken.
✗ Branch 21 not taken.
|
1027542 | if (r) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld)); |
| 66 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1027542 | PetscCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info)); |
| 67 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1027542 | SlepcCheckLapackInfo("trsyl",info); |
| 68 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1027542 | PetscCheck(scal==1.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Current implementation cannot handle scale factor %g",(double)scal); |
| 69 | } | ||
| 70 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 10 times.
|
41266 | if (sj==2) j++; |
| 71 | } | ||
| 72 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
226 | PetscFunctionReturn(PETSC_SUCCESS); |
| 73 | } | ||
| 74 | |||
| 75 | #define BLOCKSIZE 64 | ||
| 76 | |||
| 77 | /* | ||
| 78 | Schur method for the square root of an upper quasi-triangular matrix T. | ||
| 79 | T is overwritten with sqrtm(T). | ||
| 80 | If firstonly then only the first column of T will contain relevant values. | ||
| 81 | */ | ||
| 82 | 700 | PetscErrorCode FNSqrtmSchur(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly) | |
| 83 | { | ||
| 84 | 700 | PetscBLASInt i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE; | |
| 85 | 700 | PetscScalar *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0; | |
| 86 | 700 | PetscInt m,nblk; | |
| 87 | 700 | PetscReal scal; | |
| 88 | #if defined(PETSC_USE_COMPLEX) | ||
| 89 | 385 | PetscReal *rwork; | |
| 90 | #else | ||
| 91 | 315 | PetscReal *wi; | |
| 92 | #endif | ||
| 93 | |||
| 94 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
700 | PetscFunctionBegin; |
| 95 | 700 | m = n; | |
| 96 | 700 | nblk = (m+bs-1)/bs; | |
| 97 | 700 | lwork = 5*n; | |
| 98 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
700 | k = firstonly? 1: n; |
| 99 | |||
| 100 | /* compute Schur decomposition A*Q = Q*T */ | ||
| 101 | #if !defined(PETSC_USE_COMPLEX) | ||
| 102 |
4/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
|
315 | PetscCall(PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p)); |
| 103 |
10/20✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
✓ Branch 8 taken 1 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 1 times.
✓ Branch 12 taken 1 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 1 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 1 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
315 | PetscCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info)); |
| 104 | #else | ||
| 105 |
4/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
|
385 | PetscCall(PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p)); |
| 106 |
10/20✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
✓ Branch 8 taken 1 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 1 times.
✓ Branch 12 taken 1 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 1 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 1 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
385 | PetscCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info)); |
| 107 | #endif | ||
| 108 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
700 | SlepcCheckLapackInfo("gees",info); |
| 109 | |||
| 110 | /* determine block sizes and positions, to avoid cutting 2x2 blocks */ | ||
| 111 | 700 | j = 0; | |
| 112 | 700 | p[j] = 0; | |
| 113 | 1525 | do { | |
| 114 | 1166 | s[j] = PetscMin(bs,n-p[j]); | |
| 115 | #if !defined(PETSC_USE_COMPLEX) | ||
| 116 |
4/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 5 times.
|
514 | if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++; |
| 117 | #endif | ||
| 118 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1166 | if (p[j]+s[j]==n) break; |
| 119 | 466 | j++; | |
| 120 | 466 | p[j] = p[j-1]+s[j-1]; | |
| 121 | 107 | } while (1); | |
| 122 | 1866 | nblk = j+1; | |
| 123 | |||
| 124 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1866 | for (j=0;j<nblk;j++) { |
| 125 | /* evaluate f(T_jj) */ | ||
| 126 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
1166 | PetscCall(SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld)); |
| 127 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1632 | for (i=j-1;i>=0;i--) { |
| 128 | /* solve Sylvester equation for block (i,j) */ | ||
| 129 | 466 | r = p[j]-p[i]-s[i]; | |
| 130 |
1/22✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✗ Branch 11 not taken.
✗ Branch 12 not taken.
✗ Branch 13 not taken.
✗ Branch 14 not taken.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✗ Branch 17 not taken.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
✗ Branch 20 not taken.
✗ Branch 21 not taken.
|
466 | if (r) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld)); |
| 131 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
466 | PetscCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info)); |
| 132 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
466 | SlepcCheckLapackInfo("trsyl",info); |
| 133 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
466 | PetscCheck(scal==1.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Current implementation cannot handle scale factor %g",(double)scal); |
| 134 | } | ||
| 135 | } | ||
| 136 | |||
| 137 | /* backtransform B = Q*T*Q' */ | ||
| 138 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
700 | PetscCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld)); |
| 139 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
700 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld)); |
| 140 | |||
| 141 | /* flop count: Schur decomposition, triangular square root, and backtransform */ | ||
| 142 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
700 | PetscCall(PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k)); |
| 143 | |||
| 144 | #if !defined(PETSC_USE_COMPLEX) | ||
| 145 |
4/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
|
315 | PetscCall(PetscFree7(wr,wi,W,Q,work,s,p)); |
| 146 | #else | ||
| 147 |
4/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
|
385 | PetscCall(PetscFree7(wr,rwork,W,Q,work,s,p)); |
| 148 | #endif | ||
| 149 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
134 | PetscFunctionReturn(PETSC_SUCCESS); |
| 150 | } | ||
| 151 | |||
| 152 | #define DBMAXIT 25 | ||
| 153 | |||
| 154 | /* | ||
| 155 | Computes the principal square root of the matrix T using the product form | ||
| 156 | of the Denman-Beavers iteration. | ||
| 157 | T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv. | ||
| 158 | */ | ||
| 159 | 246 | PetscErrorCode FNSqrtmDenmanBeavers(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv) | |
| 160 | { | ||
| 161 | 246 | PetscScalar *Told,*M=NULL,*invM,*work,work1,prod,alpha; | |
| 162 | 246 | PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25; | |
| 163 | 246 | PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,rwork[1]; | |
| 164 | 246 | PetscBLASInt N,i,it,*piv=NULL,info,query=-1,lwork; | |
| 165 | 246 | const PetscBLASInt one=1; | |
| 166 | 246 | PetscBool converged=PETSC_FALSE,scale; | |
| 167 | 246 | unsigned int ftz; | |
| 168 | |||
| 169 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
246 | PetscFunctionBegin; |
| 170 | 246 | N = n*n; | |
| 171 | 246 | tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2; | |
| 172 | 246 | scale = PetscDefined(USE_REAL_SINGLE)? PETSC_FALSE: PETSC_TRUE; | |
| 173 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(SlepcSetFlushToZero(&ftz)); |
| 174 | |||
| 175 | /* query work size */ | ||
| 176 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
246 | PetscCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info)); |
| 177 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork)); |
| 178 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM)); |
| 179 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscArraycpy(M,T,n*n)); |
| 180 | |||
| 181 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
246 | if (inv) { /* start recurrence with I instead of A */ |
| 182 |
4/6✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
120 | PetscCall(PetscArrayzero(T,n*n)); |
| 183 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1320 | for (i=0;i<n;i++) T[i+i*ld] += 1.0; |
| 184 | } | ||
| 185 | |||
| 186 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1476 | for (it=0;it<DBMAXIT && !converged;it++) { |
| 187 | |||
| 188 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1230 | if (scale) { /* g = (abs(det(M)))^(-1/(2*n)) */ |
| 189 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
738 | PetscCall(PetscArraycpy(invM,M,n*n)); |
| 190 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
738 | PetscCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info)); |
| 191 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
738 | SlepcCheckLapackInfo("getrf",info); |
| 192 | 738 | prod = invM[0]; | |
| 193 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
39942 | for (i=1;i<n;i++) prod *= invM[i+i*ld]; |
| 194 | 738 | detM = PetscAbsScalar(prod); | |
| 195 |
1/2✓ Branch 0 taken 10 times.
✗ Branch 1 not taken.
|
738 | g = (detM>PETSC_MAX_REAL)? 0.5: PetscPowReal(detM,-1.0/(2.0*n)); |
| 196 | 738 | alpha = g; | |
| 197 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
738 | PetscCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one)); |
| 198 | 738 | alpha = g*g; | |
| 199 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
738 | PetscCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one)); |
| 200 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
738 | PetscCall(PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n)); |
| 201 | } | ||
| 202 | |||
| 203 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
1230 | PetscCall(PetscArraycpy(Told,T,n*n)); |
| 204 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
1230 | PetscCall(PetscArraycpy(invM,M,n*n)); |
| 205 | |||
| 206 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1230 | PetscCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info)); |
| 207 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1230 | SlepcCheckLapackInfo("getrf",info); |
| 208 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1230 | PetscCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info)); |
| 209 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1230 | SlepcCheckLapackInfo("getri",info); |
| 210 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
1230 | PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0)); |
| 211 | |||
| 212 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
71400 | for (i=0;i<n;i++) invM[i+i*ld] += 1.0; |
| 213 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1230 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld)); |
| 214 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
71400 | for (i=0;i<n;i++) invM[i+i*ld] -= 1.0; |
| 215 | |||
| 216 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1230 | PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one)); |
| 217 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
1230 | PetscCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one)); |
| 218 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
71400 | for (i=0;i<n;i++) M[i+i*ld] -= 0.5; |
| 219 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
1230 | PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n)); |
| 220 | |||
| 221 | 1230 | Mres = LAPACKlange_("F",&n,&n,M,&n,rwork); | |
| 222 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
72630 | for (i=0;i<n;i++) M[i+i*ld] += 1.0; |
| 223 | |||
| 224 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1230 | if (scale) { |
| 225 | /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */ | ||
| 226 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
738 | PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one)); |
| 227 | 738 | fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork); | |
| 228 | 738 | fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork); | |
| 229 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
738 | PetscCall(PetscLogFlops(7.0*n*n)); |
| 230 | 738 | reldiff = fnormdiff/fnormT; | |
| 231 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
738 | PetscCall(PetscInfo(fn,"it: %" PetscBLASInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)(tol*g))); |
| 232 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
738 | if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch off scaling */ |
| 233 | } | ||
| 234 | |||
| 235 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1230 | if (Mres<=tol) converged = PETSC_TRUE; |
| 236 | } | ||
| 237 | |||
| 238 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
246 | PetscCheck(Mres<=tol,PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",DBMAXIT); |
| 239 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscFree5(work,piv,Told,M,invM)); |
| 240 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
73 | PetscCall(SlepcResetFlushToZero(&ftz)); |
| 241 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
48 | PetscFunctionReturn(PETSC_SUCCESS); |
| 242 | } | ||
| 243 | |||
| 244 | #define NSMAXIT 50 | ||
| 245 | |||
| 246 | /* | ||
| 247 | Computes the principal square root of the matrix A using the Newton-Schulz iteration. | ||
| 248 | T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv. | ||
| 249 | */ | ||
| 250 | 246 | PetscErrorCode FNSqrtmNewtonSchulz(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv) | |
| 251 | { | ||
| 252 | 246 | PetscScalar *Y=A,*Yold,*Z,*Zold,*M; | |
| 253 | 246 | PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0; | |
| 254 | 246 | PetscReal sqrtnrm,tol,Yres=0.0,nrm,rwork[1],done=1.0; | |
| 255 | 246 | PetscBLASInt info,i,it,N,one=1,zero=0; | |
| 256 | 246 | PetscBool converged=PETSC_FALSE; | |
| 257 | 246 | unsigned int ftz; | |
| 258 | |||
| 259 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
246 | PetscFunctionBegin; |
| 260 | 246 | N = n*n; | |
| 261 | 246 | tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2; | |
| 262 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(SlepcSetFlushToZero(&ftz)); |
| 263 | |||
| 264 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M)); |
| 265 | |||
| 266 | /* scale */ | ||
| 267 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscArraycpy(Z,A,N)); |
| 268 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
13560 | for (i=0;i<n;i++) Z[i+i*ld] -= 1.0; |
| 269 | 246 | nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork); | |
| 270 | 246 | sqrtnrm = PetscSqrtReal(nrm); | |
| 271 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
246 | PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,A,&N,&info)); |
| 272 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
246 | SlepcCheckLapackInfo("lascl",info); |
| 273 | 246 | tol *= nrm; | |
| 274 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol)); |
| 275 |
3/6✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscLogFlops(2.0*n*n)); |
| 276 | |||
| 277 | /* Z = I */ | ||
| 278 |
4/6✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscArrayzero(Z,N)); |
| 279 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
13560 | for (i=0;i<n;i++) Z[i+i*ld] = 1.0; |
| 280 | |||
| 281 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2878 | for (it=0;it<NSMAXIT && !converged;it++) { |
| 282 | /* Yold = Y, Zold = Z */ | ||
| 283 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
2632 | PetscCall(PetscArraycpy(Yold,Y,N)); |
| 284 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
2632 | PetscCall(PetscArraycpy(Zold,Z,N)); |
| 285 | |||
| 286 | /* M = (3*I-Zold*Yold) */ | ||
| 287 |
4/6✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
2632 | PetscCall(PetscArrayzero(M,N)); |
| 288 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
170000 | for (i=0;i<n;i++) M[i+i*ld] = sthree; |
| 289 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2632 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld)); |
| 290 | |||
| 291 | /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */ | ||
| 292 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2632 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld)); |
| 293 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2632 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld)); |
| 294 | |||
| 295 | /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */ | ||
| 296 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2632 | PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one)); |
| 297 | 2632 | Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork); | |
| 298 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
2632 | PetscCheck(!PetscIsNanReal(Yres),PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number"); |
| 299 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2632 | if (Yres<=tol) converged = PETSC_TRUE; |
| 300 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
2632 | PetscCall(PetscInfo(fn,"it: %" PetscBLASInt_FMT " res: %g\n",it,(double)Yres)); |
| 301 | |||
| 302 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
2632 | PetscCall(PetscLogFlops(6.0*n*n*n+2.0*n*n)); |
| 303 | } | ||
| 304 | |||
| 305 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
246 | PetscCheck(Yres<=tol,PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",NSMAXIT); |
| 306 | |||
| 307 | /* undo scaling */ | ||
| 308 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
246 | if (inv) { |
| 309 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
120 | PetscCall(PetscArraycpy(A,Z,N)); |
| 310 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
120 | PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&sqrtnrm,&done,&N,&one,A,&N,&info)); |
| 311 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
126 | } else PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&done,&sqrtnrm,&N,&one,A,&N,&info)); |
| 312 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
246 | SlepcCheckLapackInfo("lascl",info); |
| 313 | |||
| 314 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
246 | PetscCall(PetscFree4(Yold,Z,Zold,M)); |
| 315 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
73 | PetscCall(SlepcResetFlushToZero(&ftz)); |
| 316 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
48 | PetscFunctionReturn(PETSC_SUCCESS); |
| 317 | } | ||
| 318 | |||
| 319 | #if defined(PETSC_HAVE_CUDA) | ||
| 320 | #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h" | ||
| 321 | #include <slepccupmblas.h> | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Matrix square root by Newton-Schulz iteration. CUDA version. | ||
| 325 | * Computes the principal square root of the matrix A using the | ||
| 326 | * Newton-Schulz iteration. A is overwritten with sqrtm(A). | ||
| 327 | */ | ||
| 328 | 48 | PetscErrorCode FNSqrtmNewtonSchulz_CUDA(FN fn,PetscBLASInt n,PetscScalar *d_A,PetscBLASInt ld,PetscBool inv) | |
| 329 | { | ||
| 330 | 48 | PetscScalar *d_Yold,*d_Z,*d_Zold,*d_M,alpha; | |
| 331 | 48 | PetscReal nrm,sqrtnrm,tol,Yres=0.0; | |
| 332 | 48 | const PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0; | |
| 333 | 48 | PetscInt it; | |
| 334 | 48 | PetscBLASInt N; | |
| 335 | 48 | const PetscBLASInt one=1; | |
| 336 | 48 | PetscBool converged=PETSC_FALSE; | |
| 337 | 48 | cublasHandle_t cublasv2handle; | |
| 338 | |||
| 339 | 48 | PetscFunctionBegin; | |
| 340 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */ |
| 341 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 342 | 48 | N = n*n; | |
| 343 | 48 | tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2; | |
| 344 | |||
| 345 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_Yold,sizeof(PetscScalar)*N)); |
| 346 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_Z,sizeof(PetscScalar)*N)); |
| 347 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_Zold,sizeof(PetscScalar)*N)); |
| 348 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N)); |
| 349 | |||
| 350 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscLogGpuTimeBegin()); |
| 351 | |||
| 352 | /* Z = I; */ | ||
| 353 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMemset(d_Z,0,sizeof(PetscScalar)*N)); |
| 354 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(set_diagonal(n,d_Z,ld,sone)); |
| 355 | |||
| 356 | /* scale */ | ||
| 357 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
48 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Z,one)); |
| 358 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
48 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,N,d_Z,one,&nrm)); |
| 359 | 48 | sqrtnrm = PetscSqrtReal(nrm); | |
| 360 | 48 | alpha = 1.0/nrm; | |
| 361 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
48 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&alpha,d_A,one)); |
| 362 | 48 | tol *= nrm; | |
| 363 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol)); |
| 364 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscLogGpuFlops(2.0*n*n)); |
| 365 | |||
| 366 | /* Z = I; */ | ||
| 367 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMemset(d_Z,0,sizeof(PetscScalar)*N)); |
| 368 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(set_diagonal(n,d_Z,ld,sone)); |
| 369 | |||
| 370 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
560 | for (it=0;it<NSMAXIT && !converged;it++) { |
| 371 | /* Yold = Y, Zold = Z */ | ||
| 372 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMemcpy(d_Yold,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 373 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMemcpy(d_Zold,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 374 | |||
| 375 | /* M = (3*I - Zold*Yold) */ | ||
| 376 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMemset(d_M,0,sizeof(PetscScalar)*N)); |
| 377 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
512 | PetscCall(set_diagonal(n,d_M,ld,sthree)); |
| 378 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
512 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&smone,d_Zold,ld,d_Yold,ld,&sone,d_M,ld)); |
| 379 | |||
| 380 | /* Y = (1/2) * Yold * M, Z = (1/2) * M * Zold */ | ||
| 381 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
512 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Yold,ld,d_M,ld,&szero,d_A,ld)); |
| 382 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
512 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_M,ld,d_Zold,ld,&szero,d_Z,ld)); |
| 383 | |||
| 384 | /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */ | ||
| 385 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
512 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Yold,one)); |
| 386 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
512 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,N,d_Yold,one,&Yres)); |
| 387 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCheck(!PetscIsNanReal(Yres),PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number"); |
| 388 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
512 | if (Yres<=tol) converged = PETSC_TRUE; |
| 389 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
512 | PetscCall(PetscInfo(fn,"it: %" PetscInt_FMT " res: %g\n",it,(double)Yres)); |
| 390 | |||
| 391 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
512 | PetscCall(PetscLogGpuFlops(6.0*n*n*n+2.0*n*n)); |
| 392 | } | ||
| 393 | |||
| 394 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCheck(Yres<=tol,PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", NSMAXIT); |
| 395 | |||
| 396 | /* undo scaling */ | ||
| 397 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
48 | if (inv) { |
| 398 | 24 | alpha = 1.0/sqrtnrm; | |
| 399 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
24 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&alpha,d_Z,one)); |
| 400 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
24 | PetscCallCUDA(cudaMemcpy(d_A,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 401 | } else { | ||
| 402 | 24 | alpha = sqrtnrm; | |
| 403 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
24 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&alpha,d_A,one)); |
| 404 | } | ||
| 405 | |||
| 406 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscLogGpuTimeEnd()); |
| 407 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_Yold)); |
| 408 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_Z)); |
| 409 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_Zold)); |
| 410 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_M)); |
| 411 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 412 | } | ||
| 413 | |||
| 414 | #if defined(PETSC_HAVE_MAGMA) | ||
| 415 | #include <slepcmagma.h> | ||
| 416 | |||
| 417 | /* | ||
| 418 | * Matrix square root by product form of Denman-Beavers iteration. CUDA version. | ||
| 419 | * Computes the principal square root of the matrix T using the product form | ||
| 420 | * of the Denman-Beavers iteration. T is overwritten with sqrtm(T). | ||
| 421 | */ | ||
| 422 | 48 | PetscErrorCode FNSqrtmDenmanBeavers_CUDAm(FN fn,PetscBLASInt n,PetscScalar *d_T,PetscBLASInt ld,PetscBool inv) | |
| 423 | { | ||
| 424 | 48 | PetscScalar *d_Told,*d_M,*d_invM,*d_work,prod,szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sneg_pfive=-0.5,sp25=0.25,alpha; | |
| 425 | 48 | PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT; | |
| 426 | 48 | PetscInt it,lwork,nb; | |
| 427 | 48 | PetscBLASInt N,one=1,*piv=NULL; | |
| 428 | 48 | PetscBool converged=PETSC_FALSE,scale; | |
| 429 | 48 | cublasHandle_t cublasv2handle; | |
| 430 | |||
| 431 | 48 | PetscFunctionBegin; | |
| 432 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */ |
| 433 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 434 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(SlepcMagmaInit()); |
| 435 | 48 | N = n*n; | |
| 436 | 48 | scale = PetscDefined(USE_REAL_SINGLE)? PETSC_FALSE: PETSC_TRUE; | |
| 437 | 48 | tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2; | |
| 438 | |||
| 439 | /* query work size */ | ||
| 440 | 48 | nb = magma_get_xgetri_nb(n); | |
| 441 | 48 | lwork = nb*n; | |
| 442 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscMalloc1(n,&piv)); |
| 443 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork)); |
| 444 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_Told,sizeof(PetscScalar)*N)); |
| 445 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N)); |
| 446 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMalloc((void **)&d_invM,sizeof(PetscScalar)*N)); |
| 447 | |||
| 448 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscLogGpuTimeBegin()); |
| 449 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaMemcpy(d_M,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 450 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
48 | if (inv) { /* start recurrence with I instead of A */ |
| 451 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
24 | PetscCallCUDA(cudaMemset(d_T,0,sizeof(PetscScalar)*N)); |
| 452 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
24 | PetscCall(set_diagonal(n,d_T,ld,1.0)); |
| 453 | } | ||
| 454 | |||
| 455 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
304 | for (it=0;it<DBMAXIT && !converged;it++) { |
| 456 | |||
| 457 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
256 | if (scale) { /* g = (abs(det(M)))^(-1/(2*n)); */ |
| 458 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
160 | PetscCallCUDA(cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 459 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
160 | PetscCallMAGMA(magma_xgetrf_gpu,n,n,d_invM,ld,piv); |
| 460 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(mult_diagonal(n,d_invM,ld,&prod)); |
| 461 | 160 | detM = PetscAbsScalar(prod); | |
| 462 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
160 | g = (detM>PETSC_MAX_REAL)? 0.5: PetscPowReal(detM,-1.0/(2.0*n)); |
| 463 | 160 | alpha = g; | |
| 464 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
160 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&alpha,d_T,one)); |
| 465 | 160 | alpha = g*g; | |
| 466 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
160 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&alpha,d_M,one)); |
| 467 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+2.0*n*n)); |
| 468 | } | ||
| 469 | |||
| 470 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
256 | PetscCallCUDA(cudaMemcpy(d_Told,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 471 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
256 | PetscCallCUDA(cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice)); |
| 472 | |||
| 473 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
256 | PetscCallMAGMA(magma_xgetrf_gpu,n,n,d_invM,ld,piv); |
| 474 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
256 | PetscCallMAGMA(magma_xgetri_gpu,n,d_invM,ld,piv,d_work,lwork); |
| 475 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0)); |
| 476 | |||
| 477 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(shift_diagonal(n,d_invM,ld,sone)); |
| 478 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
256 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Told,ld,d_invM,ld,&szero,d_T,ld)); |
| 479 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(shift_diagonal(n,d_invM,ld,smone)); |
| 480 | |||
| 481 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
256 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,N,&sone,d_invM,one,d_M,one)); |
| 482 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
256 | PetscCallCUBLAS(cublasXscal(cublasv2handle,N,&sp25,d_M,one)); |
| 483 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(shift_diagonal(n,d_M,ld,sneg_pfive)); |
| 484 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n)); |
| 485 | |||
| 486 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
256 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,N,d_M,one,&Mres)); |
| 487 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(shift_diagonal(n,d_M,ld,sone)); |
| 488 | |||
| 489 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
256 | if (scale) { |
| 490 | /* reldiff = norm(T - Told,'fro')/norm(T,'fro'); */ | ||
| 491 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
160 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,N,&smone,d_T,one,d_Told,one)); |
| 492 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
160 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,N,d_Told,one,&fnormdiff)); |
| 493 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
160 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,N,d_T,one,&fnormT)); |
| 494 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(PetscLogGpuFlops(7.0*n*n)); |
| 495 | 160 | reldiff = fnormdiff/fnormT; | |
| 496 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(PetscInfo(fn,"it: %" PetscInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g)); |
| 497 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
160 | if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch to no scaling. */ |
| 498 | } | ||
| 499 | |||
| 500 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
256 | PetscCall(PetscInfo(fn,"it: %" PetscInt_FMT " Mres: %g\n",it,(double)Mres)); |
| 501 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
256 | if (Mres<=tol) converged = PETSC_TRUE; |
| 502 | } | ||
| 503 | |||
| 504 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCheck(Mres<=tol,PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", DBMAXIT); |
| 505 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
48 | PetscCall(PetscLogGpuTimeEnd()); |
| 506 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
48 | PetscCall(PetscFree(piv)); |
| 507 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_work)); |
| 508 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_Told)); |
| 509 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_M)); |
| 510 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
48 | PetscCallCUDA(cudaFree(d_invM)); |
| 511 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 512 | } | ||
| 513 | #endif /* PETSC_HAVE_MAGMA */ | ||
| 514 | |||
| 515 | #endif /* PETSC_HAVE_CUDA */ | ||
| 516 | |||
| 517 | #define ITMAX 5 | ||
| 518 | |||
| 519 | /* | ||
| 520 | Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n) | ||
| 521 | */ | ||
| 522 | 280 | static PetscErrorCode SlepcNormEst1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm) | |
| 523 | { | ||
| 524 | 280 | PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0; | |
| 525 | 280 | PetscReal est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux; | |
| 526 | 280 | PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1; | |
| 527 | |||
| 528 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
280 | PetscFunctionBegin; |
| 529 | 280 | X = work; | |
| 530 | 280 | Y = work + 2*n; | |
| 531 | 280 | Z = work + 4*n; | |
| 532 | 280 | S = work + 6*n; | |
| 533 | 280 | S_old = work + 8*n; | |
| 534 | 280 | zvals = (PetscReal*)(work + 10*n); | |
| 535 | |||
| 536 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
33980 | for (i=0;i<n;i++) { /* X has columns of unit 1-norm */ |
| 537 | 33700 | X[i] = 1.0/n; | |
| 538 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
33700 | PetscCall(PetscRandomGetValue(rand,&val)); |
| 539 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
33700 | if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n; |
| 540 | 16595 | else X[i+n] = 1.0/n; | |
| 541 | } | ||
| 542 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
67680 | for (i=0;i<t*n;i++) S[i] = 0.0; |
| 543 | 280 | ind[0] = 0; ind[1] = 0; | |
| 544 | 280 | est_old = 0; | |
| 545 | 609 | while (1) { | |
| 546 | 609 | it++; | |
| 547 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2667 | for (j=0;j<m;j++) { /* Y = A^m*X */ |
| 548 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2058 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n)); |
| 549 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2058 | if (j<m-1) SlepcSwap(X,Y,aux); |
| 550 | } | ||
| 551 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
1827 | for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */ |
| 552 | 1218 | vals[j] = 0.0; | |
| 553 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
147812 | for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]); |
| 554 | } | ||
| 555 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
609 | if (vals[0]<vals[1]) { |
| 556 | 340 | SlepcSwap(vals[0],vals[1],raux); | |
| 557 | 340 | m1 = 1; | |
| 558 | } else m1 = 0; | ||
| 559 | 609 | est = vals[0]; | |
| 560 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
609 | if (est>est_old || it==2) est_j = ind[m1]; |
| 561 |
3/4✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 10 times.
✗ Branch 3 not taken.
|
609 | if (it>=2 && est<=est_old) { |
| 562 | est = est_old; | ||
| 563 | break; | ||
| 564 | } | ||
| 565 | 120516 | est_old = est; | |
| 566 |
1/2✓ Branch 0 taken 10 times.
✗ Branch 1 not taken.
|
385 | if (it>ITMAX) break; |
| 567 | 147203 | SlepcSwap(S,S_old,aux); | |
| 568 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
147203 | for (i=0;i<t*n;i++) { /* S = sign(Y) */ |
| 569 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
255459 | S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0; |
| 570 | } | ||
| 571 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2667 | for (j=0;j<m;j++) { /* Z = (A^T)^m*S */ |
| 572 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
2058 | PetscCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n)); |
| 573 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
2058 | if (j<m-1) SlepcSwap(S,Z,aux); |
| 574 | } | ||
| 575 | 609 | maxzval[0] = -1; maxzval[1] = -1; | |
| 576 | 609 | ind[0] = 0; ind[1] = 0; | |
| 577 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
73906 | for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */ |
| 578 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
73297 | zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n])); |
| 579 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
73297 | if (zvals[i]>maxzval[0]) { |
| 580 | 7221 | maxzval[0] = zvals[i]; | |
| 581 | 7221 | ind[0] = i; | |
| 582 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
66076 | } else if (zvals[i]>maxzval[1]) { |
| 583 | 3348 | maxzval[1] = zvals[i]; | |
| 584 | 3348 | ind[1] = i; | |
| 585 | } | ||
| 586 | } | ||
| 587 |
4/4✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 10 times.
|
609 | if (it>=2 && maxzval[0]==zvals[est_j]) break; |
| 588 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
79523 | for (i=0;i<t*n;i++) X[i] = 0.0; |
| 589 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
987 | for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0; |
| 590 | } | ||
| 591 | 280 | *nrm = est; | |
| 592 | /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */ | ||
| 593 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
280 | PetscCall(PetscLogFlops(4.0*it*m*t*n*n)); |
| 594 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
280 | PetscFunctionReturn(PETSC_SUCCESS); |
| 595 | } | ||
| 596 | |||
| 597 | #define SMALLN 100 | ||
| 598 | |||
| 599 | /* | ||
| 600 | Estimate norm(A^m,1) (required workspace is 2*n*n) | ||
| 601 | */ | ||
| 602 | 5240 | PetscErrorCode SlepcNormAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm) | |
| 603 | { | ||
| 604 | 5240 | PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0; | |
| 605 | 5240 | PetscReal rwork[1],tmp; | |
| 606 | 5240 | PetscBLASInt i,j,one=1; | |
| 607 | 5240 | PetscBool isrealpos=PETSC_TRUE; | |
| 608 | |||
| 609 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
5240 | PetscFunctionBegin; |
| 610 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
5240 | if (n<SMALLN) { /* compute matrix power explicitly */ |
| 611 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 10 times.
|
4780 | if (m==1) { |
| 612 | ✗ | *nrm = LAPACKlange_("O",&n,&n,A,&n,rwork); | |
| 613 | ✗ | PetscCall(PetscLogFlops(1.0*n*n)); | |
| 614 | } else { /* m>=2 */ | ||
| 615 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
4780 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n)); |
| 616 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
75190 | for (j=0;j<m-2;j++) { |
| 617 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
70410 | PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n)); |
| 618 | 70410 | SlepcSwap(v,w,aux); | |
| 619 | } | ||
| 620 | 4780 | *nrm = LAPACKlange_("O",&n,&n,v,&n,rwork); | |
| 621 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
4780 | PetscCall(PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n)); |
| 622 | } | ||
| 623 | } else { | ||
| 624 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
55010 | for (i=0;i<n;i++) |
| 625 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
4421260 | for (j=0;j<n;j++) |
| 626 | #if defined(PETSC_USE_COMPLEX) | ||
| 627 |
3/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
✓ Branch 2 taken 5 times.
✗ Branch 3 not taken.
|
2200145 | if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; } |
| 628 | #else | ||
| 629 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
2200145 | if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; } |
| 630 | #endif | ||
| 631 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
460 | if (isrealpos) { /* for positive matrices only */ |
| 632 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
21030 | for (i=0;i<n;i++) v[i] = 1.0; |
| 633 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
4380 | for (j=0;j<m;j++) { /* w = A'*v */ |
| 634 |
10/20✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
✓ Branch 12 taken 2 times.
✗ Branch 13 not taken.
✓ Branch 14 taken 2 times.
✗ Branch 15 not taken.
✗ Branch 16 not taken.
✓ Branch 17 taken 2 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
|
4200 | PetscCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one)); |
| 635 | 4200 | SlepcSwap(v,w,aux); | |
| 636 | } | ||
| 637 |
3/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
180 | PetscCall(PetscLogFlops(2.0*n*n*m)); |
| 638 | 180 | *nrm = 0.0; | |
| 639 |
4/4✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 10 times.
✓ Branch 3 taken 10 times.
|
21030 | for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > *nrm) *nrm = tmp; /* norm(v,inf) */ |
| 640 |
4/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
280 | } else PetscCall(SlepcNormEst1(n,A,m,work,rand,nrm)); |
| 641 | } | ||
| 642 |
6/12✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✗ Branch 7 not taken.
✓ Branch 8 taken 2 times.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
✓ Branch 11 taken 2 times.
|
1032 | PetscFunctionReturn(PETSC_SUCCESS); |
| 643 | } | ||
| 644 |