LCOV - code coverage report
Current view: top level - sys/classes/fn/impls/exp - fnexp.c (source / functions) Hit Total Coverage
Test: SLEPc Lines: 597 646 92.4 %
Date: 2024-03-28 00:52:16 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       3             :    SLEPc - Scalable Library for Eigenvalue Problem Computations
       4             :    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
       5             : 
       6             :    This file is part of SLEPc.
       7             :    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
       8             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       9             : */
      10             : /*
      11             :    Exponential function  exp(x)
      12             : */
      13             : 
      14             : #include <slepc/private/fnimpl.h>      /*I "slepcfn.h" I*/
      15             : #include <slepcblaslapack.h>
      16             : 
      17        2117 : static PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
      18             : {
      19        2117 :   PetscFunctionBegin;
      20        2117 :   *y = PetscExpScalar(x);
      21        2117 :   PetscFunctionReturn(PETSC_SUCCESS);
      22             : }
      23             : 
      24         676 : static PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
      25             : {
      26         676 :   PetscFunctionBegin;
      27         676 :   *y = PetscExpScalar(x);
      28         676 :   PetscFunctionReturn(PETSC_SUCCESS);
      29             : }
      30             : 
      31             : #define MAX_PADE 6
      32             : #define SWAP(a,b,t) do {t=a;a=b;b=t;} while (0)
      33             : 
      34          16 : static PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
      35             : {
      36          16 :   PetscBLASInt      n=0,ld,ld2,*ipiv,info,inc=1;
      37          16 :   PetscInt          m,j,k,sexp;
      38          16 :   PetscBool         odd;
      39          16 :   const PetscInt    p=MAX_PADE;
      40          16 :   PetscReal         c[MAX_PADE+1],s,*rwork;
      41          16 :   PetscScalar       scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
      42          16 :   PetscScalar       *Ba,*As,*A2,*Q,*P,*W,*aux;
      43          16 :   const PetscScalar *Aa;
      44             : 
      45          16 :   PetscFunctionBegin;
      46          16 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
      47          16 :   PetscCall(MatDenseGetArray(B,&Ba));
      48          16 :   PetscCall(MatGetSize(A,&m,NULL));
      49          16 :   PetscCall(PetscBLASIntCast(m,&n));
      50          16 :   ld  = n;
      51          16 :   ld2 = ld*ld;
      52          16 :   P   = Ba;
      53          16 :   PetscCall(PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv));
      54          16 :   PetscCall(PetscArraycpy(As,Aa,ld2));
      55             : 
      56             :   /* Pade' coefficients */
      57          16 :   c[0] = 1.0;
      58         112 :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
      59             : 
      60             :   /* Scaling */
      61          16 :   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
      62          16 :   PetscCall(PetscLogFlops(1.0*n*n));
      63          16 :   if (s>0.5) {
      64          16 :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
      65          16 :     scale = PetscPowRealInt(2.0,-sexp);
      66          16 :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
      67          16 :     PetscCall(PetscLogFlops(1.0*n*n));
      68             :   } else sexp = 0;
      69             : 
      70             :   /* Horner evaluation */
      71          16 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
      72          16 :   PetscCall(PetscLogFlops(2.0*n*n*n));
      73          16 :   PetscCall(PetscArrayzero(Q,ld2));
      74          16 :   PetscCall(PetscArrayzero(P,ld2));
      75         616 :   for (j=0;j<n;j++) {
      76         600 :     Q[j+j*ld] = c[p];
      77         600 :     P[j+j*ld] = c[p-1];
      78             :   }
      79             : 
      80             :   odd = PETSC_TRUE;
      81          96 :   for (k=p-1;k>0;k--) {
      82          80 :     if (odd) {
      83          48 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
      84          48 :       SWAP(Q,W,aux);
      85        1848 :       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
      86             :       odd = PETSC_FALSE;
      87             :     } else {
      88          32 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
      89          32 :       SWAP(P,W,aux);
      90        1232 :       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
      91             :       odd = PETSC_TRUE;
      92             :     }
      93          80 :     PetscCall(PetscLogFlops(2.0*n*n*n));
      94             :   }
      95             :   /*if (odd) {
      96             :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
      97             :     SWAP(Q,W,aux);
      98             :     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
      99             :     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
     100             :     SlepcCheckLapackInfo("gesv",info);
     101             :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
     102             :     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
     103             :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
     104             :   } else {*/
     105          16 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
     106          16 :     SWAP(P,W,aux);
     107          16 :     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
     108          16 :     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
     109          16 :     SlepcCheckLapackInfo("gesv",info);
     110          16 :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
     111         616 :     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
     112             :   /*}*/
     113          16 :   PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
     114             : 
     115          68 :   for (k=1;k<=sexp;k++) {
     116          52 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
     117          52 :     PetscCall(PetscArraycpy(P,W,ld2));
     118             :   }
     119          16 :   if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,ld2));
     120          16 :   PetscCall(PetscLogFlops(2.0*n*n*n*sexp));
     121             : 
     122          16 :   PetscCall(PetscFree6(Q,W,As,A2,rwork,ipiv));
     123          16 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     124          16 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     125          16 :   PetscFunctionReturn(PETSC_SUCCESS);
     126             : }
     127             : 
     128             : #if defined(PETSC_HAVE_COMPLEX)
     129             : /*
     130             :  * Set scaling factor (s) and Pade degree (k,m)
     131             :  */
     132          48 : static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
     133             : {
     134          48 :   PetscFunctionBegin;
     135          48 :   if (nrm>1) {
     136          32 :     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
     137           4 :     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
     138           0 :     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
     139           0 :     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
     140           0 :     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
     141           0 :     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
     142           0 :     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
     143           0 :     else               {*s = 1; *k = 1; *m = *k+1;}
     144             :   } else { /* nrm<1 */
     145          16 :     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
     146          12 :     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
     147           8 :     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
     148           8 :     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
     149           8 :     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
     150           8 :     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
     151           8 :     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
     152           8 :     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
     153           8 :     else                {*s = 0; *k = 1; *m = 0;}
     154             :   }
     155          48 :   PetscFunctionReturn(PETSC_SUCCESS);
     156             : }
     157             : 
     158             : /*
     159             :  * Partial fraction form coefficients.
     160             :  * If query, the function returns the size necessary to store the coefficients.
     161             :  */
     162          40 : static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
     163             : {
     164          40 :   PetscInt i;
     165          40 :   const PetscComplex /* m == k+1 */
     166          40 :     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
     167          40 :                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
     168          40 :                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
     169          40 :                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
     170             :                -2.733432894659307e+02                                },
     171          40 :     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     172          40 :                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     173          40 :                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     174          40 :                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
     175             :                 6.286704751729261e+00                               },
     176          40 :     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
     177          40 :                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
     178          40 :                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
     179          40 :                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
     180          40 :     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     181          40 :                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     182          40 :                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     183          40 :                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     184          40 :     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
     185          40 :                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
     186             :                -1.829749817484586e+01                                },
     187          40 :     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     188          40 :                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     189             :                 3.637834252744491e+00                                },
     190          40 :     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
     191          40 :                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
     192          40 :     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     193          40 :                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     194          40 :   const PetscComplex /* m == k-1 */
     195          40 :     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
     196          40 :                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
     197          40 :                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
     198          40 :                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
     199          40 :     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     200          40 :                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     201          40 :                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     202          40 :                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     203          40 :     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
     204          40 :                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
     205             :                -1.734353918633177e+02                                },
     206          40 :     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     207          40 :                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     208             :                 5.648485971016893e+00                                },
     209          40 :     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
     210          40 :                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
     211          40 :     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
     212          40 :                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
     213          40 :   const PetscScalar /* m == k-1 */
     214          40 :     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
     215          40 :     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
     216          40 :     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
     217          40 :     m1remain2[2] = {-0.5,                   -3.5},
     218          40 :     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
     219          40 :     remain2[3] = {1.0/2.0, 1, 1};
     220             : 
     221          40 :   PetscFunctionBegin;
     222          40 :   if (query) { /* query about buffer's size */
     223          20 :     if (m==k+1) {
     224           2 :       *remain = 0;
     225           2 :       *r = *q = k+1;
     226           2 :       PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     227             :     }
     228          18 :     if (m==k-1) {
     229          18 :       *remain = 2;
     230          18 :       if (k==5) *r = *q = 4;
     231           4 :       else if (k==4) *r = *q = 3;
     232           0 :       else if (k==3) *r = *q = 2;
     233           0 :       else if (k==2) *r = *q = 1;
     234             :     }
     235          18 :     if (m==0) {
     236           0 :       *r = *q = 0;
     237           0 :       *remain = k+1;
     238             :     }
     239             :   } else {
     240          20 :     if (m==k+1) {
     241           2 :       if (k==4) {
     242          12 :         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
     243           0 :       } else if (k==3) {
     244           0 :         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
     245           0 :       } else if (k==2) {
     246           0 :         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
     247           0 :       } else if (k==1) {
     248           0 :         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
     249             :       }
     250           2 :       PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     251             :     }
     252          18 :     if (m==k-1) {
     253          18 :       if (k==5) {
     254          70 :         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
     255          42 :         for (i=0;i<2;i++) remain[i] = m1remain5[i];
     256           4 :       } else if (k==4) {
     257          16 :         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
     258          12 :         for (i=0;i<2;i++) remain[i] = m1remain4[i];
     259           0 :       } else if (k==3) {
     260           0 :         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
     261           0 :       } else if (k==2) {
     262           0 :         r[0] = -13.5; q[0] = 3;
     263           0 :         for (i=0;i<2;i++) remain[i] = m1remain2[i];
     264             :       }
     265             :     }
     266          18 :     if (m==0) {
     267           0 :       r = q = NULL;
     268           0 :       if (k==3) {
     269           0 :         for (i=0;i<4;i++) remain[i] = remain3[i];
     270           0 :       } else if (k==2) {
     271           0 :         for (i=0;i<3;i++) remain[i] = remain2[i];
     272             :       }
     273             :     }
     274             :   }
     275          36 :   PetscFunctionReturn(PETSC_SUCCESS);
     276             : }
     277             : 
     278             : /*
     279             :  * Product form coefficients.
     280             :  * If query, the function returns the size necessary to store the coefficients.
     281             :  */
     282          40 : static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
     283             : {
     284          40 :   PetscInt i;
     285          40 :   const PetscComplex /* m == k+1 */
     286          40 :   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     287          40 :              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     288          40 :              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     289          40 :              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     290          40 :   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     291          40 :               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     292             :               6.286704751729261e+00                                ,
     293          40 :               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     294          40 :               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
     295          40 :   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     296          40 :              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     297             :              -5.648485971016893e+00                                },
     298          40 :   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     299          40 :               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     300          40 :               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     301          40 :               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     302          40 :   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
     303          40 :              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
     304          40 :   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     305          40 :               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     306             :               3.637834252744491e+00                               },
     307          40 :   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     308          40 :               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     309          40 :   const PetscComplex /* m == k-1 */
     310          40 :   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     311          40 :              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     312             :              -6.286704751729261e+00                                ,
     313          40 :              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     314          40 :              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
     315          40 :   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     316          40 :               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     317          40 :               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     318          40 :               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     319          40 :   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     320          40 :              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     321          40 :              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     322          40 :              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     323          40 :   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     324          40 :               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     325             :               5.648485971016893e+00                                },
     326          40 :   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     327          40 :              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     328             :              -3.637834252744491e+00                                },
     329          40 :   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
     330          40 :               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
     331          40 :   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     332          40 :              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     333             : 
     334          40 :   PetscFunctionBegin;
     335          40 :   if (query) {
     336          20 :     if (m == k+1) {
     337           2 :       *mult = 1;
     338           2 :       *p = k;
     339           2 :       *q = k+1;
     340           2 :       PetscFunctionReturn(PETSC_SUCCESS);
     341             :     }
     342          18 :     if (m==k-1) {
     343          18 :       *mult = 1;
     344          18 :       *p = k;
     345          18 :       *q = k-1;
     346             :     }
     347             :   } else {
     348          20 :     if (m == k+1) {
     349           2 :       *mult = PetscPowInt(-1,m);
     350           2 :       *mult *= m;
     351           2 :       if (k==4) {
     352          10 :         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
     353           2 :         q[4] = p1q4[4];
     354           0 :       } else if (k==3) {
     355           0 :         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
     356           0 :         q[3] = p1q3[3];
     357           0 :       } else if (k==2) {
     358           0 :         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
     359           0 :         q[2] = p1q2[2];
     360           0 :       } else if (k==1) {
     361           0 :         p[0] = -3;
     362           0 :         for (i=0;i<2;i++) q[i] = p1q1[i];
     363             :       }
     364           2 :       PetscFunctionReturn(PETSC_SUCCESS);
     365             :     }
     366          18 :     if (m==k-1) {
     367          18 :       *mult = PetscPowInt(-1,m);
     368          18 :       *mult /= k;
     369          18 :       if (k==5) {
     370          70 :         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
     371          14 :         p[4] = m1p5[4];
     372           4 :       } else if (k==4) {
     373          16 :         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
     374           4 :         p[3] = m1p4[3];
     375           0 :       } else if (k==3) {
     376           0 :         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
     377           0 :         p[2] = m1p3[2];
     378           0 :       } else if (k==2) {
     379           0 :         for (i=0;i<2;i++) p[i] = m1p2[i];
     380           0 :         q[0] = 3;
     381             :       }
     382             :     }
     383             :   }
     384          36 :   PetscFunctionReturn(PETSC_SUCCESS);
     385             : }
     386             : #endif /* PETSC_HAVE_COMPLEX */
     387             : 
     388             : #if defined(PETSC_USE_COMPLEX)
     389             : static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
     390             : {
     391             :   PetscInt i;
     392             : 
     393             :   PetscFunctionBegin;
     394             :   *result=PETSC_TRUE;
     395             :   for (i=0;i<n&&*result;i++) {
     396             :     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
     397             :   }
     398             :   PetscFunctionReturn(PETSC_SUCCESS);
     399             : }
     400             : #endif
     401             : 
     402             : /*
     403             :  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
     404             :  * and Yuji Nakatsukasa
     405             :  *
     406             :  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
     407             :  *     Approximation for the Matrix Exponential",
     408             :  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
     409             :  *     https://doi.org/10.1137/15M1027553
     410             :  */
     411          48 : static PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
     412             : {
     413             : #if !defined(PETSC_HAVE_COMPLEX)
     414             :   PetscFunctionBegin;
     415             :   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
     416             : #else
     417          48 :   PetscInt          i,j,n_,s,k,m,mod;
     418          48 :   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
     419          48 :   PetscReal         nrm,shift=0.0;
     420             : #if defined(PETSC_USE_COMPLEX)
     421             :   PetscReal         *rwork=NULL;
     422             : #endif
     423          48 :   PetscComplex      *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
     424          48 :   PetscScalar       *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
     425          48 :   const PetscScalar *Aa;
     426          48 :   PetscBool         isreal,flg;
     427          48 :   PetscBLASInt      query=-1;
     428          48 :   PetscScalar       work1,*work;
     429             : 
     430          48 :   PetscFunctionBegin;
     431          48 :   PetscCall(MatGetSize(A,&n_,NULL));
     432          48 :   PetscCall(PetscBLASIntCast(n_,&n));
     433          48 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
     434          48 :   PetscCall(MatDenseGetArray(B,&Ba));
     435          48 :   Ba2 = Ba;
     436          48 :   PetscCall(PetscBLASIntCast(n*n,&n2));
     437             : 
     438          48 :   PetscCall(PetscMalloc2(n2,&sMaux,n2,&Maux));
     439          48 :   Maux2 = Maux;
     440          48 :   PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
     441          48 :   if (!flg) {
     442          48 :     PetscCall(PetscMalloc2(n,&wr,n,&wi));
     443          48 :     PetscCall(PetscArraycpy(sMaux,Aa,n2));
     444             :     /* estimate rightmost eigenvalue and shift A with it */
     445             : #if !defined(PETSC_USE_COMPLEX)
     446          48 :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
     447          48 :     SlepcCheckLapackInfo("geev",info);
     448          48 :     PetscCall(PetscBLASIntCast((PetscInt)work1,&lwork));
     449          48 :     PetscCall(PetscMalloc1(lwork,&work));
     450          48 :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
     451          48 :     PetscCall(PetscFree(work));
     452             : #else
     453             :     PetscCall(PetscArraycpy(Maux,Aa,n2));
     454             :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
     455             :     SlepcCheckLapackInfo("geev",info);
     456             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
     457             :     PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
     458             :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
     459             :     PetscCall(PetscFree2(rwork,work));
     460             : #endif
     461          48 :     SlepcCheckLapackInfo("geev",info);
     462          48 :     PetscCall(PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
     463             : 
     464          48 :     shift = PetscRealPart(wr[0]);
     465        1360 :     for (i=1;i<n;i++) {
     466        1312 :       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
     467             :     }
     468          48 :     PetscCall(PetscFree2(wr,wi));
     469             :   }
     470             :   /* shift so that largest real part is (about) 0 */
     471          48 :   PetscCall(PetscArraycpy(sMaux,Aa,n2));
     472          48 :   if (shift) {
     473        1408 :     for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
     474          48 :     PetscCall(PetscLogFlops(1.0*n));
     475             :   }
     476             : #if defined(PETSC_USE_COMPLEX)
     477             :   PetscCall(PetscArraycpy(Maux,Aa,n2));
     478             :   if (shift) {
     479             :     for (i=0;i<n;i++) Maux[i+i*n] -= shift;
     480             :     PetscCall(PetscLogFlops(1.0*n));
     481             :   }
     482             : #endif
     483             : 
     484             :   /* estimate norm(A) and select the scaling factor */
     485          48 :   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
     486          48 :   PetscCall(PetscLogFlops(1.0*n*n));
     487          48 :   PetscCall(sexpm_params(nrm,&s,&k,&m));
     488          48 :   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
     489           8 :     if (shift) expshift = PetscExpReal(shift);
     490          88 :     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
     491           8 :     if (shift) {
     492           8 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
     493           8 :       PetscCall(PetscLogFlops(1.0*(n+n2)));
     494           0 :     } else PetscCall(PetscLogFlops(1.0*n));
     495           8 :     PetscCall(PetscArraycpy(Ba,sMaux,n2));
     496           8 :     PetscCall(PetscFree2(sMaux,Maux));
     497           8 :     PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     498           8 :     PetscCall(MatDenseRestoreArray(B,&Ba));
     499           8 :     PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     500             :   }
     501             : 
     502          40 :   PetscCall(PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv));
     503          40 :   expmA2 = expmA; RR2 = RR;
     504             :   /* scale matrix */
     505             : #if !defined(PETSC_USE_COMPLEX)
     506      118440 :   for (i=0;i<n2;i++) {
     507      118400 :     As[i] = sMaux[i];
     508             :   }
     509             : #else
     510             :   PetscCall(PetscArraycpy(As,sMaux,n2));
     511             : #endif
     512          40 :   scale = 1.0/PetscPowRealInt(2.0,s);
     513          40 :   PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
     514          40 :   PetscCall(SlepcLogFlopsComplex(1.0*n2));
     515             : 
     516             :   /* evaluate Pade approximant (partial fraction or product form) */
     517          40 :   if (fn->method==3 || !m) { /* partial fraction */
     518          20 :     PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
     519          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
     520          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
     521          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
     522          20 :     PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
     523          20 :     PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
     524             : 
     525          20 :     PetscCall(PetscArrayzero(expmA,n2));
     526             : #if !defined(PETSC_USE_COMPLEX)
     527          20 :     isreal = PETSC_TRUE;
     528             : #else
     529             :     PetscCall(getisreal(n2,Maux,&isreal));
     530             : #endif
     531          20 :     if (isreal) {
     532          20 :       rsizediv2 = irsize/2;
     533          56 :       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
     534          36 :         PetscCall(PetscArraycpy(Maux,As,n2));
     535          36 :         PetscCall(PetscArrayzero(RR,n2));
     536        1276 :         for (j=0;j<n;j++) {
     537        1240 :           Maux[j+j*n] -= p[2*i];
     538        1240 :           RR[j+j*n] = r[2*i];
     539             :         }
     540          36 :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     541          36 :         SlepcCheckLapackInfo("gesv",info);
     542      118036 :         for (j=0;j<n2;j++) {
     543      118000 :           expmA[j] += RR[j] + PetscConj(RR[j]);
     544             :         }
     545             :         /* loop(n) + gesv + loop(n2) */
     546          36 :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
     547             :       }
     548             : 
     549          20 :       mod = ipsize % 2;
     550          20 :       if (mod) {
     551           6 :         PetscCall(PetscArraycpy(Maux,As,n2));
     552           6 :         PetscCall(PetscArrayzero(RR,n2));
     553          66 :         for (j=0;j<n;j++) {
     554          60 :           Maux[j+j*n] -= p[ipsize-1];
     555          60 :           RR[j+j*n] = r[irsize-1];
     556             :         }
     557           6 :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     558           6 :         SlepcCheckLapackInfo("gesv",info);
     559         606 :         for (j=0;j<n2;j++) {
     560         600 :           expmA[j] += RR[j];
     561             :         }
     562           6 :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
     563             :       }
     564             :     } else { /* complex */
     565             :       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
     566             :         PetscCall(PetscArraycpy(Maux,As,n2));
     567             :         PetscCall(PetscArrayzero(RR,n2));
     568             :         for (j=0;j<n;j++) {
     569             :           Maux[j+j*n] -= p[i];
     570             :           RR[j+j*n] = r[i];
     571             :         }
     572             :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     573             :         SlepcCheckLapackInfo("gesv",info);
     574             :         for (j=0;j<n2;j++) {
     575             :           expmA[j] += RR[j];
     576             :         }
     577             :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
     578             :       }
     579             :     }
     580          56 :     for (i=0;i<iremainsize;i++) {
     581          36 :       if (!i) {
     582          18 :         PetscCall(PetscArrayzero(RR,n2));
     583         638 :         for (j=0;j<n;j++) {
     584         620 :           RR[j+j*n] = remainterm[iremainsize-1];
     585             :         }
     586             :       } else {
     587          18 :         PetscCall(PetscArraycpy(RR,As,n2));
     588          18 :         for (j=1;j<i;j++) {
     589           0 :           PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
     590           0 :           SWAP(RR,Maux,aux);
     591           0 :           PetscCall(SlepcLogFlopsComplex(2.0*n*n*n));
     592             :         }
     593          18 :         PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
     594          18 :         PetscCall(SlepcLogFlopsComplex(1.0*n2));
     595             :       }
     596      118036 :       for (j=0;j<n2;j++) {
     597      118000 :         expmA[j] += RR[j];
     598             :       }
     599          36 :       PetscCall(SlepcLogFlopsComplex(1.0*n2));
     600             :     }
     601          20 :     PetscCall(PetscFree3(r,p,remainterm));
     602             :   } else { /* product form, default */
     603          20 :     PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
     604          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
     605          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
     606          20 :     PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
     607          20 :     PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
     608             : 
     609          20 :     PetscCall(PetscArrayzero(expmA,n2));
     610         660 :     for (i=0;i<n;i++) { /* initialize */
     611         640 :       expmA[i+i*n] = 1.0;
     612             :     }
     613          20 :     minlen = PetscMin(irsize,ipsize);
     614          96 :     for (i=0;i<minlen;i++) {
     615          76 :       PetscCall(PetscArraycpy(RR,As,n2));
     616        2596 :       for (j=0;j<n;j++) {
     617        2520 :         RR[j+j*n] -= rootp[i];
     618             :       }
     619          76 :       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
     620          76 :       SWAP(expmA,Maux,aux);
     621          76 :       PetscCall(PetscArraycpy(RR,As,n2));
     622        2596 :       for (j=0;j<n;j++) {
     623        2520 :         RR[j+j*n] -= rootq[i];
     624             :       }
     625          76 :       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
     626          76 :       SlepcCheckLapackInfo("gesv",info);
     627             :       /* loop(n) + gemm + loop(n) + gesv */
     628          76 :       PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
     629             :     }
     630             :     /* extra numerator */
     631          38 :     for (i=minlen;i<irsize;i++) {
     632          18 :       PetscCall(PetscArraycpy(RR,As,n2));
     633         638 :       for (j=0;j<n;j++) {
     634         620 :         RR[j+j*n] -= rootp[i];
     635             :       }
     636          18 :       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
     637          18 :       SWAP(expmA,Maux,aux);
     638          18 :       PetscCall(SlepcLogFlopsComplex(1.0*n+2.0*n*n*n));
     639             :     }
     640             :     /* extra denominator */
     641          22 :     for (i=minlen;i<ipsize;i++) {
     642           2 :       PetscCall(PetscArraycpy(RR,As,n2));
     643          22 :       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
     644           2 :       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
     645           2 :       SlepcCheckLapackInfo("gesv",info);
     646           2 :       PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
     647             :     }
     648          20 :     PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
     649          20 :     PetscCall(SlepcLogFlopsComplex(1.0*n2));
     650          20 :     PetscCall(PetscFree2(rootp,rootq));
     651             :   }
     652             : 
     653             : #if !defined(PETSC_USE_COMPLEX)
     654      118440 :   for (i=0;i<n2;i++) {
     655      118400 :     Ba2[i] = PetscRealPartComplex(expmA[i]);
     656             :   }
     657             : #else
     658             :   PetscCall(PetscArraycpy(Ba2,expmA,n2));
     659             : #endif
     660             : 
     661             :   /* perform repeated squaring */
     662         196 :   for (i=0;i<s;i++) { /* final squaring */
     663         156 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
     664         156 :     SWAP(Ba2,sMaux,saux);
     665         156 :     PetscCall(PetscLogFlops(2.0*n*n*n));
     666             :   }
     667          40 :   if (Ba2!=Ba) {
     668           4 :     PetscCall(PetscArraycpy(Ba,Ba2,n2));
     669           4 :     sMaux = Ba2;
     670             :   }
     671          40 :   if (shift) {
     672          40 :     expshift = PetscExpReal(shift);
     673          40 :     PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
     674          40 :     PetscCall(PetscLogFlops(1.0*n2));
     675             :   }
     676             : 
     677             :   /* restore pointers */
     678          40 :   Maux = Maux2; expmA = expmA2; RR = RR2;
     679          40 :   PetscCall(PetscFree2(sMaux,Maux));
     680          40 :   PetscCall(PetscFree4(expmA,As,RR,piv));
     681          40 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     682          40 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     683          40 :   PetscFunctionReturn(PETSC_SUCCESS);
     684             : #endif
     685             : }
     686             : 
     687             : #define SMALLN 100
     688             : 
     689             : /*
     690             :  * Function needed to compute optimal parameters (required workspace is 3*n*n)
     691             :  */
     692         432 : static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
     693             : {
     694         432 :   PetscScalar    *Ascaled=work;
     695         432 :   PetscReal      nrm,alpha,beta,rwork[1];
     696         432 :   PetscInt       t;
     697         432 :   PetscBLASInt   i,j;
     698             : 
     699         432 :   PetscFunctionBegin;
     700         432 :   beta = PetscPowReal(coeff,1.0/(2*m+1));
     701       11282 :   for (i=0;i<n;i++)
     702      654502 :     for (j=0;j<n;j++)
     703      643652 :       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
     704         432 :   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
     705         432 :   PetscCall(PetscLogFlops(2.0*n*n));
     706         432 :   PetscCall(SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha));
     707         432 :   alpha /= nrm;
     708         432 :   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
     709         432 :   PetscFunctionReturn(t);
     710             : }
     711             : 
     712             : /*
     713             :  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
     714             :  */
     715         432 : static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
     716             : {
     717         432 :   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
     718         432 :   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
     719         432 :   PetscBLASInt    n_=0,n2,one=1;
     720         432 :   PetscRandom     rand;
     721         432 :   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
     722             :                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
     723         432 :   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
     724             :                                2.539398330063230e-001,    /* m = 5  */
     725             :                                9.504178996162932e-001,    /* m = 7  */
     726             :                                2.097847961257068e+000,    /* m = 9  */
     727             :                                5.371920351148152e+000 };  /* m = 13 */
     728             : 
     729         432 :   PetscFunctionBegin;
     730         432 :   *s = 0;
     731         432 :   *m = 13;
     732         432 :   PetscCall(PetscBLASIntCast(n,&n_));
     733         432 :   PetscCall(PetscRandomCreate(PETSC_COMM_SELF,&rand));
     734         432 :   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
     735         432 :   if (d4==0.0) { /* safeguard for the case A = 0 */
     736           0 :     *m = 3;
     737           0 :     goto done;
     738             :   }
     739         432 :   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
     740         432 :   PetscCall(PetscLogFlops(2.0*n*n));
     741         432 :   eta1 = PetscMax(d4,d6);
     742         432 :   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
     743          82 :     *m = 3;
     744          82 :     goto done;
     745             :   }
     746         350 :   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
     747         111 :     *m = 5;
     748         111 :     goto done;
     749             :   }
     750         239 :   if (n<SMALLN) {
     751         225 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
     752         225 :     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
     753         225 :     PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
     754             :   } else {
     755          14 :     PetscCall(SlepcNormAm(n_,Apowers[2],2,work,rand,&d8));
     756          14 :     d8 = PetscPowReal(d8,1.0/8.0);
     757             :   }
     758         239 :   eta3 = PetscMax(d6,d8);
     759         239 :   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
     760          13 :     *m = 7;
     761          13 :     goto done;
     762             :   }
     763         226 :   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
     764          18 :     *m = 9;
     765          18 :     goto done;
     766             :   }
     767         208 :   if (n<SMALLN) {
     768         194 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
     769         194 :     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
     770         194 :     PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
     771             :   } else {
     772          14 :     PetscCall(SlepcNormAm(n_,Apowers[1],5,work,rand,&d10));
     773          14 :     d10 = PetscPowReal(d10,1.0/10.0);
     774             :   }
     775         208 :   eta4 = PetscMax(d8,d10);
     776         208 :   eta5 = PetscMin(eta3,eta4);
     777         208 :   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
     778         208 :   if (*s) {
     779          74 :     Ascaled = work+3*n*n;
     780          74 :     n2 = n_*n_;
     781          74 :     PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
     782          74 :     sfactor = PetscPowRealInt(2.0,-(*s));
     783          74 :     PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
     784          74 :     PetscCall(PetscLogFlops(1.0*n*n));
     785             :   } else Ascaled = A;
     786         208 :   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
     787         432 : done:
     788         432 :   PetscCall(PetscRandomDestroy(&rand));
     789         432 :   PetscFunctionReturn(PETSC_SUCCESS);
     790             : }
     791             : 
     792             : /*
     793             :  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
     794             :  *
     795             :  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
     796             :  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
     797             :  */
     798         432 : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
     799             : {
     800         432 :   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
     801         432 :   PetscInt          n,m,j,s;
     802         432 :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
     803         432 :   PetscScalar       *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
     804         432 :   const PetscScalar *Aa,*c;
     805         432 :   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
     806         432 :   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
     807         432 :   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
     808         432 :   const PetscScalar c9[10]  = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
     809             :                                 2162160, 110880, 3960, 90, 1 };
     810         432 :   const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
     811             :                                 1187353796428800.0,  129060195264000.0,   10559470521600.0,
     812             :                                 670442572800.0,      33522128640.0,       1323241920.0,
     813             :                                 40840800,          960960,            16380,  182,  1 };
     814             : 
     815         432 :   PetscFunctionBegin;
     816         432 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
     817         432 :   PetscCall(MatDenseGetArray(B,&Ba));
     818         432 :   PetscCall(MatGetSize(A,&n,NULL));
     819         432 :   PetscCall(PetscBLASIntCast(n,&n_));
     820         432 :   n2 = n_*n_;
     821         432 :   PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
     822             : 
     823             :   /* Matrix powers */
     824         432 :   Apowers[0] = work;                  /* Apowers[0] = A   */
     825         432 :   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
     826         432 :   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
     827         432 :   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
     828         432 :   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
     829             : 
     830         432 :   PetscCall(PetscArraycpy(Apowers[0],Aa,n2));
     831         432 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
     832         432 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
     833         432 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
     834         432 :   PetscCall(PetscLogFlops(6.0*n*n*n));
     835             : 
     836             :   /* Compute scaling parameter and order of Pade approximant */
     837         432 :   PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
     838             : 
     839         432 :   if (s) { /* rescale */
     840         390 :     for (j=0;j<4;j++) {
     841         312 :       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
     842         312 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
     843             :     }
     844          78 :     PetscCall(PetscLogFlops(4.0*n*n));
     845             :   }
     846             : 
     847             :   /* Evaluate the Pade approximant */
     848         432 :   switch (m) {
     849             :     case 3:  c = c3;  break;
     850         111 :     case 5:  c = c5;  break;
     851          13 :     case 7:  c = c7;  break;
     852          18 :     case 9:  c = c9;  break;
     853         208 :     case 13: c = c13; break;
     854           0 :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
     855             :   }
     856         432 :   P = Ba;
     857         432 :   Q = Apowers[4] + n*n;
     858         432 :   W = Q + n*n;
     859         432 :   switch (m) {
     860         224 :     case 3:
     861             :     case 5:
     862             :     case 7:
     863             :     case 9:
     864         224 :       if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
     865         224 :       PetscCall(PetscArrayzero(P,n2));
     866         224 :       PetscCall(PetscArrayzero(Q,n2));
     867        2526 :       for (j=0;j<n;j++) {
     868        2302 :         P[j+j*n] = c[1];
     869        2302 :         Q[j+j*n] = c[0];
     870             :       }
     871         639 :       for (j=m;j>=3;j-=2) {
     872         415 :         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
     873         415 :         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
     874         415 :         PetscCall(PetscLogFlops(4.0*n*n));
     875             :       }
     876         224 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
     877         224 :       PetscCall(PetscLogFlops(2.0*n*n*n));
     878         432 :       SWAP(P,W,aux);
     879             :       break;
     880         208 :     case 13:
     881             :       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
     882             :               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
     883         208 :       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
     884         208 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
     885         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
     886         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
     887         208 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
     888         208 :       PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
     889         208 :       PetscCall(PetscArrayzero(P,n2));
     890        8756 :       for (j=0;j<n;j++) P[j+j*n] = c[1];
     891         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
     892         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
     893         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
     894         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
     895         208 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
     896         208 :       PetscCall(PetscLogFlops(7.0*n*n+2.0*n*n*n));
     897             :       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
     898             :               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
     899         208 :       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
     900         208 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
     901         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
     902         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
     903         208 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
     904         208 :       PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
     905         208 :       PetscCall(PetscArrayzero(Q,n2));
     906        8756 :       for (j=0;j<n;j++) Q[j+j*n] = c[0];
     907         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
     908         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
     909         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
     910         208 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
     911         208 :       PetscCall(PetscLogFlops(7.0*n*n));
     912             :       break;
     913           0 :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
     914             :   }
     915         432 :   PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
     916         432 :   PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
     917         432 :   SlepcCheckLapackInfo("gesv",info);
     918         432 :   PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
     919       11282 :   for (j=0;j<n;j++) P[j+j*n] += 1.0;
     920         432 :   PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n));
     921             : 
     922             :   /* Squaring */
     923        1092 :   for (j=1;j<=s;j++) {
     924         660 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
     925         660 :     SWAP(P,W,aux);
     926             :   }
     927         432 :   if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,n2));
     928         432 :   PetscCall(PetscLogFlops(2.0*n*n*n*s));
     929             : 
     930         432 :   PetscCall(PetscFree2(work,ipiv));
     931         432 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     932         432 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     933         432 :   PetscFunctionReturn(PETSC_SUCCESS);
     934             : }
     935             : 
     936             : #if defined(PETSC_HAVE_CUDA)
     937             : #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
     938             : #include <slepccublas.h>
     939             : 
     940             : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
     941             : {
     942             :   PetscBLASInt      n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
     943             :   PetscInt          m,k,sexp;
     944             :   PetscBool         odd;
     945             :   const PetscInt    p=MAX_PADE;
     946             :   PetscReal         c[MAX_PADE+1],s;
     947             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
     948             :   const PetscScalar *Aa;
     949             :   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
     950             :   cublasHandle_t    cublasv2handle;
     951             : 
     952             :   PetscFunctionBegin;
     953             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
     954             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
     955             :   PetscCall(MatGetSize(A,&m,NULL));
     956             :   PetscCall(PetscBLASIntCast(m,&n));
     957             :   ld  = n;
     958             :   ld2 = ld*ld;
     959             :   if (A==B) {
     960             :     PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
     961             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
     962             :     PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
     963             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
     964             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
     965             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
     966             : 
     967             :   PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
     968             :   PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
     969             :   PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
     970             :   PetscCallCUDA(cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld));
     971             :   PetscCallCUDA(cudaMalloc((void **)&d_info,sizeof(PetscBLASInt)));
     972             :   PetscCallCUDA(cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*)));
     973             :   PetscCallCUDA(cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*)));
     974             : 
     975             :   PetscCall(PetscMalloc1(1,&ppP));
     976             :   PetscCall(PetscMalloc1(1,&ppQ));
     977             : 
     978             :   d_P = d_Ba;
     979             :   PetscCall(PetscLogGpuTimeBegin());
     980             : 
     981             :   /* Pade' coefficients */
     982             :   c[0] = 1.0;
     983             :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
     984             : 
     985             :   /* Scaling */
     986             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
     987             :   if (s>0.5) {
     988             :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
     989             :     scale = PetscPowRealInt(2.0,-sexp);
     990             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
     991             :     PetscCall(PetscLogGpuFlops(1.0*n*n));
     992             :   } else sexp = 0;
     993             : 
     994             :   /* Horner evaluation */
     995             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
     996             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n));
     997             :   PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
     998             :   PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
     999             :   PetscCall(set_diagonal(n,d_Q,ld,c[p]));
    1000             :   PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
    1001             : 
    1002             :   odd = PETSC_TRUE;
    1003             :   for (k=p-1;k>0;k--) {
    1004             :     if (odd) {
    1005             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
    1006             :       SWAP(d_Q,d_W,aux);
    1007             :       PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
    1008             :       odd = PETSC_FALSE;
    1009             :     } else {
    1010             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
    1011             :       SWAP(d_P,d_W,aux);
    1012             :       PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
    1013             :       odd = PETSC_TRUE;
    1014             :     }
    1015             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1016             :   }
    1017             :   if (odd) {
    1018             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
    1019             :     SWAP(d_Q,d_W,aux);
    1020             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1021             : 
    1022             :     ppQ[0] = d_Q;
    1023             :     ppP[0] = d_P;
    1024             :     PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1025             :     PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1026             : 
    1027             :     PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
    1028             :     PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
    1029             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1030             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1031             :     PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
    1032             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1033             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1034             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1035             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1036             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
    1037             :   } else {
    1038             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
    1039             :     SWAP(d_P,d_W,aux);
    1040             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1041             : 
    1042             :     ppQ[0] = d_Q;
    1043             :     ppP[0] = d_P;
    1044             :     PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1045             :     PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1046             : 
    1047             :     PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
    1048             :     PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
    1049             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1050             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1051             :     PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
    1052             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1053             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1054             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1055             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1056             :   }
    1057             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
    1058             : 
    1059             :   for (k=1;k<=sexp;k++) {
    1060             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
    1061             :     PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1062             :   }
    1063             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
    1064             : 
    1065             :   PetscCall(PetscLogGpuTimeEnd());
    1066             :   PetscCallCUDA(cudaFree(d_Q));
    1067             :   PetscCallCUDA(cudaFree(d_W));
    1068             :   PetscCallCUDA(cudaFree(d_A2));
    1069             :   PetscCallCUDA(cudaFree(d_ipiv));
    1070             :   PetscCallCUDA(cudaFree(d_info));
    1071             :   PetscCallCUDA(cudaFree(d_ppP));
    1072             :   PetscCallCUDA(cudaFree(d_ppQ));
    1073             : 
    1074             :   PetscCall(PetscFree(ppP));
    1075             :   PetscCall(PetscFree(ppQ));
    1076             : 
    1077             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1078             :   if (A!=B) {
    1079             :     if (s>0.5) {  /* undo scaling */
    1080             :       scale = 1.0/scale;
    1081             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1082             :     }
    1083             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
    1084             :   } else PetscCallCUDA(cudaFree(d_As));
    1085             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1086             :   PetscFunctionReturn(PETSC_SUCCESS);
    1087             : }
    1088             : 
    1089             : #if defined(PETSC_HAVE_MAGMA)
    1090             : #include <slepcmagma.h>
    1091             : 
    1092             : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
    1093             : {
    1094             :   PetscBLASInt      n=0,ld,ld2,*piv,one=1;
    1095             :   PetscInt          m,k,sexp;
    1096             :   PetscBool         odd;
    1097             :   const PetscInt    p=MAX_PADE;
    1098             :   PetscReal         c[MAX_PADE+1],s;
    1099             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
    1100             :   const PetscScalar *Aa;
    1101             :   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
    1102             :   cublasHandle_t    cublasv2handle;
    1103             : 
    1104             :   PetscFunctionBegin;
    1105             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1106             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1107             :   PetscCall(SlepcMagmaInit());
    1108             :   PetscCall(MatGetSize(A,&m,NULL));
    1109             :   PetscCall(PetscBLASIntCast(m,&n));
    1110             :   ld  = n;
    1111             :   ld2 = ld*ld;
    1112             :   if (A==B) {
    1113             :     PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
    1114             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1115             :     PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1116             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1117             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
    1118             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1119             : 
    1120             :   PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
    1121             :   PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
    1122             :   PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
    1123             : 
    1124             :   PetscCall(PetscMalloc1(n,&piv));
    1125             : 
    1126             :   d_P = d_Ba;
    1127             :   PetscCall(PetscLogGpuTimeBegin());
    1128             : 
    1129             :   /* Pade' coefficients */
    1130             :   c[0] = 1.0;
    1131             :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
    1132             : 
    1133             :   /* Scaling */
    1134             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
    1135             :   PetscCall(PetscLogGpuFlops(1.0*n*n));
    1136             : 
    1137             :   if (s>0.5) {
    1138             :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
    1139             :     scale = PetscPowRealInt(2.0,-sexp);
    1140             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1141             :     PetscCall(PetscLogGpuFlops(1.0*n*n));
    1142             :   } else sexp = 0;
    1143             : 
    1144             :   /* Horner evaluation */
    1145             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
    1146             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1147             :   PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
    1148             :   PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
    1149             :   PetscCall(set_diagonal(n,d_Q,ld,c[p]));
    1150             :   PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
    1151             : 
    1152             :   odd = PETSC_TRUE;
    1153             :   for (k=p-1;k>0;k--) {
    1154             :     if (odd) {
    1155             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
    1156             :       SWAP(d_Q,d_W,aux);
    1157             :       PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
    1158             :       odd = PETSC_FALSE;
    1159             :     } else {
    1160             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
    1161             :       SWAP(d_P,d_W,aux);
    1162             :       PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
    1163             :       odd = PETSC_TRUE;
    1164             :     }
    1165             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1166             :   }
    1167             :   if (odd) {
    1168             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
    1169             :     SWAP(d_Q,d_W,aux);
    1170             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1171             :     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
    1172             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1173             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1174             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
    1175             :   } else {
    1176             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
    1177             :     SWAP(d_P,d_W,aux);
    1178             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1179             :     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
    1180             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1181             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1182             :   }
    1183             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
    1184             : 
    1185             :   for (k=1;k<=sexp;k++) {
    1186             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
    1187             :     PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1188             :   }
    1189             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
    1190             : 
    1191             :   PetscCall(PetscLogGpuTimeEnd());
    1192             :   PetscCallCUDA(cudaFree(d_Q));
    1193             :   PetscCallCUDA(cudaFree(d_W));
    1194             :   PetscCallCUDA(cudaFree(d_A2));
    1195             :   PetscCall(PetscFree(piv));
    1196             : 
    1197             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1198             :   if (A!=B) {
    1199             :     if (s>0.5) {  /* undo scaling */
    1200             :       scale = 1.0/scale;
    1201             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1202             :     }
    1203             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
    1204             :   } else PetscCallCUDA(cudaFree(d_As));
    1205             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1206             :   PetscFunctionReturn(PETSC_SUCCESS);
    1207             : }
    1208             : 
    1209             : /*
    1210             :  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
    1211             :  *
    1212             :  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
    1213             :  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
    1214             :  */
    1215             : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
    1216             : {
    1217             :   PetscBLASInt      n_=0,n2,*ipiv,one=1;
    1218             :   PetscInt          n,m,j,s;
    1219             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
    1220             :   PetscScalar       *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
    1221             :   const PetscScalar *Aa,*c;
    1222             :   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
    1223             :   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
    1224             :   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
    1225             :   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
    1226             :     2162160, 110880, 3960, 90, 1 };
    1227             :   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
    1228             :     1187353796428800,  129060195264000,   10559470521600,
    1229             :     670442572800,      33522128640,       1323241920,
    1230             :     40840800,          960960,            16380,  182,  1 };
    1231             :   cublasHandle_t    cublasv2handle;
    1232             : 
    1233             :   PetscFunctionBegin;
    1234             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1235             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1236             :   PetscCall(SlepcMagmaInit());
    1237             :   PetscCall(MatGetSize(A,&n,NULL));
    1238             :   PetscCall(PetscBLASIntCast(n,&n_));
    1239             :   n2 = n_*n_;
    1240             :   PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
    1241             :   /* Matrix powers */
    1242             :   Apowers[0] = work;                  /* Apowers[0] = A   */
    1243             :   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
    1244             :   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
    1245             :   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
    1246             :   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
    1247             :   if (A==B) {
    1248             :     PetscCallCUDA(cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar)));
    1249             :     d_Apowers[0] = d_work;              /* d_Apowers[0] = A   */
    1250             :     d_Apowers[1] = d_Apowers[0] + n*n;  /* d_Apowers[1] = A^2 */
    1251             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1252             :     PetscCallCUDA(cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1253             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1254             :   } else {
    1255             :     PetscCallCUDA(cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar)));
    1256             :     PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
    1257             :     d_Apowers[1] = d_work;              /* d_Apowers[1] = A^2 */
    1258             :   }
    1259             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1260             :   d_Apowers[2] = d_Apowers[1] + n*n;    /* d_Apowers[2] = A^4 */
    1261             :   d_Apowers[3] = d_Apowers[2] + n*n;    /* d_Apowers[3] = A^6 */
    1262             :   d_Apowers[4] = d_Apowers[3] + n*n;    /* d_Apowers[4] = A^8 */
    1263             :   d_Q = d_Apowers[4] + n*n;
    1264             :   d_W = d_Q + n*n;
    1265             : 
    1266             :   PetscCall(PetscLogGpuTimeBegin());
    1267             : 
    1268             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_));
    1269             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_));
    1270             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_));
    1271             :   PetscCall(PetscLogGpuFlops(6.0*n*n*n));
    1272             : 
    1273             :   PetscCallCUDA(cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
    1274             :   PetscCallCUDA(cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
    1275             :   PetscCall(PetscLogGpuToCpu(4*n2*sizeof(PetscScalar)));
    1276             :   /* Compute scaling parameter and order of Pade approximant */
    1277             :   PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
    1278             : 
    1279             :   if (s) { /* rescale */
    1280             :     for (j=0;j<4;j++) {
    1281             :       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
    1282             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one));
    1283             :     }
    1284             :     PetscCall(PetscLogGpuFlops(4.0*n*n));
    1285             :   }
    1286             : 
    1287             :   /* Evaluate the Pade approximant */
    1288             :   switch (m) {
    1289             :     case 3:  c = c3;  break;
    1290             :     case 5:  c = c5;  break;
    1291             :     case 7:  c = c7;  break;
    1292             :     case 9:  c = c9;  break;
    1293             :     case 13: c = c13; break;
    1294             :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
    1295             :   }
    1296             :   d_P = d_Ba;
    1297             :   switch (m) {
    1298             :     case 3:
    1299             :     case 5:
    1300             :     case 7:
    1301             :     case 9:
    1302             :       if (m==9) PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_));
    1303             :       PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
    1304             :       PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
    1305             :       PetscCall(set_diagonal(n,d_P,n,c[1]));
    1306             :       PetscCall(set_diagonal(n,d_Q,n,c[0]));
    1307             :       for (j=m;j>=3;j-=2) {
    1308             :         PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one));
    1309             :         PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one));
    1310             :         PetscCall(PetscLogGpuFlops(4.0*n*n));
    1311             :       }
    1312             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_));
    1313             :       PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1314             :       SWAP(d_P,d_W,aux);
    1315             :       break;
    1316             :     case 13:
    1317             :       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
    1318             :           + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
    1319             :       PetscCallCUDA(cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1320             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[13],d_P,one));
    1321             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one));
    1322             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one));
    1323             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_));
    1324             :       PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
    1325             : 
    1326             :       PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
    1327             :       PetscCall(set_diagonal(n,d_P,n,c[1]));
    1328             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one));
    1329             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one));
    1330             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one));
    1331             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one));
    1332             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_));
    1333             :       PetscCall(PetscLogGpuFlops(7.0*n*n+2.0*n*n*n));
    1334             :       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
    1335             :           + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
    1336             :       PetscCallCUDA(cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1337             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[12],d_Q,one));
    1338             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one));
    1339             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one));
    1340             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_));
    1341             :       PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
    1342             :       PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
    1343             :       PetscCall(set_diagonal(n,d_Q,n,c[0]));
    1344             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one));
    1345             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one));
    1346             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one));
    1347             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one));
    1348             :       PetscCall(PetscLogGpuFlops(7.0*n*n));
    1349             :       break;
    1350             :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
    1351             :   }
    1352             :   PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one));
    1353             : 
    1354             :   PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
    1355             : 
    1356             :   PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&stwo,d_P,one));
    1357             :   PetscCall(shift_diagonal(n,d_P,n,sone));
    1358             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n));
    1359             : 
    1360             :   /* Squaring */
    1361             :   for (j=1;j<=s;j++) {
    1362             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_));
    1363             :     SWAP(d_P,d_W,aux);
    1364             :   }
    1365             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*s));
    1366             :   PetscCall(PetscLogGpuTimeEnd());
    1367             : 
    1368             :   PetscCall(PetscFree2(work,ipiv));
    1369             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1370             :   if (A!=B) {
    1371             :     if (s>0.5) {  /* undo scaling */
    1372             :       scale = 1.0/PetscPowRealInt(2.0,-s);
    1373             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one));
    1374             :     }
    1375             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
    1376             :   }
    1377             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1378             :   PetscCallCUDA(cudaFree(d_work));
    1379             :   PetscFunctionReturn(PETSC_SUCCESS);
    1380             : }
    1381             : 
    1382             : /*
    1383             :  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
    1384             :  * and Yuji Nakatsukasa
    1385             :  *
    1386             :  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
    1387             :  *     Approximation for the Matrix Exponential",
    1388             :  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
    1389             :  *     https://doi.org/10.1137/15M1027553
    1390             :  */
    1391             : PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
    1392             : {
    1393             :   PetscInt          i,j,n_,s,k,m,mod;
    1394             :   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
    1395             :   PetscReal         nrm,shift=0.0,rone=1.0,rzero=0.0;
    1396             : #if defined(PETSC_USE_COMPLEX)
    1397             :   PetscReal         *rwork=NULL;
    1398             : #endif
    1399             :   PetscComplex      *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
    1400             :   PetscScalar       *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
    1401             :   const PetscScalar *Aa;
    1402             :   PetscBool         isreal,*d_isreal,flg;
    1403             :   cublasHandle_t    cublasv2handle;
    1404             : 
    1405             :   PetscFunctionBegin;
    1406             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1407             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1408             :   PetscCall(SlepcMagmaInit());
    1409             :   PetscCall(MatGetSize(A,&n_,NULL));
    1410             :   PetscCall(PetscBLASIntCast(n_,&n));
    1411             :   PetscCall(PetscBLASIntCast(n*n,&n2));
    1412             : 
    1413             :   if (A==B) {
    1414             :     PetscCallCUDA(cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2));
    1415             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1416             :     PetscCallCUDA(cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1417             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1418             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa));
    1419             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1420             :   d_Ba2 = d_Ba;
    1421             : 
    1422             :   PetscCallCUDA(cudaMalloc((void **)&d_isreal,sizeof(PetscBool)));
    1423             :   PetscCallCUDA(cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2));
    1424             :   PetscCallCUDA(cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2));
    1425             : 
    1426             :   PetscCall(PetscLogGpuTimeBegin());
    1427             :   d_Maux2 = d_Maux;
    1428             :   PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
    1429             :   if (!flg) {
    1430             :     PetscCall(PetscMalloc2(n,&wr,n,&wi));
    1431             :     /* estimate rightmost eigenvalue and shift A with it */
    1432             :     PetscCall(PetscMalloc1(n2,&Maux));
    1433             :     PetscCall(MatDenseGetArrayRead(A,&Aa));
    1434             :     PetscCall(PetscArraycpy(Maux,Aa,n2));
    1435             :     PetscCall(MatDenseRestoreArrayRead(A,&Aa));
    1436             : #if !defined(PETSC_USE_COMPLEX)
    1437             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
    1438             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
    1439             :     PetscCall(PetscMalloc1(lwork,&work));
    1440             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
    1441             :     PetscCall(PetscFree(work));
    1442             : #else
    1443             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
    1444             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
    1445             :     PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
    1446             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
    1447             :     PetscCall(PetscFree2(rwork,work));
    1448             : #endif
    1449             :     PetscCall(PetscFree(Maux));
    1450             :     PetscCall(PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
    1451             : 
    1452             :     shift = PetscRealPart(wr[0]);
    1453             :     for (i=1;i<n;i++) {
    1454             :       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
    1455             :     }
    1456             :     PetscCall(PetscFree2(wr,wi));
    1457             :   }
    1458             :   /* shift so that largest real part is (about) 0 */
    1459             :   PetscCallCUDA(cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1460             :   if (shift) {
    1461             :     PetscCall(shift_diagonal(n,d_sMaux,n,-shift));
    1462             :     PetscCall(PetscLogGpuFlops(1.0*n));
    1463             :   }
    1464             : #if defined(PETSC_USE_COMPLEX)
    1465             :   PetscCallCUDA(cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1466             :   if (shift) {
    1467             :     PetscCall(shift_diagonal(n,d_Maux,n,-shift));
    1468             :     PetscCall(PetscLogGpuFlops(1.0*n));
    1469             :   }
    1470             : #endif
    1471             :   if (A!=B) PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa));
    1472             :   else PetscCallCUDA(cudaFree(d_Aa));
    1473             : 
    1474             :   /* estimate norm(A) and select the scaling factor */
    1475             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm));
    1476             :   PetscCall(PetscLogGpuFlops(2.0*n*n));
    1477             :   PetscCall(sexpm_params(nrm,&s,&k,&m));
    1478             :   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
    1479             :     if (shift) expshift = PetscExpReal(shift);
    1480             :     PetscCall(shift_Cdiagonal(n,d_Maux,n,rone,rzero));
    1481             :     if (shift) {
    1482             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one));
    1483             :       PetscCall(PetscLogGpuFlops(1.0*(n+n2)));
    1484             :     } else PetscCall(PetscLogGpuFlops(1.0*n));
    1485             :     PetscCallCUDA(cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1486             :     PetscCallCUDA(cudaFree(d_isreal));
    1487             :     PetscCallCUDA(cudaFree(d_sMaux));
    1488             :     PetscCallCUDA(cudaFree(d_Maux));
    1489             :     PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1490             :     PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
    1491             :   }
    1492             : 
    1493             :   PetscCallCUDA(cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2));
    1494             :   PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2));
    1495             :   PetscCallCUDA(cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2));
    1496             :   d_expmA2 = d_expmA; d_RR2 = d_RR;
    1497             :   PetscCall(PetscMalloc1(n,&piv));
    1498             :   /* scale matrix */
    1499             : #if !defined(PETSC_USE_COMPLEX)
    1500             :   PetscCall(copy_array2D_S2C(n,n,d_As,n,d_sMaux,n));
    1501             : #else
    1502             :   PetscCallCUDA(cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1503             : #endif
    1504             :   scale = 1.0/PetscPowRealInt(2.0,s);
    1505             :   PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one));
    1506             :   PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1507             : 
    1508             :   /* evaluate Pade approximant (partial fraction or product form) */
    1509             :   if (fn->method==3 || !m) { /* partial fraction */
    1510             :     PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
    1511             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
    1512             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
    1513             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
    1514             :     PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
    1515             :     PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
    1516             : 
    1517             :     PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
    1518             : #if !defined(PETSC_USE_COMPLEX)
    1519             :     isreal = PETSC_TRUE;
    1520             : #else
    1521             :     PetscCall(getisreal_array2D(n,n,d_Maux,n,d_isreal));
    1522             :     PetscCallCUDA(cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost));
    1523             : #endif
    1524             :     if (isreal) {
    1525             :       rsizediv2 = irsize/2;
    1526             :       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
    1527             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1528             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1529             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i])));
    1530             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i])));
    1531             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1532             :         PetscCall(add_array2D_Conj(n,n,d_RR,n));
    1533             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1534             :         /* shift(n) + gesv + axpy(n2) */
    1535             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
    1536             :       }
    1537             : 
    1538             :       mod = ipsize % 2;
    1539             :       if (mod) {
    1540             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1541             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1542             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1])));
    1543             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1])));
    1544             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1545             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1546             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
    1547             :       }
    1548             :     } else { /* complex */
    1549             :       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
    1550             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1551             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1552             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i])));
    1553             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i])));
    1554             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1555             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1556             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
    1557             :       }
    1558             :     }
    1559             :     for (i=0;i<iremainsize;i++) {
    1560             :       if (!i) {
    1561             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1562             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1])));
    1563             :       } else {
    1564             :         PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1565             :         for (j=1;j<i;j++) {
    1566             :           PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n));
    1567             :           SWAP(d_RR,d_Maux,aux);
    1568             :           PetscCall(SlepcLogGpuFlopsComplex(2.0*n*n*n));
    1569             :         }
    1570             :         PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one));
    1571             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1572             :       }
    1573             :       PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1574             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1575             :     }
    1576             :     PetscCall(PetscFree3(r,p,remainterm));
    1577             :   } else { /* product form, default */
    1578             :     PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
    1579             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
    1580             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
    1581             :     PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
    1582             :     PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
    1583             : 
    1584             :     PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
    1585             :     PetscCall(set_Cdiagonal(n,d_expmA,n,rone,rzero)); /* initialize */
    1586             :     minlen = PetscMin(irsize,ipsize);
    1587             :     for (i=0;i<minlen;i++) {
    1588             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1589             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
    1590             :       PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
    1591             :       SWAP(d_expmA,d_Maux,aux);
    1592             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1593             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
    1594             :       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
    1595             :       /* shift(n) + gemm + shift(n) + gesv */
    1596             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
    1597             :     }
    1598             :     /* extra enumerator */
    1599             :     for (i=minlen;i<irsize;i++) {
    1600             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1601             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
    1602             :       PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
    1603             :       SWAP(d_expmA,d_Maux,aux);
    1604             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n));
    1605             :     }
    1606             :     /* extra denominator */
    1607             :     for (i=minlen;i<ipsize;i++) {
    1608             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1609             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
    1610             :       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
    1611             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
    1612             :     }
    1613             :     PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one));
    1614             :     PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1615             :     PetscCall(PetscFree2(rootp,rootq));
    1616             :   }
    1617             : 
    1618             : #if !defined(PETSC_USE_COMPLEX)
    1619             :   PetscCall(copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n));
    1620             : #else
    1621             :   PetscCallCUDA(cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1622             : #endif
    1623             : 
    1624             :   /* perform repeated squaring */
    1625             :   for (i=0;i<s;i++) { /* final squaring */
    1626             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n));
    1627             :     SWAP(d_Ba2,d_sMaux,saux);
    1628             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1629             :   }
    1630             :   if (d_Ba2!=d_Ba) {
    1631             :     PetscCallCUDA(cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1632             :     d_sMaux = d_Ba2;
    1633             :   }
    1634             :   if (shift) {
    1635             :     expshift = PetscExpReal(shift);
    1636             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one));
    1637             :     PetscCall(PetscLogGpuFlops(1.0*n2));
    1638             :   }
    1639             : 
    1640             :   PetscCall(PetscLogGpuTimeEnd());
    1641             : 
    1642             :   /* restore pointers */
    1643             :   d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
    1644             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1645             :   PetscCallCUDA(cudaFree(d_isreal));
    1646             :   PetscCallCUDA(cudaFree(d_sMaux));
    1647             :   PetscCallCUDA(cudaFree(d_Maux));
    1648             :   PetscCallCUDA(cudaFree(d_expmA));
    1649             :   PetscCallCUDA(cudaFree(d_As));
    1650             :   PetscCallCUDA(cudaFree(d_RR));
    1651             :   PetscCall(PetscFree(piv));
    1652             :   PetscFunctionReturn(PETSC_SUCCESS);
    1653             : }
    1654             : #endif /* PETSC_HAVE_MAGMA */
    1655             : #endif /* PETSC_HAVE_CUDA */
    1656             : 
    1657          29 : static PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
    1658             : {
    1659          29 :   PetscBool      isascii;
    1660          29 :   char           str[50];
    1661          29 :   const char     *methodname[] = {
    1662             :                   "scaling & squaring, [m/m] Pade approximant (Higham)",
    1663             :                   "scaling & squaring, [6/6] Pade approximant",
    1664             :                   "scaling & squaring, subdiagonal Pade approximant (product form)",
    1665             :                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
    1666             :   };
    1667          29 :   const int      nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
    1668             : 
    1669          29 :   PetscFunctionBegin;
    1670          29 :   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
    1671          29 :   if (isascii) {
    1672          29 :     if (fn->beta==(PetscScalar)1.0) {
    1673          24 :       if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: exp(x)\n"));
    1674             :       else {
    1675          10 :         PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
    1676          10 :         PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: exp(%s*x)\n",str));
    1677             :       }
    1678             :     } else {
    1679           5 :       PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE));
    1680           5 :       if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: %s*exp(x)\n",str));
    1681             :       else {
    1682           5 :         PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: %s",str));
    1683           5 :         PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
    1684           5 :         PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
    1685           5 :         PetscCall(PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str));
    1686           5 :         PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
    1687             :       }
    1688             :     }
    1689          29 :     if (fn->method<nmeth) PetscCall(PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]));
    1690             :   }
    1691          29 :   PetscFunctionReturn(PETSC_SUCCESS);
    1692             : }
    1693             : 
    1694          93 : SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
    1695             : {
    1696          93 :   PetscFunctionBegin;
    1697          93 :   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
    1698          93 :   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
    1699          93 :   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
    1700          93 :   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
    1701          93 :   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
    1702          93 :   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
    1703             : #if defined(PETSC_HAVE_CUDA)
    1704             :   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
    1705             : #if defined(PETSC_HAVE_MAGMA)
    1706             :   fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
    1707             :   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
    1708             :   fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
    1709             :   fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
    1710             : #endif
    1711             : #endif
    1712          93 :   fn->ops->view                   = FNView_Exp;
    1713          93 :   PetscFunctionReturn(PETSC_SUCCESS);
    1714             : }

Generated by: LCOV version 1.14