LCOV - code coverage report
Current view: top level - sys/classes/fn/impls/exp - fnexp.c (source / functions) Hit Total Coverage
Test: SLEPc Lines: 608 668 91.0 %
Date: 2024-12-18 00:51:33 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       3             :    SLEPc - Scalable Library for Eigenvalue Problem Computations
       4             :    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
       5             : 
       6             :    This file is part of SLEPc.
       7             :    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
       8             :    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       9             : */
      10             : /*
      11             :    Exponential function  exp(x)
      12             : */
      13             : 
      14             : #include <slepc/private/fnimpl.h>      /*I "slepcfn.h" I*/
      15             : #include <slepcblaslapack.h>
      16             : 
      17        3453 : static PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
      18             : {
      19        3453 :   PetscFunctionBegin;
      20        3453 :   *y = PetscExpScalar(x);
      21        3453 :   PetscFunctionReturn(PETSC_SUCCESS);
      22             : }
      23             : 
      24        1075 : static PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
      25             : {
      26        1075 :   PetscFunctionBegin;
      27        1075 :   *y = PetscExpScalar(x);
      28        1075 :   PetscFunctionReturn(PETSC_SUCCESS);
      29             : }
      30             : 
      31             : #define MAX_PADE 6
      32             : 
      33          16 : static PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
      34             : {
      35          16 :   PetscBLASInt      n=0,ld,ld2,*ipiv,info,inc=1;
      36          16 :   PetscInt          m,j,k,sexp;
      37          16 :   PetscBool         odd;
      38          16 :   const PetscInt    p=MAX_PADE;
      39          16 :   PetscReal         c[MAX_PADE+1],s,*rwork;
      40          16 :   PetscScalar       scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
      41          16 :   PetscScalar       *Ba,*As,*A2,*Q,*P,*W,*aux;
      42          16 :   const PetscScalar *Aa;
      43             : 
      44          16 :   PetscFunctionBegin;
      45          16 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
      46          16 :   PetscCall(MatDenseGetArray(B,&Ba));
      47          16 :   PetscCall(MatGetSize(A,&m,NULL));
      48          16 :   PetscCall(PetscBLASIntCast(m,&n));
      49          16 :   ld  = n;
      50          16 :   ld2 = ld*ld;
      51          16 :   P   = Ba;
      52          16 :   PetscCall(PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv));
      53          16 :   PetscCall(PetscArraycpy(As,Aa,ld2));
      54             : 
      55             :   /* Pade' coefficients */
      56          16 :   c[0] = 1.0;
      57         112 :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
      58             : 
      59             :   /* Scaling */
      60          16 :   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
      61          16 :   PetscCall(PetscLogFlops(1.0*n*n));
      62          16 :   if (s>0.5) {
      63          16 :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
      64          16 :     scale = PetscPowRealInt(2.0,-sexp);
      65          16 :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
      66          16 :     PetscCall(PetscLogFlops(1.0*n*n));
      67             :   } else sexp = 0;
      68             : 
      69             :   /* Horner evaluation */
      70          16 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
      71          16 :   PetscCall(PetscLogFlops(2.0*n*n*n));
      72          16 :   PetscCall(PetscArrayzero(Q,ld2));
      73          16 :   PetscCall(PetscArrayzero(P,ld2));
      74         616 :   for (j=0;j<n;j++) {
      75         600 :     Q[j+j*ld] = c[p];
      76         600 :     P[j+j*ld] = c[p-1];
      77             :   }
      78             : 
      79             :   odd = PETSC_TRUE;
      80          96 :   for (k=p-1;k>0;k--) {
      81          80 :     if (odd) {
      82          48 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
      83          48 :       SlepcSwap(Q,W,aux);
      84        1848 :       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
      85             :       odd = PETSC_FALSE;
      86             :     } else {
      87          32 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
      88          32 :       SlepcSwap(P,W,aux);
      89        1232 :       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
      90             :       odd = PETSC_TRUE;
      91             :     }
      92          80 :     PetscCall(PetscLogFlops(2.0*n*n*n));
      93             :   }
      94             :   /*if (odd) {
      95             :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
      96             :     SlepcSwap(Q,W,aux);
      97             :     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
      98             :     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
      99             :     SlepcCheckLapackInfo("gesv",info);
     100             :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
     101             :     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
     102             :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
     103             :   } else {*/
     104          16 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
     105          16 :     SlepcSwap(P,W,aux);
     106          16 :     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
     107          16 :     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
     108          16 :     SlepcCheckLapackInfo("gesv",info);
     109          16 :     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
     110         616 :     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
     111             :   /*}*/
     112          16 :   PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
     113             : 
     114          68 :   for (k=1;k<=sexp;k++) {
     115          52 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
     116          52 :     PetscCall(PetscArraycpy(P,W,ld2));
     117             :   }
     118          16 :   if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,ld2));
     119          16 :   PetscCall(PetscLogFlops(2.0*n*n*n*sexp));
     120             : 
     121          16 :   PetscCall(PetscFree6(Q,W,As,A2,rwork,ipiv));
     122          16 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     123          16 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     124          16 :   PetscFunctionReturn(PETSC_SUCCESS);
     125             : }
     126             : 
     127             : #if defined(PETSC_HAVE_COMPLEX)
     128             : /*
     129             :  * Set scaling factor (s) and Pade degree (k,m)
     130             :  */
     131          48 : static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
     132             : {
     133          48 :   PetscFunctionBegin;
     134          48 :   if (nrm>1) {
     135          32 :     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
     136           4 :     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
     137           0 :     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
     138           0 :     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
     139           0 :     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
     140           0 :     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
     141           0 :     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
     142           0 :     else               {*s = 1; *k = 1; *m = *k+1;}
     143             :   } else { /* nrm<1 */
     144          16 :     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
     145          12 :     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
     146           8 :     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
     147           8 :     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
     148           8 :     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
     149           8 :     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
     150           8 :     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
     151           8 :     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
     152           8 :     else                {*s = 0; *k = 1; *m = 0;}
     153             :   }
     154          48 :   PetscFunctionReturn(PETSC_SUCCESS);
     155             : }
     156             : 
     157             : /*
     158             :  * Partial fraction form coefficients.
     159             :  * If query, the function returns the size necessary to store the coefficients.
     160             :  */
     161          40 : static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
     162             : {
     163          40 :   PetscInt i;
     164          40 :   const PetscComplex /* m == k+1 */
     165          40 :     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
     166          40 :                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
     167          40 :                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
     168          40 :                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
     169             :                -2.733432894659307e+02                                },
     170          40 :     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     171          40 :                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     172          40 :                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     173          40 :                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
     174             :                 6.286704751729261e+00                               },
     175          40 :     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
     176          40 :                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
     177          40 :                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
     178          40 :                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
     179          40 :     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     180          40 :                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     181          40 :                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     182          40 :                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     183          40 :     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
     184          40 :                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
     185             :                -1.829749817484586e+01                                },
     186          40 :     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     187          40 :                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     188             :                 3.637834252744491e+00                                },
     189          40 :     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
     190          40 :                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
     191          40 :     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     192          40 :                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     193          40 :   const PetscComplex /* m == k-1 */
     194          40 :     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
     195          40 :                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
     196          40 :                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
     197          40 :                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
     198          40 :     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     199          40 :                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     200          40 :                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     201          40 :                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     202          40 :     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
     203          40 :                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
     204             :                -1.734353918633177e+02                                },
     205          40 :     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     206          40 :                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     207             :                 5.648485971016893e+00                                },
     208          40 :     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
     209          40 :                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
     210          40 :     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
     211          40 :                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
     212          40 :   const PetscScalar /* m == k-1 */
     213          40 :     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
     214          40 :     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
     215          40 :     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
     216          40 :     m1remain2[2] = {-0.5,                   -3.5},
     217          40 :     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
     218          40 :     remain2[3] = {1.0/2.0, 1, 1};
     219             : 
     220          40 :   PetscFunctionBegin;
     221          40 :   if (query) { /* query about buffer's size */
     222          20 :     if (m==k+1) {
     223           2 :       *remain = 0;
     224           2 :       *r = *q = k+1;
     225           2 :       PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     226             :     }
     227          18 :     if (m==k-1) {
     228          18 :       *remain = 2;
     229          18 :       if (k==5) *r = *q = 4;
     230           4 :       else if (k==4) *r = *q = 3;
     231           0 :       else if (k==3) *r = *q = 2;
     232           0 :       else if (k==2) *r = *q = 1;
     233             :     }
     234          18 :     if (m==0) {
     235           0 :       *r = *q = 0;
     236           0 :       *remain = k+1;
     237             :     }
     238             :   } else {
     239          20 :     if (m==k+1) {
     240           2 :       if (k==4) {
     241          12 :         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
     242           0 :       } else if (k==3) {
     243           0 :         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
     244           0 :       } else if (k==2) {
     245           0 :         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
     246           0 :       } else if (k==1) {
     247           0 :         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
     248             :       }
     249           2 :       PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     250             :     }
     251          18 :     if (m==k-1) {
     252          18 :       if (k==5) {
     253          70 :         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
     254          42 :         for (i=0;i<2;i++) remain[i] = m1remain5[i];
     255           4 :       } else if (k==4) {
     256          16 :         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
     257          12 :         for (i=0;i<2;i++) remain[i] = m1remain4[i];
     258           0 :       } else if (k==3) {
     259           0 :         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
     260           0 :       } else if (k==2) {
     261           0 :         r[0] = -13.5; q[0] = 3;
     262           0 :         for (i=0;i<2;i++) remain[i] = m1remain2[i];
     263             :       }
     264             :     }
     265          18 :     if (m==0) {
     266           0 :       r = q = NULL;
     267           0 :       if (k==3) {
     268           0 :         for (i=0;i<4;i++) remain[i] = remain3[i];
     269           0 :       } else if (k==2) {
     270           0 :         for (i=0;i<3;i++) remain[i] = remain2[i];
     271             :       }
     272             :     }
     273             :   }
     274          36 :   PetscFunctionReturn(PETSC_SUCCESS);
     275             : }
     276             : 
     277             : /*
     278             :  * Product form coefficients.
     279             :  * If query, the function returns the size necessary to store the coefficients.
     280             :  */
     281          40 : static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
     282             : {
     283          40 :   PetscInt i;
     284          40 :   const PetscComplex /* m == k+1 */
     285          40 :   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     286          40 :              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     287          40 :              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     288          40 :              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     289          40 :   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     290          40 :               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     291             :               6.286704751729261e+00                                ,
     292          40 :               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     293          40 :               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
     294          40 :   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     295          40 :              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     296             :              -5.648485971016893e+00                                },
     297          40 :   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     298          40 :               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     299          40 :               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     300          40 :               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     301          40 :   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
     302          40 :              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
     303          40 :   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     304          40 :               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     305             :               3.637834252744491e+00                               },
     306          40 :   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     307          40 :               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     308          40 :   const PetscComplex /* m == k-1 */
     309          40 :   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
     310          40 :              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
     311             :              -6.286704751729261e+00                                ,
     312          40 :              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
     313          40 :              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
     314          40 :   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
     315          40 :               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
     316          40 :               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
     317          40 :               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
     318          40 :   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
     319          40 :              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
     320          40 :              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
     321          40 :              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
     322          40 :   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
     323          40 :               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
     324             :               5.648485971016893e+00                                },
     325          40 :   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
     326          40 :              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
     327             :              -3.637834252744491e+00                                },
     328          40 :   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
     329          40 :               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
     330          40 :   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
     331          40 :              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
     332             : 
     333          40 :   PetscFunctionBegin;
     334          40 :   if (query) {
     335          20 :     if (m == k+1) {
     336           2 :       *mult = 1;
     337           2 :       *p = k;
     338           2 :       *q = k+1;
     339           2 :       PetscFunctionReturn(PETSC_SUCCESS);
     340             :     }
     341          18 :     if (m==k-1) {
     342          18 :       *mult = 1;
     343          18 :       *p = k;
     344          18 :       *q = k-1;
     345             :     }
     346             :   } else {
     347          20 :     if (m == k+1) {
     348           2 :       *mult = PetscPowInt(-1,m);
     349           2 :       *mult *= m;
     350           2 :       if (k==4) {
     351          10 :         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
     352           2 :         q[4] = p1q4[4];
     353           0 :       } else if (k==3) {
     354           0 :         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
     355           0 :         q[3] = p1q3[3];
     356           0 :       } else if (k==2) {
     357           0 :         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
     358           0 :         q[2] = p1q2[2];
     359           0 :       } else if (k==1) {
     360           0 :         p[0] = -3;
     361           0 :         for (i=0;i<2;i++) q[i] = p1q1[i];
     362             :       }
     363           2 :       PetscFunctionReturn(PETSC_SUCCESS);
     364             :     }
     365          18 :     if (m==k-1) {
     366          18 :       *mult = PetscPowInt(-1,m);
     367          18 :       *mult /= k;
     368          18 :       if (k==5) {
     369          70 :         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
     370          14 :         p[4] = m1p5[4];
     371           4 :       } else if (k==4) {
     372          16 :         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
     373           4 :         p[3] = m1p4[3];
     374           0 :       } else if (k==3) {
     375           0 :         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
     376           0 :         p[2] = m1p3[2];
     377           0 :       } else if (k==2) {
     378           0 :         for (i=0;i<2;i++) p[i] = m1p2[i];
     379           0 :         q[0] = 3;
     380             :       }
     381             :     }
     382             :   }
     383          36 :   PetscFunctionReturn(PETSC_SUCCESS);
     384             : }
     385             : #endif /* PETSC_HAVE_COMPLEX */
     386             : 
     387             : #if defined(PETSC_USE_COMPLEX)
     388          20 : static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
     389             : {
     390          20 :   PetscInt i;
     391             : 
     392          20 :   PetscFunctionBegin;
     393          20 :   *result=PETSC_TRUE;
     394       59220 :   for (i=0;i<n&&*result;i++) {
     395       59200 :     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
     396             :   }
     397          20 :   PetscFunctionReturn(PETSC_SUCCESS);
     398             : }
     399             : #endif
     400             : 
     401             : /*
     402             :  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
     403             :  * and Yuji Nakatsukasa
     404             :  *
     405             :  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
     406             :  *     Approximation for the Matrix Exponential",
     407             :  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
     408             :  *     https://doi.org/10.1137/15M1027553
     409             :  */
     410          48 : static PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
     411             : {
     412             : #if !defined(PETSC_HAVE_COMPLEX)
     413             :   PetscFunctionBegin;
     414             :   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
     415             : #else
     416          48 :   PetscInt          i,j,n_,s,k,m,mod;
     417          48 :   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
     418          48 :   PetscReal         nrm,shift=0.0;
     419             : #if defined(PETSC_USE_COMPLEX)
     420          48 :   PetscReal         *rwork=NULL;
     421             : #endif
     422          48 :   PetscComplex      *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
     423          48 :   PetscScalar       *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
     424          48 :   const PetscScalar *Aa;
     425          48 :   PetscBool         isreal,flg;
     426          48 :   PetscBLASInt      query=-1;
     427          48 :   PetscScalar       work1,*work;
     428             : 
     429          48 :   PetscFunctionBegin;
     430          48 :   PetscCall(MatGetSize(A,&n_,NULL));
     431          48 :   PetscCall(PetscBLASIntCast(n_,&n));
     432          48 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
     433          48 :   PetscCall(MatDenseGetArray(B,&Ba));
     434          48 :   Ba2 = Ba;
     435          48 :   PetscCall(PetscBLASIntCast(n*n,&n2));
     436             : 
     437          48 :   PetscCall(PetscMalloc2(n2,&sMaux,n2,&Maux));
     438          48 :   Maux2 = Maux;
     439          48 :   PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
     440          48 :   if (!flg) {
     441          48 :     PetscCall(PetscMalloc2(n,&wr,n,&wi));
     442          48 :     PetscCall(PetscArraycpy(sMaux,Aa,n2));
     443             :     /* estimate rightmost eigenvalue and shift A with it */
     444             : #if !defined(PETSC_USE_COMPLEX)
     445             :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
     446             :     SlepcCheckLapackInfo("geev",info);
     447             :     PetscCall(PetscBLASIntCast((PetscInt)work1,&lwork));
     448             :     PetscCall(PetscMalloc1(lwork,&work));
     449             :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
     450             :     PetscCall(PetscFree(work));
     451             : #else
     452          48 :     PetscCall(PetscArraycpy(Maux,Aa,n2));
     453          48 :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
     454          48 :     SlepcCheckLapackInfo("geev",info);
     455          48 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
     456          48 :     PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
     457          48 :     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
     458          48 :     PetscCall(PetscFree2(rwork,work));
     459             : #endif
     460          48 :     SlepcCheckLapackInfo("geev",info);
     461          48 :     PetscCall(PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
     462             : 
     463          48 :     shift = PetscRealPart(wr[0]);
     464        1360 :     for (i=1;i<n;i++) {
     465        1312 :       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
     466             :     }
     467          48 :     PetscCall(PetscFree2(wr,wi));
     468             :   }
     469             :   /* shift so that largest real part is (about) 0 */
     470          48 :   PetscCall(PetscArraycpy(sMaux,Aa,n2));
     471          48 :   if (shift) {
     472        1408 :     for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
     473          48 :     PetscCall(PetscLogFlops(1.0*n));
     474             :   }
     475             : #if defined(PETSC_USE_COMPLEX)
     476          48 :   PetscCall(PetscArraycpy(Maux,Aa,n2));
     477          48 :   if (shift) {
     478        1408 :     for (i=0;i<n;i++) Maux[i+i*n] -= shift;
     479          48 :     PetscCall(PetscLogFlops(1.0*n));
     480             :   }
     481             : #endif
     482             : 
     483             :   /* estimate norm(A) and select the scaling factor */
     484          48 :   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
     485          48 :   PetscCall(PetscLogFlops(1.0*n*n));
     486          48 :   PetscCall(sexpm_params(nrm,&s,&k,&m));
     487          48 :   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
     488           8 :     if (shift) expshift = PetscExpReal(shift);
     489          88 :     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
     490           8 :     if (shift) {
     491           8 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
     492           8 :       PetscCall(PetscLogFlops(1.0*(n+n2)));
     493           0 :     } else PetscCall(PetscLogFlops(1.0*n));
     494           8 :     PetscCall(PetscArraycpy(Ba,sMaux,n2));
     495           8 :     PetscCall(PetscFree2(sMaux,Maux));
     496           8 :     PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     497           8 :     PetscCall(MatDenseRestoreArray(B,&Ba));
     498           8 :     PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
     499             :   }
     500             : 
     501          40 :   PetscCall(PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv));
     502          40 :   expmA2 = expmA; RR2 = RR;
     503             :   /* scale matrix */
     504             : #if !defined(PETSC_USE_COMPLEX)
     505             :   for (i=0;i<n2;i++) {
     506             :     As[i] = sMaux[i];
     507             :   }
     508             : #else
     509          40 :   PetscCall(PetscArraycpy(As,sMaux,n2));
     510             : #endif
     511          40 :   scale = 1.0/PetscPowRealInt(2.0,s);
     512          40 :   PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
     513          40 :   PetscCall(SlepcLogFlopsComplex(1.0*n2));
     514             : 
     515             :   /* evaluate Pade approximant (partial fraction or product form) */
     516          40 :   if (fn->method==3 || !m) { /* partial fraction */
     517          20 :     PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
     518          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
     519          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
     520          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
     521          20 :     PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
     522          20 :     PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
     523             : 
     524          20 :     PetscCall(PetscArrayzero(expmA,n2));
     525             : #if !defined(PETSC_USE_COMPLEX)
     526             :     isreal = PETSC_TRUE;
     527             : #else
     528          20 :     PetscCall(getisreal(n2,Maux,&isreal));
     529             : #endif
     530          20 :     if (isreal) {
     531          20 :       rsizediv2 = irsize/2;
     532          56 :       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
     533          36 :         PetscCall(PetscArraycpy(Maux,As,n2));
     534          36 :         PetscCall(PetscArrayzero(RR,n2));
     535        1276 :         for (j=0;j<n;j++) {
     536        1240 :           Maux[j+j*n] -= p[2*i];
     537        1240 :           RR[j+j*n] = r[2*i];
     538             :         }
     539          36 :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     540          36 :         SlepcCheckLapackInfo("gesv",info);
     541      118036 :         for (j=0;j<n2;j++) {
     542      118000 :           expmA[j] += RR[j] + PetscConj(RR[j]);
     543             :         }
     544             :         /* loop(n) + gesv + loop(n2) */
     545          36 :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
     546             :       }
     547             : 
     548          20 :       mod = ipsize % 2;
     549          20 :       if (mod) {
     550           6 :         PetscCall(PetscArraycpy(Maux,As,n2));
     551           6 :         PetscCall(PetscArrayzero(RR,n2));
     552          66 :         for (j=0;j<n;j++) {
     553          60 :           Maux[j+j*n] -= p[ipsize-1];
     554          60 :           RR[j+j*n] = r[irsize-1];
     555             :         }
     556           6 :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     557           6 :         SlepcCheckLapackInfo("gesv",info);
     558         606 :         for (j=0;j<n2;j++) {
     559         600 :           expmA[j] += RR[j];
     560             :         }
     561           6 :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
     562             :       }
     563             :     } else { /* complex */
     564           0 :       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
     565           0 :         PetscCall(PetscArraycpy(Maux,As,n2));
     566           0 :         PetscCall(PetscArrayzero(RR,n2));
     567           0 :         for (j=0;j<n;j++) {
     568           0 :           Maux[j+j*n] -= p[i];
     569           0 :           RR[j+j*n] = r[i];
     570             :         }
     571           0 :         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
     572           0 :         SlepcCheckLapackInfo("gesv",info);
     573           0 :         for (j=0;j<n2;j++) {
     574           0 :           expmA[j] += RR[j];
     575             :         }
     576           0 :         PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
     577             :       }
     578             :     }
     579          56 :     for (i=0;i<iremainsize;i++) {
     580          36 :       if (!i) {
     581          18 :         PetscCall(PetscArrayzero(RR,n2));
     582         638 :         for (j=0;j<n;j++) {
     583         620 :           RR[j+j*n] = remainterm[iremainsize-1];
     584             :         }
     585             :       } else {
     586          18 :         PetscCall(PetscArraycpy(RR,As,n2));
     587          18 :         for (j=1;j<i;j++) {
     588           0 :           PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
     589           0 :           SlepcSwap(RR,Maux,aux);
     590           0 :           PetscCall(SlepcLogFlopsComplex(2.0*n*n*n));
     591             :         }
     592          18 :         PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
     593          18 :         PetscCall(SlepcLogFlopsComplex(1.0*n2));
     594             :       }
     595      118036 :       for (j=0;j<n2;j++) {
     596      118000 :         expmA[j] += RR[j];
     597             :       }
     598          36 :       PetscCall(SlepcLogFlopsComplex(1.0*n2));
     599             :     }
     600          20 :     PetscCall(PetscFree3(r,p,remainterm));
     601             :   } else { /* product form, default */
     602          20 :     PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
     603          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
     604          20 :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
     605          20 :     PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
     606          20 :     PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
     607             : 
     608          20 :     PetscCall(PetscArrayzero(expmA,n2));
     609         660 :     for (i=0;i<n;i++) { /* initialize */
     610         640 :       expmA[i+i*n] = 1.0;
     611             :     }
     612          20 :     minlen = PetscMin(irsize,ipsize);
     613          96 :     for (i=0;i<minlen;i++) {
     614          76 :       PetscCall(PetscArraycpy(RR,As,n2));
     615        2596 :       for (j=0;j<n;j++) {
     616        2520 :         RR[j+j*n] -= rootp[i];
     617             :       }
     618          76 :       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
     619          76 :       SlepcSwap(expmA,Maux,aux);
     620          76 :       PetscCall(PetscArraycpy(RR,As,n2));
     621        2596 :       for (j=0;j<n;j++) {
     622        2520 :         RR[j+j*n] -= rootq[i];
     623             :       }
     624          76 :       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
     625          76 :       SlepcCheckLapackInfo("gesv",info);
     626             :       /* loop(n) + gemm + loop(n) + gesv */
     627          76 :       PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
     628             :     }
     629             :     /* extra numerator */
     630          38 :     for (i=minlen;i<irsize;i++) {
     631          18 :       PetscCall(PetscArraycpy(RR,As,n2));
     632         638 :       for (j=0;j<n;j++) {
     633         620 :         RR[j+j*n] -= rootp[i];
     634             :       }
     635          18 :       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
     636          18 :       SlepcSwap(expmA,Maux,aux);
     637          18 :       PetscCall(SlepcLogFlopsComplex(1.0*n+2.0*n*n*n));
     638             :     }
     639             :     /* extra denominator */
     640          22 :     for (i=minlen;i<ipsize;i++) {
     641           2 :       PetscCall(PetscArraycpy(RR,As,n2));
     642          22 :       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
     643           2 :       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
     644           2 :       SlepcCheckLapackInfo("gesv",info);
     645           2 :       PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
     646             :     }
     647          20 :     PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
     648          20 :     PetscCall(SlepcLogFlopsComplex(1.0*n2));
     649          20 :     PetscCall(PetscFree2(rootp,rootq));
     650             :   }
     651             : 
     652             : #if !defined(PETSC_USE_COMPLEX)
     653             :   for (i=0;i<n2;i++) {
     654             :     Ba2[i] = PetscRealPartComplex(expmA[i]);
     655             :   }
     656             : #else
     657          40 :   PetscCall(PetscArraycpy(Ba2,expmA,n2));
     658             : #endif
     659             : 
     660             :   /* perform repeated squaring */
     661         196 :   for (i=0;i<s;i++) { /* final squaring */
     662         156 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
     663         156 :     SlepcSwap(Ba2,sMaux,saux);
     664         156 :     PetscCall(PetscLogFlops(2.0*n*n*n));
     665             :   }
     666          40 :   if (Ba2!=Ba) {
     667           4 :     PetscCall(PetscArraycpy(Ba,Ba2,n2));
     668           4 :     sMaux = Ba2;
     669             :   }
     670          40 :   if (shift) {
     671          40 :     expshift = PetscExpReal(shift);
     672          40 :     PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
     673          40 :     PetscCall(PetscLogFlops(1.0*n2));
     674             :   }
     675             : 
     676             :   /* restore pointers */
     677          40 :   Maux = Maux2; expmA = expmA2; RR = RR2;
     678          40 :   PetscCall(PetscFree2(sMaux,Maux));
     679          40 :   PetscCall(PetscFree4(expmA,As,RR,piv));
     680          40 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     681          40 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     682          40 :   PetscFunctionReturn(PETSC_SUCCESS);
     683             : #endif
     684             : }
     685             : 
     686             : #define SMALLN 100
     687             : 
     688             : /*
     689             :  * Function needed to compute optimal parameters (required workspace is 3*n*n)
     690             :  */
     691         469 : static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
     692             : {
     693         469 :   PetscScalar    *Ascaled=work;
     694         469 :   PetscReal      nrm,alpha,beta,rwork[1];
     695         469 :   PetscInt       t;
     696         469 :   PetscBLASInt   i,j;
     697             : 
     698         469 :   PetscFunctionBegin;
     699         469 :   beta = PetscPowReal(coeff,1.0/(2*m+1));
     700       11564 :   for (i=0;i<n;i++)
     701      661700 :     for (j=0;j<n;j++)
     702      650605 :       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
     703         469 :   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
     704         469 :   PetscCall(PetscLogFlops(2.0*n*n));
     705         469 :   PetscCall(SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha));
     706         469 :   alpha /= nrm;
     707         469 :   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
     708         469 :   PetscFunctionReturn(t);
     709             : }
     710             : 
     711             : /*
     712             :  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
     713             :  */
     714         469 : static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
     715             : {
     716         469 :   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
     717         469 :   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
     718         469 :   PetscBLASInt    n_=0,n2,one=1;
     719         469 :   PetscRandom     rand;
     720         469 :   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
     721             :                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
     722         469 :   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
     723             :                                2.539398330063230e-001,    /* m = 5  */
     724             :                                9.504178996162932e-001,    /* m = 7  */
     725             :                                2.097847961257068e+000,    /* m = 9  */
     726             :                                5.371920351148152e+000 };  /* m = 13 */
     727             : 
     728         469 :   PetscFunctionBegin;
     729         469 :   *s = 0;
     730         469 :   *m = 13;
     731         469 :   PetscCall(PetscBLASIntCast(n,&n_));
     732         469 :   PetscCall(PetscRandomCreate(PETSC_COMM_SELF,&rand));
     733         469 :   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
     734         469 :   if (d4==0.0) { /* safeguard for the case A = 0 */
     735           0 :     *m = 3;
     736           0 :     goto done;
     737             :   }
     738         469 :   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
     739         469 :   PetscCall(PetscLogFlops(2.0*n*n));
     740         469 :   eta1 = PetscMax(d4,d6);
     741         469 :   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
     742         140 :     *m = 3;
     743         140 :     goto done;
     744             :   }
     745         329 :   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
     746          90 :     *m = 5;
     747          90 :     goto done;
     748             :   }
     749         239 :   if (n<SMALLN) {
     750         225 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
     751         225 :     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
     752         225 :     PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
     753             :   } else {
     754          14 :     PetscCall(SlepcNormAm(n_,Apowers[2],2,work,rand,&d8));
     755          14 :     d8 = PetscPowReal(d8,1.0/8.0);
     756             :   }
     757         239 :   eta3 = PetscMax(d6,d8);
     758         239 :   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
     759           8 :     *m = 7;
     760           8 :     goto done;
     761             :   }
     762         231 :   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
     763           9 :     *m = 9;
     764           9 :     goto done;
     765             :   }
     766         222 :   if (n<SMALLN) {
     767         208 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
     768         208 :     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
     769         208 :     PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
     770             :   } else {
     771          14 :     PetscCall(SlepcNormAm(n_,Apowers[1],5,work,rand,&d10));
     772          14 :     d10 = PetscPowReal(d10,1.0/10.0);
     773             :   }
     774         222 :   eta4 = PetscMax(d8,d10);
     775         222 :   eta5 = PetscMin(eta3,eta4);
     776         222 :   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
     777         222 :   if (*s) {
     778          75 :     Ascaled = work+3*n*n;
     779          75 :     n2 = n_*n_;
     780          75 :     PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
     781          75 :     sfactor = PetscPowRealInt(2.0,-(*s));
     782          75 :     PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
     783          75 :     PetscCall(PetscLogFlops(1.0*n*n));
     784             :   } else Ascaled = A;
     785         222 :   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
     786         469 : done:
     787         469 :   PetscCall(PetscRandomDestroy(&rand));
     788         469 :   PetscFunctionReturn(PETSC_SUCCESS);
     789             : }
     790             : 
     791             : /*
     792             :  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
     793             :  *
     794             :  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
     795             :  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
     796             :  */
     797         469 : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
     798             : {
     799         469 :   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
     800         469 :   PetscInt          n,m,j,s;
     801         469 :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
     802         469 :   PetscScalar       *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
     803         469 :   const PetscScalar *Aa,*c;
     804         469 :   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
     805         469 :   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
     806         469 :   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
     807         469 :   const PetscScalar c9[10]  = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
     808             :                                 2162160, 110880, 3960, 90, 1 };
     809         469 :   const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
     810             :                                 1187353796428800.0,  129060195264000.0,   10559470521600.0,
     811             :                                 670442572800.0,      33522128640.0,       1323241920.0,
     812             :                                 40840800,          960960,            16380,  182,  1 };
     813             : 
     814         469 :   PetscFunctionBegin;
     815         469 :   PetscCall(MatDenseGetArrayRead(A,&Aa));
     816         469 :   PetscCall(MatDenseGetArray(B,&Ba));
     817         469 :   PetscCall(MatGetSize(A,&n,NULL));
     818         469 :   PetscCall(PetscBLASIntCast(n,&n_));
     819         469 :   n2 = n_*n_;
     820         469 :   PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
     821             : 
     822             :   /* Matrix powers */
     823         469 :   Apowers[0] = work;                  /* Apowers[0] = A   */
     824         469 :   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
     825         469 :   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
     826         469 :   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
     827         469 :   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
     828             : 
     829         469 :   PetscCall(PetscArraycpy(Apowers[0],Aa,n2));
     830         469 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
     831         469 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
     832         469 :   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
     833         469 :   PetscCall(PetscLogFlops(6.0*n*n*n));
     834             : 
     835             :   /* Compute scaling parameter and order of Pade approximant */
     836         469 :   PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
     837             : 
     838         469 :   if (s) { /* rescale */
     839         395 :     for (j=0;j<4;j++) {
     840         316 :       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
     841         316 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
     842             :     }
     843          79 :     PetscCall(PetscLogFlops(4.0*n*n));
     844             :   }
     845             : 
     846             :   /* Evaluate the Pade approximant */
     847         469 :   switch (m) {
     848             :     case 3:  c = c3;  break;
     849          90 :     case 5:  c = c5;  break;
     850           8 :     case 7:  c = c7;  break;
     851           9 :     case 9:  c = c9;  break;
     852         222 :     case 13: c = c13; break;
     853           0 :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
     854             :   }
     855         469 :   P = Ba;
     856         469 :   Q = Apowers[4] + n*n;
     857         469 :   W = Q + n*n;
     858         469 :   switch (m) {
     859         247 :     case 3:
     860             :     case 5:
     861             :     case 7:
     862             :     case 9:
     863         247 :       if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
     864         247 :       PetscCall(PetscArrayzero(P,n2));
     865         247 :       PetscCall(PetscArrayzero(Q,n2));
     866        2366 :       for (j=0;j<n;j++) {
     867        2119 :         P[j+j*n] = c[1];
     868        2119 :         Q[j+j*n] = c[0];
     869             :       }
     870         627 :       for (j=m;j>=3;j-=2) {
     871         380 :         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
     872         380 :         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
     873         380 :         PetscCall(PetscLogFlops(4.0*n*n));
     874             :       }
     875         247 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
     876         247 :       PetscCall(PetscLogFlops(2.0*n*n*n));
     877         469 :       SlepcSwap(P,W,aux);
     878             :       break;
     879         222 :     case 13:
     880             :       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
     881             :               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
     882         222 :       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
     883         222 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
     884         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
     885         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
     886         222 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
     887         222 :       PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
     888         222 :       PetscCall(PetscArrayzero(P,n2));
     889        9198 :       for (j=0;j<n;j++) P[j+j*n] = c[1];
     890         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
     891         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
     892         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
     893         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
     894         222 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
     895         222 :       PetscCall(PetscLogFlops(7.0*n*n+2.0*n*n*n));
     896             :       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
     897             :               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
     898         222 :       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
     899         222 :       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
     900         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
     901         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
     902         222 :       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
     903         222 :       PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
     904         222 :       PetscCall(PetscArrayzero(Q,n2));
     905        9198 :       for (j=0;j<n;j++) Q[j+j*n] = c[0];
     906         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
     907         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
     908         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
     909         222 :       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
     910         222 :       PetscCall(PetscLogFlops(7.0*n*n));
     911             :       break;
     912           0 :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
     913             :   }
     914         469 :   PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
     915         469 :   PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
     916         469 :   SlepcCheckLapackInfo("gesv",info);
     917         469 :   PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
     918       11564 :   for (j=0;j<n;j++) P[j+j*n] += 1.0;
     919         469 :   PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n));
     920             : 
     921             :   /* Squaring */
     922        1133 :   for (j=1;j<=s;j++) {
     923         664 :     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
     924         664 :     SlepcSwap(P,W,aux);
     925             :   }
     926         469 :   if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,n2));
     927         469 :   PetscCall(PetscLogFlops(2.0*n*n*n*s));
     928             : 
     929         469 :   PetscCall(PetscFree2(work,ipiv));
     930         469 :   PetscCall(MatDenseRestoreArrayRead(A,&Aa));
     931         469 :   PetscCall(MatDenseRestoreArray(B,&Ba));
     932         469 :   PetscFunctionReturn(PETSC_SUCCESS);
     933             : }
     934             : 
     935             : #if defined(PETSC_HAVE_CUDA)
     936             : #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
     937             : #include <slepccupmblas.h>
     938             : 
     939             : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
     940             : {
     941             :   PetscBLASInt      n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
     942             :   PetscInt          m,k,sexp;
     943             :   PetscBool         odd;
     944             :   const PetscInt    p=MAX_PADE;
     945             :   PetscReal         c[MAX_PADE+1],s;
     946             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
     947             :   const PetscScalar *Aa;
     948             :   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
     949             :   cublasHandle_t    cublasv2handle;
     950             : 
     951             :   PetscFunctionBegin;
     952             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
     953             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
     954             :   PetscCall(MatGetSize(A,&m,NULL));
     955             :   PetscCall(PetscBLASIntCast(m,&n));
     956             :   ld  = n;
     957             :   ld2 = ld*ld;
     958             :   if (A==B) {
     959             :     PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
     960             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
     961             :     PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
     962             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
     963             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
     964             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
     965             : 
     966             :   PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
     967             :   PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
     968             :   PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
     969             :   PetscCallCUDA(cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld));
     970             :   PetscCallCUDA(cudaMalloc((void **)&d_info,sizeof(PetscBLASInt)));
     971             :   PetscCallCUDA(cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*)));
     972             :   PetscCallCUDA(cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*)));
     973             : 
     974             :   PetscCall(PetscMalloc1(1,&ppP));
     975             :   PetscCall(PetscMalloc1(1,&ppQ));
     976             : 
     977             :   d_P = d_Ba;
     978             :   PetscCall(PetscLogGpuTimeBegin());
     979             : 
     980             :   /* Pade' coefficients */
     981             :   c[0] = 1.0;
     982             :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
     983             : 
     984             :   /* Scaling */
     985             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
     986             :   if (s>0.5) {
     987             :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
     988             :     scale = PetscPowRealInt(2.0,-sexp);
     989             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
     990             :     PetscCall(PetscLogGpuFlops(1.0*n*n));
     991             :   } else sexp = 0;
     992             : 
     993             :   /* Horner evaluation */
     994             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
     995             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n));
     996             :   PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
     997             :   PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
     998             :   PetscCall(set_diagonal(n,d_Q,ld,c[p]));
     999             :   PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
    1000             : 
    1001             :   odd = PETSC_TRUE;
    1002             :   for (k=p-1;k>0;k--) {
    1003             :     if (odd) {
    1004             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
    1005             :       SlepcSwap(d_Q,d_W,aux);
    1006             :       PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
    1007             :       odd = PETSC_FALSE;
    1008             :     } else {
    1009             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
    1010             :       SlepcSwap(d_P,d_W,aux);
    1011             :       PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
    1012             :       odd = PETSC_TRUE;
    1013             :     }
    1014             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1015             :   }
    1016             :   if (odd) {
    1017             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
    1018             :     SlepcSwap(d_Q,d_W,aux);
    1019             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1020             : 
    1021             :     ppQ[0] = d_Q;
    1022             :     ppP[0] = d_P;
    1023             :     PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1024             :     PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1025             : 
    1026             :     PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
    1027             :     PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
    1028             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1029             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1030             :     PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
    1031             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1032             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1033             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1034             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1035             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
    1036             :   } else {
    1037             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
    1038             :     SlepcSwap(d_P,d_W,aux);
    1039             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1040             : 
    1041             :     ppQ[0] = d_Q;
    1042             :     ppP[0] = d_P;
    1043             :     PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1044             :     PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
    1045             : 
    1046             :     PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
    1047             :     PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
    1048             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1049             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1050             :     PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
    1051             :     PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
    1052             :     PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
    1053             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1054             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1055             :   }
    1056             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
    1057             : 
    1058             :   for (k=1;k<=sexp;k++) {
    1059             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
    1060             :     PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1061             :   }
    1062             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
    1063             : 
    1064             :   PetscCall(PetscLogGpuTimeEnd());
    1065             :   PetscCallCUDA(cudaFree(d_Q));
    1066             :   PetscCallCUDA(cudaFree(d_W));
    1067             :   PetscCallCUDA(cudaFree(d_A2));
    1068             :   PetscCallCUDA(cudaFree(d_ipiv));
    1069             :   PetscCallCUDA(cudaFree(d_info));
    1070             :   PetscCallCUDA(cudaFree(d_ppP));
    1071             :   PetscCallCUDA(cudaFree(d_ppQ));
    1072             : 
    1073             :   PetscCall(PetscFree(ppP));
    1074             :   PetscCall(PetscFree(ppQ));
    1075             : 
    1076             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1077             :   if (A!=B) {
    1078             :     if (s>0.5) {  /* undo scaling */
    1079             :       scale = 1.0/scale;
    1080             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1081             :     }
    1082             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
    1083             :   } else PetscCallCUDA(cudaFree(d_As));
    1084             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1085             :   PetscFunctionReturn(PETSC_SUCCESS);
    1086             : }
    1087             : 
    1088             : #if defined(PETSC_HAVE_MAGMA)
    1089             : #include <slepcmagma.h>
    1090             : 
    1091             : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
    1092             : {
    1093             :   PetscBLASInt      n=0,ld,ld2,*piv,one=1;
    1094             :   PetscInt          m,k,sexp;
    1095             :   PetscBool         odd;
    1096             :   const PetscInt    p=MAX_PADE;
    1097             :   PetscReal         c[MAX_PADE+1],s;
    1098             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
    1099             :   const PetscScalar *Aa;
    1100             :   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
    1101             :   cublasHandle_t    cublasv2handle;
    1102             : 
    1103             :   PetscFunctionBegin;
    1104             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1105             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1106             :   PetscCall(SlepcMagmaInit());
    1107             :   PetscCall(MatGetSize(A,&m,NULL));
    1108             :   PetscCall(PetscBLASIntCast(m,&n));
    1109             :   ld  = n;
    1110             :   ld2 = ld*ld;
    1111             :   if (A==B) {
    1112             :     PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
    1113             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1114             :     PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1115             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1116             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
    1117             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1118             : 
    1119             :   PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
    1120             :   PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
    1121             :   PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
    1122             : 
    1123             :   PetscCall(PetscMalloc1(n,&piv));
    1124             : 
    1125             :   d_P = d_Ba;
    1126             :   PetscCall(PetscLogGpuTimeBegin());
    1127             : 
    1128             :   /* Pade' coefficients */
    1129             :   c[0] = 1.0;
    1130             :   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
    1131             : 
    1132             :   /* Scaling */
    1133             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
    1134             :   PetscCall(PetscLogGpuFlops(1.0*n*n));
    1135             : 
    1136             :   if (s>0.5) {
    1137             :     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
    1138             :     scale = PetscPowRealInt(2.0,-sexp);
    1139             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1140             :     PetscCall(PetscLogGpuFlops(1.0*n*n));
    1141             :   } else sexp = 0;
    1142             : 
    1143             :   /* Horner evaluation */
    1144             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
    1145             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1146             :   PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
    1147             :   PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
    1148             :   PetscCall(set_diagonal(n,d_Q,ld,c[p]));
    1149             :   PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
    1150             : 
    1151             :   odd = PETSC_TRUE;
    1152             :   for (k=p-1;k>0;k--) {
    1153             :     if (odd) {
    1154             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
    1155             :       SlepcSwap(d_Q,d_W,aux);
    1156             :       PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
    1157             :       odd = PETSC_FALSE;
    1158             :     } else {
    1159             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
    1160             :       SlepcSwap(d_P,d_W,aux);
    1161             :       PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
    1162             :       odd = PETSC_TRUE;
    1163             :     }
    1164             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1165             :   }
    1166             :   if (odd) {
    1167             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
    1168             :     SlepcSwap(d_Q,d_W,aux);
    1169             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1170             :     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
    1171             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1172             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1173             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
    1174             :   } else {
    1175             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
    1176             :     SlepcSwap(d_P,d_W,aux);
    1177             :     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
    1178             :     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
    1179             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
    1180             :     PetscCall(shift_diagonal(n,d_P,ld,sone));
    1181             :   }
    1182             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
    1183             : 
    1184             :   for (k=1;k<=sexp;k++) {
    1185             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
    1186             :     PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1187             :   }
    1188             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
    1189             : 
    1190             :   PetscCall(PetscLogGpuTimeEnd());
    1191             :   PetscCallCUDA(cudaFree(d_Q));
    1192             :   PetscCallCUDA(cudaFree(d_W));
    1193             :   PetscCallCUDA(cudaFree(d_A2));
    1194             :   PetscCall(PetscFree(piv));
    1195             : 
    1196             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
    1197             :   if (A!=B) {
    1198             :     if (s>0.5) {  /* undo scaling */
    1199             :       scale = 1.0/scale;
    1200             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
    1201             :     }
    1202             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
    1203             :   } else PetscCallCUDA(cudaFree(d_As));
    1204             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1205             :   PetscFunctionReturn(PETSC_SUCCESS);
    1206             : }
    1207             : 
    1208             : /*
    1209             :  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
    1210             :  *
    1211             :  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
    1212             :  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
    1213             :  */
    1214             : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
    1215             : {
    1216             :   PetscBLASInt      n_=0,n2,*ipiv,one=1;
    1217             :   PetscInt          n,m,j,s;
    1218             :   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
    1219             :   PetscScalar       *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
    1220             :   const PetscScalar *Aa,*c;
    1221             :   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
    1222             :   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
    1223             :   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
    1224             :   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
    1225             :     2162160, 110880, 3960, 90, 1 };
    1226             :   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
    1227             :     1187353796428800,  129060195264000,   10559470521600,
    1228             :     670442572800,      33522128640,       1323241920,
    1229             :     40840800,          960960,            16380,  182,  1 };
    1230             :   cublasHandle_t    cublasv2handle;
    1231             : 
    1232             :   PetscFunctionBegin;
    1233             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1234             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1235             :   PetscCall(SlepcMagmaInit());
    1236             :   PetscCall(MatGetSize(A,&n,NULL));
    1237             :   PetscCall(PetscBLASIntCast(n,&n_));
    1238             :   n2 = n_*n_;
    1239             :   PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
    1240             :   /* Matrix powers */
    1241             :   Apowers[0] = work;                  /* Apowers[0] = A   */
    1242             :   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
    1243             :   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
    1244             :   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
    1245             :   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
    1246             :   if (A==B) {
    1247             :     PetscCallCUDA(cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar)));
    1248             :     d_Apowers[0] = d_work;              /* d_Apowers[0] = A   */
    1249             :     d_Apowers[1] = d_Apowers[0] + n*n;  /* d_Apowers[1] = A^2 */
    1250             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1251             :     PetscCallCUDA(cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1252             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1253             :   } else {
    1254             :     PetscCallCUDA(cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar)));
    1255             :     PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
    1256             :     d_Apowers[1] = d_work;              /* d_Apowers[1] = A^2 */
    1257             :   }
    1258             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1259             :   d_Apowers[2] = d_Apowers[1] + n*n;    /* d_Apowers[2] = A^4 */
    1260             :   d_Apowers[3] = d_Apowers[2] + n*n;    /* d_Apowers[3] = A^6 */
    1261             :   d_Apowers[4] = d_Apowers[3] + n*n;    /* d_Apowers[4] = A^8 */
    1262             :   d_Q = d_Apowers[4] + n*n;
    1263             :   d_W = d_Q + n*n;
    1264             : 
    1265             :   PetscCall(PetscLogGpuTimeBegin());
    1266             : 
    1267             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_));
    1268             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_));
    1269             :   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_));
    1270             :   PetscCall(PetscLogGpuFlops(6.0*n*n*n));
    1271             : 
    1272             :   PetscCallCUDA(cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
    1273             :   PetscCallCUDA(cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
    1274             :   PetscCall(PetscLogGpuToCpu(4*n2*sizeof(PetscScalar)));
    1275             :   /* Compute scaling parameter and order of Pade approximant */
    1276             :   PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
    1277             : 
    1278             :   if (s) { /* rescale */
    1279             :     for (j=0;j<4;j++) {
    1280             :       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
    1281             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one));
    1282             :     }
    1283             :     PetscCall(PetscLogGpuFlops(4.0*n*n));
    1284             :   }
    1285             : 
    1286             :   /* Evaluate the Pade approximant */
    1287             :   switch (m) {
    1288             :     case 3:  c = c3;  break;
    1289             :     case 5:  c = c5;  break;
    1290             :     case 7:  c = c7;  break;
    1291             :     case 9:  c = c9;  break;
    1292             :     case 13: c = c13; break;
    1293             :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
    1294             :   }
    1295             :   d_P = d_Ba;
    1296             :   switch (m) {
    1297             :     case 3:
    1298             :     case 5:
    1299             :     case 7:
    1300             :     case 9:
    1301             :       if (m==9) PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_));
    1302             :       PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
    1303             :       PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
    1304             :       PetscCall(set_diagonal(n,d_P,n,c[1]));
    1305             :       PetscCall(set_diagonal(n,d_Q,n,c[0]));
    1306             :       for (j=m;j>=3;j-=2) {
    1307             :         PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one));
    1308             :         PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one));
    1309             :         PetscCall(PetscLogGpuFlops(4.0*n*n));
    1310             :       }
    1311             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_));
    1312             :       PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1313             :       SlepcSwap(d_P,d_W,aux);
    1314             :       break;
    1315             :     case 13:
    1316             :       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
    1317             :           + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
    1318             :       PetscCallCUDA(cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1319             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[13],d_P,one));
    1320             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one));
    1321             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one));
    1322             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_));
    1323             :       PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
    1324             : 
    1325             :       PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
    1326             :       PetscCall(set_diagonal(n,d_P,n,c[1]));
    1327             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one));
    1328             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one));
    1329             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one));
    1330             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one));
    1331             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_));
    1332             :       PetscCall(PetscLogGpuFlops(7.0*n*n+2.0*n*n*n));
    1333             :       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
    1334             :           + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
    1335             :       PetscCallCUDA(cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1336             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[12],d_Q,one));
    1337             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one));
    1338             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one));
    1339             :       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_));
    1340             :       PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
    1341             :       PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
    1342             :       PetscCall(set_diagonal(n,d_Q,n,c[0]));
    1343             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one));
    1344             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one));
    1345             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one));
    1346             :       PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one));
    1347             :       PetscCall(PetscLogGpuFlops(7.0*n*n));
    1348             :       break;
    1349             :     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
    1350             :   }
    1351             :   PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one));
    1352             : 
    1353             :   PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
    1354             : 
    1355             :   PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&stwo,d_P,one));
    1356             :   PetscCall(shift_diagonal(n,d_P,n,sone));
    1357             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n));
    1358             : 
    1359             :   /* Squaring */
    1360             :   for (j=1;j<=s;j++) {
    1361             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_));
    1362             :     SlepcSwap(d_P,d_W,aux);
    1363             :   }
    1364             :   PetscCall(PetscLogGpuFlops(2.0*n*n*n*s));
    1365             :   PetscCall(PetscLogGpuTimeEnd());
    1366             : 
    1367             :   PetscCall(PetscFree2(work,ipiv));
    1368             :   if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
    1369             :   if (A!=B) {
    1370             :     if (s>0.5) {  /* undo scaling */
    1371             :       scale = 1.0/PetscPowRealInt(2.0,-s);
    1372             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one));
    1373             :     }
    1374             :     PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
    1375             :   }
    1376             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1377             :   PetscCallCUDA(cudaFree(d_work));
    1378             :   PetscFunctionReturn(PETSC_SUCCESS);
    1379             : }
    1380             : 
    1381             : /*
    1382             :  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
    1383             :  * and Yuji Nakatsukasa
    1384             :  *
    1385             :  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
    1386             :  *     Approximation for the Matrix Exponential",
    1387             :  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
    1388             :  *     https://doi.org/10.1137/15M1027553
    1389             :  */
    1390             : PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
    1391             : {
    1392             :   PetscInt          i,j,n_,s,k,m,mod;
    1393             :   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
    1394             :   PetscReal         nrm,shift=0.0,rone=1.0,rzero=0.0;
    1395             : #if defined(PETSC_USE_COMPLEX)
    1396             :   PetscReal         *rwork=NULL;
    1397             : #endif
    1398             :   PetscComplex      *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
    1399             :   PetscScalar       *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
    1400             :   const PetscScalar *Aa;
    1401             :   PetscBool         isreal,*d_isreal,flg;
    1402             :   cublasHandle_t    cublasv2handle;
    1403             : 
    1404             :   PetscFunctionBegin;
    1405             :   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
    1406             :   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
    1407             :   PetscCall(SlepcMagmaInit());
    1408             :   PetscCall(MatGetSize(A,&n_,NULL));
    1409             :   PetscCall(PetscBLASIntCast(n_,&n));
    1410             :   PetscCall(PetscBLASIntCast(n*n,&n2));
    1411             : 
    1412             :   if (A==B) {
    1413             :     PetscCallCUDA(cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2));
    1414             :     PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
    1415             :     PetscCallCUDA(cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1416             :     PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
    1417             :   } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa));
    1418             :   PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
    1419             :   d_Ba2 = d_Ba;
    1420             : 
    1421             :   PetscCallCUDA(cudaMalloc((void **)&d_isreal,sizeof(PetscBool)));
    1422             :   PetscCallCUDA(cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2));
    1423             :   PetscCallCUDA(cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2));
    1424             : 
    1425             :   PetscCall(PetscLogGpuTimeBegin());
    1426             :   d_Maux2 = d_Maux;
    1427             :   PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
    1428             :   if (!flg) {
    1429             :     PetscCall(PetscMalloc2(n,&wr,n,&wi));
    1430             :     /* estimate rightmost eigenvalue and shift A with it */
    1431             :     PetscCall(PetscMalloc1(n2,&Maux));
    1432             :     PetscCall(MatDenseGetArrayRead(A,&Aa));
    1433             :     PetscCall(PetscArraycpy(Maux,Aa,n2));
    1434             :     PetscCall(MatDenseRestoreArrayRead(A,&Aa));
    1435             : #if !defined(PETSC_USE_COMPLEX)
    1436             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
    1437             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
    1438             :     PetscCall(PetscMalloc1(lwork,&work));
    1439             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
    1440             :     PetscCall(PetscFree(work));
    1441             : #else
    1442             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
    1443             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
    1444             :     PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
    1445             :     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
    1446             :     PetscCall(PetscFree2(rwork,work));
    1447             : #endif
    1448             :     PetscCall(PetscFree(Maux));
    1449             :     PetscCall(PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
    1450             : 
    1451             :     shift = PetscRealPart(wr[0]);
    1452             :     for (i=1;i<n;i++) {
    1453             :       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
    1454             :     }
    1455             :     PetscCall(PetscFree2(wr,wi));
    1456             :   }
    1457             :   /* shift so that largest real part is (about) 0 */
    1458             :   PetscCallCUDA(cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1459             :   if (shift) {
    1460             :     PetscCall(shift_diagonal(n,d_sMaux,n,-shift));
    1461             :     PetscCall(PetscLogGpuFlops(1.0*n));
    1462             :   }
    1463             : #if defined(PETSC_USE_COMPLEX)
    1464             :   PetscCallCUDA(cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1465             :   if (shift) {
    1466             :     PetscCall(shift_diagonal(n,d_Maux,n,-shift));
    1467             :     PetscCall(PetscLogGpuFlops(1.0*n));
    1468             :   }
    1469             : #endif
    1470             :   if (A!=B) PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa));
    1471             :   else PetscCallCUDA(cudaFree(d_Aa));
    1472             : 
    1473             :   /* estimate norm(A) and select the scaling factor */
    1474             :   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm));
    1475             :   PetscCall(PetscLogGpuFlops(2.0*n*n));
    1476             :   PetscCall(sexpm_params(nrm,&s,&k,&m));
    1477             :   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
    1478             :     if (shift) expshift = PetscExpReal(shift);
    1479             :     PetscCall(shift_Cdiagonal(n,d_Maux,n,rone,rzero));
    1480             :     if (shift) {
    1481             :       PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one));
    1482             :       PetscCall(PetscLogGpuFlops(1.0*(n+n2)));
    1483             :     } else PetscCall(PetscLogGpuFlops(1.0*n));
    1484             :     PetscCallCUDA(cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1485             :     PetscCallCUDA(cudaFree(d_isreal));
    1486             :     PetscCallCUDA(cudaFree(d_sMaux));
    1487             :     PetscCallCUDA(cudaFree(d_Maux));
    1488             :     PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1489             :     PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
    1490             :   }
    1491             : 
    1492             :   PetscCallCUDA(cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2));
    1493             :   PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2));
    1494             :   PetscCallCUDA(cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2));
    1495             :   d_expmA2 = d_expmA; d_RR2 = d_RR;
    1496             :   PetscCall(PetscMalloc1(n,&piv));
    1497             :   /* scale matrix */
    1498             : #if !defined(PETSC_USE_COMPLEX)
    1499             :   PetscCall(copy_array2D_S2C(n,n,d_As,n,d_sMaux,n));
    1500             : #else
    1501             :   PetscCallCUDA(cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1502             : #endif
    1503             :   scale = 1.0/PetscPowRealInt(2.0,s);
    1504             :   PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one));
    1505             :   PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1506             : 
    1507             :   /* evaluate Pade approximant (partial fraction or product form) */
    1508             :   if (fn->method==3 || !m) { /* partial fraction */
    1509             :     PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
    1510             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
    1511             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
    1512             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
    1513             :     PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
    1514             :     PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
    1515             : 
    1516             :     PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
    1517             : #if !defined(PETSC_USE_COMPLEX)
    1518             :     isreal = PETSC_TRUE;
    1519             : #else
    1520             :     PetscCall(getisreal_array2D(n,n,d_Maux,n,d_isreal));
    1521             :     PetscCallCUDA(cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost));
    1522             : #endif
    1523             :     if (isreal) {
    1524             :       rsizediv2 = irsize/2;
    1525             :       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
    1526             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1527             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1528             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i])));
    1529             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i])));
    1530             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1531             :         PetscCall(add_array2D_Conj(n,n,d_RR,n));
    1532             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1533             :         /* shift(n) + gesv + axpy(n2) */
    1534             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
    1535             :       }
    1536             : 
    1537             :       mod = ipsize % 2;
    1538             :       if (mod) {
    1539             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1540             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1541             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1])));
    1542             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1])));
    1543             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1544             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1545             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
    1546             :       }
    1547             :     } else { /* complex */
    1548             :       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
    1549             :         PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1550             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1551             :         PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i])));
    1552             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i])));
    1553             :         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
    1554             :         PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1555             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
    1556             :       }
    1557             :     }
    1558             :     for (i=0;i<iremainsize;i++) {
    1559             :       if (!i) {
    1560             :         PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
    1561             :         PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1])));
    1562             :       } else {
    1563             :         PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1564             :         for (j=1;j<i;j++) {
    1565             :           PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n));
    1566             :           SlepcSwap(d_RR,d_Maux,aux);
    1567             :           PetscCall(SlepcLogGpuFlopsComplex(2.0*n*n*n));
    1568             :         }
    1569             :         PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one));
    1570             :         PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1571             :       }
    1572             :       PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
    1573             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1574             :     }
    1575             :     PetscCall(PetscFree3(r,p,remainterm));
    1576             :   } else { /* product form, default */
    1577             :     PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
    1578             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
    1579             :     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
    1580             :     PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
    1581             :     PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
    1582             : 
    1583             :     PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
    1584             :     PetscCall(set_Cdiagonal(n,d_expmA,n,rone,rzero)); /* initialize */
    1585             :     minlen = PetscMin(irsize,ipsize);
    1586             :     for (i=0;i<minlen;i++) {
    1587             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1588             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
    1589             :       PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
    1590             :       SlepcSwap(d_expmA,d_Maux,aux);
    1591             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1592             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
    1593             :       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
    1594             :       /* shift(n) + gemm + shift(n) + gesv */
    1595             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
    1596             :     }
    1597             :     /* extra enumerator */
    1598             :     for (i=minlen;i<irsize;i++) {
    1599             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1600             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
    1601             :       PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
    1602             :       SlepcSwap(d_expmA,d_Maux,aux);
    1603             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n));
    1604             :     }
    1605             :     /* extra denominator */
    1606             :     for (i=minlen;i<ipsize;i++) {
    1607             :       PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
    1608             :       PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
    1609             :       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
    1610             :       PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
    1611             :     }
    1612             :     PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one));
    1613             :     PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
    1614             :     PetscCall(PetscFree2(rootp,rootq));
    1615             :   }
    1616             : 
    1617             : #if !defined(PETSC_USE_COMPLEX)
    1618             :   PetscCall(copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n));
    1619             : #else
    1620             :   PetscCallCUDA(cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1621             : #endif
    1622             : 
    1623             :   /* perform repeated squaring */
    1624             :   for (i=0;i<s;i++) { /* final squaring */
    1625             :     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n));
    1626             :     SlepcSwap(d_Ba2,d_sMaux,saux);
    1627             :     PetscCall(PetscLogGpuFlops(2.0*n*n*n));
    1628             :   }
    1629             :   if (d_Ba2!=d_Ba) {
    1630             :     PetscCallCUDA(cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
    1631             :     d_sMaux = d_Ba2;
    1632             :   }
    1633             :   if (shift) {
    1634             :     expshift = PetscExpReal(shift);
    1635             :     PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one));
    1636             :     PetscCall(PetscLogGpuFlops(1.0*n2));
    1637             :   }
    1638             : 
    1639             :   PetscCall(PetscLogGpuTimeEnd());
    1640             : 
    1641             :   /* restore pointers */
    1642             :   d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
    1643             :   PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
    1644             :   PetscCallCUDA(cudaFree(d_isreal));
    1645             :   PetscCallCUDA(cudaFree(d_sMaux));
    1646             :   PetscCallCUDA(cudaFree(d_Maux));
    1647             :   PetscCallCUDA(cudaFree(d_expmA));
    1648             :   PetscCallCUDA(cudaFree(d_As));
    1649             :   PetscCallCUDA(cudaFree(d_RR));
    1650             :   PetscCall(PetscFree(piv));
    1651             :   PetscFunctionReturn(PETSC_SUCCESS);
    1652             : }
    1653             : #endif /* PETSC_HAVE_MAGMA */
    1654             : #endif /* PETSC_HAVE_CUDA */
    1655             : 
    1656          30 : static PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
    1657             : {
    1658          30 :   PetscBool      isascii;
    1659          30 :   char           str[50];
    1660          30 :   const char     *methodname[] = {
    1661             :                   "scaling & squaring, [m/m] Pade approximant (Higham)",
    1662             :                   "scaling & squaring, [6/6] Pade approximant",
    1663             :                   "scaling & squaring, subdiagonal Pade approximant (product form)",
    1664             :                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
    1665             :   };
    1666          30 :   const int      nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
    1667             : 
    1668          30 :   PetscFunctionBegin;
    1669          30 :   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
    1670          30 :   if (isascii) {
    1671          30 :     if (fn->beta==(PetscScalar)1.0) {
    1672          25 :       if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: exp(x)\n"));
    1673             :       else {
    1674          11 :         PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
    1675          11 :         PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: exp(%s*x)\n",str));
    1676             :       }
    1677             :     } else {
    1678           5 :       PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE));
    1679           5 :       if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: %s*exp(x)\n",str));
    1680             :       else {
    1681           5 :         PetscCall(PetscViewerASCIIPrintf(viewer,"  exponential: %s",str));
    1682           5 :         PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
    1683           5 :         PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
    1684           5 :         PetscCall(PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str));
    1685           5 :         PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
    1686             :       }
    1687             :     }
    1688          30 :     if (fn->method<nmeth) PetscCall(PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]));
    1689             :   }
    1690          30 :   PetscFunctionReturn(PETSC_SUCCESS);
    1691             : }
    1692             : 
    1693         115 : SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
    1694             : {
    1695         115 :   PetscFunctionBegin;
    1696         115 :   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
    1697         115 :   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
    1698         115 :   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
    1699         115 :   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
    1700         115 :   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
    1701         115 :   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
    1702             : #if defined(PETSC_HAVE_CUDA)
    1703             :   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
    1704             : #if defined(PETSC_HAVE_MAGMA)
    1705             :   fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
    1706             :   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
    1707             :   fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
    1708             :   fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
    1709             : #endif
    1710             : #endif
    1711         115 :   fn->ops->view                   = FNView_Exp;
    1712         115 :   PetscFunctionReturn(PETSC_SUCCESS);
    1713             : }

Generated by: LCOV version 1.14