Line data Source code
1 : /*
2 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3 : SLEPc - Scalable Library for Eigenvalue Problem Computations
4 : Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
5 :
6 : This file is part of SLEPc.
7 : SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9 : */
10 : /*
11 : Exponential function exp(x)
12 : */
13 :
14 : #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15 : #include <slepcblaslapack.h>
16 :
17 2117 : static PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18 : {
19 2117 : PetscFunctionBegin;
20 2117 : *y = PetscExpScalar(x);
21 2117 : PetscFunctionReturn(PETSC_SUCCESS);
22 : }
23 :
24 676 : static PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25 : {
26 676 : PetscFunctionBegin;
27 676 : *y = PetscExpScalar(x);
28 676 : PetscFunctionReturn(PETSC_SUCCESS);
29 : }
30 :
31 : #define MAX_PADE 6
32 :
33 16 : static PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
34 : {
35 16 : PetscBLASInt n=0,ld,ld2,*ipiv,info,inc=1;
36 16 : PetscInt m,j,k,sexp;
37 16 : PetscBool odd;
38 16 : const PetscInt p=MAX_PADE;
39 16 : PetscReal c[MAX_PADE+1],s,*rwork;
40 16 : PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
41 16 : PetscScalar *Ba,*As,*A2,*Q,*P,*W,*aux;
42 16 : const PetscScalar *Aa;
43 :
44 16 : PetscFunctionBegin;
45 16 : PetscCall(MatDenseGetArrayRead(A,&Aa));
46 16 : PetscCall(MatDenseGetArray(B,&Ba));
47 16 : PetscCall(MatGetSize(A,&m,NULL));
48 16 : PetscCall(PetscBLASIntCast(m,&n));
49 16 : ld = n;
50 16 : ld2 = ld*ld;
51 16 : P = Ba;
52 16 : PetscCall(PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv));
53 16 : PetscCall(PetscArraycpy(As,Aa,ld2));
54 :
55 : /* Pade' coefficients */
56 16 : c[0] = 1.0;
57 112 : for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
58 :
59 : /* Scaling */
60 16 : s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
61 16 : PetscCall(PetscLogFlops(1.0*n*n));
62 16 : if (s>0.5) {
63 16 : sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
64 16 : scale = PetscPowRealInt(2.0,-sexp);
65 16 : PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
66 16 : PetscCall(PetscLogFlops(1.0*n*n));
67 : } else sexp = 0;
68 :
69 : /* Horner evaluation */
70 16 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
71 16 : PetscCall(PetscLogFlops(2.0*n*n*n));
72 16 : PetscCall(PetscArrayzero(Q,ld2));
73 16 : PetscCall(PetscArrayzero(P,ld2));
74 616 : for (j=0;j<n;j++) {
75 600 : Q[j+j*ld] = c[p];
76 600 : P[j+j*ld] = c[p-1];
77 : }
78 :
79 : odd = PETSC_TRUE;
80 96 : for (k=p-1;k>0;k--) {
81 80 : if (odd) {
82 48 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
83 48 : SlepcSwap(Q,W,aux);
84 1848 : for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
85 : odd = PETSC_FALSE;
86 : } else {
87 32 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
88 32 : SlepcSwap(P,W,aux);
89 1232 : for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
90 : odd = PETSC_TRUE;
91 : }
92 80 : PetscCall(PetscLogFlops(2.0*n*n*n));
93 : }
94 : /*if (odd) {
95 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
96 : SlepcSwap(Q,W,aux);
97 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
98 : PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
99 : SlepcCheckLapackInfo("gesv",info);
100 : PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
101 : for (j=0;j<n;j++) P[j+j*ld] += 1.0;
102 : PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
103 : } else {*/
104 16 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
105 16 : SlepcSwap(P,W,aux);
106 16 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
107 16 : PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
108 16 : SlepcCheckLapackInfo("gesv",info);
109 16 : PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
110 616 : for (j=0;j<n;j++) P[j+j*ld] += 1.0;
111 : /*}*/
112 16 : PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
113 :
114 68 : for (k=1;k<=sexp;k++) {
115 52 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
116 52 : PetscCall(PetscArraycpy(P,W,ld2));
117 : }
118 16 : if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,ld2));
119 16 : PetscCall(PetscLogFlops(2.0*n*n*n*sexp));
120 :
121 16 : PetscCall(PetscFree6(Q,W,As,A2,rwork,ipiv));
122 16 : PetscCall(MatDenseRestoreArrayRead(A,&Aa));
123 16 : PetscCall(MatDenseRestoreArray(B,&Ba));
124 16 : PetscFunctionReturn(PETSC_SUCCESS);
125 : }
126 :
127 : #if defined(PETSC_HAVE_COMPLEX)
128 : /*
129 : * Set scaling factor (s) and Pade degree (k,m)
130 : */
131 48 : static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
132 : {
133 48 : PetscFunctionBegin;
134 48 : if (nrm>1) {
135 32 : if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
136 4 : else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
137 0 : else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
138 0 : else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
139 0 : else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
140 0 : else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
141 0 : else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
142 0 : else {*s = 1; *k = 1; *m = *k+1;}
143 : } else { /* nrm<1 */
144 16 : if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
145 12 : else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
146 8 : else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
147 8 : else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
148 8 : else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
149 8 : else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
150 8 : else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
151 8 : else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
152 8 : else {*s = 0; *k = 1; *m = 0;}
153 : }
154 48 : PetscFunctionReturn(PETSC_SUCCESS);
155 : }
156 :
157 : /*
158 : * Partial fraction form coefficients.
159 : * If query, the function returns the size necessary to store the coefficients.
160 : */
161 40 : static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
162 : {
163 40 : PetscInt i;
164 40 : const PetscComplex /* m == k+1 */
165 40 : p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
166 40 : -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
167 40 : 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
168 40 : 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
169 : -2.733432894659307e+02 },
170 40 : p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
171 40 : 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
172 40 : 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
173 40 : 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
174 : 6.286704751729261e+00 },
175 40 : p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
176 40 : -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
177 40 : 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
178 40 : 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
179 40 : p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
180 40 : 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
181 40 : 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
182 40 : 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
183 40 : p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
184 40 : 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
185 : -1.829749817484586e+01 },
186 40 : p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
187 40 : 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
188 : 3.637834252744491e+00 },
189 40 : p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
190 40 : 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
191 40 : p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
192 40 : 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
193 40 : const PetscComplex /* m == k-1 */
194 40 : m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
195 40 : -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
196 40 : 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
197 40 : 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
198 40 : m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
199 40 : 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
200 40 : 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
201 40 : 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
202 40 : m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
203 40 : 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
204 : -1.734353918633177e+02 },
205 40 : m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
206 40 : 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
207 : 5.648485971016893e+00 },
208 40 : m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
209 40 : 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
210 40 : m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
211 40 : 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
212 40 : const PetscScalar /* m == k-1 */
213 40 : m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
214 40 : m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
215 40 : m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
216 40 : m1remain2[2] = {-0.5, -3.5},
217 40 : remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
218 40 : remain2[3] = {1.0/2.0, 1, 1};
219 :
220 40 : PetscFunctionBegin;
221 40 : if (query) { /* query about buffer's size */
222 20 : if (m==k+1) {
223 2 : *remain = 0;
224 2 : *r = *q = k+1;
225 2 : PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
226 : }
227 18 : if (m==k-1) {
228 18 : *remain = 2;
229 18 : if (k==5) *r = *q = 4;
230 4 : else if (k==4) *r = *q = 3;
231 0 : else if (k==3) *r = *q = 2;
232 0 : else if (k==2) *r = *q = 1;
233 : }
234 18 : if (m==0) {
235 0 : *r = *q = 0;
236 0 : *remain = k+1;
237 : }
238 : } else {
239 20 : if (m==k+1) {
240 2 : if (k==4) {
241 12 : for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
242 0 : } else if (k==3) {
243 0 : for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
244 0 : } else if (k==2) {
245 0 : for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
246 0 : } else if (k==1) {
247 0 : for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
248 : }
249 2 : PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
250 : }
251 18 : if (m==k-1) {
252 18 : if (k==5) {
253 70 : for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
254 42 : for (i=0;i<2;i++) remain[i] = m1remain5[i];
255 4 : } else if (k==4) {
256 16 : for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
257 12 : for (i=0;i<2;i++) remain[i] = m1remain4[i];
258 0 : } else if (k==3) {
259 0 : for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
260 0 : } else if (k==2) {
261 0 : r[0] = -13.5; q[0] = 3;
262 0 : for (i=0;i<2;i++) remain[i] = m1remain2[i];
263 : }
264 : }
265 18 : if (m==0) {
266 0 : r = q = NULL;
267 0 : if (k==3) {
268 0 : for (i=0;i<4;i++) remain[i] = remain3[i];
269 0 : } else if (k==2) {
270 0 : for (i=0;i<3;i++) remain[i] = remain2[i];
271 : }
272 : }
273 : }
274 36 : PetscFunctionReturn(PETSC_SUCCESS);
275 : }
276 :
277 : /*
278 : * Product form coefficients.
279 : * If query, the function returns the size necessary to store the coefficients.
280 : */
281 40 : static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
282 : {
283 40 : PetscInt i;
284 40 : const PetscComplex /* m == k+1 */
285 40 : p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
286 40 : -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
287 40 : -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
288 40 : -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
289 40 : p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
290 40 : 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
291 : 6.286704751729261e+00 ,
292 40 : 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
293 40 : 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
294 40 : p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
295 40 : -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
296 : -5.648485971016893e+00 },
297 40 : p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
298 40 : 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
299 40 : 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
300 40 : 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
301 40 : p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
302 40 : -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
303 40 : p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
304 40 : 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
305 : 3.637834252744491e+00 },
306 40 : p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
307 40 : 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
308 40 : const PetscComplex /* m == k-1 */
309 40 : m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
310 40 : -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
311 : -6.286704751729261e+00 ,
312 40 : -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
313 40 : -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
314 40 : m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
315 40 : 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
316 40 : 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
317 40 : 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
318 40 : m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
319 40 : -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
320 40 : -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
321 40 : -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
322 40 : m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
323 40 : 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
324 : 5.648485971016893e+00 },
325 40 : m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
326 40 : -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
327 : -3.637834252744491e+00 },
328 40 : m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
329 40 : 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
330 40 : m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
331 40 : -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
332 :
333 40 : PetscFunctionBegin;
334 40 : if (query) {
335 20 : if (m == k+1) {
336 2 : *mult = 1;
337 2 : *p = k;
338 2 : *q = k+1;
339 2 : PetscFunctionReturn(PETSC_SUCCESS);
340 : }
341 18 : if (m==k-1) {
342 18 : *mult = 1;
343 18 : *p = k;
344 18 : *q = k-1;
345 : }
346 : } else {
347 20 : if (m == k+1) {
348 2 : *mult = PetscPowInt(-1,m);
349 2 : *mult *= m;
350 2 : if (k==4) {
351 10 : for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
352 2 : q[4] = p1q4[4];
353 0 : } else if (k==3) {
354 0 : for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
355 0 : q[3] = p1q3[3];
356 0 : } else if (k==2) {
357 0 : for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
358 0 : q[2] = p1q2[2];
359 0 : } else if (k==1) {
360 0 : p[0] = -3;
361 0 : for (i=0;i<2;i++) q[i] = p1q1[i];
362 : }
363 2 : PetscFunctionReturn(PETSC_SUCCESS);
364 : }
365 18 : if (m==k-1) {
366 18 : *mult = PetscPowInt(-1,m);
367 18 : *mult /= k;
368 18 : if (k==5) {
369 70 : for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
370 14 : p[4] = m1p5[4];
371 4 : } else if (k==4) {
372 16 : for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
373 4 : p[3] = m1p4[3];
374 0 : } else if (k==3) {
375 0 : for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
376 0 : p[2] = m1p3[2];
377 0 : } else if (k==2) {
378 0 : for (i=0;i<2;i++) p[i] = m1p2[i];
379 0 : q[0] = 3;
380 : }
381 : }
382 : }
383 36 : PetscFunctionReturn(PETSC_SUCCESS);
384 : }
385 : #endif /* PETSC_HAVE_COMPLEX */
386 :
387 : #if defined(PETSC_USE_COMPLEX)
388 : static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
389 : {
390 : PetscInt i;
391 :
392 : PetscFunctionBegin;
393 : *result=PETSC_TRUE;
394 : for (i=0;i<n&&*result;i++) {
395 : if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
396 : }
397 : PetscFunctionReturn(PETSC_SUCCESS);
398 : }
399 : #endif
400 :
401 : /*
402 : * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
403 : * and Yuji Nakatsukasa
404 : *
405 : * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
406 : * Approximation for the Matrix Exponential",
407 : * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
408 : * https://doi.org/10.1137/15M1027553
409 : */
410 48 : static PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
411 : {
412 : #if !defined(PETSC_HAVE_COMPLEX)
413 : PetscFunctionBegin;
414 : SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
415 : #else
416 48 : PetscInt i,j,n_,s,k,m,mod;
417 48 : PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
418 48 : PetscReal nrm,shift=0.0;
419 : #if defined(PETSC_USE_COMPLEX)
420 : PetscReal *rwork=NULL;
421 : #endif
422 48 : PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
423 48 : PetscScalar *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
424 48 : const PetscScalar *Aa;
425 48 : PetscBool isreal,flg;
426 48 : PetscBLASInt query=-1;
427 48 : PetscScalar work1,*work;
428 :
429 48 : PetscFunctionBegin;
430 48 : PetscCall(MatGetSize(A,&n_,NULL));
431 48 : PetscCall(PetscBLASIntCast(n_,&n));
432 48 : PetscCall(MatDenseGetArrayRead(A,&Aa));
433 48 : PetscCall(MatDenseGetArray(B,&Ba));
434 48 : Ba2 = Ba;
435 48 : PetscCall(PetscBLASIntCast(n*n,&n2));
436 :
437 48 : PetscCall(PetscMalloc2(n2,&sMaux,n2,&Maux));
438 48 : Maux2 = Maux;
439 48 : PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
440 48 : if (!flg) {
441 48 : PetscCall(PetscMalloc2(n,&wr,n,&wi));
442 48 : PetscCall(PetscArraycpy(sMaux,Aa,n2));
443 : /* estimate rightmost eigenvalue and shift A with it */
444 : #if !defined(PETSC_USE_COMPLEX)
445 48 : PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
446 48 : SlepcCheckLapackInfo("geev",info);
447 48 : PetscCall(PetscBLASIntCast((PetscInt)work1,&lwork));
448 48 : PetscCall(PetscMalloc1(lwork,&work));
449 48 : PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
450 48 : PetscCall(PetscFree(work));
451 : #else
452 : PetscCall(PetscArraycpy(Maux,Aa,n2));
453 : PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
454 : SlepcCheckLapackInfo("geev",info);
455 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
456 : PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
457 : PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
458 : PetscCall(PetscFree2(rwork,work));
459 : #endif
460 48 : SlepcCheckLapackInfo("geev",info);
461 48 : PetscCall(PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
462 :
463 48 : shift = PetscRealPart(wr[0]);
464 1360 : for (i=1;i<n;i++) {
465 1312 : if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
466 : }
467 48 : PetscCall(PetscFree2(wr,wi));
468 : }
469 : /* shift so that largest real part is (about) 0 */
470 48 : PetscCall(PetscArraycpy(sMaux,Aa,n2));
471 48 : if (shift) {
472 1408 : for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
473 48 : PetscCall(PetscLogFlops(1.0*n));
474 : }
475 : #if defined(PETSC_USE_COMPLEX)
476 : PetscCall(PetscArraycpy(Maux,Aa,n2));
477 : if (shift) {
478 : for (i=0;i<n;i++) Maux[i+i*n] -= shift;
479 : PetscCall(PetscLogFlops(1.0*n));
480 : }
481 : #endif
482 :
483 : /* estimate norm(A) and select the scaling factor */
484 48 : nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
485 48 : PetscCall(PetscLogFlops(1.0*n*n));
486 48 : PetscCall(sexpm_params(nrm,&s,&k,&m));
487 48 : if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
488 8 : if (shift) expshift = PetscExpReal(shift);
489 88 : for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
490 8 : if (shift) {
491 8 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
492 8 : PetscCall(PetscLogFlops(1.0*(n+n2)));
493 0 : } else PetscCall(PetscLogFlops(1.0*n));
494 8 : PetscCall(PetscArraycpy(Ba,sMaux,n2));
495 8 : PetscCall(PetscFree2(sMaux,Maux));
496 8 : PetscCall(MatDenseRestoreArrayRead(A,&Aa));
497 8 : PetscCall(MatDenseRestoreArray(B,&Ba));
498 8 : PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
499 : }
500 :
501 40 : PetscCall(PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv));
502 40 : expmA2 = expmA; RR2 = RR;
503 : /* scale matrix */
504 : #if !defined(PETSC_USE_COMPLEX)
505 118440 : for (i=0;i<n2;i++) {
506 118400 : As[i] = sMaux[i];
507 : }
508 : #else
509 : PetscCall(PetscArraycpy(As,sMaux,n2));
510 : #endif
511 40 : scale = 1.0/PetscPowRealInt(2.0,s);
512 40 : PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
513 40 : PetscCall(SlepcLogFlopsComplex(1.0*n2));
514 :
515 : /* evaluate Pade approximant (partial fraction or product form) */
516 40 : if (fn->method==3 || !m) { /* partial fraction */
517 20 : PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
518 20 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
519 20 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
520 20 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
521 20 : PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
522 20 : PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
523 :
524 20 : PetscCall(PetscArrayzero(expmA,n2));
525 : #if !defined(PETSC_USE_COMPLEX)
526 20 : isreal = PETSC_TRUE;
527 : #else
528 : PetscCall(getisreal(n2,Maux,&isreal));
529 : #endif
530 20 : if (isreal) {
531 20 : rsizediv2 = irsize/2;
532 56 : for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
533 36 : PetscCall(PetscArraycpy(Maux,As,n2));
534 36 : PetscCall(PetscArrayzero(RR,n2));
535 1276 : for (j=0;j<n;j++) {
536 1240 : Maux[j+j*n] -= p[2*i];
537 1240 : RR[j+j*n] = r[2*i];
538 : }
539 36 : PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
540 36 : SlepcCheckLapackInfo("gesv",info);
541 118036 : for (j=0;j<n2;j++) {
542 118000 : expmA[j] += RR[j] + PetscConj(RR[j]);
543 : }
544 : /* loop(n) + gesv + loop(n2) */
545 36 : PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
546 : }
547 :
548 20 : mod = ipsize % 2;
549 20 : if (mod) {
550 6 : PetscCall(PetscArraycpy(Maux,As,n2));
551 6 : PetscCall(PetscArrayzero(RR,n2));
552 66 : for (j=0;j<n;j++) {
553 60 : Maux[j+j*n] -= p[ipsize-1];
554 60 : RR[j+j*n] = r[irsize-1];
555 : }
556 6 : PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
557 6 : SlepcCheckLapackInfo("gesv",info);
558 606 : for (j=0;j<n2;j++) {
559 600 : expmA[j] += RR[j];
560 : }
561 6 : PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
562 : }
563 : } else { /* complex */
564 : for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
565 : PetscCall(PetscArraycpy(Maux,As,n2));
566 : PetscCall(PetscArrayzero(RR,n2));
567 : for (j=0;j<n;j++) {
568 : Maux[j+j*n] -= p[i];
569 : RR[j+j*n] = r[i];
570 : }
571 : PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
572 : SlepcCheckLapackInfo("gesv",info);
573 : for (j=0;j<n2;j++) {
574 : expmA[j] += RR[j];
575 : }
576 : PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
577 : }
578 : }
579 56 : for (i=0;i<iremainsize;i++) {
580 36 : if (!i) {
581 18 : PetscCall(PetscArrayzero(RR,n2));
582 638 : for (j=0;j<n;j++) {
583 620 : RR[j+j*n] = remainterm[iremainsize-1];
584 : }
585 : } else {
586 18 : PetscCall(PetscArraycpy(RR,As,n2));
587 18 : for (j=1;j<i;j++) {
588 0 : PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
589 0 : SlepcSwap(RR,Maux,aux);
590 0 : PetscCall(SlepcLogFlopsComplex(2.0*n*n*n));
591 : }
592 18 : PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
593 18 : PetscCall(SlepcLogFlopsComplex(1.0*n2));
594 : }
595 118036 : for (j=0;j<n2;j++) {
596 118000 : expmA[j] += RR[j];
597 : }
598 36 : PetscCall(SlepcLogFlopsComplex(1.0*n2));
599 : }
600 20 : PetscCall(PetscFree3(r,p,remainterm));
601 : } else { /* product form, default */
602 20 : PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
603 20 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
604 20 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
605 20 : PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
606 20 : PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
607 :
608 20 : PetscCall(PetscArrayzero(expmA,n2));
609 660 : for (i=0;i<n;i++) { /* initialize */
610 640 : expmA[i+i*n] = 1.0;
611 : }
612 20 : minlen = PetscMin(irsize,ipsize);
613 96 : for (i=0;i<minlen;i++) {
614 76 : PetscCall(PetscArraycpy(RR,As,n2));
615 2596 : for (j=0;j<n;j++) {
616 2520 : RR[j+j*n] -= rootp[i];
617 : }
618 76 : PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
619 76 : SlepcSwap(expmA,Maux,aux);
620 76 : PetscCall(PetscArraycpy(RR,As,n2));
621 2596 : for (j=0;j<n;j++) {
622 2520 : RR[j+j*n] -= rootq[i];
623 : }
624 76 : PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
625 76 : SlepcCheckLapackInfo("gesv",info);
626 : /* loop(n) + gemm + loop(n) + gesv */
627 76 : PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
628 : }
629 : /* extra numerator */
630 38 : for (i=minlen;i<irsize;i++) {
631 18 : PetscCall(PetscArraycpy(RR,As,n2));
632 638 : for (j=0;j<n;j++) {
633 620 : RR[j+j*n] -= rootp[i];
634 : }
635 18 : PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
636 18 : SlepcSwap(expmA,Maux,aux);
637 18 : PetscCall(SlepcLogFlopsComplex(1.0*n+2.0*n*n*n));
638 : }
639 : /* extra denominator */
640 22 : for (i=minlen;i<ipsize;i++) {
641 2 : PetscCall(PetscArraycpy(RR,As,n2));
642 22 : for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
643 2 : PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
644 2 : SlepcCheckLapackInfo("gesv",info);
645 2 : PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
646 : }
647 20 : PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
648 20 : PetscCall(SlepcLogFlopsComplex(1.0*n2));
649 20 : PetscCall(PetscFree2(rootp,rootq));
650 : }
651 :
652 : #if !defined(PETSC_USE_COMPLEX)
653 118440 : for (i=0;i<n2;i++) {
654 118400 : Ba2[i] = PetscRealPartComplex(expmA[i]);
655 : }
656 : #else
657 : PetscCall(PetscArraycpy(Ba2,expmA,n2));
658 : #endif
659 :
660 : /* perform repeated squaring */
661 196 : for (i=0;i<s;i++) { /* final squaring */
662 156 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
663 156 : SlepcSwap(Ba2,sMaux,saux);
664 156 : PetscCall(PetscLogFlops(2.0*n*n*n));
665 : }
666 40 : if (Ba2!=Ba) {
667 4 : PetscCall(PetscArraycpy(Ba,Ba2,n2));
668 4 : sMaux = Ba2;
669 : }
670 40 : if (shift) {
671 40 : expshift = PetscExpReal(shift);
672 40 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
673 40 : PetscCall(PetscLogFlops(1.0*n2));
674 : }
675 :
676 : /* restore pointers */
677 40 : Maux = Maux2; expmA = expmA2; RR = RR2;
678 40 : PetscCall(PetscFree2(sMaux,Maux));
679 40 : PetscCall(PetscFree4(expmA,As,RR,piv));
680 40 : PetscCall(MatDenseRestoreArrayRead(A,&Aa));
681 40 : PetscCall(MatDenseRestoreArray(B,&Ba));
682 40 : PetscFunctionReturn(PETSC_SUCCESS);
683 : #endif
684 : }
685 :
686 : #define SMALLN 100
687 :
688 : /*
689 : * Function needed to compute optimal parameters (required workspace is 3*n*n)
690 : */
691 433 : static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
692 : {
693 433 : PetscScalar *Ascaled=work;
694 433 : PetscReal nrm,alpha,beta,rwork[1];
695 433 : PetscInt t;
696 433 : PetscBLASInt i,j;
697 :
698 433 : PetscFunctionBegin;
699 433 : beta = PetscPowReal(coeff,1.0/(2*m+1));
700 11276 : for (i=0;i<n;i++)
701 653092 : for (j=0;j<n;j++)
702 642249 : Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
703 433 : nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
704 433 : PetscCall(PetscLogFlops(2.0*n*n));
705 433 : PetscCall(SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha));
706 433 : alpha /= nrm;
707 433 : t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
708 433 : PetscFunctionReturn(t);
709 : }
710 :
711 : /*
712 : * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
713 : */
714 433 : static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
715 : {
716 433 : PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
717 433 : PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
718 433 : PetscBLASInt n_=0,n2,one=1;
719 433 : PetscRandom rand;
720 433 : const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
721 : 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
722 433 : const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
723 : 2.539398330063230e-001, /* m = 5 */
724 : 9.504178996162932e-001, /* m = 7 */
725 : 2.097847961257068e+000, /* m = 9 */
726 : 5.371920351148152e+000 }; /* m = 13 */
727 :
728 433 : PetscFunctionBegin;
729 433 : *s = 0;
730 433 : *m = 13;
731 433 : PetscCall(PetscBLASIntCast(n,&n_));
732 433 : PetscCall(PetscRandomCreate(PETSC_COMM_SELF,&rand));
733 433 : d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
734 433 : if (d4==0.0) { /* safeguard for the case A = 0 */
735 0 : *m = 3;
736 0 : goto done;
737 : }
738 433 : d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
739 433 : PetscCall(PetscLogFlops(2.0*n*n));
740 433 : eta1 = PetscMax(d4,d6);
741 433 : if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
742 83 : *m = 3;
743 83 : goto done;
744 : }
745 350 : if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
746 110 : *m = 5;
747 110 : goto done;
748 : }
749 240 : if (n<SMALLN) {
750 226 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
751 226 : d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
752 226 : PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
753 : } else {
754 14 : PetscCall(SlepcNormAm(n_,Apowers[2],2,work,rand,&d8));
755 14 : d8 = PetscPowReal(d8,1.0/8.0);
756 : }
757 240 : eta3 = PetscMax(d6,d8);
758 240 : if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
759 14 : *m = 7;
760 14 : goto done;
761 : }
762 226 : if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
763 18 : *m = 9;
764 18 : goto done;
765 : }
766 208 : if (n<SMALLN) {
767 194 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
768 194 : d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
769 194 : PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
770 : } else {
771 14 : PetscCall(SlepcNormAm(n_,Apowers[1],5,work,rand,&d10));
772 14 : d10 = PetscPowReal(d10,1.0/10.0);
773 : }
774 208 : eta4 = PetscMax(d8,d10);
775 208 : eta5 = PetscMin(eta3,eta4);
776 208 : *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
777 208 : if (*s) {
778 74 : Ascaled = work+3*n*n;
779 74 : n2 = n_*n_;
780 74 : PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
781 74 : sfactor = PetscPowRealInt(2.0,-(*s));
782 74 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
783 74 : PetscCall(PetscLogFlops(1.0*n*n));
784 : } else Ascaled = A;
785 208 : *s += ell(n_,Ascaled,coeff[4],13,work,rand);
786 433 : done:
787 433 : PetscCall(PetscRandomDestroy(&rand));
788 433 : PetscFunctionReturn(PETSC_SUCCESS);
789 : }
790 :
791 : /*
792 : * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
793 : *
794 : * N. J. Higham, "The scaling and squaring method for the matrix exponential
795 : * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
796 : */
797 433 : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
798 : {
799 433 : PetscBLASInt n_=0,n2,*ipiv,info,one=1;
800 433 : PetscInt n,m,j,s;
801 433 : PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
802 433 : PetscScalar *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
803 433 : const PetscScalar *Aa,*c;
804 433 : const PetscScalar c3[4] = { 120, 60, 12, 1 };
805 433 : const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
806 433 : const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
807 433 : const PetscScalar c9[10] = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
808 : 2162160, 110880, 3960, 90, 1 };
809 433 : const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
810 : 1187353796428800.0, 129060195264000.0, 10559470521600.0,
811 : 670442572800.0, 33522128640.0, 1323241920.0,
812 : 40840800, 960960, 16380, 182, 1 };
813 :
814 433 : PetscFunctionBegin;
815 433 : PetscCall(MatDenseGetArrayRead(A,&Aa));
816 433 : PetscCall(MatDenseGetArray(B,&Ba));
817 433 : PetscCall(MatGetSize(A,&n,NULL));
818 433 : PetscCall(PetscBLASIntCast(n,&n_));
819 433 : n2 = n_*n_;
820 433 : PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
821 :
822 : /* Matrix powers */
823 433 : Apowers[0] = work; /* Apowers[0] = A */
824 433 : Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
825 433 : Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
826 433 : Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
827 433 : Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
828 :
829 433 : PetscCall(PetscArraycpy(Apowers[0],Aa,n2));
830 433 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
831 433 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
832 433 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
833 433 : PetscCall(PetscLogFlops(6.0*n*n*n));
834 :
835 : /* Compute scaling parameter and order of Pade approximant */
836 433 : PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
837 :
838 433 : if (s) { /* rescale */
839 390 : for (j=0;j<4;j++) {
840 312 : scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
841 312 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
842 : }
843 78 : PetscCall(PetscLogFlops(4.0*n*n));
844 : }
845 :
846 : /* Evaluate the Pade approximant */
847 433 : switch (m) {
848 : case 3: c = c3; break;
849 110 : case 5: c = c5; break;
850 14 : case 7: c = c7; break;
851 18 : case 9: c = c9; break;
852 208 : case 13: c = c13; break;
853 0 : default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
854 : }
855 433 : P = Ba;
856 433 : Q = Apowers[4] + n*n;
857 433 : W = Q + n*n;
858 433 : switch (m) {
859 225 : case 3:
860 : case 5:
861 : case 7:
862 : case 9:
863 225 : if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
864 225 : PetscCall(PetscArrayzero(P,n2));
865 225 : PetscCall(PetscArrayzero(Q,n2));
866 2553 : for (j=0;j<n;j++) {
867 2328 : P[j+j*n] = c[1];
868 2328 : Q[j+j*n] = c[0];
869 : }
870 642 : for (j=m;j>=3;j-=2) {
871 417 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
872 417 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
873 417 : PetscCall(PetscLogFlops(4.0*n*n));
874 : }
875 225 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
876 225 : PetscCall(PetscLogFlops(2.0*n*n*n));
877 433 : SlepcSwap(P,W,aux);
878 : break;
879 208 : case 13:
880 : /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
881 : + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
882 208 : PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
883 208 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
884 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
885 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
886 208 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
887 208 : PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
888 208 : PetscCall(PetscArrayzero(P,n2));
889 8723 : for (j=0;j<n;j++) P[j+j*n] = c[1];
890 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
891 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
892 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
893 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
894 208 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
895 208 : PetscCall(PetscLogFlops(7.0*n*n+2.0*n*n*n));
896 : /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
897 : + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
898 208 : PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
899 208 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
900 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
901 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
902 208 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
903 208 : PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
904 208 : PetscCall(PetscArrayzero(Q,n2));
905 8723 : for (j=0;j<n;j++) Q[j+j*n] = c[0];
906 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
907 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
908 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
909 208 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
910 208 : PetscCall(PetscLogFlops(7.0*n*n));
911 : break;
912 0 : default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
913 : }
914 433 : PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
915 433 : PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
916 433 : SlepcCheckLapackInfo("gesv",info);
917 433 : PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
918 11276 : for (j=0;j<n;j++) P[j+j*n] += 1.0;
919 433 : PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n));
920 :
921 : /* Squaring */
922 1093 : for (j=1;j<=s;j++) {
923 660 : PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
924 660 : SlepcSwap(P,W,aux);
925 : }
926 433 : if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,n2));
927 433 : PetscCall(PetscLogFlops(2.0*n*n*n*s));
928 :
929 433 : PetscCall(PetscFree2(work,ipiv));
930 433 : PetscCall(MatDenseRestoreArrayRead(A,&Aa));
931 433 : PetscCall(MatDenseRestoreArray(B,&Ba));
932 433 : PetscFunctionReturn(PETSC_SUCCESS);
933 : }
934 :
935 : #if defined(PETSC_HAVE_CUDA)
936 : #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
937 : #include <slepccupmblas.h>
938 :
939 : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
940 : {
941 : PetscBLASInt n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
942 : PetscInt m,k,sexp;
943 : PetscBool odd;
944 : const PetscInt p=MAX_PADE;
945 : PetscReal c[MAX_PADE+1],s;
946 : PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
947 : const PetscScalar *Aa;
948 : PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
949 : cublasHandle_t cublasv2handle;
950 :
951 : PetscFunctionBegin;
952 : PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
953 : PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
954 : PetscCall(MatGetSize(A,&m,NULL));
955 : PetscCall(PetscBLASIntCast(m,&n));
956 : ld = n;
957 : ld2 = ld*ld;
958 : if (A==B) {
959 : PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
960 : PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
961 : PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
962 : PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
963 : } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
964 : PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
965 :
966 : PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
967 : PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
968 : PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
969 : PetscCallCUDA(cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld));
970 : PetscCallCUDA(cudaMalloc((void **)&d_info,sizeof(PetscBLASInt)));
971 : PetscCallCUDA(cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*)));
972 : PetscCallCUDA(cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*)));
973 :
974 : PetscCall(PetscMalloc1(1,&ppP));
975 : PetscCall(PetscMalloc1(1,&ppQ));
976 :
977 : d_P = d_Ba;
978 : PetscCall(PetscLogGpuTimeBegin());
979 :
980 : /* Pade' coefficients */
981 : c[0] = 1.0;
982 : for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
983 :
984 : /* Scaling */
985 : PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
986 : if (s>0.5) {
987 : sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
988 : scale = PetscPowRealInt(2.0,-sexp);
989 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
990 : PetscCall(PetscLogGpuFlops(1.0*n*n));
991 : } else sexp = 0;
992 :
993 : /* Horner evaluation */
994 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
995 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
996 : PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
997 : PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
998 : PetscCall(set_diagonal(n,d_Q,ld,c[p]));
999 : PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1000 :
1001 : odd = PETSC_TRUE;
1002 : for (k=p-1;k>0;k--) {
1003 : if (odd) {
1004 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1005 : SlepcSwap(d_Q,d_W,aux);
1006 : PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1007 : odd = PETSC_FALSE;
1008 : } else {
1009 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1010 : SlepcSwap(d_P,d_W,aux);
1011 : PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1012 : odd = PETSC_TRUE;
1013 : }
1014 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1015 : }
1016 : if (odd) {
1017 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1018 : SlepcSwap(d_Q,d_W,aux);
1019 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1020 :
1021 : ppQ[0] = d_Q;
1022 : ppP[0] = d_P;
1023 : PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1024 : PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1025 :
1026 : PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1027 : PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1028 : PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1029 : PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1030 : PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1031 : PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1032 : PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1033 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1034 : PetscCall(shift_diagonal(n,d_P,ld,sone));
1035 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1036 : } else {
1037 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1038 : SlepcSwap(d_P,d_W,aux);
1039 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1040 :
1041 : ppQ[0] = d_Q;
1042 : ppP[0] = d_P;
1043 : PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1044 : PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1045 :
1046 : PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1047 : PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1048 : PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1049 : PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1050 : PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1051 : PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1052 : PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1053 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1054 : PetscCall(shift_diagonal(n,d_P,ld,sone));
1055 : }
1056 : PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1057 :
1058 : for (k=1;k<=sexp;k++) {
1059 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1060 : PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1061 : }
1062 : PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1063 :
1064 : PetscCall(PetscLogGpuTimeEnd());
1065 : PetscCallCUDA(cudaFree(d_Q));
1066 : PetscCallCUDA(cudaFree(d_W));
1067 : PetscCallCUDA(cudaFree(d_A2));
1068 : PetscCallCUDA(cudaFree(d_ipiv));
1069 : PetscCallCUDA(cudaFree(d_info));
1070 : PetscCallCUDA(cudaFree(d_ppP));
1071 : PetscCallCUDA(cudaFree(d_ppQ));
1072 :
1073 : PetscCall(PetscFree(ppP));
1074 : PetscCall(PetscFree(ppQ));
1075 :
1076 : if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1077 : if (A!=B) {
1078 : if (s>0.5) { /* undo scaling */
1079 : scale = 1.0/scale;
1080 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1081 : }
1082 : PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1083 : } else PetscCallCUDA(cudaFree(d_As));
1084 : PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1085 : PetscFunctionReturn(PETSC_SUCCESS);
1086 : }
1087 :
1088 : #if defined(PETSC_HAVE_MAGMA)
1089 : #include <slepcmagma.h>
1090 :
1091 : PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1092 : {
1093 : PetscBLASInt n=0,ld,ld2,*piv,one=1;
1094 : PetscInt m,k,sexp;
1095 : PetscBool odd;
1096 : const PetscInt p=MAX_PADE;
1097 : PetscReal c[MAX_PADE+1],s;
1098 : PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1099 : const PetscScalar *Aa;
1100 : PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1101 : cublasHandle_t cublasv2handle;
1102 :
1103 : PetscFunctionBegin;
1104 : PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1105 : PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1106 : PetscCall(SlepcMagmaInit());
1107 : PetscCall(MatGetSize(A,&m,NULL));
1108 : PetscCall(PetscBLASIntCast(m,&n));
1109 : ld = n;
1110 : ld2 = ld*ld;
1111 : if (A==B) {
1112 : PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
1113 : PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1114 : PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1115 : PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1116 : } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
1117 : PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1118 :
1119 : PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
1120 : PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
1121 : PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
1122 :
1123 : PetscCall(PetscMalloc1(n,&piv));
1124 :
1125 : d_P = d_Ba;
1126 : PetscCall(PetscLogGpuTimeBegin());
1127 :
1128 : /* Pade' coefficients */
1129 : c[0] = 1.0;
1130 : for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1131 :
1132 : /* Scaling */
1133 : PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
1134 : PetscCall(PetscLogGpuFlops(1.0*n*n));
1135 :
1136 : if (s>0.5) {
1137 : sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1138 : scale = PetscPowRealInt(2.0,-sexp);
1139 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1140 : PetscCall(PetscLogGpuFlops(1.0*n*n));
1141 : } else sexp = 0;
1142 :
1143 : /* Horner evaluation */
1144 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
1145 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1146 : PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
1147 : PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
1148 : PetscCall(set_diagonal(n,d_Q,ld,c[p]));
1149 : PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1150 :
1151 : odd = PETSC_TRUE;
1152 : for (k=p-1;k>0;k--) {
1153 : if (odd) {
1154 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1155 : SlepcSwap(d_Q,d_W,aux);
1156 : PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1157 : odd = PETSC_FALSE;
1158 : } else {
1159 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1160 : SlepcSwap(d_P,d_W,aux);
1161 : PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1162 : odd = PETSC_TRUE;
1163 : }
1164 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1165 : }
1166 : if (odd) {
1167 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1168 : SlepcSwap(d_Q,d_W,aux);
1169 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1170 : PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1171 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1172 : PetscCall(shift_diagonal(n,d_P,ld,sone));
1173 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1174 : } else {
1175 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1176 : SlepcSwap(d_P,d_W,aux);
1177 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1178 : PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1179 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1180 : PetscCall(shift_diagonal(n,d_P,ld,sone));
1181 : }
1182 : PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1183 :
1184 : for (k=1;k<=sexp;k++) {
1185 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1186 : PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1187 : }
1188 : PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1189 :
1190 : PetscCall(PetscLogGpuTimeEnd());
1191 : PetscCallCUDA(cudaFree(d_Q));
1192 : PetscCallCUDA(cudaFree(d_W));
1193 : PetscCallCUDA(cudaFree(d_A2));
1194 : PetscCall(PetscFree(piv));
1195 :
1196 : if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1197 : if (A!=B) {
1198 : if (s>0.5) { /* undo scaling */
1199 : scale = 1.0/scale;
1200 : PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1201 : }
1202 : PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1203 : } else PetscCallCUDA(cudaFree(d_As));
1204 : PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1205 : PetscFunctionReturn(PETSC_SUCCESS);
1206 : }
1207 :
1208 : /*
1209 : * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1210 : *
1211 : * N. J. Higham, "The scaling and squaring method for the matrix exponential
1212 : * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1213 : */
1214 : PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1215 : {
1216 : PetscBLASInt n_=0,n2,*ipiv,one=1;
1217 : PetscInt n,m,j,s;
1218 : PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1219 : PetscScalar *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1220 : const PetscScalar *Aa,*c;
1221 : const PetscScalar c3[4] = { 120, 60, 12, 1 };
1222 : const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1223 : const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1224 : const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1225 : 2162160, 110880, 3960, 90, 1 };
1226 : const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1227 : 1187353796428800, 129060195264000, 10559470521600,
1228 : 670442572800, 33522128640, 1323241920,
1229 : 40840800, 960960, 16380, 182, 1 };
1230 : cublasHandle_t cublasv2handle;
1231 :
1232 : PetscFunctionBegin;
1233 : PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1234 : PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1235 : PetscCall(SlepcMagmaInit());
1236 : PetscCall(MatGetSize(A,&n,NULL));
1237 : PetscCall(PetscBLASIntCast(n,&n_));
1238 : n2 = n_*n_;
1239 : PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
1240 : /* Matrix powers */
1241 : Apowers[0] = work; /* Apowers[0] = A */
1242 : Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1243 : Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1244 : Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1245 : Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1246 : if (A==B) {
1247 : PetscCallCUDA(cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar)));
1248 : d_Apowers[0] = d_work; /* d_Apowers[0] = A */
1249 : d_Apowers[1] = d_Apowers[0] + n*n; /* d_Apowers[1] = A^2 */
1250 : PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1251 : PetscCallCUDA(cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1252 : PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1253 : } else {
1254 : PetscCallCUDA(cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar)));
1255 : PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1256 : d_Apowers[1] = d_work; /* d_Apowers[1] = A^2 */
1257 : }
1258 : PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1259 : d_Apowers[2] = d_Apowers[1] + n*n; /* d_Apowers[2] = A^4 */
1260 : d_Apowers[3] = d_Apowers[2] + n*n; /* d_Apowers[3] = A^6 */
1261 : d_Apowers[4] = d_Apowers[3] + n*n; /* d_Apowers[4] = A^8 */
1262 : d_Q = d_Apowers[4] + n*n;
1263 : d_W = d_Q + n*n;
1264 :
1265 : PetscCall(PetscLogGpuTimeBegin());
1266 :
1267 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_));
1268 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_));
1269 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_));
1270 : PetscCall(PetscLogGpuFlops(6.0*n*n*n));
1271 :
1272 : PetscCallCUDA(cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1273 : PetscCallCUDA(cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1274 : PetscCall(PetscLogGpuToCpu(4*n2*sizeof(PetscScalar)));
1275 : /* Compute scaling parameter and order of Pade approximant */
1276 : PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
1277 :
1278 : if (s) { /* rescale */
1279 : for (j=0;j<4;j++) {
1280 : scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1281 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one));
1282 : }
1283 : PetscCall(PetscLogGpuFlops(4.0*n*n));
1284 : }
1285 :
1286 : /* Evaluate the Pade approximant */
1287 : switch (m) {
1288 : case 3: c = c3; break;
1289 : case 5: c = c5; break;
1290 : case 7: c = c7; break;
1291 : case 9: c = c9; break;
1292 : case 13: c = c13; break;
1293 : default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1294 : }
1295 : d_P = d_Ba;
1296 : switch (m) {
1297 : case 3:
1298 : case 5:
1299 : case 7:
1300 : case 9:
1301 : if (m==9) PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_));
1302 : PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1303 : PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1304 : PetscCall(set_diagonal(n,d_P,n,c[1]));
1305 : PetscCall(set_diagonal(n,d_Q,n,c[0]));
1306 : for (j=m;j>=3;j-=2) {
1307 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one));
1308 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one));
1309 : PetscCall(PetscLogGpuFlops(4.0*n*n));
1310 : }
1311 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_));
1312 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1313 : SlepcSwap(d_P,d_W,aux);
1314 : break;
1315 : case 13:
1316 : /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1317 : + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1318 : PetscCallCUDA(cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1319 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[13],d_P,one));
1320 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one));
1321 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one));
1322 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_));
1323 : PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1324 :
1325 : PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1326 : PetscCall(set_diagonal(n,d_P,n,c[1]));
1327 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one));
1328 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one));
1329 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one));
1330 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one));
1331 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_));
1332 : PetscCall(PetscLogGpuFlops(7.0*n*n+2.0*n*n*n));
1333 : /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1334 : + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1335 : PetscCallCUDA(cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1336 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[12],d_Q,one));
1337 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one));
1338 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one));
1339 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_));
1340 : PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1341 : PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1342 : PetscCall(set_diagonal(n,d_Q,n,c[0]));
1343 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one));
1344 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one));
1345 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one));
1346 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one));
1347 : PetscCall(PetscLogGpuFlops(7.0*n*n));
1348 : break;
1349 : default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1350 : }
1351 : PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one));
1352 :
1353 : PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
1354 :
1355 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&stwo,d_P,one));
1356 : PetscCall(shift_diagonal(n,d_P,n,sone));
1357 : PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n));
1358 :
1359 : /* Squaring */
1360 : for (j=1;j<=s;j++) {
1361 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_));
1362 : SlepcSwap(d_P,d_W,aux);
1363 : }
1364 : PetscCall(PetscLogGpuFlops(2.0*n*n*n*s));
1365 : PetscCall(PetscLogGpuTimeEnd());
1366 :
1367 : PetscCall(PetscFree2(work,ipiv));
1368 : if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1369 : if (A!=B) {
1370 : if (s>0.5) { /* undo scaling */
1371 : scale = 1.0/PetscPowRealInt(2.0,-s);
1372 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one));
1373 : }
1374 : PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1375 : }
1376 : PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1377 : PetscCallCUDA(cudaFree(d_work));
1378 : PetscFunctionReturn(PETSC_SUCCESS);
1379 : }
1380 :
1381 : /*
1382 : * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1383 : * and Yuji Nakatsukasa
1384 : *
1385 : * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1386 : * Approximation for the Matrix Exponential",
1387 : * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1388 : * https://doi.org/10.1137/15M1027553
1389 : */
1390 : PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1391 : {
1392 : PetscInt i,j,n_,s,k,m,mod;
1393 : PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
1394 : PetscReal nrm,shift=0.0,rone=1.0,rzero=0.0;
1395 : #if defined(PETSC_USE_COMPLEX)
1396 : PetscReal *rwork=NULL;
1397 : #endif
1398 : PetscComplex *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1399 : PetscScalar *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1400 : const PetscScalar *Aa;
1401 : PetscBool isreal,*d_isreal,flg;
1402 : cublasHandle_t cublasv2handle;
1403 :
1404 : PetscFunctionBegin;
1405 : PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1406 : PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1407 : PetscCall(SlepcMagmaInit());
1408 : PetscCall(MatGetSize(A,&n_,NULL));
1409 : PetscCall(PetscBLASIntCast(n_,&n));
1410 : PetscCall(PetscBLASIntCast(n*n,&n2));
1411 :
1412 : if (A==B) {
1413 : PetscCallCUDA(cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2));
1414 : PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1415 : PetscCallCUDA(cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1416 : PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1417 : } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa));
1418 : PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1419 : d_Ba2 = d_Ba;
1420 :
1421 : PetscCallCUDA(cudaMalloc((void **)&d_isreal,sizeof(PetscBool)));
1422 : PetscCallCUDA(cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2));
1423 : PetscCallCUDA(cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2));
1424 :
1425 : PetscCall(PetscLogGpuTimeBegin());
1426 : d_Maux2 = d_Maux;
1427 : PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
1428 : if (!flg) {
1429 : PetscCall(PetscMalloc2(n,&wr,n,&wi));
1430 : /* estimate rightmost eigenvalue and shift A with it */
1431 : PetscCall(PetscMalloc1(n2,&Maux));
1432 : PetscCall(MatDenseGetArrayRead(A,&Aa));
1433 : PetscCall(PetscArraycpy(Maux,Aa,n2));
1434 : PetscCall(MatDenseRestoreArrayRead(A,&Aa));
1435 : #if !defined(PETSC_USE_COMPLEX)
1436 : PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
1437 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1438 : PetscCall(PetscMalloc1(lwork,&work));
1439 : PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
1440 : PetscCall(PetscFree(work));
1441 : #else
1442 : PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
1443 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1444 : PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
1445 : PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
1446 : PetscCall(PetscFree2(rwork,work));
1447 : #endif
1448 : PetscCall(PetscFree(Maux));
1449 : PetscCall(PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
1450 :
1451 : shift = PetscRealPart(wr[0]);
1452 : for (i=1;i<n;i++) {
1453 : if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1454 : }
1455 : PetscCall(PetscFree2(wr,wi));
1456 : }
1457 : /* shift so that largest real part is (about) 0 */
1458 : PetscCallCUDA(cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1459 : if (shift) {
1460 : PetscCall(shift_diagonal(n,d_sMaux,n,-shift));
1461 : PetscCall(PetscLogGpuFlops(1.0*n));
1462 : }
1463 : #if defined(PETSC_USE_COMPLEX)
1464 : PetscCallCUDA(cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1465 : if (shift) {
1466 : PetscCall(shift_diagonal(n,d_Maux,n,-shift));
1467 : PetscCall(PetscLogGpuFlops(1.0*n));
1468 : }
1469 : #endif
1470 : if (A!=B) PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa));
1471 : else PetscCallCUDA(cudaFree(d_Aa));
1472 :
1473 : /* estimate norm(A) and select the scaling factor */
1474 : PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm));
1475 : PetscCall(PetscLogGpuFlops(2.0*n*n));
1476 : PetscCall(sexpm_params(nrm,&s,&k,&m));
1477 : if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1478 : if (shift) expshift = PetscExpReal(shift);
1479 : PetscCall(shift_Cdiagonal(n,d_Maux,n,rone,rzero));
1480 : if (shift) {
1481 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one));
1482 : PetscCall(PetscLogGpuFlops(1.0*(n+n2)));
1483 : } else PetscCall(PetscLogGpuFlops(1.0*n));
1484 : PetscCallCUDA(cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1485 : PetscCallCUDA(cudaFree(d_isreal));
1486 : PetscCallCUDA(cudaFree(d_sMaux));
1487 : PetscCallCUDA(cudaFree(d_Maux));
1488 : PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1489 : PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
1490 : }
1491 :
1492 : PetscCallCUDA(cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2));
1493 : PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2));
1494 : PetscCallCUDA(cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2));
1495 : d_expmA2 = d_expmA; d_RR2 = d_RR;
1496 : PetscCall(PetscMalloc1(n,&piv));
1497 : /* scale matrix */
1498 : #if !defined(PETSC_USE_COMPLEX)
1499 : PetscCall(copy_array2D_S2C(n,n,d_As,n,d_sMaux,n));
1500 : #else
1501 : PetscCallCUDA(cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1502 : #endif
1503 : scale = 1.0/PetscPowRealInt(2.0,s);
1504 : PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one));
1505 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1506 :
1507 : /* evaluate Pade approximant (partial fraction or product form) */
1508 : if (fn->method==3 || !m) { /* partial fraction */
1509 : PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
1510 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1511 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1512 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
1513 : PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
1514 : PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
1515 :
1516 : PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1517 : #if !defined(PETSC_USE_COMPLEX)
1518 : isreal = PETSC_TRUE;
1519 : #else
1520 : PetscCall(getisreal_array2D(n,n,d_Maux,n,d_isreal));
1521 : PetscCallCUDA(cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost));
1522 : #endif
1523 : if (isreal) {
1524 : rsizediv2 = irsize/2;
1525 : for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1526 : PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1527 : PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1528 : PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i])));
1529 : PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i])));
1530 : PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1531 : PetscCall(add_array2D_Conj(n,n,d_RR,n));
1532 : PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1533 : /* shift(n) + gesv + axpy(n2) */
1534 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
1535 : }
1536 :
1537 : mod = ipsize % 2;
1538 : if (mod) {
1539 : PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1540 : PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1541 : PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1])));
1542 : PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1])));
1543 : PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1544 : PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1545 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1546 : }
1547 : } else { /* complex */
1548 : for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1549 : PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1550 : PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1551 : PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i])));
1552 : PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i])));
1553 : PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1554 : PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1555 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1556 : }
1557 : }
1558 : for (i=0;i<iremainsize;i++) {
1559 : if (!i) {
1560 : PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1561 : PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1])));
1562 : } else {
1563 : PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1564 : for (j=1;j<i;j++) {
1565 : PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n));
1566 : SlepcSwap(d_RR,d_Maux,aux);
1567 : PetscCall(SlepcLogGpuFlopsComplex(2.0*n*n*n));
1568 : }
1569 : PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one));
1570 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1571 : }
1572 : PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1573 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1574 : }
1575 : PetscCall(PetscFree3(r,p,remainterm));
1576 : } else { /* product form, default */
1577 : PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
1578 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1579 : PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1580 : PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
1581 : PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
1582 :
1583 : PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1584 : PetscCall(set_Cdiagonal(n,d_expmA,n,rone,rzero)); /* initialize */
1585 : minlen = PetscMin(irsize,ipsize);
1586 : for (i=0;i<minlen;i++) {
1587 : PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1588 : PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1589 : PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1590 : SlepcSwap(d_expmA,d_Maux,aux);
1591 : PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1592 : PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1593 : PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1594 : /* shift(n) + gemm + shift(n) + gesv */
1595 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1596 : }
1597 : /* extra enumerator */
1598 : for (i=minlen;i<irsize;i++) {
1599 : PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1600 : PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1601 : PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1602 : SlepcSwap(d_expmA,d_Maux,aux);
1603 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n));
1604 : }
1605 : /* extra denominator */
1606 : for (i=minlen;i<ipsize;i++) {
1607 : PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1608 : PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1609 : PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1610 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1611 : }
1612 : PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one));
1613 : PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1614 : PetscCall(PetscFree2(rootp,rootq));
1615 : }
1616 :
1617 : #if !defined(PETSC_USE_COMPLEX)
1618 : PetscCall(copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n));
1619 : #else
1620 : PetscCallCUDA(cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1621 : #endif
1622 :
1623 : /* perform repeated squaring */
1624 : for (i=0;i<s;i++) { /* final squaring */
1625 : PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n));
1626 : SlepcSwap(d_Ba2,d_sMaux,saux);
1627 : PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1628 : }
1629 : if (d_Ba2!=d_Ba) {
1630 : PetscCallCUDA(cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1631 : d_sMaux = d_Ba2;
1632 : }
1633 : if (shift) {
1634 : expshift = PetscExpReal(shift);
1635 : PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one));
1636 : PetscCall(PetscLogGpuFlops(1.0*n2));
1637 : }
1638 :
1639 : PetscCall(PetscLogGpuTimeEnd());
1640 :
1641 : /* restore pointers */
1642 : d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1643 : PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1644 : PetscCallCUDA(cudaFree(d_isreal));
1645 : PetscCallCUDA(cudaFree(d_sMaux));
1646 : PetscCallCUDA(cudaFree(d_Maux));
1647 : PetscCallCUDA(cudaFree(d_expmA));
1648 : PetscCallCUDA(cudaFree(d_As));
1649 : PetscCallCUDA(cudaFree(d_RR));
1650 : PetscCall(PetscFree(piv));
1651 : PetscFunctionReturn(PETSC_SUCCESS);
1652 : }
1653 : #endif /* PETSC_HAVE_MAGMA */
1654 : #endif /* PETSC_HAVE_CUDA */
1655 :
1656 29 : static PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1657 : {
1658 29 : PetscBool isascii;
1659 29 : char str[50];
1660 29 : const char *methodname[] = {
1661 : "scaling & squaring, [m/m] Pade approximant (Higham)",
1662 : "scaling & squaring, [6/6] Pade approximant",
1663 : "scaling & squaring, subdiagonal Pade approximant (product form)",
1664 : "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1665 : };
1666 29 : const int nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
1667 :
1668 29 : PetscFunctionBegin;
1669 29 : PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
1670 29 : if (isascii) {
1671 29 : if (fn->beta==(PetscScalar)1.0) {
1672 24 : if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(x)\n"));
1673 : else {
1674 10 : PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1675 10 : PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(%s*x)\n",str));
1676 : }
1677 : } else {
1678 5 : PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE));
1679 5 : if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s*exp(x)\n",str));
1680 : else {
1681 5 : PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s",str));
1682 5 : PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1683 5 : PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1684 5 : PetscCall(PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str));
1685 5 : PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1686 : }
1687 : }
1688 29 : if (fn->method<nmeth) PetscCall(PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]));
1689 : }
1690 29 : PetscFunctionReturn(PETSC_SUCCESS);
1691 : }
1692 :
1693 93 : SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1694 : {
1695 93 : PetscFunctionBegin;
1696 93 : fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1697 93 : fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1698 93 : fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1699 93 : fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1700 93 : fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1701 93 : fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1702 : #if defined(PETSC_HAVE_CUDA)
1703 : fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1704 : #if defined(PETSC_HAVE_MAGMA)
1705 : fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1706 : fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1707 : fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1708 : fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1709 : #endif
1710 : #endif
1711 93 : fn->ops->view = FNView_Exp;
1712 93 : PetscFunctionReturn(PETSC_SUCCESS);
1713 : }
|