Actual source code: fnexp.c
slepc-3.21.1 2024-04-26
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: static PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
19: PetscFunctionBegin;
20: *y = PetscExpScalar(x);
21: PetscFunctionReturn(PETSC_SUCCESS);
22: }
24: static PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
26: PetscFunctionBegin;
27: *y = PetscExpScalar(x);
28: PetscFunctionReturn(PETSC_SUCCESS);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) do {t=a;a=b;b=t;} while (0)
34: static PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: PetscBLASInt n=0,ld,ld2,*ipiv,info,inc=1;
37: PetscInt m,j,k,sexp;
38: PetscBool odd;
39: const PetscInt p=MAX_PADE;
40: PetscReal c[MAX_PADE+1],s,*rwork;
41: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
42: PetscScalar *Ba,*As,*A2,*Q,*P,*W,*aux;
43: const PetscScalar *Aa;
45: PetscFunctionBegin;
46: PetscCall(MatDenseGetArrayRead(A,&Aa));
47: PetscCall(MatDenseGetArray(B,&Ba));
48: PetscCall(MatGetSize(A,&m,NULL));
49: PetscCall(PetscBLASIntCast(m,&n));
50: ld = n;
51: ld2 = ld*ld;
52: P = Ba;
53: PetscCall(PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv));
54: PetscCall(PetscArraycpy(As,Aa,ld2));
56: /* Pade' coefficients */
57: c[0] = 1.0;
58: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
60: /* Scaling */
61: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
62: PetscCall(PetscLogFlops(1.0*n*n));
63: if (s>0.5) {
64: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
65: scale = PetscPowRealInt(2.0,-sexp);
66: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
67: PetscCall(PetscLogFlops(1.0*n*n));
68: } else sexp = 0;
70: /* Horner evaluation */
71: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
72: PetscCall(PetscLogFlops(2.0*n*n*n));
73: PetscCall(PetscArrayzero(Q,ld2));
74: PetscCall(PetscArrayzero(P,ld2));
75: for (j=0;j<n;j++) {
76: Q[j+j*ld] = c[p];
77: P[j+j*ld] = c[p-1];
78: }
80: odd = PETSC_TRUE;
81: for (k=p-1;k>0;k--) {
82: if (odd) {
83: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
84: SWAP(Q,W,aux);
85: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
86: odd = PETSC_FALSE;
87: } else {
88: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
89: SWAP(P,W,aux);
90: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
91: odd = PETSC_TRUE;
92: }
93: PetscCall(PetscLogFlops(2.0*n*n*n));
94: }
95: /*if (odd) {
96: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
97: SWAP(Q,W,aux);
98: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
99: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
100: SlepcCheckLapackInfo("gesv",info);
101: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
102: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
103: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
104: } else {*/
105: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
106: SWAP(P,W,aux);
107: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
108: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
109: SlepcCheckLapackInfo("gesv",info);
110: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
111: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
112: /*}*/
113: PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
115: for (k=1;k<=sexp;k++) {
116: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
117: PetscCall(PetscArraycpy(P,W,ld2));
118: }
119: if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,ld2));
120: PetscCall(PetscLogFlops(2.0*n*n*n*sexp));
122: PetscCall(PetscFree6(Q,W,As,A2,rwork,ipiv));
123: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
124: PetscCall(MatDenseRestoreArray(B,&Ba));
125: PetscFunctionReturn(PETSC_SUCCESS);
126: }
128: #if defined(PETSC_HAVE_COMPLEX)
129: /*
130: * Set scaling factor (s) and Pade degree (k,m)
131: */
132: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
133: {
134: PetscFunctionBegin;
135: if (nrm>1) {
136: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
137: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
138: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
139: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
140: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
141: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
142: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
143: else {*s = 1; *k = 1; *m = *k+1;}
144: } else { /* nrm<1 */
145: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
146: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
147: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
148: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
149: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
150: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
151: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
152: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
153: else {*s = 0; *k = 1; *m = 0;}
154: }
155: PetscFunctionReturn(PETSC_SUCCESS);
156: }
158: /*
159: * Partial fraction form coefficients.
160: * If query, the function returns the size necessary to store the coefficients.
161: */
162: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
163: {
164: PetscInt i;
165: const PetscComplex /* m == k+1 */
166: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
167: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
168: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
169: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
170: -2.733432894659307e+02 },
171: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
172: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
173: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
174: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
175: 6.286704751729261e+00 },
176: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
177: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
178: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
179: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
180: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
181: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
182: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
183: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
184: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
185: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
186: -1.829749817484586e+01 },
187: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
188: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
189: 3.637834252744491e+00 },
190: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
191: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
192: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
193: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
194: const PetscComplex /* m == k-1 */
195: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
196: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
197: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
198: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
199: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
200: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
201: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
202: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
203: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
204: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
205: -1.734353918633177e+02 },
206: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
207: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
208: 5.648485971016893e+00 },
209: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
210: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
211: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
212: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
213: const PetscScalar /* m == k-1 */
214: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
215: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
216: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
217: m1remain2[2] = {-0.5, -3.5},
218: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
219: remain2[3] = {1.0/2.0, 1, 1};
221: PetscFunctionBegin;
222: if (query) { /* query about buffer's size */
223: if (m==k+1) {
224: *remain = 0;
225: *r = *q = k+1;
226: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
227: }
228: if (m==k-1) {
229: *remain = 2;
230: if (k==5) *r = *q = 4;
231: else if (k==4) *r = *q = 3;
232: else if (k==3) *r = *q = 2;
233: else if (k==2) *r = *q = 1;
234: }
235: if (m==0) {
236: *r = *q = 0;
237: *remain = k+1;
238: }
239: } else {
240: if (m==k+1) {
241: if (k==4) {
242: for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
243: } else if (k==3) {
244: for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
245: } else if (k==2) {
246: for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
247: } else if (k==1) {
248: for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
249: }
250: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
251: }
252: if (m==k-1) {
253: if (k==5) {
254: for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
255: for (i=0;i<2;i++) remain[i] = m1remain5[i];
256: } else if (k==4) {
257: for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
258: for (i=0;i<2;i++) remain[i] = m1remain4[i];
259: } else if (k==3) {
260: for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
261: } else if (k==2) {
262: r[0] = -13.5; q[0] = 3;
263: for (i=0;i<2;i++) remain[i] = m1remain2[i];
264: }
265: }
266: if (m==0) {
267: r = q = NULL;
268: if (k==3) {
269: for (i=0;i<4;i++) remain[i] = remain3[i];
270: } else if (k==2) {
271: for (i=0;i<3;i++) remain[i] = remain2[i];
272: }
273: }
274: }
275: PetscFunctionReturn(PETSC_SUCCESS);
276: }
278: /*
279: * Product form coefficients.
280: * If query, the function returns the size necessary to store the coefficients.
281: */
282: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
283: {
284: PetscInt i;
285: const PetscComplex /* m == k+1 */
286: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
287: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
288: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
289: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
290: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
291: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
292: 6.286704751729261e+00 ,
293: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
294: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
295: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
296: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
297: -5.648485971016893e+00 },
298: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
299: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
300: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
301: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
302: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
303: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
304: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
305: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
306: 3.637834252744491e+00 },
307: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
308: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
309: const PetscComplex /* m == k-1 */
310: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
311: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
312: -6.286704751729261e+00 ,
313: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
314: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
315: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
316: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
317: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
318: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
319: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
320: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
321: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
322: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
323: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
324: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
325: 5.648485971016893e+00 },
326: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
327: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
328: -3.637834252744491e+00 },
329: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
330: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
331: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
332: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
334: PetscFunctionBegin;
335: if (query) {
336: if (m == k+1) {
337: *mult = 1;
338: *p = k;
339: *q = k+1;
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
342: if (m==k-1) {
343: *mult = 1;
344: *p = k;
345: *q = k-1;
346: }
347: } else {
348: if (m == k+1) {
349: *mult = PetscPowInt(-1,m);
350: *mult *= m;
351: if (k==4) {
352: for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
353: q[4] = p1q4[4];
354: } else if (k==3) {
355: for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
356: q[3] = p1q3[3];
357: } else if (k==2) {
358: for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
359: q[2] = p1q2[2];
360: } else if (k==1) {
361: p[0] = -3;
362: for (i=0;i<2;i++) q[i] = p1q1[i];
363: }
364: PetscFunctionReturn(PETSC_SUCCESS);
365: }
366: if (m==k-1) {
367: *mult = PetscPowInt(-1,m);
368: *mult /= k;
369: if (k==5) {
370: for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
371: p[4] = m1p5[4];
372: } else if (k==4) {
373: for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
374: p[3] = m1p4[3];
375: } else if (k==3) {
376: for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
377: p[2] = m1p3[2];
378: } else if (k==2) {
379: for (i=0;i<2;i++) p[i] = m1p2[i];
380: q[0] = 3;
381: }
382: }
383: }
384: PetscFunctionReturn(PETSC_SUCCESS);
385: }
386: #endif /* PETSC_HAVE_COMPLEX */
388: #if defined(PETSC_USE_COMPLEX)
389: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
390: {
391: PetscInt i;
393: PetscFunctionBegin;
394: *result=PETSC_TRUE;
395: for (i=0;i<n&&*result;i++) {
396: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
397: }
398: PetscFunctionReturn(PETSC_SUCCESS);
399: }
400: #endif
402: /*
403: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
404: * and Yuji Nakatsukasa
405: *
406: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
407: * Approximation for the Matrix Exponential",
408: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
409: * https://doi.org/10.1137/15M1027553
410: */
411: static PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
412: {
413: #if !defined(PETSC_HAVE_COMPLEX)
414: PetscFunctionBegin;
415: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
416: #else
417: PetscInt i,j,n_,s,k,m,mod;
418: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
419: PetscReal nrm,shift=0.0;
420: #if defined(PETSC_USE_COMPLEX)
421: PetscReal *rwork=NULL;
422: #endif
423: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
424: PetscScalar *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
425: const PetscScalar *Aa;
426: PetscBool isreal,flg;
427: PetscBLASInt query=-1;
428: PetscScalar work1,*work;
430: PetscFunctionBegin;
431: PetscCall(MatGetSize(A,&n_,NULL));
432: PetscCall(PetscBLASIntCast(n_,&n));
433: PetscCall(MatDenseGetArrayRead(A,&Aa));
434: PetscCall(MatDenseGetArray(B,&Ba));
435: Ba2 = Ba;
436: PetscCall(PetscBLASIntCast(n*n,&n2));
438: PetscCall(PetscMalloc2(n2,&sMaux,n2,&Maux));
439: Maux2 = Maux;
440: PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
441: if (!flg) {
442: PetscCall(PetscMalloc2(n,&wr,n,&wi));
443: PetscCall(PetscArraycpy(sMaux,Aa,n2));
444: /* estimate rightmost eigenvalue and shift A with it */
445: #if !defined(PETSC_USE_COMPLEX)
446: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
447: SlepcCheckLapackInfo("geev",info);
448: PetscCall(PetscBLASIntCast((PetscInt)work1,&lwork));
449: PetscCall(PetscMalloc1(lwork,&work));
450: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
451: PetscCall(PetscFree(work));
452: #else
453: PetscCall(PetscArraycpy(Maux,Aa,n2));
454: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
455: SlepcCheckLapackInfo("geev",info);
456: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
457: PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
458: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
459: PetscCall(PetscFree2(rwork,work));
460: #endif
461: SlepcCheckLapackInfo("geev",info);
462: PetscCall(PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
464: shift = PetscRealPart(wr[0]);
465: for (i=1;i<n;i++) {
466: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
467: }
468: PetscCall(PetscFree2(wr,wi));
469: }
470: /* shift so that largest real part is (about) 0 */
471: PetscCall(PetscArraycpy(sMaux,Aa,n2));
472: if (shift) {
473: for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
474: PetscCall(PetscLogFlops(1.0*n));
475: }
476: #if defined(PETSC_USE_COMPLEX)
477: PetscCall(PetscArraycpy(Maux,Aa,n2));
478: if (shift) {
479: for (i=0;i<n;i++) Maux[i+i*n] -= shift;
480: PetscCall(PetscLogFlops(1.0*n));
481: }
482: #endif
484: /* estimate norm(A) and select the scaling factor */
485: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
486: PetscCall(PetscLogFlops(1.0*n*n));
487: PetscCall(sexpm_params(nrm,&s,&k,&m));
488: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
489: if (shift) expshift = PetscExpReal(shift);
490: for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
491: if (shift) {
492: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
493: PetscCall(PetscLogFlops(1.0*(n+n2)));
494: } else PetscCall(PetscLogFlops(1.0*n));
495: PetscCall(PetscArraycpy(Ba,sMaux,n2));
496: PetscCall(PetscFree2(sMaux,Maux));
497: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
498: PetscCall(MatDenseRestoreArray(B,&Ba));
499: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
500: }
502: PetscCall(PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv));
503: expmA2 = expmA; RR2 = RR;
504: /* scale matrix */
505: #if !defined(PETSC_USE_COMPLEX)
506: for (i=0;i<n2;i++) {
507: As[i] = sMaux[i];
508: }
509: #else
510: PetscCall(PetscArraycpy(As,sMaux,n2));
511: #endif
512: scale = 1.0/PetscPowRealInt(2.0,s);
513: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
514: PetscCall(SlepcLogFlopsComplex(1.0*n2));
516: /* evaluate Pade approximant (partial fraction or product form) */
517: if (fn->method==3 || !m) { /* partial fraction */
518: PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
519: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
520: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
521: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
522: PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
523: PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
525: PetscCall(PetscArrayzero(expmA,n2));
526: #if !defined(PETSC_USE_COMPLEX)
527: isreal = PETSC_TRUE;
528: #else
529: PetscCall(getisreal(n2,Maux,&isreal));
530: #endif
531: if (isreal) {
532: rsizediv2 = irsize/2;
533: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
534: PetscCall(PetscArraycpy(Maux,As,n2));
535: PetscCall(PetscArrayzero(RR,n2));
536: for (j=0;j<n;j++) {
537: Maux[j+j*n] -= p[2*i];
538: RR[j+j*n] = r[2*i];
539: }
540: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
541: SlepcCheckLapackInfo("gesv",info);
542: for (j=0;j<n2;j++) {
543: expmA[j] += RR[j] + PetscConj(RR[j]);
544: }
545: /* loop(n) + gesv + loop(n2) */
546: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
547: }
549: mod = ipsize % 2;
550: if (mod) {
551: PetscCall(PetscArraycpy(Maux,As,n2));
552: PetscCall(PetscArrayzero(RR,n2));
553: for (j=0;j<n;j++) {
554: Maux[j+j*n] -= p[ipsize-1];
555: RR[j+j*n] = r[irsize-1];
556: }
557: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
558: SlepcCheckLapackInfo("gesv",info);
559: for (j=0;j<n2;j++) {
560: expmA[j] += RR[j];
561: }
562: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
563: }
564: } else { /* complex */
565: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
566: PetscCall(PetscArraycpy(Maux,As,n2));
567: PetscCall(PetscArrayzero(RR,n2));
568: for (j=0;j<n;j++) {
569: Maux[j+j*n] -= p[i];
570: RR[j+j*n] = r[i];
571: }
572: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
573: SlepcCheckLapackInfo("gesv",info);
574: for (j=0;j<n2;j++) {
575: expmA[j] += RR[j];
576: }
577: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
578: }
579: }
580: for (i=0;i<iremainsize;i++) {
581: if (!i) {
582: PetscCall(PetscArrayzero(RR,n2));
583: for (j=0;j<n;j++) {
584: RR[j+j*n] = remainterm[iremainsize-1];
585: }
586: } else {
587: PetscCall(PetscArraycpy(RR,As,n2));
588: for (j=1;j<i;j++) {
589: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
590: SWAP(RR,Maux,aux);
591: PetscCall(SlepcLogFlopsComplex(2.0*n*n*n));
592: }
593: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
594: PetscCall(SlepcLogFlopsComplex(1.0*n2));
595: }
596: for (j=0;j<n2;j++) {
597: expmA[j] += RR[j];
598: }
599: PetscCall(SlepcLogFlopsComplex(1.0*n2));
600: }
601: PetscCall(PetscFree3(r,p,remainterm));
602: } else { /* product form, default */
603: PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
604: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
605: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
606: PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
607: PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
609: PetscCall(PetscArrayzero(expmA,n2));
610: for (i=0;i<n;i++) { /* initialize */
611: expmA[i+i*n] = 1.0;
612: }
613: minlen = PetscMin(irsize,ipsize);
614: for (i=0;i<minlen;i++) {
615: PetscCall(PetscArraycpy(RR,As,n2));
616: for (j=0;j<n;j++) {
617: RR[j+j*n] -= rootp[i];
618: }
619: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
620: SWAP(expmA,Maux,aux);
621: PetscCall(PetscArraycpy(RR,As,n2));
622: for (j=0;j<n;j++) {
623: RR[j+j*n] -= rootq[i];
624: }
625: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
626: SlepcCheckLapackInfo("gesv",info);
627: /* loop(n) + gemm + loop(n) + gesv */
628: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
629: }
630: /* extra numerator */
631: for (i=minlen;i<irsize;i++) {
632: PetscCall(PetscArraycpy(RR,As,n2));
633: for (j=0;j<n;j++) {
634: RR[j+j*n] -= rootp[i];
635: }
636: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
637: SWAP(expmA,Maux,aux);
638: PetscCall(SlepcLogFlopsComplex(1.0*n+2.0*n*n*n));
639: }
640: /* extra denominator */
641: for (i=minlen;i<ipsize;i++) {
642: PetscCall(PetscArraycpy(RR,As,n2));
643: for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
644: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
645: SlepcCheckLapackInfo("gesv",info);
646: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
647: }
648: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
649: PetscCall(SlepcLogFlopsComplex(1.0*n2));
650: PetscCall(PetscFree2(rootp,rootq));
651: }
653: #if !defined(PETSC_USE_COMPLEX)
654: for (i=0;i<n2;i++) {
655: Ba2[i] = PetscRealPartComplex(expmA[i]);
656: }
657: #else
658: PetscCall(PetscArraycpy(Ba2,expmA,n2));
659: #endif
661: /* perform repeated squaring */
662: for (i=0;i<s;i++) { /* final squaring */
663: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
664: SWAP(Ba2,sMaux,saux);
665: PetscCall(PetscLogFlops(2.0*n*n*n));
666: }
667: if (Ba2!=Ba) {
668: PetscCall(PetscArraycpy(Ba,Ba2,n2));
669: sMaux = Ba2;
670: }
671: if (shift) {
672: expshift = PetscExpReal(shift);
673: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
674: PetscCall(PetscLogFlops(1.0*n2));
675: }
677: /* restore pointers */
678: Maux = Maux2; expmA = expmA2; RR = RR2;
679: PetscCall(PetscFree2(sMaux,Maux));
680: PetscCall(PetscFree4(expmA,As,RR,piv));
681: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
682: PetscCall(MatDenseRestoreArray(B,&Ba));
683: PetscFunctionReturn(PETSC_SUCCESS);
684: #endif
685: }
687: #define SMALLN 100
689: /*
690: * Function needed to compute optimal parameters (required workspace is 3*n*n)
691: */
692: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
693: {
694: PetscScalar *Ascaled=work;
695: PetscReal nrm,alpha,beta,rwork[1];
696: PetscInt t;
697: PetscBLASInt i,j;
699: PetscFunctionBegin;
700: beta = PetscPowReal(coeff,1.0/(2*m+1));
701: for (i=0;i<n;i++)
702: for (j=0;j<n;j++)
703: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
704: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
705: PetscCall(PetscLogFlops(2.0*n*n));
706: PetscCall(SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha));
707: alpha /= nrm;
708: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
709: PetscFunctionReturn(t);
710: }
712: /*
713: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
714: */
715: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
716: {
717: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
718: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
719: PetscBLASInt n_=0,n2,one=1;
720: PetscRandom rand;
721: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
722: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
723: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
724: 2.539398330063230e-001, /* m = 5 */
725: 9.504178996162932e-001, /* m = 7 */
726: 2.097847961257068e+000, /* m = 9 */
727: 5.371920351148152e+000 }; /* m = 13 */
729: PetscFunctionBegin;
730: *s = 0;
731: *m = 13;
732: PetscCall(PetscBLASIntCast(n,&n_));
733: PetscCall(PetscRandomCreate(PETSC_COMM_SELF,&rand));
734: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
735: if (d4==0.0) { /* safeguard for the case A = 0 */
736: *m = 3;
737: goto done;
738: }
739: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
740: PetscCall(PetscLogFlops(2.0*n*n));
741: eta1 = PetscMax(d4,d6);
742: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
743: *m = 3;
744: goto done;
745: }
746: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
747: *m = 5;
748: goto done;
749: }
750: if (n<SMALLN) {
751: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
752: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
753: PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
754: } else {
755: PetscCall(SlepcNormAm(n_,Apowers[2],2,work,rand,&d8));
756: d8 = PetscPowReal(d8,1.0/8.0);
757: }
758: eta3 = PetscMax(d6,d8);
759: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
760: *m = 7;
761: goto done;
762: }
763: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
764: *m = 9;
765: goto done;
766: }
767: if (n<SMALLN) {
768: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
769: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
770: PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
771: } else {
772: PetscCall(SlepcNormAm(n_,Apowers[1],5,work,rand,&d10));
773: d10 = PetscPowReal(d10,1.0/10.0);
774: }
775: eta4 = PetscMax(d8,d10);
776: eta5 = PetscMin(eta3,eta4);
777: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
778: if (*s) {
779: Ascaled = work+3*n*n;
780: n2 = n_*n_;
781: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
782: sfactor = PetscPowRealInt(2.0,-(*s));
783: PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
784: PetscCall(PetscLogFlops(1.0*n*n));
785: } else Ascaled = A;
786: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
787: done:
788: PetscCall(PetscRandomDestroy(&rand));
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
792: /*
793: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
794: *
795: * N. J. Higham, "The scaling and squaring method for the matrix exponential
796: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
797: */
798: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
799: {
800: PetscBLASInt n_=0,n2,*ipiv,info,one=1;
801: PetscInt n,m,j,s;
802: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
803: PetscScalar *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
804: const PetscScalar *Aa,*c;
805: const PetscScalar c3[4] = { 120, 60, 12, 1 };
806: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
807: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
808: const PetscScalar c9[10] = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
809: 2162160, 110880, 3960, 90, 1 };
810: const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
811: 1187353796428800.0, 129060195264000.0, 10559470521600.0,
812: 670442572800.0, 33522128640.0, 1323241920.0,
813: 40840800, 960960, 16380, 182, 1 };
815: PetscFunctionBegin;
816: PetscCall(MatDenseGetArrayRead(A,&Aa));
817: PetscCall(MatDenseGetArray(B,&Ba));
818: PetscCall(MatGetSize(A,&n,NULL));
819: PetscCall(PetscBLASIntCast(n,&n_));
820: n2 = n_*n_;
821: PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
823: /* Matrix powers */
824: Apowers[0] = work; /* Apowers[0] = A */
825: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
826: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
827: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
828: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
830: PetscCall(PetscArraycpy(Apowers[0],Aa,n2));
831: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
832: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
833: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
834: PetscCall(PetscLogFlops(6.0*n*n*n));
836: /* Compute scaling parameter and order of Pade approximant */
837: PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
839: if (s) { /* rescale */
840: for (j=0;j<4;j++) {
841: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
842: PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
843: }
844: PetscCall(PetscLogFlops(4.0*n*n));
845: }
847: /* Evaluate the Pade approximant */
848: switch (m) {
849: case 3: c = c3; break;
850: case 5: c = c5; break;
851: case 7: c = c7; break;
852: case 9: c = c9; break;
853: case 13: c = c13; break;
854: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
855: }
856: P = Ba;
857: Q = Apowers[4] + n*n;
858: W = Q + n*n;
859: switch (m) {
860: case 3:
861: case 5:
862: case 7:
863: case 9:
864: if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
865: PetscCall(PetscArrayzero(P,n2));
866: PetscCall(PetscArrayzero(Q,n2));
867: for (j=0;j<n;j++) {
868: P[j+j*n] = c[1];
869: Q[j+j*n] = c[0];
870: }
871: for (j=m;j>=3;j-=2) {
872: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
873: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
874: PetscCall(PetscLogFlops(4.0*n*n));
875: }
876: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
877: PetscCall(PetscLogFlops(2.0*n*n*n));
878: SWAP(P,W,aux);
879: break;
880: case 13:
881: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
882: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
883: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
884: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
885: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
886: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
887: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
888: PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
889: PetscCall(PetscArrayzero(P,n2));
890: for (j=0;j<n;j++) P[j+j*n] = c[1];
891: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
892: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
893: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
894: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
895: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
896: PetscCall(PetscLogFlops(7.0*n*n+2.0*n*n*n));
897: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
898: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
899: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
900: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
901: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
902: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
903: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
904: PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
905: PetscCall(PetscArrayzero(Q,n2));
906: for (j=0;j<n;j++) Q[j+j*n] = c[0];
907: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
908: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
909: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
910: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
911: PetscCall(PetscLogFlops(7.0*n*n));
912: break;
913: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
914: }
915: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
916: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
917: SlepcCheckLapackInfo("gesv",info);
918: PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
919: for (j=0;j<n;j++) P[j+j*n] += 1.0;
920: PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n));
922: /* Squaring */
923: for (j=1;j<=s;j++) {
924: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
925: SWAP(P,W,aux);
926: }
927: if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,n2));
928: PetscCall(PetscLogFlops(2.0*n*n*n*s));
930: PetscCall(PetscFree2(work,ipiv));
931: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
932: PetscCall(MatDenseRestoreArray(B,&Ba));
933: PetscFunctionReturn(PETSC_SUCCESS);
934: }
936: #if defined(PETSC_HAVE_CUDA)
937: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
938: #include <slepccublas.h>
940: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
941: {
942: PetscBLASInt n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
943: PetscInt m,k,sexp;
944: PetscBool odd;
945: const PetscInt p=MAX_PADE;
946: PetscReal c[MAX_PADE+1],s;
947: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
948: const PetscScalar *Aa;
949: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
950: cublasHandle_t cublasv2handle;
952: PetscFunctionBegin;
953: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
954: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
955: PetscCall(MatGetSize(A,&m,NULL));
956: PetscCall(PetscBLASIntCast(m,&n));
957: ld = n;
958: ld2 = ld*ld;
959: if (A==B) {
960: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
961: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
962: PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
963: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
964: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
965: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
967: PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
968: PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
969: PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
970: PetscCallCUDA(cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld));
971: PetscCallCUDA(cudaMalloc((void **)&d_info,sizeof(PetscBLASInt)));
972: PetscCallCUDA(cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*)));
973: PetscCallCUDA(cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*)));
975: PetscCall(PetscMalloc1(1,&ppP));
976: PetscCall(PetscMalloc1(1,&ppQ));
978: d_P = d_Ba;
979: PetscCall(PetscLogGpuTimeBegin());
981: /* Pade' coefficients */
982: c[0] = 1.0;
983: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
985: /* Scaling */
986: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
987: if (s>0.5) {
988: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
989: scale = PetscPowRealInt(2.0,-sexp);
990: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
991: PetscCall(PetscLogGpuFlops(1.0*n*n));
992: } else sexp = 0;
994: /* Horner evaluation */
995: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
996: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
997: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
998: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
999: PetscCall(set_diagonal(n,d_Q,ld,c[p]));
1000: PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1002: odd = PETSC_TRUE;
1003: for (k=p-1;k>0;k--) {
1004: if (odd) {
1005: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1006: SWAP(d_Q,d_W,aux);
1007: PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1008: odd = PETSC_FALSE;
1009: } else {
1010: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1011: SWAP(d_P,d_W,aux);
1012: PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1013: odd = PETSC_TRUE;
1014: }
1015: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1016: }
1017: if (odd) {
1018: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1019: SWAP(d_Q,d_W,aux);
1020: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1022: ppQ[0] = d_Q;
1023: ppP[0] = d_P;
1024: PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1025: PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1027: PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1028: PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1029: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1030: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1031: PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1032: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1033: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1034: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1035: PetscCall(shift_diagonal(n,d_P,ld,sone));
1036: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1037: } else {
1038: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1039: SWAP(d_P,d_W,aux);
1040: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1042: ppQ[0] = d_Q;
1043: ppP[0] = d_P;
1044: PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1045: PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1047: PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1048: PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1049: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1050: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1051: PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1052: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1053: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1054: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1055: PetscCall(shift_diagonal(n,d_P,ld,sone));
1056: }
1057: PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1059: for (k=1;k<=sexp;k++) {
1060: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1061: PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1062: }
1063: PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1065: PetscCall(PetscLogGpuTimeEnd());
1066: PetscCallCUDA(cudaFree(d_Q));
1067: PetscCallCUDA(cudaFree(d_W));
1068: PetscCallCUDA(cudaFree(d_A2));
1069: PetscCallCUDA(cudaFree(d_ipiv));
1070: PetscCallCUDA(cudaFree(d_info));
1071: PetscCallCUDA(cudaFree(d_ppP));
1072: PetscCallCUDA(cudaFree(d_ppQ));
1074: PetscCall(PetscFree(ppP));
1075: PetscCall(PetscFree(ppQ));
1077: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1078: if (A!=B) {
1079: if (s>0.5) { /* undo scaling */
1080: scale = 1.0/scale;
1081: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1082: }
1083: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1084: } else PetscCallCUDA(cudaFree(d_As));
1085: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1086: PetscFunctionReturn(PETSC_SUCCESS);
1087: }
1089: #if defined(PETSC_HAVE_MAGMA)
1090: #include <slepcmagma.h>
1092: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1093: {
1094: PetscBLASInt n=0,ld,ld2,*piv,one=1;
1095: PetscInt m,k,sexp;
1096: PetscBool odd;
1097: const PetscInt p=MAX_PADE;
1098: PetscReal c[MAX_PADE+1],s;
1099: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1100: const PetscScalar *Aa;
1101: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1102: cublasHandle_t cublasv2handle;
1104: PetscFunctionBegin;
1105: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1106: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1107: PetscCall(SlepcMagmaInit());
1108: PetscCall(MatGetSize(A,&m,NULL));
1109: PetscCall(PetscBLASIntCast(m,&n));
1110: ld = n;
1111: ld2 = ld*ld;
1112: if (A==B) {
1113: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
1114: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1115: PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1116: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1117: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
1118: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1120: PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
1121: PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
1122: PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
1124: PetscCall(PetscMalloc1(n,&piv));
1126: d_P = d_Ba;
1127: PetscCall(PetscLogGpuTimeBegin());
1129: /* Pade' coefficients */
1130: c[0] = 1.0;
1131: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1133: /* Scaling */
1134: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
1135: PetscCall(PetscLogGpuFlops(1.0*n*n));
1137: if (s>0.5) {
1138: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1139: scale = PetscPowRealInt(2.0,-sexp);
1140: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1141: PetscCall(PetscLogGpuFlops(1.0*n*n));
1142: } else sexp = 0;
1144: /* Horner evaluation */
1145: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
1146: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1147: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
1148: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
1149: PetscCall(set_diagonal(n,d_Q,ld,c[p]));
1150: PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1152: odd = PETSC_TRUE;
1153: for (k=p-1;k>0;k--) {
1154: if (odd) {
1155: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1156: SWAP(d_Q,d_W,aux);
1157: PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1158: odd = PETSC_FALSE;
1159: } else {
1160: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1161: SWAP(d_P,d_W,aux);
1162: PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1163: odd = PETSC_TRUE;
1164: }
1165: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1166: }
1167: if (odd) {
1168: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1169: SWAP(d_Q,d_W,aux);
1170: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1171: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1172: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1173: PetscCall(shift_diagonal(n,d_P,ld,sone));
1174: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1175: } else {
1176: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1177: SWAP(d_P,d_W,aux);
1178: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1179: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1180: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1181: PetscCall(shift_diagonal(n,d_P,ld,sone));
1182: }
1183: PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1185: for (k=1;k<=sexp;k++) {
1186: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1187: PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1188: }
1189: PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1191: PetscCall(PetscLogGpuTimeEnd());
1192: PetscCallCUDA(cudaFree(d_Q));
1193: PetscCallCUDA(cudaFree(d_W));
1194: PetscCallCUDA(cudaFree(d_A2));
1195: PetscCall(PetscFree(piv));
1197: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1198: if (A!=B) {
1199: if (s>0.5) { /* undo scaling */
1200: scale = 1.0/scale;
1201: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1202: }
1203: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1204: } else PetscCallCUDA(cudaFree(d_As));
1205: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1206: PetscFunctionReturn(PETSC_SUCCESS);
1207: }
1209: /*
1210: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1211: *
1212: * N. J. Higham, "The scaling and squaring method for the matrix exponential
1213: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1214: */
1215: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1216: {
1217: PetscBLASInt n_=0,n2,*ipiv,one=1;
1218: PetscInt n,m,j,s;
1219: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1220: PetscScalar *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1221: const PetscScalar *Aa,*c;
1222: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1223: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1224: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1225: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1226: 2162160, 110880, 3960, 90, 1 };
1227: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1228: 1187353796428800, 129060195264000, 10559470521600,
1229: 670442572800, 33522128640, 1323241920,
1230: 40840800, 960960, 16380, 182, 1 };
1231: cublasHandle_t cublasv2handle;
1233: PetscFunctionBegin;
1234: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1235: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1236: PetscCall(SlepcMagmaInit());
1237: PetscCall(MatGetSize(A,&n,NULL));
1238: PetscCall(PetscBLASIntCast(n,&n_));
1239: n2 = n_*n_;
1240: PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
1241: /* Matrix powers */
1242: Apowers[0] = work; /* Apowers[0] = A */
1243: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1244: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1245: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1246: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1247: if (A==B) {
1248: PetscCallCUDA(cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar)));
1249: d_Apowers[0] = d_work; /* d_Apowers[0] = A */
1250: d_Apowers[1] = d_Apowers[0] + n*n; /* d_Apowers[1] = A^2 */
1251: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1252: PetscCallCUDA(cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1253: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1254: } else {
1255: PetscCallCUDA(cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar)));
1256: PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1257: d_Apowers[1] = d_work; /* d_Apowers[1] = A^2 */
1258: }
1259: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1260: d_Apowers[2] = d_Apowers[1] + n*n; /* d_Apowers[2] = A^4 */
1261: d_Apowers[3] = d_Apowers[2] + n*n; /* d_Apowers[3] = A^6 */
1262: d_Apowers[4] = d_Apowers[3] + n*n; /* d_Apowers[4] = A^8 */
1263: d_Q = d_Apowers[4] + n*n;
1264: d_W = d_Q + n*n;
1266: PetscCall(PetscLogGpuTimeBegin());
1268: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_));
1269: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_));
1270: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_));
1271: PetscCall(PetscLogGpuFlops(6.0*n*n*n));
1273: PetscCallCUDA(cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1274: PetscCallCUDA(cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1275: PetscCall(PetscLogGpuToCpu(4*n2*sizeof(PetscScalar)));
1276: /* Compute scaling parameter and order of Pade approximant */
1277: PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
1279: if (s) { /* rescale */
1280: for (j=0;j<4;j++) {
1281: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1282: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one));
1283: }
1284: PetscCall(PetscLogGpuFlops(4.0*n*n));
1285: }
1287: /* Evaluate the Pade approximant */
1288: switch (m) {
1289: case 3: c = c3; break;
1290: case 5: c = c5; break;
1291: case 7: c = c7; break;
1292: case 9: c = c9; break;
1293: case 13: c = c13; break;
1294: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1295: }
1296: d_P = d_Ba;
1297: switch (m) {
1298: case 3:
1299: case 5:
1300: case 7:
1301: case 9:
1302: if (m==9) PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_));
1303: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1304: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1305: PetscCall(set_diagonal(n,d_P,n,c[1]));
1306: PetscCall(set_diagonal(n,d_Q,n,c[0]));
1307: for (j=m;j>=3;j-=2) {
1308: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one));
1309: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one));
1310: PetscCall(PetscLogGpuFlops(4.0*n*n));
1311: }
1312: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_));
1313: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1314: SWAP(d_P,d_W,aux);
1315: break;
1316: case 13:
1317: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1318: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1319: PetscCallCUDA(cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1320: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[13],d_P,one));
1321: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one));
1322: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one));
1323: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_));
1324: PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1326: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1327: PetscCall(set_diagonal(n,d_P,n,c[1]));
1328: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one));
1329: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one));
1330: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one));
1331: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one));
1332: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_));
1333: PetscCall(PetscLogGpuFlops(7.0*n*n+2.0*n*n*n));
1334: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1335: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1336: PetscCallCUDA(cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1337: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[12],d_Q,one));
1338: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one));
1339: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one));
1340: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_));
1341: PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1342: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1343: PetscCall(set_diagonal(n,d_Q,n,c[0]));
1344: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one));
1345: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one));
1346: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one));
1347: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one));
1348: PetscCall(PetscLogGpuFlops(7.0*n*n));
1349: break;
1350: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1351: }
1352: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one));
1354: PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
1356: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&stwo,d_P,one));
1357: PetscCall(shift_diagonal(n,d_P,n,sone));
1358: PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n));
1360: /* Squaring */
1361: for (j=1;j<=s;j++) {
1362: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_));
1363: SWAP(d_P,d_W,aux);
1364: }
1365: PetscCall(PetscLogGpuFlops(2.0*n*n*n*s));
1366: PetscCall(PetscLogGpuTimeEnd());
1368: PetscCall(PetscFree2(work,ipiv));
1369: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1370: if (A!=B) {
1371: if (s>0.5) { /* undo scaling */
1372: scale = 1.0/PetscPowRealInt(2.0,-s);
1373: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one));
1374: }
1375: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1376: }
1377: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1378: PetscCallCUDA(cudaFree(d_work));
1379: PetscFunctionReturn(PETSC_SUCCESS);
1380: }
1382: /*
1383: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1384: * and Yuji Nakatsukasa
1385: *
1386: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1387: * Approximation for the Matrix Exponential",
1388: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1389: * https://doi.org/10.1137/15M1027553
1390: */
1391: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1392: {
1393: PetscInt i,j,n_,s,k,m,mod;
1394: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
1395: PetscReal nrm,shift=0.0,rone=1.0,rzero=0.0;
1396: #if defined(PETSC_USE_COMPLEX)
1397: PetscReal *rwork=NULL;
1398: #endif
1399: PetscComplex *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1400: PetscScalar *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1401: const PetscScalar *Aa;
1402: PetscBool isreal,*d_isreal,flg;
1403: cublasHandle_t cublasv2handle;
1405: PetscFunctionBegin;
1406: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1407: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1408: PetscCall(SlepcMagmaInit());
1409: PetscCall(MatGetSize(A,&n_,NULL));
1410: PetscCall(PetscBLASIntCast(n_,&n));
1411: PetscCall(PetscBLASIntCast(n*n,&n2));
1413: if (A==B) {
1414: PetscCallCUDA(cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2));
1415: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1416: PetscCallCUDA(cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1417: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1418: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa));
1419: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1420: d_Ba2 = d_Ba;
1422: PetscCallCUDA(cudaMalloc((void **)&d_isreal,sizeof(PetscBool)));
1423: PetscCallCUDA(cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2));
1424: PetscCallCUDA(cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2));
1426: PetscCall(PetscLogGpuTimeBegin());
1427: d_Maux2 = d_Maux;
1428: PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
1429: if (!flg) {
1430: PetscCall(PetscMalloc2(n,&wr,n,&wi));
1431: /* estimate rightmost eigenvalue and shift A with it */
1432: PetscCall(PetscMalloc1(n2,&Maux));
1433: PetscCall(MatDenseGetArrayRead(A,&Aa));
1434: PetscCall(PetscArraycpy(Maux,Aa,n2));
1435: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
1436: #if !defined(PETSC_USE_COMPLEX)
1437: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
1438: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1439: PetscCall(PetscMalloc1(lwork,&work));
1440: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
1441: PetscCall(PetscFree(work));
1442: #else
1443: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
1444: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1445: PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
1446: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
1447: PetscCall(PetscFree2(rwork,work));
1448: #endif
1449: PetscCall(PetscFree(Maux));
1450: PetscCall(PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
1452: shift = PetscRealPart(wr[0]);
1453: for (i=1;i<n;i++) {
1454: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1455: }
1456: PetscCall(PetscFree2(wr,wi));
1457: }
1458: /* shift so that largest real part is (about) 0 */
1459: PetscCallCUDA(cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1460: if (shift) {
1461: PetscCall(shift_diagonal(n,d_sMaux,n,-shift));
1462: PetscCall(PetscLogGpuFlops(1.0*n));
1463: }
1464: #if defined(PETSC_USE_COMPLEX)
1465: PetscCallCUDA(cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1466: if (shift) {
1467: PetscCall(shift_diagonal(n,d_Maux,n,-shift));
1468: PetscCall(PetscLogGpuFlops(1.0*n));
1469: }
1470: #endif
1471: if (A!=B) PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa));
1472: else PetscCallCUDA(cudaFree(d_Aa));
1474: /* estimate norm(A) and select the scaling factor */
1475: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm));
1476: PetscCall(PetscLogGpuFlops(2.0*n*n));
1477: PetscCall(sexpm_params(nrm,&s,&k,&m));
1478: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1479: if (shift) expshift = PetscExpReal(shift);
1480: PetscCall(shift_Cdiagonal(n,d_Maux,n,rone,rzero));
1481: if (shift) {
1482: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one));
1483: PetscCall(PetscLogGpuFlops(1.0*(n+n2)));
1484: } else PetscCall(PetscLogGpuFlops(1.0*n));
1485: PetscCallCUDA(cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1486: PetscCallCUDA(cudaFree(d_isreal));
1487: PetscCallCUDA(cudaFree(d_sMaux));
1488: PetscCallCUDA(cudaFree(d_Maux));
1489: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1490: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
1491: }
1493: PetscCallCUDA(cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2));
1494: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2));
1495: PetscCallCUDA(cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2));
1496: d_expmA2 = d_expmA; d_RR2 = d_RR;
1497: PetscCall(PetscMalloc1(n,&piv));
1498: /* scale matrix */
1499: #if !defined(PETSC_USE_COMPLEX)
1500: PetscCall(copy_array2D_S2C(n,n,d_As,n,d_sMaux,n));
1501: #else
1502: PetscCallCUDA(cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1503: #endif
1504: scale = 1.0/PetscPowRealInt(2.0,s);
1505: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one));
1506: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1508: /* evaluate Pade approximant (partial fraction or product form) */
1509: if (fn->method==3 || !m) { /* partial fraction */
1510: PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
1511: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1512: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1513: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
1514: PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
1515: PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
1517: PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1518: #if !defined(PETSC_USE_COMPLEX)
1519: isreal = PETSC_TRUE;
1520: #else
1521: PetscCall(getisreal_array2D(n,n,d_Maux,n,d_isreal));
1522: PetscCallCUDA(cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost));
1523: #endif
1524: if (isreal) {
1525: rsizediv2 = irsize/2;
1526: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1527: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1528: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1529: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i])));
1530: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i])));
1531: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1532: PetscCall(add_array2D_Conj(n,n,d_RR,n));
1533: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1534: /* shift(n) + gesv + axpy(n2) */
1535: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
1536: }
1538: mod = ipsize % 2;
1539: if (mod) {
1540: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1541: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1542: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1])));
1543: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1])));
1544: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1545: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1546: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1547: }
1548: } else { /* complex */
1549: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1550: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1551: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1552: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i])));
1553: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i])));
1554: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1555: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1556: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1557: }
1558: }
1559: for (i=0;i<iremainsize;i++) {
1560: if (!i) {
1561: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1562: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1])));
1563: } else {
1564: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1565: for (j=1;j<i;j++) {
1566: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n));
1567: SWAP(d_RR,d_Maux,aux);
1568: PetscCall(SlepcLogGpuFlopsComplex(2.0*n*n*n));
1569: }
1570: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one));
1571: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1572: }
1573: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1574: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1575: }
1576: PetscCall(PetscFree3(r,p,remainterm));
1577: } else { /* product form, default */
1578: PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
1579: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1580: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1581: PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
1582: PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
1584: PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1585: PetscCall(set_Cdiagonal(n,d_expmA,n,rone,rzero)); /* initialize */
1586: minlen = PetscMin(irsize,ipsize);
1587: for (i=0;i<minlen;i++) {
1588: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1589: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1590: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1591: SWAP(d_expmA,d_Maux,aux);
1592: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1593: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1594: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1595: /* shift(n) + gemm + shift(n) + gesv */
1596: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1597: }
1598: /* extra enumerator */
1599: for (i=minlen;i<irsize;i++) {
1600: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1601: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1602: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1603: SWAP(d_expmA,d_Maux,aux);
1604: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n));
1605: }
1606: /* extra denominator */
1607: for (i=minlen;i<ipsize;i++) {
1608: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1609: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1610: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1611: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1612: }
1613: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one));
1614: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1615: PetscCall(PetscFree2(rootp,rootq));
1616: }
1618: #if !defined(PETSC_USE_COMPLEX)
1619: PetscCall(copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n));
1620: #else
1621: PetscCallCUDA(cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1622: #endif
1624: /* perform repeated squaring */
1625: for (i=0;i<s;i++) { /* final squaring */
1626: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n));
1627: SWAP(d_Ba2,d_sMaux,saux);
1628: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1629: }
1630: if (d_Ba2!=d_Ba) {
1631: PetscCallCUDA(cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1632: d_sMaux = d_Ba2;
1633: }
1634: if (shift) {
1635: expshift = PetscExpReal(shift);
1636: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one));
1637: PetscCall(PetscLogGpuFlops(1.0*n2));
1638: }
1640: PetscCall(PetscLogGpuTimeEnd());
1642: /* restore pointers */
1643: d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1644: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1645: PetscCallCUDA(cudaFree(d_isreal));
1646: PetscCallCUDA(cudaFree(d_sMaux));
1647: PetscCallCUDA(cudaFree(d_Maux));
1648: PetscCallCUDA(cudaFree(d_expmA));
1649: PetscCallCUDA(cudaFree(d_As));
1650: PetscCallCUDA(cudaFree(d_RR));
1651: PetscCall(PetscFree(piv));
1652: PetscFunctionReturn(PETSC_SUCCESS);
1653: }
1654: #endif /* PETSC_HAVE_MAGMA */
1655: #endif /* PETSC_HAVE_CUDA */
1657: static PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1658: {
1659: PetscBool isascii;
1660: char str[50];
1661: const char *methodname[] = {
1662: "scaling & squaring, [m/m] Pade approximant (Higham)",
1663: "scaling & squaring, [6/6] Pade approximant",
1664: "scaling & squaring, subdiagonal Pade approximant (product form)",
1665: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1666: };
1667: const int nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
1669: PetscFunctionBegin;
1670: PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
1671: if (isascii) {
1672: if (fn->beta==(PetscScalar)1.0) {
1673: if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(x)\n"));
1674: else {
1675: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1676: PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(%s*x)\n",str));
1677: }
1678: } else {
1679: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE));
1680: if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s*exp(x)\n",str));
1681: else {
1682: PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s",str));
1683: PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1684: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1685: PetscCall(PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str));
1686: PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1687: }
1688: }
1689: if (fn->method<nmeth) PetscCall(PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]));
1690: }
1691: PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1694: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1695: {
1696: PetscFunctionBegin;
1697: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1698: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1699: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1700: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1701: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1702: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1703: #if defined(PETSC_HAVE_CUDA)
1704: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1705: #if defined(PETSC_HAVE_MAGMA)
1706: fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1707: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1708: fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1709: fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1710: #endif
1711: #endif
1712: fn->ops->view = FNView_Exp;
1713: PetscFunctionReturn(PETSC_SUCCESS);
1714: }