Actual source code: fnexp.c
slepc-3.22.2 2024-12-02
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: static PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
19: PetscFunctionBegin;
20: *y = PetscExpScalar(x);
21: PetscFunctionReturn(PETSC_SUCCESS);
22: }
24: static PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
26: PetscFunctionBegin;
27: *y = PetscExpScalar(x);
28: PetscFunctionReturn(PETSC_SUCCESS);
29: }
31: #define MAX_PADE 6
33: static PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
34: {
35: PetscBLASInt n=0,ld,ld2,*ipiv,info,inc=1;
36: PetscInt m,j,k,sexp;
37: PetscBool odd;
38: const PetscInt p=MAX_PADE;
39: PetscReal c[MAX_PADE+1],s,*rwork;
40: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
41: PetscScalar *Ba,*As,*A2,*Q,*P,*W,*aux;
42: const PetscScalar *Aa;
44: PetscFunctionBegin;
45: PetscCall(MatDenseGetArrayRead(A,&Aa));
46: PetscCall(MatDenseGetArray(B,&Ba));
47: PetscCall(MatGetSize(A,&m,NULL));
48: PetscCall(PetscBLASIntCast(m,&n));
49: ld = n;
50: ld2 = ld*ld;
51: P = Ba;
52: PetscCall(PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv));
53: PetscCall(PetscArraycpy(As,Aa,ld2));
55: /* Pade' coefficients */
56: c[0] = 1.0;
57: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
59: /* Scaling */
60: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
61: PetscCall(PetscLogFlops(1.0*n*n));
62: if (s>0.5) {
63: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
64: scale = PetscPowRealInt(2.0,-sexp);
65: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
66: PetscCall(PetscLogFlops(1.0*n*n));
67: } else sexp = 0;
69: /* Horner evaluation */
70: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
71: PetscCall(PetscLogFlops(2.0*n*n*n));
72: PetscCall(PetscArrayzero(Q,ld2));
73: PetscCall(PetscArrayzero(P,ld2));
74: for (j=0;j<n;j++) {
75: Q[j+j*ld] = c[p];
76: P[j+j*ld] = c[p-1];
77: }
79: odd = PETSC_TRUE;
80: for (k=p-1;k>0;k--) {
81: if (odd) {
82: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
83: SlepcSwap(Q,W,aux);
84: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
85: odd = PETSC_FALSE;
86: } else {
87: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
88: SlepcSwap(P,W,aux);
89: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
90: odd = PETSC_TRUE;
91: }
92: PetscCall(PetscLogFlops(2.0*n*n*n));
93: }
94: /*if (odd) {
95: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
96: SlepcSwap(Q,W,aux);
97: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
98: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
99: SlepcCheckLapackInfo("gesv",info);
100: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
101: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
102: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
103: } else {*/
104: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
105: SlepcSwap(P,W,aux);
106: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
107: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
108: SlepcCheckLapackInfo("gesv",info);
109: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
110: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
111: /*}*/
112: PetscCall(PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
114: for (k=1;k<=sexp;k++) {
115: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
116: PetscCall(PetscArraycpy(P,W,ld2));
117: }
118: if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,ld2));
119: PetscCall(PetscLogFlops(2.0*n*n*n*sexp));
121: PetscCall(PetscFree6(Q,W,As,A2,rwork,ipiv));
122: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
123: PetscCall(MatDenseRestoreArray(B,&Ba));
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: #if defined(PETSC_HAVE_COMPLEX)
128: /*
129: * Set scaling factor (s) and Pade degree (k,m)
130: */
131: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
132: {
133: PetscFunctionBegin;
134: if (nrm>1) {
135: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
136: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
137: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
138: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
139: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
140: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
141: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
142: else {*s = 1; *k = 1; *m = *k+1;}
143: } else { /* nrm<1 */
144: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
145: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
146: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
147: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
148: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
149: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
150: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
151: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
152: else {*s = 0; *k = 1; *m = 0;}
153: }
154: PetscFunctionReturn(PETSC_SUCCESS);
155: }
157: /*
158: * Partial fraction form coefficients.
159: * If query, the function returns the size necessary to store the coefficients.
160: */
161: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
162: {
163: PetscInt i;
164: const PetscComplex /* m == k+1 */
165: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
166: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
167: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
168: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
169: -2.733432894659307e+02 },
170: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
171: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
172: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
173: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
174: 6.286704751729261e+00 },
175: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
176: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
177: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
178: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
179: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
180: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
181: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
182: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
183: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
184: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
185: -1.829749817484586e+01 },
186: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
187: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
188: 3.637834252744491e+00 },
189: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
190: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
191: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
192: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
193: const PetscComplex /* m == k-1 */
194: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
195: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
196: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
197: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
198: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
199: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
200: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
201: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
202: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
203: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
204: -1.734353918633177e+02 },
205: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
206: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
207: 5.648485971016893e+00 },
208: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
209: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
210: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
211: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
212: const PetscScalar /* m == k-1 */
213: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
214: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
215: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
216: m1remain2[2] = {-0.5, -3.5},
217: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
218: remain2[3] = {1.0/2.0, 1, 1};
220: PetscFunctionBegin;
221: if (query) { /* query about buffer's size */
222: if (m==k+1) {
223: *remain = 0;
224: *r = *q = k+1;
225: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
226: }
227: if (m==k-1) {
228: *remain = 2;
229: if (k==5) *r = *q = 4;
230: else if (k==4) *r = *q = 3;
231: else if (k==3) *r = *q = 2;
232: else if (k==2) *r = *q = 1;
233: }
234: if (m==0) {
235: *r = *q = 0;
236: *remain = k+1;
237: }
238: } else {
239: if (m==k+1) {
240: if (k==4) {
241: for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
242: } else if (k==3) {
243: for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
244: } else if (k==2) {
245: for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
246: } else if (k==1) {
247: for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
248: }
249: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
250: }
251: if (m==k-1) {
252: if (k==5) {
253: for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
254: for (i=0;i<2;i++) remain[i] = m1remain5[i];
255: } else if (k==4) {
256: for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
257: for (i=0;i<2;i++) remain[i] = m1remain4[i];
258: } else if (k==3) {
259: for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
260: } else if (k==2) {
261: r[0] = -13.5; q[0] = 3;
262: for (i=0;i<2;i++) remain[i] = m1remain2[i];
263: }
264: }
265: if (m==0) {
266: r = q = NULL;
267: if (k==3) {
268: for (i=0;i<4;i++) remain[i] = remain3[i];
269: } else if (k==2) {
270: for (i=0;i<3;i++) remain[i] = remain2[i];
271: }
272: }
273: }
274: PetscFunctionReturn(PETSC_SUCCESS);
275: }
277: /*
278: * Product form coefficients.
279: * If query, the function returns the size necessary to store the coefficients.
280: */
281: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
282: {
283: PetscInt i;
284: const PetscComplex /* m == k+1 */
285: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
286: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
287: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
288: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
289: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
290: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
291: 6.286704751729261e+00 ,
292: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
293: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
294: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
295: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
296: -5.648485971016893e+00 },
297: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
298: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
299: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
300: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
301: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
302: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
303: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
304: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
305: 3.637834252744491e+00 },
306: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
307: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
308: const PetscComplex /* m == k-1 */
309: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
310: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
311: -6.286704751729261e+00 ,
312: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
313: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
314: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
315: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
316: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
317: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
318: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
319: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
320: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
321: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
322: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
323: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
324: 5.648485971016893e+00 },
325: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
326: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
327: -3.637834252744491e+00 },
328: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
329: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
330: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
331: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
333: PetscFunctionBegin;
334: if (query) {
335: if (m == k+1) {
336: *mult = 1;
337: *p = k;
338: *q = k+1;
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
341: if (m==k-1) {
342: *mult = 1;
343: *p = k;
344: *q = k-1;
345: }
346: } else {
347: if (m == k+1) {
348: *mult = PetscPowInt(-1,m);
349: *mult *= m;
350: if (k==4) {
351: for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
352: q[4] = p1q4[4];
353: } else if (k==3) {
354: for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
355: q[3] = p1q3[3];
356: } else if (k==2) {
357: for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
358: q[2] = p1q2[2];
359: } else if (k==1) {
360: p[0] = -3;
361: for (i=0;i<2;i++) q[i] = p1q1[i];
362: }
363: PetscFunctionReturn(PETSC_SUCCESS);
364: }
365: if (m==k-1) {
366: *mult = PetscPowInt(-1,m);
367: *mult /= k;
368: if (k==5) {
369: for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
370: p[4] = m1p5[4];
371: } else if (k==4) {
372: for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
373: p[3] = m1p4[3];
374: } else if (k==3) {
375: for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
376: p[2] = m1p3[2];
377: } else if (k==2) {
378: for (i=0;i<2;i++) p[i] = m1p2[i];
379: q[0] = 3;
380: }
381: }
382: }
383: PetscFunctionReturn(PETSC_SUCCESS);
384: }
385: #endif /* PETSC_HAVE_COMPLEX */
387: #if defined(PETSC_USE_COMPLEX)
388: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
389: {
390: PetscInt i;
392: PetscFunctionBegin;
393: *result=PETSC_TRUE;
394: for (i=0;i<n&&*result;i++) {
395: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
396: }
397: PetscFunctionReturn(PETSC_SUCCESS);
398: }
399: #endif
401: /*
402: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
403: * and Yuji Nakatsukasa
404: *
405: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
406: * Approximation for the Matrix Exponential",
407: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
408: * https://doi.org/10.1137/15M1027553
409: */
410: static PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
411: {
412: #if !defined(PETSC_HAVE_COMPLEX)
413: PetscFunctionBegin;
414: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
415: #else
416: PetscInt i,j,n_,s,k,m,mod;
417: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
418: PetscReal nrm,shift=0.0;
419: #if defined(PETSC_USE_COMPLEX)
420: PetscReal *rwork=NULL;
421: #endif
422: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
423: PetscScalar *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
424: const PetscScalar *Aa;
425: PetscBool isreal,flg;
426: PetscBLASInt query=-1;
427: PetscScalar work1,*work;
429: PetscFunctionBegin;
430: PetscCall(MatGetSize(A,&n_,NULL));
431: PetscCall(PetscBLASIntCast(n_,&n));
432: PetscCall(MatDenseGetArrayRead(A,&Aa));
433: PetscCall(MatDenseGetArray(B,&Ba));
434: Ba2 = Ba;
435: PetscCall(PetscBLASIntCast(n*n,&n2));
437: PetscCall(PetscMalloc2(n2,&sMaux,n2,&Maux));
438: Maux2 = Maux;
439: PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
440: if (!flg) {
441: PetscCall(PetscMalloc2(n,&wr,n,&wi));
442: PetscCall(PetscArraycpy(sMaux,Aa,n2));
443: /* estimate rightmost eigenvalue and shift A with it */
444: #if !defined(PETSC_USE_COMPLEX)
445: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
446: SlepcCheckLapackInfo("geev",info);
447: PetscCall(PetscBLASIntCast((PetscInt)work1,&lwork));
448: PetscCall(PetscMalloc1(lwork,&work));
449: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
450: PetscCall(PetscFree(work));
451: #else
452: PetscCall(PetscArraycpy(Maux,Aa,n2));
453: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
454: SlepcCheckLapackInfo("geev",info);
455: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
456: PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
457: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
458: PetscCall(PetscFree2(rwork,work));
459: #endif
460: SlepcCheckLapackInfo("geev",info);
461: PetscCall(PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
463: shift = PetscRealPart(wr[0]);
464: for (i=1;i<n;i++) {
465: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
466: }
467: PetscCall(PetscFree2(wr,wi));
468: }
469: /* shift so that largest real part is (about) 0 */
470: PetscCall(PetscArraycpy(sMaux,Aa,n2));
471: if (shift) {
472: for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
473: PetscCall(PetscLogFlops(1.0*n));
474: }
475: #if defined(PETSC_USE_COMPLEX)
476: PetscCall(PetscArraycpy(Maux,Aa,n2));
477: if (shift) {
478: for (i=0;i<n;i++) Maux[i+i*n] -= shift;
479: PetscCall(PetscLogFlops(1.0*n));
480: }
481: #endif
483: /* estimate norm(A) and select the scaling factor */
484: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
485: PetscCall(PetscLogFlops(1.0*n*n));
486: PetscCall(sexpm_params(nrm,&s,&k,&m));
487: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
488: if (shift) expshift = PetscExpReal(shift);
489: for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
490: if (shift) {
491: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
492: PetscCall(PetscLogFlops(1.0*(n+n2)));
493: } else PetscCall(PetscLogFlops(1.0*n));
494: PetscCall(PetscArraycpy(Ba,sMaux,n2));
495: PetscCall(PetscFree2(sMaux,Maux));
496: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
497: PetscCall(MatDenseRestoreArray(B,&Ba));
498: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
499: }
501: PetscCall(PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv));
502: expmA2 = expmA; RR2 = RR;
503: /* scale matrix */
504: #if !defined(PETSC_USE_COMPLEX)
505: for (i=0;i<n2;i++) {
506: As[i] = sMaux[i];
507: }
508: #else
509: PetscCall(PetscArraycpy(As,sMaux,n2));
510: #endif
511: scale = 1.0/PetscPowRealInt(2.0,s);
512: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
513: PetscCall(SlepcLogFlopsComplex(1.0*n2));
515: /* evaluate Pade approximant (partial fraction or product form) */
516: if (fn->method==3 || !m) { /* partial fraction */
517: PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
518: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
519: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
520: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
521: PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
522: PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
524: PetscCall(PetscArrayzero(expmA,n2));
525: #if !defined(PETSC_USE_COMPLEX)
526: isreal = PETSC_TRUE;
527: #else
528: PetscCall(getisreal(n2,Maux,&isreal));
529: #endif
530: if (isreal) {
531: rsizediv2 = irsize/2;
532: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
533: PetscCall(PetscArraycpy(Maux,As,n2));
534: PetscCall(PetscArrayzero(RR,n2));
535: for (j=0;j<n;j++) {
536: Maux[j+j*n] -= p[2*i];
537: RR[j+j*n] = r[2*i];
538: }
539: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
540: SlepcCheckLapackInfo("gesv",info);
541: for (j=0;j<n2;j++) {
542: expmA[j] += RR[j] + PetscConj(RR[j]);
543: }
544: /* loop(n) + gesv + loop(n2) */
545: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
546: }
548: mod = ipsize % 2;
549: if (mod) {
550: PetscCall(PetscArraycpy(Maux,As,n2));
551: PetscCall(PetscArrayzero(RR,n2));
552: for (j=0;j<n;j++) {
553: Maux[j+j*n] -= p[ipsize-1];
554: RR[j+j*n] = r[irsize-1];
555: }
556: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
557: SlepcCheckLapackInfo("gesv",info);
558: for (j=0;j<n2;j++) {
559: expmA[j] += RR[j];
560: }
561: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
562: }
563: } else { /* complex */
564: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
565: PetscCall(PetscArraycpy(Maux,As,n2));
566: PetscCall(PetscArrayzero(RR,n2));
567: for (j=0;j<n;j++) {
568: Maux[j+j*n] -= p[i];
569: RR[j+j*n] = r[i];
570: }
571: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
572: SlepcCheckLapackInfo("gesv",info);
573: for (j=0;j<n2;j++) {
574: expmA[j] += RR[j];
575: }
576: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
577: }
578: }
579: for (i=0;i<iremainsize;i++) {
580: if (!i) {
581: PetscCall(PetscArrayzero(RR,n2));
582: for (j=0;j<n;j++) {
583: RR[j+j*n] = remainterm[iremainsize-1];
584: }
585: } else {
586: PetscCall(PetscArraycpy(RR,As,n2));
587: for (j=1;j<i;j++) {
588: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
589: SlepcSwap(RR,Maux,aux);
590: PetscCall(SlepcLogFlopsComplex(2.0*n*n*n));
591: }
592: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
593: PetscCall(SlepcLogFlopsComplex(1.0*n2));
594: }
595: for (j=0;j<n2;j++) {
596: expmA[j] += RR[j];
597: }
598: PetscCall(SlepcLogFlopsComplex(1.0*n2));
599: }
600: PetscCall(PetscFree3(r,p,remainterm));
601: } else { /* product form, default */
602: PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
603: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
604: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
605: PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
606: PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
608: PetscCall(PetscArrayzero(expmA,n2));
609: for (i=0;i<n;i++) { /* initialize */
610: expmA[i+i*n] = 1.0;
611: }
612: minlen = PetscMin(irsize,ipsize);
613: for (i=0;i<minlen;i++) {
614: PetscCall(PetscArraycpy(RR,As,n2));
615: for (j=0;j<n;j++) {
616: RR[j+j*n] -= rootp[i];
617: }
618: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
619: SlepcSwap(expmA,Maux,aux);
620: PetscCall(PetscArraycpy(RR,As,n2));
621: for (j=0;j<n;j++) {
622: RR[j+j*n] -= rootq[i];
623: }
624: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
625: SlepcCheckLapackInfo("gesv",info);
626: /* loop(n) + gemm + loop(n) + gesv */
627: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
628: }
629: /* extra numerator */
630: for (i=minlen;i<irsize;i++) {
631: PetscCall(PetscArraycpy(RR,As,n2));
632: for (j=0;j<n;j++) {
633: RR[j+j*n] -= rootp[i];
634: }
635: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
636: SlepcSwap(expmA,Maux,aux);
637: PetscCall(SlepcLogFlopsComplex(1.0*n+2.0*n*n*n));
638: }
639: /* extra denominator */
640: for (i=minlen;i<ipsize;i++) {
641: PetscCall(PetscArraycpy(RR,As,n2));
642: for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
643: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
644: SlepcCheckLapackInfo("gesv",info);
645: PetscCall(SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
646: }
647: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
648: PetscCall(SlepcLogFlopsComplex(1.0*n2));
649: PetscCall(PetscFree2(rootp,rootq));
650: }
652: #if !defined(PETSC_USE_COMPLEX)
653: for (i=0;i<n2;i++) {
654: Ba2[i] = PetscRealPartComplex(expmA[i]);
655: }
656: #else
657: PetscCall(PetscArraycpy(Ba2,expmA,n2));
658: #endif
660: /* perform repeated squaring */
661: for (i=0;i<s;i++) { /* final squaring */
662: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
663: SlepcSwap(Ba2,sMaux,saux);
664: PetscCall(PetscLogFlops(2.0*n*n*n));
665: }
666: if (Ba2!=Ba) {
667: PetscCall(PetscArraycpy(Ba,Ba2,n2));
668: sMaux = Ba2;
669: }
670: if (shift) {
671: expshift = PetscExpReal(shift);
672: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
673: PetscCall(PetscLogFlops(1.0*n2));
674: }
676: /* restore pointers */
677: Maux = Maux2; expmA = expmA2; RR = RR2;
678: PetscCall(PetscFree2(sMaux,Maux));
679: PetscCall(PetscFree4(expmA,As,RR,piv));
680: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
681: PetscCall(MatDenseRestoreArray(B,&Ba));
682: PetscFunctionReturn(PETSC_SUCCESS);
683: #endif
684: }
686: #define SMALLN 100
688: /*
689: * Function needed to compute optimal parameters (required workspace is 3*n*n)
690: */
691: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
692: {
693: PetscScalar *Ascaled=work;
694: PetscReal nrm,alpha,beta,rwork[1];
695: PetscInt t;
696: PetscBLASInt i,j;
698: PetscFunctionBegin;
699: beta = PetscPowReal(coeff,1.0/(2*m+1));
700: for (i=0;i<n;i++)
701: for (j=0;j<n;j++)
702: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
703: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
704: PetscCall(PetscLogFlops(2.0*n*n));
705: PetscCall(SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha));
706: alpha /= nrm;
707: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
708: PetscFunctionReturn(t);
709: }
711: /*
712: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
713: */
714: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
715: {
716: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
717: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
718: PetscBLASInt n_=0,n2,one=1;
719: PetscRandom rand;
720: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
721: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
722: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
723: 2.539398330063230e-001, /* m = 5 */
724: 9.504178996162932e-001, /* m = 7 */
725: 2.097847961257068e+000, /* m = 9 */
726: 5.371920351148152e+000 }; /* m = 13 */
728: PetscFunctionBegin;
729: *s = 0;
730: *m = 13;
731: PetscCall(PetscBLASIntCast(n,&n_));
732: PetscCall(PetscRandomCreate(PETSC_COMM_SELF,&rand));
733: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
734: if (d4==0.0) { /* safeguard for the case A = 0 */
735: *m = 3;
736: goto done;
737: }
738: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
739: PetscCall(PetscLogFlops(2.0*n*n));
740: eta1 = PetscMax(d4,d6);
741: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
742: *m = 3;
743: goto done;
744: }
745: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
746: *m = 5;
747: goto done;
748: }
749: if (n<SMALLN) {
750: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
751: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
752: PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
753: } else {
754: PetscCall(SlepcNormAm(n_,Apowers[2],2,work,rand,&d8));
755: d8 = PetscPowReal(d8,1.0/8.0);
756: }
757: eta3 = PetscMax(d6,d8);
758: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
759: *m = 7;
760: goto done;
761: }
762: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
763: *m = 9;
764: goto done;
765: }
766: if (n<SMALLN) {
767: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
768: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
769: PetscCall(PetscLogFlops(2.0*n*n*n+1.0*n*n));
770: } else {
771: PetscCall(SlepcNormAm(n_,Apowers[1],5,work,rand,&d10));
772: d10 = PetscPowReal(d10,1.0/10.0);
773: }
774: eta4 = PetscMax(d8,d10);
775: eta5 = PetscMin(eta3,eta4);
776: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
777: if (*s) {
778: Ascaled = work+3*n*n;
779: n2 = n_*n_;
780: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
781: sfactor = PetscPowRealInt(2.0,-(*s));
782: PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
783: PetscCall(PetscLogFlops(1.0*n*n));
784: } else Ascaled = A;
785: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
786: done:
787: PetscCall(PetscRandomDestroy(&rand));
788: PetscFunctionReturn(PETSC_SUCCESS);
789: }
791: /*
792: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
793: *
794: * N. J. Higham, "The scaling and squaring method for the matrix exponential
795: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
796: */
797: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
798: {
799: PetscBLASInt n_=0,n2,*ipiv,info,one=1;
800: PetscInt n,m,j,s;
801: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
802: PetscScalar *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
803: const PetscScalar *Aa,*c;
804: const PetscScalar c3[4] = { 120, 60, 12, 1 };
805: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
806: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
807: const PetscScalar c9[10] = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
808: 2162160, 110880, 3960, 90, 1 };
809: const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
810: 1187353796428800.0, 129060195264000.0, 10559470521600.0,
811: 670442572800.0, 33522128640.0, 1323241920.0,
812: 40840800, 960960, 16380, 182, 1 };
814: PetscFunctionBegin;
815: PetscCall(MatDenseGetArrayRead(A,&Aa));
816: PetscCall(MatDenseGetArray(B,&Ba));
817: PetscCall(MatGetSize(A,&n,NULL));
818: PetscCall(PetscBLASIntCast(n,&n_));
819: n2 = n_*n_;
820: PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
822: /* Matrix powers */
823: Apowers[0] = work; /* Apowers[0] = A */
824: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
825: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
826: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
827: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
829: PetscCall(PetscArraycpy(Apowers[0],Aa,n2));
830: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
831: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
832: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
833: PetscCall(PetscLogFlops(6.0*n*n*n));
835: /* Compute scaling parameter and order of Pade approximant */
836: PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
838: if (s) { /* rescale */
839: for (j=0;j<4;j++) {
840: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
841: PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
842: }
843: PetscCall(PetscLogFlops(4.0*n*n));
844: }
846: /* Evaluate the Pade approximant */
847: switch (m) {
848: case 3: c = c3; break;
849: case 5: c = c5; break;
850: case 7: c = c7; break;
851: case 9: c = c9; break;
852: case 13: c = c13; break;
853: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
854: }
855: P = Ba;
856: Q = Apowers[4] + n*n;
857: W = Q + n*n;
858: switch (m) {
859: case 3:
860: case 5:
861: case 7:
862: case 9:
863: if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
864: PetscCall(PetscArrayzero(P,n2));
865: PetscCall(PetscArrayzero(Q,n2));
866: for (j=0;j<n;j++) {
867: P[j+j*n] = c[1];
868: Q[j+j*n] = c[0];
869: }
870: for (j=m;j>=3;j-=2) {
871: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
872: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
873: PetscCall(PetscLogFlops(4.0*n*n));
874: }
875: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
876: PetscCall(PetscLogFlops(2.0*n*n*n));
877: SlepcSwap(P,W,aux);
878: break;
879: case 13:
880: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
881: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
882: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
883: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
884: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
885: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
886: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
887: PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
888: PetscCall(PetscArrayzero(P,n2));
889: for (j=0;j<n;j++) P[j+j*n] = c[1];
890: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
891: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
892: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
893: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
894: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
895: PetscCall(PetscLogFlops(7.0*n*n+2.0*n*n*n));
896: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
897: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
898: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
899: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
900: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
901: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
902: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
903: PetscCall(PetscLogFlops(5.0*n*n+2.0*n*n*n));
904: PetscCall(PetscArrayzero(Q,n2));
905: for (j=0;j<n;j++) Q[j+j*n] = c[0];
906: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
907: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
908: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
909: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
910: PetscCall(PetscLogFlops(7.0*n*n));
911: break;
912: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
913: }
914: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
915: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
916: SlepcCheckLapackInfo("gesv",info);
917: PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
918: for (j=0;j<n;j++) P[j+j*n] += 1.0;
919: PetscCall(PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n));
921: /* Squaring */
922: for (j=1;j<=s;j++) {
923: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
924: SlepcSwap(P,W,aux);
925: }
926: if (P!=Ba) PetscCall(PetscArraycpy(Ba,P,n2));
927: PetscCall(PetscLogFlops(2.0*n*n*n*s));
929: PetscCall(PetscFree2(work,ipiv));
930: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
931: PetscCall(MatDenseRestoreArray(B,&Ba));
932: PetscFunctionReturn(PETSC_SUCCESS);
933: }
935: #if defined(PETSC_HAVE_CUDA)
936: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
937: #include <slepccupmblas.h>
939: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
940: {
941: PetscBLASInt n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
942: PetscInt m,k,sexp;
943: PetscBool odd;
944: const PetscInt p=MAX_PADE;
945: PetscReal c[MAX_PADE+1],s;
946: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
947: const PetscScalar *Aa;
948: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
949: cublasHandle_t cublasv2handle;
951: PetscFunctionBegin;
952: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
953: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
954: PetscCall(MatGetSize(A,&m,NULL));
955: PetscCall(PetscBLASIntCast(m,&n));
956: ld = n;
957: ld2 = ld*ld;
958: if (A==B) {
959: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
960: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
961: PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
962: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
963: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
964: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
966: PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
967: PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
968: PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
969: PetscCallCUDA(cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld));
970: PetscCallCUDA(cudaMalloc((void **)&d_info,sizeof(PetscBLASInt)));
971: PetscCallCUDA(cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*)));
972: PetscCallCUDA(cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*)));
974: PetscCall(PetscMalloc1(1,&ppP));
975: PetscCall(PetscMalloc1(1,&ppQ));
977: d_P = d_Ba;
978: PetscCall(PetscLogGpuTimeBegin());
980: /* Pade' coefficients */
981: c[0] = 1.0;
982: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
984: /* Scaling */
985: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
986: if (s>0.5) {
987: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
988: scale = PetscPowRealInt(2.0,-sexp);
989: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
990: PetscCall(PetscLogGpuFlops(1.0*n*n));
991: } else sexp = 0;
993: /* Horner evaluation */
994: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
995: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
996: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
997: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
998: PetscCall(set_diagonal(n,d_Q,ld,c[p]));
999: PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1001: odd = PETSC_TRUE;
1002: for (k=p-1;k>0;k--) {
1003: if (odd) {
1004: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1005: SlepcSwap(d_Q,d_W,aux);
1006: PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1007: odd = PETSC_FALSE;
1008: } else {
1009: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1010: SlepcSwap(d_P,d_W,aux);
1011: PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1012: odd = PETSC_TRUE;
1013: }
1014: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1015: }
1016: if (odd) {
1017: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1018: SlepcSwap(d_Q,d_W,aux);
1019: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1021: ppQ[0] = d_Q;
1022: ppP[0] = d_P;
1023: PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1024: PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1026: PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1027: PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1028: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1029: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1030: PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1031: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1032: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1033: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1034: PetscCall(shift_diagonal(n,d_P,ld,sone));
1035: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1036: } else {
1037: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1038: SlepcSwap(d_P,d_W,aux);
1039: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1041: ppQ[0] = d_Q;
1042: ppP[0] = d_P;
1043: PetscCallCUDA(cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1044: PetscCallCUDA(cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice));
1046: PetscCallCUBLAS(cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one));
1047: PetscCallCUDA(cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost));
1048: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1049: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1050: PetscCallCUBLAS(cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one));
1051: PetscCheck(info>=0,PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %" PetscBLASInt_FMT,PetscAbsInt(info));
1052: PetscCheck(info<=0,PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%" PetscBLASInt_FMT ",%" PetscBLASInt_FMT ") is zero",info,info);
1053: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1054: PetscCall(shift_diagonal(n,d_P,ld,sone));
1055: }
1056: PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1058: for (k=1;k<=sexp;k++) {
1059: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1060: PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1061: }
1062: PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1064: PetscCall(PetscLogGpuTimeEnd());
1065: PetscCallCUDA(cudaFree(d_Q));
1066: PetscCallCUDA(cudaFree(d_W));
1067: PetscCallCUDA(cudaFree(d_A2));
1068: PetscCallCUDA(cudaFree(d_ipiv));
1069: PetscCallCUDA(cudaFree(d_info));
1070: PetscCallCUDA(cudaFree(d_ppP));
1071: PetscCallCUDA(cudaFree(d_ppQ));
1073: PetscCall(PetscFree(ppP));
1074: PetscCall(PetscFree(ppQ));
1076: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1077: if (A!=B) {
1078: if (s>0.5) { /* undo scaling */
1079: scale = 1.0/scale;
1080: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1081: }
1082: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1083: } else PetscCallCUDA(cudaFree(d_As));
1084: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1085: PetscFunctionReturn(PETSC_SUCCESS);
1086: }
1088: #if defined(PETSC_HAVE_MAGMA)
1089: #include <slepcmagma.h>
1091: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1092: {
1093: PetscBLASInt n=0,ld,ld2,*piv,one=1;
1094: PetscInt m,k,sexp;
1095: PetscBool odd;
1096: const PetscInt p=MAX_PADE;
1097: PetscReal c[MAX_PADE+1],s;
1098: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1099: const PetscScalar *Aa;
1100: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1101: cublasHandle_t cublasv2handle;
1103: PetscFunctionBegin;
1104: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1105: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1106: PetscCall(SlepcMagmaInit());
1107: PetscCall(MatGetSize(A,&m,NULL));
1108: PetscCall(PetscBLASIntCast(m,&n));
1109: ld = n;
1110: ld2 = ld*ld;
1111: if (A==B) {
1112: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m));
1113: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1114: PetscCallCUDA(cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1115: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1116: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As));
1117: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1119: PetscCallCUDA(cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m));
1120: PetscCallCUDA(cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m));
1121: PetscCallCUDA(cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m));
1123: PetscCall(PetscMalloc1(n,&piv));
1125: d_P = d_Ba;
1126: PetscCall(PetscLogGpuTimeBegin());
1128: /* Pade' coefficients */
1129: c[0] = 1.0;
1130: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1132: /* Scaling */
1133: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,ld2,d_As,one,&s));
1134: PetscCall(PetscLogGpuFlops(1.0*n*n));
1136: if (s>0.5) {
1137: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1138: scale = PetscPowRealInt(2.0,-sexp);
1139: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1140: PetscCall(PetscLogGpuFlops(1.0*n*n));
1141: } else sexp = 0;
1143: /* Horner evaluation */
1144: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld));
1145: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1146: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2));
1147: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*ld2));
1148: PetscCall(set_diagonal(n,d_Q,ld,c[p]));
1149: PetscCall(set_diagonal(n,d_P,ld,c[p-1]));
1151: odd = PETSC_TRUE;
1152: for (k=p-1;k>0;k--) {
1153: if (odd) {
1154: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld));
1155: SlepcSwap(d_Q,d_W,aux);
1156: PetscCall(shift_diagonal(n,d_Q,ld,c[k-1]));
1157: odd = PETSC_FALSE;
1158: } else {
1159: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld));
1160: SlepcSwap(d_P,d_W,aux);
1161: PetscCall(shift_diagonal(n,d_P,ld,c[k-1]));
1162: odd = PETSC_TRUE;
1163: }
1164: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1165: }
1166: if (odd) {
1167: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld));
1168: SlepcSwap(d_Q,d_W,aux);
1169: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1170: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1171: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1172: PetscCall(shift_diagonal(n,d_P,ld,sone));
1173: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&smone,d_P,one));
1174: } else {
1175: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld));
1176: SlepcSwap(d_P,d_W,aux);
1177: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one));
1178: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1179: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&stwo,d_P,one));
1180: PetscCall(shift_diagonal(n,d_P,ld,sone));
1181: }
1182: PetscCall(PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n));
1184: for (k=1;k<=sexp;k++) {
1185: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld));
1186: PetscCallCUDA(cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1187: }
1188: PetscCall(PetscLogGpuFlops(2.0*n*n*n*sexp));
1190: PetscCall(PetscLogGpuTimeEnd());
1191: PetscCallCUDA(cudaFree(d_Q));
1192: PetscCallCUDA(cudaFree(d_W));
1193: PetscCallCUDA(cudaFree(d_A2));
1194: PetscCall(PetscFree(piv));
1196: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice));
1197: if (A!=B) {
1198: if (s>0.5) { /* undo scaling */
1199: scale = 1.0/scale;
1200: PetscCallCUBLAS(cublasXscal(cublasv2handle,ld2,&scale,d_As,one));
1201: }
1202: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As));
1203: } else PetscCallCUDA(cudaFree(d_As));
1204: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1205: PetscFunctionReturn(PETSC_SUCCESS);
1206: }
1208: /*
1209: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1210: *
1211: * N. J. Higham, "The scaling and squaring method for the matrix exponential
1212: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1213: */
1214: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1215: {
1216: PetscBLASInt n_=0,n2,*ipiv,one=1;
1217: PetscInt n,m,j,s;
1218: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1219: PetscScalar *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1220: const PetscScalar *Aa,*c;
1221: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1222: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1223: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1224: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1225: 2162160, 110880, 3960, 90, 1 };
1226: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1227: 1187353796428800, 129060195264000, 10559470521600,
1228: 670442572800, 33522128640, 1323241920,
1229: 40840800, 960960, 16380, 182, 1 };
1230: cublasHandle_t cublasv2handle;
1232: PetscFunctionBegin;
1233: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1234: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1235: PetscCall(SlepcMagmaInit());
1236: PetscCall(MatGetSize(A,&n,NULL));
1237: PetscCall(PetscBLASIntCast(n,&n_));
1238: n2 = n_*n_;
1239: PetscCall(PetscMalloc2(8*n*n,&work,n,&ipiv));
1240: /* Matrix powers */
1241: Apowers[0] = work; /* Apowers[0] = A */
1242: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1243: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1244: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1245: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1246: if (A==B) {
1247: PetscCallCUDA(cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar)));
1248: d_Apowers[0] = d_work; /* d_Apowers[0] = A */
1249: d_Apowers[1] = d_Apowers[0] + n*n; /* d_Apowers[1] = A^2 */
1250: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1251: PetscCallCUDA(cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1252: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1253: } else {
1254: PetscCallCUDA(cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar)));
1255: PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1256: d_Apowers[1] = d_work; /* d_Apowers[1] = A^2 */
1257: }
1258: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1259: d_Apowers[2] = d_Apowers[1] + n*n; /* d_Apowers[2] = A^4 */
1260: d_Apowers[3] = d_Apowers[2] + n*n; /* d_Apowers[3] = A^6 */
1261: d_Apowers[4] = d_Apowers[3] + n*n; /* d_Apowers[4] = A^8 */
1262: d_Q = d_Apowers[4] + n*n;
1263: d_W = d_Q + n*n;
1265: PetscCall(PetscLogGpuTimeBegin());
1267: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_));
1268: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_));
1269: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_));
1270: PetscCall(PetscLogGpuFlops(6.0*n*n*n));
1272: PetscCallCUDA(cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1273: PetscCallCUDA(cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
1274: PetscCall(PetscLogGpuToCpu(4*n2*sizeof(PetscScalar)));
1275: /* Compute scaling parameter and order of Pade approximant */
1276: PetscCall(expm_params(n,Apowers,&s,&m,Apowers[4]));
1278: if (s) { /* rescale */
1279: for (j=0;j<4;j++) {
1280: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1281: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one));
1282: }
1283: PetscCall(PetscLogGpuFlops(4.0*n*n));
1284: }
1286: /* Evaluate the Pade approximant */
1287: switch (m) {
1288: case 3: c = c3; break;
1289: case 5: c = c5; break;
1290: case 7: c = c7; break;
1291: case 9: c = c9; break;
1292: case 13: c = c13; break;
1293: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1294: }
1295: d_P = d_Ba;
1296: switch (m) {
1297: case 3:
1298: case 5:
1299: case 7:
1300: case 9:
1301: if (m==9) PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_));
1302: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1303: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1304: PetscCall(set_diagonal(n,d_P,n,c[1]));
1305: PetscCall(set_diagonal(n,d_Q,n,c[0]));
1306: for (j=m;j>=3;j-=2) {
1307: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one));
1308: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one));
1309: PetscCall(PetscLogGpuFlops(4.0*n*n));
1310: }
1311: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_));
1312: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1313: SlepcSwap(d_P,d_W,aux);
1314: break;
1315: case 13:
1316: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1317: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1318: PetscCallCUDA(cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1319: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[13],d_P,one));
1320: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one));
1321: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one));
1322: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_));
1323: PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1325: PetscCallCUDA(cudaMemset(d_P,0,sizeof(PetscScalar)*n2));
1326: PetscCall(set_diagonal(n,d_P,n,c[1]));
1327: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one));
1328: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one));
1329: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one));
1330: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one));
1331: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_));
1332: PetscCall(PetscLogGpuFlops(7.0*n*n+2.0*n*n*n));
1333: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1334: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1335: PetscCallCUDA(cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1336: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&c[12],d_Q,one));
1337: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one));
1338: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one));
1339: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_));
1340: PetscCall(PetscLogGpuFlops(5.0*n*n+2.0*n*n*n));
1341: PetscCallCUDA(cudaMemset(d_Q,0,sizeof(PetscScalar)*n2));
1342: PetscCall(set_diagonal(n,d_Q,n,c[0]));
1343: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one));
1344: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one));
1345: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one));
1346: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one));
1347: PetscCall(PetscLogGpuFlops(7.0*n*n));
1348: break;
1349: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1350: }
1351: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one));
1353: PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
1355: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&stwo,d_P,one));
1356: PetscCall(shift_diagonal(n,d_P,n,sone));
1357: PetscCall(PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n));
1359: /* Squaring */
1360: for (j=1;j<=s;j++) {
1361: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_));
1362: SlepcSwap(d_P,d_W,aux);
1363: }
1364: PetscCall(PetscLogGpuFlops(2.0*n*n*n*s));
1365: PetscCall(PetscLogGpuTimeEnd());
1367: PetscCall(PetscFree2(work,ipiv));
1368: if (d_P!=d_Ba) PetscCallCUDA(cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
1369: if (A!=B) {
1370: if (s>0.5) { /* undo scaling */
1371: scale = 1.0/PetscPowRealInt(2.0,-s);
1372: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one));
1373: }
1374: PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]));
1375: }
1376: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1377: PetscCallCUDA(cudaFree(d_work));
1378: PetscFunctionReturn(PETSC_SUCCESS);
1379: }
1381: /*
1382: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1383: * and Yuji Nakatsukasa
1384: *
1385: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1386: * Approximation for the Matrix Exponential",
1387: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1388: * https://doi.org/10.1137/15M1027553
1389: */
1390: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1391: {
1392: PetscInt i,j,n_,s,k,m,mod;
1393: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
1394: PetscReal nrm,shift=0.0,rone=1.0,rzero=0.0;
1395: #if defined(PETSC_USE_COMPLEX)
1396: PetscReal *rwork=NULL;
1397: #endif
1398: PetscComplex *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1399: PetscScalar *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1400: const PetscScalar *Aa;
1401: PetscBool isreal,*d_isreal,flg;
1402: cublasHandle_t cublasv2handle;
1404: PetscFunctionBegin;
1405: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* For CUDA event timers */
1406: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1407: PetscCall(SlepcMagmaInit());
1408: PetscCall(MatGetSize(A,&n_,NULL));
1409: PetscCall(PetscBLASIntCast(n_,&n));
1410: PetscCall(PetscBLASIntCast(n*n,&n2));
1412: if (A==B) {
1413: PetscCallCUDA(cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2));
1414: PetscCall(MatDenseCUDAGetArrayRead(A,&Aa));
1415: PetscCallCUDA(cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1416: PetscCall(MatDenseCUDARestoreArrayRead(A,&Aa));
1417: } else PetscCall(MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa));
1418: PetscCall(MatDenseCUDAGetArrayWrite(B,&d_Ba));
1419: d_Ba2 = d_Ba;
1421: PetscCallCUDA(cudaMalloc((void **)&d_isreal,sizeof(PetscBool)));
1422: PetscCallCUDA(cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2));
1423: PetscCallCUDA(cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2));
1425: PetscCall(PetscLogGpuTimeBegin());
1426: d_Maux2 = d_Maux;
1427: PetscCall(PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg));
1428: if (!flg) {
1429: PetscCall(PetscMalloc2(n,&wr,n,&wi));
1430: /* estimate rightmost eigenvalue and shift A with it */
1431: PetscCall(PetscMalloc1(n2,&Maux));
1432: PetscCall(MatDenseGetArrayRead(A,&Aa));
1433: PetscCall(PetscArraycpy(Maux,Aa,n2));
1434: PetscCall(MatDenseRestoreArrayRead(A,&Aa));
1435: #if !defined(PETSC_USE_COMPLEX)
1436: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
1437: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1438: PetscCall(PetscMalloc1(lwork,&work));
1439: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
1440: PetscCall(PetscFree(work));
1441: #else
1442: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
1443: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork));
1444: PetscCall(PetscMalloc2(2*n,&rwork,lwork,&work));
1445: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
1446: PetscCall(PetscFree2(rwork,work));
1447: #endif
1448: PetscCall(PetscFree(Maux));
1449: PetscCall(PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n));
1451: shift = PetscRealPart(wr[0]);
1452: for (i=1;i<n;i++) {
1453: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1454: }
1455: PetscCall(PetscFree2(wr,wi));
1456: }
1457: /* shift so that largest real part is (about) 0 */
1458: PetscCallCUDA(cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1459: if (shift) {
1460: PetscCall(shift_diagonal(n,d_sMaux,n,-shift));
1461: PetscCall(PetscLogGpuFlops(1.0*n));
1462: }
1463: #if defined(PETSC_USE_COMPLEX)
1464: PetscCallCUDA(cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1465: if (shift) {
1466: PetscCall(shift_diagonal(n,d_Maux,n,-shift));
1467: PetscCall(PetscLogGpuFlops(1.0*n));
1468: }
1469: #endif
1470: if (A!=B) PetscCall(MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa));
1471: else PetscCallCUDA(cudaFree(d_Aa));
1473: /* estimate norm(A) and select the scaling factor */
1474: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm));
1475: PetscCall(PetscLogGpuFlops(2.0*n*n));
1476: PetscCall(sexpm_params(nrm,&s,&k,&m));
1477: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1478: if (shift) expshift = PetscExpReal(shift);
1479: PetscCall(shift_Cdiagonal(n,d_Maux,n,rone,rzero));
1480: if (shift) {
1481: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one));
1482: PetscCall(PetscLogGpuFlops(1.0*(n+n2)));
1483: } else PetscCall(PetscLogGpuFlops(1.0*n));
1484: PetscCallCUDA(cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1485: PetscCallCUDA(cudaFree(d_isreal));
1486: PetscCallCUDA(cudaFree(d_sMaux));
1487: PetscCallCUDA(cudaFree(d_Maux));
1488: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1489: PetscFunctionReturn(PETSC_SUCCESS); /* quick return */
1490: }
1492: PetscCallCUDA(cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2));
1493: PetscCallCUDA(cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2));
1494: PetscCallCUDA(cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2));
1495: d_expmA2 = d_expmA; d_RR2 = d_RR;
1496: PetscCall(PetscMalloc1(n,&piv));
1497: /* scale matrix */
1498: #if !defined(PETSC_USE_COMPLEX)
1499: PetscCall(copy_array2D_S2C(n,n,d_As,n,d_sMaux,n));
1500: #else
1501: PetscCallCUDA(cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1502: #endif
1503: scale = 1.0/PetscPowRealInt(2.0,s);
1504: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one));
1505: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1507: /* evaluate Pade approximant (partial fraction or product form) */
1508: if (fn->method==3 || !m) { /* partial fraction */
1509: PetscCall(getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE));
1510: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1511: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1512: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize));
1513: PetscCall(PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm));
1514: PetscCall(getcoeffs(k,m,r,p,remainterm,PETSC_FALSE));
1516: PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1517: #if !defined(PETSC_USE_COMPLEX)
1518: isreal = PETSC_TRUE;
1519: #else
1520: PetscCall(getisreal_array2D(n,n,d_Maux,n,d_isreal));
1521: PetscCallCUDA(cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost));
1522: #endif
1523: if (isreal) {
1524: rsizediv2 = irsize/2;
1525: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1526: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1527: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1528: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i])));
1529: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i])));
1530: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1531: PetscCall(add_array2D_Conj(n,n,d_RR,n));
1532: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1533: /* shift(n) + gesv + axpy(n2) */
1534: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2));
1535: }
1537: mod = ipsize % 2;
1538: if (mod) {
1539: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1540: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1541: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1])));
1542: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1])));
1543: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1544: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1545: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1546: }
1547: } else { /* complex */
1548: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1549: PetscCallCUDA(cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1550: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1551: PetscCall(shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i])));
1552: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i])));
1553: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1554: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1555: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2));
1556: }
1557: }
1558: for (i=0;i<iremainsize;i++) {
1559: if (!i) {
1560: PetscCallCUDA(cudaMemset(d_RR,0,sizeof(PetscComplex)*n2));
1561: PetscCall(set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1])));
1562: } else {
1563: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1564: for (j=1;j<i;j++) {
1565: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n));
1566: SlepcSwap(d_RR,d_Maux,aux);
1567: PetscCall(SlepcLogGpuFlopsComplex(2.0*n*n*n));
1568: }
1569: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one));
1570: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1571: }
1572: PetscCallCUBLAS(cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one));
1573: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1574: }
1575: PetscCall(PetscFree3(r,p,remainterm));
1576: } else { /* product form, default */
1577: PetscCall(getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE));
1578: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize));
1579: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize));
1580: PetscCall(PetscMalloc2(irsize,&rootp,ipsize,&rootq));
1581: PetscCall(getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE));
1583: PetscCallCUDA(cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2));
1584: PetscCall(set_Cdiagonal(n,d_expmA,n,rone,rzero)); /* initialize */
1585: minlen = PetscMin(irsize,ipsize);
1586: for (i=0;i<minlen;i++) {
1587: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1588: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1589: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1590: SlepcSwap(d_expmA,d_Maux,aux);
1591: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1592: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1593: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1594: /* shift(n) + gemm + shift(n) + gesv */
1595: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1596: }
1597: /* extra enumerator */
1598: for (i=minlen;i<irsize;i++) {
1599: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1600: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i])));
1601: PetscCallCUBLAS(cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n));
1602: SlepcSwap(d_expmA,d_Maux,aux);
1603: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n));
1604: }
1605: /* extra denominator */
1606: for (i=minlen;i<ipsize;i++) {
1607: PetscCallCUDA(cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice));
1608: PetscCall(shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i])));
1609: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1610: PetscCall(SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)));
1611: }
1612: PetscCallCUBLAS(cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one));
1613: PetscCall(SlepcLogGpuFlopsComplex(1.0*n2));
1614: PetscCall(PetscFree2(rootp,rootq));
1615: }
1617: #if !defined(PETSC_USE_COMPLEX)
1618: PetscCall(copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n));
1619: #else
1620: PetscCallCUDA(cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1621: #endif
1623: /* perform repeated squaring */
1624: for (i=0;i<s;i++) { /* final squaring */
1625: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n));
1626: SlepcSwap(d_Ba2,d_sMaux,saux);
1627: PetscCall(PetscLogGpuFlops(2.0*n*n*n));
1628: }
1629: if (d_Ba2!=d_Ba) {
1630: PetscCallCUDA(cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice));
1631: d_sMaux = d_Ba2;
1632: }
1633: if (shift) {
1634: expshift = PetscExpReal(shift);
1635: PetscCallCUBLAS(cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one));
1636: PetscCall(PetscLogGpuFlops(1.0*n2));
1637: }
1639: PetscCall(PetscLogGpuTimeEnd());
1641: /* restore pointers */
1642: d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1643: PetscCall(MatDenseCUDARestoreArrayWrite(B,&d_Ba));
1644: PetscCallCUDA(cudaFree(d_isreal));
1645: PetscCallCUDA(cudaFree(d_sMaux));
1646: PetscCallCUDA(cudaFree(d_Maux));
1647: PetscCallCUDA(cudaFree(d_expmA));
1648: PetscCallCUDA(cudaFree(d_As));
1649: PetscCallCUDA(cudaFree(d_RR));
1650: PetscCall(PetscFree(piv));
1651: PetscFunctionReturn(PETSC_SUCCESS);
1652: }
1653: #endif /* PETSC_HAVE_MAGMA */
1654: #endif /* PETSC_HAVE_CUDA */
1656: static PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1657: {
1658: PetscBool isascii;
1659: char str[50];
1660: const char *methodname[] = {
1661: "scaling & squaring, [m/m] Pade approximant (Higham)",
1662: "scaling & squaring, [6/6] Pade approximant",
1663: "scaling & squaring, subdiagonal Pade approximant (product form)",
1664: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1665: };
1666: const int nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
1668: PetscFunctionBegin;
1669: PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
1670: if (isascii) {
1671: if (fn->beta==(PetscScalar)1.0) {
1672: if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(x)\n"));
1673: else {
1674: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1675: PetscCall(PetscViewerASCIIPrintf(viewer," exponential: exp(%s*x)\n",str));
1676: }
1677: } else {
1678: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE));
1679: if (fn->alpha==(PetscScalar)1.0) PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s*exp(x)\n",str));
1680: else {
1681: PetscCall(PetscViewerASCIIPrintf(viewer," exponential: %s",str));
1682: PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1683: PetscCall(SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE));
1684: PetscCall(PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str));
1685: PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1686: }
1687: }
1688: if (fn->method<nmeth) PetscCall(PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]));
1689: }
1690: PetscFunctionReturn(PETSC_SUCCESS);
1691: }
1693: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1694: {
1695: PetscFunctionBegin;
1696: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1697: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1698: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1699: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1700: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1701: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1702: #if defined(PETSC_HAVE_CUDA)
1703: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1704: #if defined(PETSC_HAVE_MAGMA)
1705: fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1706: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1707: fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1708: fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1709: #endif
1710: #endif
1711: fn->ops->view = FNView_Exp;
1712: PetscFunctionReturn(PETSC_SUCCESS);
1713: }