Line data Source code
1 : /*
2 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3 : SLEPc - Scalable Library for Eigenvalue Problem Computations
4 : Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
5 :
6 : This file is part of SLEPc.
7 : SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8 : - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9 : */
10 : /*
11 : BDC - Block-divide and conquer (see description in README file)
12 : */
13 :
14 : #include <slepc/private/dsimpl.h>
15 : #include <slepcblaslapack.h>
16 :
17 21 : PetscErrorCode BDC_dlaed3m_(const char *jobz,const char *defl,PetscBLASInt k,PetscBLASInt n,
18 : PetscBLASInt n1,PetscReal *d,PetscReal *q,PetscBLASInt ldq,
19 : PetscReal rho,PetscReal *dlamda,PetscReal *q2,PetscBLASInt *indx,
20 : PetscBLASInt *ctot,PetscReal *w,PetscReal *s,PetscBLASInt *info,
21 : PetscBLASInt jobz_len,PetscBLASInt defl_len)
22 : {
23 : /* -- Routine written in LAPACK version 3.0 style -- */
24 : /* *************************************************** */
25 : /* Written by */
26 : /* Michael Moldaschl and Wilfried Gansterer */
27 : /* University of Vienna */
28 : /* last modification: March 16, 2014 */
29 :
30 : /* Small adaptations of original code written by */
31 : /* Wilfried Gansterer and Bob Ward, */
32 : /* Department of Computer Science, University of Tennessee */
33 : /* see https://doi.org/10.1137/S1064827501399432 */
34 : /* *************************************************** */
35 :
36 : /* Purpose */
37 : /* ======= */
38 :
39 : /* DLAED3M finds the roots of the secular equation, as defined by the */
40 : /* values in D, W, and RHO, between 1 and K. It makes the */
41 : /* appropriate calls to DLAED4 and then updates the eigenvectors by */
42 : /* multiplying the matrix of eigenvectors of the pair of eigensystems */
43 : /* being combined by the matrix of eigenvectors of the K-by-K system */
44 : /* which is solved here. */
45 :
46 : /* This code makes very mild assumptions about floating point */
47 : /* arithmetic. It will work on machines with a guard digit in */
48 : /* add/subtract, or on those binary machines without guard digits */
49 : /* which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. */
50 : /* It could conceivably fail on hexadecimal or decimal machines */
51 : /* without guard digits, but we know of none. */
52 :
53 : /* Arguments */
54 : /* ========= */
55 :
56 : /* JOBZ (input) CHARACTER*1 */
57 : /* = 'N': Do not accumulate eigenvectors (not implemented); */
58 : /* = 'D': Do accumulate eigenvectors in the divide-and-conquer */
59 : /* process. */
60 :
61 : /* DEFL (input) CHARACTER*1 */
62 : /* = '0': No deflation happened in DSRTDF */
63 : /* = '1': Some deflation happened in DSRTDF (and therefore some */
64 : /* Givens rotations need to be applied to the computed */
65 : /* eigenvector matrix Q) */
66 :
67 : /* K (input) INTEGER */
68 : /* The number of terms in the rational function to be solved by */
69 : /* DLAED4. 0 <= K <= N. */
70 :
71 : /* N (input) INTEGER */
72 : /* The number of rows and columns in the Q matrix. */
73 : /* N >= K (deflation may result in N>K). */
74 :
75 : /* N1 (input) INTEGER */
76 : /* The location of the last eigenvalue in the leading submatrix. */
77 : /* min(1,N) <= N1 <= max(1,N-1). */
78 :
79 : /* D (output) DOUBLE PRECISION array, dimension (N) */
80 : /* D(I) contains the updated eigenvalues for */
81 : /* 1 <= I <= K. */
82 :
83 : /* Q (output) DOUBLE PRECISION array, dimension (LDQ,N) */
84 : /* Initially the first K columns are used as workspace. */
85 : /* On output the columns 1 to K contain */
86 : /* the updated eigenvectors. */
87 :
88 : /* LDQ (input) INTEGER */
89 : /* The leading dimension of the array Q. LDQ >= max(1,N). */
90 :
91 : /* RHO (input) DOUBLE PRECISION */
92 : /* The value of the parameter in the rank one update equation. */
93 : /* RHO >= 0 required. */
94 :
95 : /* DLAMDA (input/output) DOUBLE PRECISION array, dimension (K) */
96 : /* The first K elements of this array contain the old roots */
97 : /* of the deflated updating problem. These are the poles */
98 : /* of the secular equation. May be changed on output by */
99 : /* having lowest order bit set to zero on Cray X-MP, Cray Y-MP, */
100 : /* Cray-2, or Cray C-90, as described above. */
101 :
102 : /* Q2 (input) DOUBLE PRECISION array, dimension (LDQ2, N) */
103 : /* The first K columns of this matrix contain the non-deflated */
104 : /* eigenvectors for the split problem. */
105 :
106 : /* INDX (input) INTEGER array, dimension (N) */
107 : /* The permutation used to arrange the columns of the deflated */
108 : /* Q matrix into three groups (see DLAED2). */
109 : /* The rows of the eigenvectors found by DLAED4 must be likewise */
110 : /* permuted before the matrix multiply can take place. */
111 :
112 : /* CTOT (input) INTEGER array, dimension (4) */
113 : /* A count of the total number of the various types of columns */
114 : /* in Q, as described in INDX. The fourth column type is any */
115 : /* column which has been deflated. */
116 :
117 : /* W (input/output) DOUBLE PRECISION array, dimension (K) */
118 : /* The first K elements of this array contain the components */
119 : /* of the deflation-adjusted updating vector. Destroyed on */
120 : /* output. */
121 :
122 : /* S (workspace) DOUBLE PRECISION array, dimension */
123 : /* (MAX(CTOT(1)+CTOT(2),CTOT(2)+CTOT(3)) + 1)*K */
124 : /* Will contain parts of the eigenvectors of the repaired matrix */
125 : /* which will be multiplied by the previously accumulated */
126 : /* eigenvectors to update the system. This array is a major */
127 : /* source of workspace requirements ! */
128 :
129 : /* INFO (output) INTEGER */
130 : /* = 0: successful exit. */
131 : /* < 0: if INFO = -i, the i-th argument had an illegal value. */
132 : /* > 0: if INFO = i, eigenpair i was not computed successfully */
133 :
134 : /* Further Details */
135 : /* =============== */
136 :
137 : /* Based on code written by */
138 : /* Wilfried Gansterer and Bob Ward, */
139 : /* Department of Computer Science, University of Tennessee */
140 : /* Based on the design of the LAPACK code DLAED3 with small modifications */
141 : /* (Note that in contrast to the original DLAED3, this routine */
142 : /* DOES NOT require that N1 <= N/2) */
143 :
144 : /* Based on contributions by */
145 : /* Jeff Rutter, Computer Science Division, University of California */
146 : /* at Berkeley, USA */
147 : /* Modified by Francoise Tisseur, University of Tennessee. */
148 :
149 : /* ===================================================================== */
150 :
151 21 : PetscReal temp, done = 1.0, dzero = 0.0;
152 21 : PetscBLASInt i, j, n2, n12, ii, n23, iq2, i1, one=1;
153 :
154 21 : PetscFunctionBegin;
155 21 : *info = 0;
156 :
157 21 : if (k < 0) *info = -3;
158 21 : else if (n < k) *info = -4;
159 21 : else if (n1 < PetscMin(1,n) || n1 > PetscMax(1,n)) *info = -5;
160 21 : else if (ldq < PetscMax(1,n)) *info = -8;
161 21 : else if (rho < 0.) *info = -9;
162 21 : PetscCheck(!*info,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong argument %" PetscBLASInt_FMT " in DLAED3M",-(*info));
163 :
164 : /* Quick return if possible */
165 :
166 21 : if (k == 0) PetscFunctionReturn(PETSC_SUCCESS);
167 :
168 : /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */
169 : /* be computed with high relative accuracy (barring over/underflow). */
170 : /* This is a problem on machines without a guard digit in */
171 : /* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). */
172 : /* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), */
173 : /* which on any of these machines zeros out the bottommost */
174 : /* bit of DLAMDA(I) if it is 1; this makes the subsequent */
175 : /* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation */
176 : /* occurs. On binary machines with a guard digit (almost all */
177 : /* machines) it does not change DLAMDA(I) at all. On hexadecimal */
178 : /* and decimal machines with a guard digit, it slightly */
179 : /* changes the bottommost bits of DLAMDA(I). It does not account */
180 : /* for hexadecimal or decimal machines without guard digits */
181 : /* (we know of none). We use a subroutine call to compute */
182 : /* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating */
183 : /* this code. */
184 :
185 181 : for (i = 0; i < k; ++i) {
186 160 : dlamda[i] = LAPACKlamc3_(&dlamda[i], &dlamda[i]) - dlamda[i];
187 : }
188 :
189 181 : for (j = 1; j <= k; ++j) {
190 :
191 : /* ....calling DLAED4 for eigenpair J.... */
192 :
193 160 : PetscCallBLAS("LAPACKlaed4",LAPACKlaed4_(&k, &j, dlamda, w, &q[(j-1)*ldq], &rho, &d[j-1], info));
194 160 : SlepcCheckLapackInfo("laed4",*info);
195 :
196 160 : if (j < k) {
197 :
198 : /* If the zero finder terminated properly, but the computed */
199 : /* eigenvalues are not ordered, issue an error statement */
200 : /* but continue computation. */
201 :
202 139 : PetscCheck(dlamda[j-1]<dlamda[j],PETSC_COMM_SELF,PETSC_ERR_FP,"DLAMDA(%" PetscBLASInt_FMT ") is greater or equal than DLAMDA(%" PetscBLASInt_FMT ")", j, j+1);
203 160 : PetscCheck(d[j-1]>=dlamda[j-1] && d[j-1]<=dlamda[j],PETSC_COMM_SELF,PETSC_ERR_FP,"DLAMDA(%" PetscBLASInt_FMT ") = %g D(%" PetscBLASInt_FMT ") = %g DLAMDA(%" PetscBLASInt_FMT ") = %g", j, (double)dlamda[j-1], j, (double)d[j-1], j+1, (double)dlamda[j]);
204 : }
205 : }
206 :
207 21 : if (k == 1) goto L110;
208 :
209 21 : if (k == 2) {
210 :
211 : /* permute the components of Q(:,J) (the information returned by DLAED4 */
212 : /* necessary to construct the eigenvectors) according to the permutation */
213 : /* stored in INDX, resulting from deflation */
214 :
215 6 : for (j = 0; j < k; ++j) {
216 4 : w[0] = q[0+j*ldq];
217 4 : w[1] = q[1+j*ldq];
218 4 : ii = indx[0];
219 4 : q[0+j*ldq] = w[ii-1];
220 4 : ii = indx[1];
221 4 : q[1+j*ldq] = w[ii-1];
222 : }
223 2 : goto L110;
224 : }
225 :
226 : /* ....K.GE.3.... */
227 : /* Compute updated W (used for computing the eigenvectors corresponding */
228 : /* to the previously computed eigenvalues). */
229 :
230 19 : PetscCallBLAS("BLAScopy",BLAScopy_(&k, w, &one, s, &one));
231 :
232 : /* Initialize W(I) = Q(I,I) */
233 :
234 19 : i1 = ldq + 1;
235 19 : PetscCallBLAS("BLAScopy",BLAScopy_(&k, q, &i1, w, &one));
236 175 : for (j = 0; j < k; ++j) {
237 852 : for (i = 0; i < j; ++i) {
238 696 : w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]);
239 : }
240 852 : for (i = j + 1; i < k; ++i) {
241 696 : w[i] *= q[i+j*ldq] / (dlamda[i] - dlamda[j]);
242 : }
243 : }
244 175 : for (i = 0; i < k; ++i) {
245 156 : temp = PetscSqrtReal(-w[i]);
246 156 : if (temp<0) temp = -temp;
247 156 : w[i] = (s[i] >= 0) ? temp : -temp;
248 : }
249 :
250 : /* Compute eigenvectors of the modified rank-1 modification (using the */
251 : /* vector W). */
252 :
253 175 : for (j = 0; j < k; ++j) {
254 1704 : for (i = 0; i < k; ++i) {
255 1548 : s[i] = w[i] / q[i+j*ldq];
256 : }
257 156 : temp = BLASnrm2_(&k, s, &one);
258 1860 : for (i = 0; i < k; ++i) {
259 :
260 : /* apply the permutation resulting from deflation as stored */
261 : /* in INDX */
262 :
263 1548 : ii = indx[i];
264 1548 : q[i+j*ldq] = s[ii-1] / temp;
265 : }
266 : }
267 :
268 : /* ************************************************************************** */
269 :
270 : /* ....updating the eigenvectors.... */
271 :
272 19 : L110:
273 :
274 21 : n2 = n - n1;
275 21 : n12 = ctot[0] + ctot[1];
276 21 : n23 = ctot[1] + ctot[2];
277 21 : if (*(unsigned char *)jobz == 'D') {
278 :
279 : /* Compute the updated eigenvectors. (NOTE that every call of */
280 : /* DGEMM requires three DISTINCT arrays) */
281 :
282 : /* copy Q(CTOT(1)+1:K,1:K) to S */
283 :
284 523 : for (j=0;j<k;j++) for (i=0;i<n23;i++) s[i+j*n23] = q[ctot[0]+i+j*ldq];
285 21 : iq2 = n1 * n12 + 1;
286 :
287 21 : if (n23 != 0) {
288 :
289 : /* multiply the second part of Q2 (the eigenvectors of the */
290 : /* lower block) with S and write the result into the lower part of */
291 : /* Q, i.e., Q(N1+1:N,1:K) */
292 :
293 7 : PetscCallBLAS("BLASgemm",BLASgemm_("N", "N", &n2, &k, &n23, &done,
294 : &q2[iq2-1], &n2, s, &n23, &dzero, &q[n1], &ldq));
295 : } else {
296 120 : for (j=0;j<k;j++) for (i=0;i<n2;i++) q[n1+i+j*ldq] = 0.0;
297 : }
298 :
299 : /* copy Q(1:CTOT(1)+CTOT(2),1:K) to S */
300 :
301 1557 : for (j=0;j<k;j++) for (i=0;i<n12;i++) s[i+j*n12] = q[i+j*ldq];
302 :
303 21 : if (n12 != 0) {
304 :
305 : /* multiply the first part of Q2 (the eigenvectors of the */
306 : /* upper block) with S and write the result into the upper part of */
307 : /* Q, i.e., Q(1:N1,1:K) */
308 :
309 21 : PetscCallBLAS("BLASgemm",BLASgemm_("N", "N", &n1, &k, &n12, &done,
310 : q2, &n1, s, &n12, &dzero, q, &ldq));
311 : } else {
312 0 : for (j=0;j<k;j++) for (i=0;i<n1;i++) q[i+j*ldq] = 0.0;
313 : }
314 : }
315 21 : PetscFunctionReturn(PETSC_SUCCESS);
316 : }
|