Actual source code: fnutilcuda.cu

slepc-3.21.2 2024-09-25
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include "fnutilcuda.h"

 16: static __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 17: {
 18:   PetscInt x;
 19:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 21:   if (x<n) {
 22:     d_pa[x+x*ld] = v;
 23:   }
 24: }

 26: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 27: {
 28:   PetscInt    i,dimGrid_xcount;
 29:   dim3        blocks3d,threads3d;

 31:   PetscFunctionBegin;
 32:   PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
 33:   for (i=0;i<dimGrid_xcount;i++) {
 34:     set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 35:     PetscCallCUDA(cudaGetLastError());
 36:   }
 37:   PetscFunctionReturn(PETSC_SUCCESS);
 38: }

 40: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 41: {
 42:   PetscInt x;
 43:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 45:   if (x<n) {
 46:     d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
 47:   }
 48: }

 50: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 51: {
 52:   PetscInt    i,dimGrid_xcount;
 53:   dim3        blocks3d,threads3d;

 55:   PetscFunctionBegin;
 56:   PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
 57:   for (i=0;i<dimGrid_xcount;i++) {
 58:     set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
 59:     PetscCallCUDA(cudaGetLastError());
 60:   }
 61:   PetscFunctionReturn(PETSC_SUCCESS);
 62: }

 64: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 65: {
 66:   PetscInt x;
 67:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 69:   if (x<n) {
 70:     d_pa[x+x*ld] += v;
 71:   }
 72: }

 74: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 75: {
 76:   PetscInt    i,dimGrid_xcount;
 77:   dim3        blocks3d,threads3d;

 79:   PetscFunctionBegin;
 80:   PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
 81:   for (i=0;i<dimGrid_xcount;i++) {
 82:     shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 83:     PetscCallCUDA(cudaGetLastError());
 84:   }
 85:   PetscFunctionReturn(PETSC_SUCCESS);
 86: }

 88: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 89: {
 90:   PetscInt x;
 91:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 93:   if (x<n) {
 94:     d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
 95:   }
 96: }

 98: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 99: {
100:   PetscInt    i,dimGrid_xcount;
101:   dim3        blocks3d,threads3d;

103:   PetscFunctionBegin;
104:   PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
105:   for (i=0;i<dimGrid_xcount;i++) {
106:     shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
107:     PetscCallCUDA(cudaGetLastError());
108:   }
109:   PetscFunctionReturn(PETSC_SUCCESS);
110: }

112: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
113: {
114:   PetscInt x,y,i,j;

116:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
117:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
118:   for (j=y;j<n;j+=blockDim.y) {
119:     for (i=x;i<m;i+=blockDim.x) {
120:       d_pa[i+j*lda] = d_pb[i+j*ldb];
121:     }
122:   }
123: }

125: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
126: {
127:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
128:   dim3        blocks3d,threads3d;

130:   PetscFunctionBegin;
131:   PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
132:   for (i=0;i<dimGrid_xcount;i++) {
133:     for (j=0;j<dimGrid_ycount;j++) {
134:       copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
135:       PetscCallCUDA(cudaGetLastError());
136:     }
137:   }
138:   PetscFunctionReturn(PETSC_SUCCESS);
139: }

141: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
142: {
143:   PetscInt x,y,i,j;

145:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
146:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
147:   for (j=y;j<n;j+=blockDim.y) {
148:     for (i=x;i<m;i+=blockDim.x) {
149:       d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
150:     }
151:   }
152: }

154: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
155: {
156:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
157:   dim3        blocks3d,threads3d;

159:   PetscFunctionBegin;
160:   PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
161:   for (i=0;i<dimGrid_xcount;i++) {
162:     for (j=0;j<dimGrid_ycount;j++) {
163:       copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
164:       PetscCallCUDA(cudaGetLastError());
165:     }
166:   }
167:   PetscFunctionReturn(PETSC_SUCCESS);
168: }

170: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
171: {
172:   PetscInt x,y,i,j;

174:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
175:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
176:   for (j=y;j<n;j+=blockDim.y) {
177:     for (i=x;i<m;i+=blockDim.x) {
178:       d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
179:     }
180:   }
181: }

183: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
184: {
185:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
186:   dim3        blocks3d,threads3d;

188:   PetscFunctionBegin;
189:   PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
190:   for (i=0;i<dimGrid_xcount;i++) {
191:     for (j=0;j<dimGrid_ycount;j++) {
192:       add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
193:       PetscCallCUDA(cudaGetLastError());
194:     }
195:   }
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
200: {
201:   PetscInt x,y,i,j;

203:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
204:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
205:   if (*d_result) {
206:     for (j=y;j<n;j+=blockDim.y) {
207:       for (i=x;i<m;i+=blockDim.x) {
208:         if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
209:       }
210:     }
211:   }
212: }

214: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
215: {
216:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
217:   PetscBool   result=PETSC_TRUE;
218:   dim3        blocks3d,threads3d;

220:   PetscFunctionBegin;
221:   PetscCallCUDA(cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice));
222:   PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
223:   for (i=0;i<dimGrid_xcount;i++) {
224:     for (j=0;j<dimGrid_ycount;j++) {
225:       getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
226:       PetscCallCUDA(cudaGetLastError());
227:     }
228:   }
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount)
233: {
234:   PetscInt               x;
235:   unsigned int           bs=blockDim.x;
236:   __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X];

238:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
239:   shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0;
240:   __syncthreads();

242:   /* reduction */
243:   if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
244:   if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
245:   if ((bs >= 128) && (threadIdx.x <  64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  64]; } __syncthreads();
246:   if ((bs >=  64) && (threadIdx.x <  32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  32]; } __syncthreads();
247:   if ((bs >=  32) && (threadIdx.x <  16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  16]; } __syncthreads();
248:   if ((bs >=  16) && (threadIdx.x <   8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   8]; } __syncthreads();
249:   if ((bs >=   8) && (threadIdx.x <   4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   4]; } __syncthreads();
250:   if ((bs >=   4) && (threadIdx.x <   2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   2]; } __syncthreads();
251:   if ((bs >=   2) && (threadIdx.x <   1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   1]; } __syncthreads();
252:   if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
253: }

255: __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v)
256: {
257:   PetscInt    i,j,dimGrid_xcount;
258:   PetscScalar *part,*d_part;
259:   dim3        blocks3d,threads3d;

261:   PetscFunctionBegin;
262:   PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
263:   PetscCallCUDA(cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x));
264:   PetscCall(PetscMalloc1(blocks3d.x,&part));
265:   for (i=0;i<dimGrid_xcount;i++) {
266:     mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i);
267:     PetscCallCUDA(cudaGetLastError());
268:     PetscCallCUDA(cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
269:     if (i == 0) {
270:       *v = part[0];
271:       j=1;
272:     } else j=0;
273:     for (; j<(int)blocks3d.x; j++) *v *= part[j];
274:   }
275:   PetscCallCUDA(cudaFree(d_part));
276:   PetscCall(PetscFree(part));
277:   PetscFunctionReturn(PETSC_SUCCESS);
278: }