Actual source code: fnutilcuda.cu
slepc-3.21.2 2024-09-25
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include "fnutilcuda.h"
16: static __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
17: {
18: PetscInt x;
19: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
21: if (x<n) {
22: d_pa[x+x*ld] = v;
23: }
24: }
26: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
27: {
28: PetscInt i,dimGrid_xcount;
29: dim3 blocks3d,threads3d;
31: PetscFunctionBegin;
32: PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
33: for (i=0;i<dimGrid_xcount;i++) {
34: set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
35: PetscCallCUDA(cudaGetLastError());
36: }
37: PetscFunctionReturn(PETSC_SUCCESS);
38: }
40: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
41: {
42: PetscInt x;
43: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
45: if (x<n) {
46: d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
47: }
48: }
50: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
51: {
52: PetscInt i,dimGrid_xcount;
53: dim3 blocks3d,threads3d;
55: PetscFunctionBegin;
56: PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
57: for (i=0;i<dimGrid_xcount;i++) {
58: set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
59: PetscCallCUDA(cudaGetLastError());
60: }
61: PetscFunctionReturn(PETSC_SUCCESS);
62: }
64: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
65: {
66: PetscInt x;
67: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
69: if (x<n) {
70: d_pa[x+x*ld] += v;
71: }
72: }
74: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
75: {
76: PetscInt i,dimGrid_xcount;
77: dim3 blocks3d,threads3d;
79: PetscFunctionBegin;
80: PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
81: for (i=0;i<dimGrid_xcount;i++) {
82: shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
83: PetscCallCUDA(cudaGetLastError());
84: }
85: PetscFunctionReturn(PETSC_SUCCESS);
86: }
88: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
89: {
90: PetscInt x;
91: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
93: if (x<n) {
94: d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
95: }
96: }
98: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
99: {
100: PetscInt i,dimGrid_xcount;
101: dim3 blocks3d,threads3d;
103: PetscFunctionBegin;
104: PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
105: for (i=0;i<dimGrid_xcount;i++) {
106: shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
107: PetscCallCUDA(cudaGetLastError());
108: }
109: PetscFunctionReturn(PETSC_SUCCESS);
110: }
112: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
113: {
114: PetscInt x,y,i,j;
116: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
117: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
118: for (j=y;j<n;j+=blockDim.y) {
119: for (i=x;i<m;i+=blockDim.x) {
120: d_pa[i+j*lda] = d_pb[i+j*ldb];
121: }
122: }
123: }
125: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
126: {
127: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
128: dim3 blocks3d,threads3d;
130: PetscFunctionBegin;
131: PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
132: for (i=0;i<dimGrid_xcount;i++) {
133: for (j=0;j<dimGrid_ycount;j++) {
134: copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
135: PetscCallCUDA(cudaGetLastError());
136: }
137: }
138: PetscFunctionReturn(PETSC_SUCCESS);
139: }
141: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
142: {
143: PetscInt x,y,i,j;
145: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
146: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
147: for (j=y;j<n;j+=blockDim.y) {
148: for (i=x;i<m;i+=blockDim.x) {
149: d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
150: }
151: }
152: }
154: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
155: {
156: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
157: dim3 blocks3d,threads3d;
159: PetscFunctionBegin;
160: PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
161: for (i=0;i<dimGrid_xcount;i++) {
162: for (j=0;j<dimGrid_ycount;j++) {
163: copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
164: PetscCallCUDA(cudaGetLastError());
165: }
166: }
167: PetscFunctionReturn(PETSC_SUCCESS);
168: }
170: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
171: {
172: PetscInt x,y,i,j;
174: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
175: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
176: for (j=y;j<n;j+=blockDim.y) {
177: for (i=x;i<m;i+=blockDim.x) {
178: d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
179: }
180: }
181: }
183: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
184: {
185: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
186: dim3 blocks3d,threads3d;
188: PetscFunctionBegin;
189: PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
190: for (i=0;i<dimGrid_xcount;i++) {
191: for (j=0;j<dimGrid_ycount;j++) {
192: add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
193: PetscCallCUDA(cudaGetLastError());
194: }
195: }
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
200: {
201: PetscInt x,y,i,j;
203: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
204: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
205: if (*d_result) {
206: for (j=y;j<n;j+=blockDim.y) {
207: for (i=x;i<m;i+=blockDim.x) {
208: if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
209: }
210: }
211: }
212: }
214: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
215: {
216: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
217: PetscBool result=PETSC_TRUE;
218: dim3 blocks3d,threads3d;
220: PetscFunctionBegin;
221: PetscCallCUDA(cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice));
222: PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
223: for (i=0;i<dimGrid_xcount;i++) {
224: for (j=0;j<dimGrid_ycount;j++) {
225: getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
226: PetscCallCUDA(cudaGetLastError());
227: }
228: }
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount)
233: {
234: PetscInt x;
235: unsigned int bs=blockDim.x;
236: __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X];
238: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
239: shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0;
240: __syncthreads();
242: /* reduction */
243: if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
244: if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
245: if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads();
246: if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads();
247: if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads();
248: if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads();
249: if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads();
250: if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads();
251: if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads();
252: if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
253: }
255: __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v)
256: {
257: PetscInt i,j,dimGrid_xcount;
258: PetscScalar *part,*d_part;
259: dim3 blocks3d,threads3d;
261: PetscFunctionBegin;
262: PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
263: PetscCallCUDA(cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x));
264: PetscCall(PetscMalloc1(blocks3d.x,&part));
265: for (i=0;i<dimGrid_xcount;i++) {
266: mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i);
267: PetscCallCUDA(cudaGetLastError());
268: PetscCallCUDA(cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
269: if (i == 0) {
270: *v = part[0];
271: j=1;
272: } else j=0;
273: for (; j<(int)blocks3d.x; j++) *v *= part[j];
274: }
275: PetscCallCUDA(cudaFree(d_part));
276: PetscCall(PetscFree(part));
277: PetscFunctionReturn(PETSC_SUCCESS);
278: }