GCC Code Coverage Report


Directory: ./
File: src/sys/classes/fn/impls/cuda/fnutilcuda.cu
Date: 2025-10-03 04:28:47
Exec Total Coverage
Lines: 114 115 99.1%
Functions: 9 23 39.1%
Branches: 62 124 50.0%

Line Branch Exec Source
1 /*
2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3 SLEPc - Scalable Library for Eigenvalue Problem Computations
4 Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
5
6 This file is part of SLEPc.
7 SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9 */
10 /*
11 Utility subroutines common to several impls
12 */
13
14 #include "fnutilcuda.h"
15
16 808 static __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
17 808 {
18 PetscInt x;
19 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
20
21 if (x<n) {
22 d_pa[x+x*ld] = v;
23 }
24 404 }
25
26 808 __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
27 {
28 808 PetscInt i,dimGrid_xcount;
29 808 dim3 blocks3d,threads3d;
30
31 808 PetscFunctionBegin;
32
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
808 PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
33
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
1616 for (i=0;i<dimGrid_xcount;i++) {
34
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
1212 set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
35
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
808 PetscCallCUDA(cudaGetLastError());
36 }
37 PetscFunctionReturn(PETSC_SUCCESS);
38 }
39
40 64 __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
41 64 {
42 PetscInt x;
43 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
44
45 if (x<n) {
46 d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
47 }
48 64 }
49
50 64 __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
51 {
52 64 PetscInt i,dimGrid_xcount;
53 64 dim3 blocks3d,threads3d;
54
55 64 PetscFunctionBegin;
56
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
64 PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
57
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
128 for (i=0;i<dimGrid_xcount;i++) {
58
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
64 set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
59
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
64 PetscCallCUDA(cudaGetLastError());
60 }
61 PetscFunctionReturn(PETSC_SUCCESS);
62 }
63
64 1816 __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
65 1816 {
66 PetscInt x;
67 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
68
69 if (x<n) {
70 d_pa[x+x*ld] += v;
71 }
72 900 }
73
74 1816 __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
75 {
76 1816 PetscInt i,dimGrid_xcount;
77 1816 dim3 blocks3d,threads3d;
78
79 1816 PetscFunctionBegin;
80
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
1816 PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
81
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
3632 for (i=0;i<dimGrid_xcount;i++) {
82
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2732 shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
83
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1816 PetscCallCUDA(cudaGetLastError());
84 }
85 PetscFunctionReturn(PETSC_SUCCESS);
86 }
87
88 176 __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
89 176 {
90 PetscInt x;
91 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
92
93 if (x<n) {
94 d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
95 }
96 176 }
97
98 176 __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
99 {
100 176 PetscInt i,dimGrid_xcount;
101 176 dim3 blocks3d,threads3d;
102
103 176 PetscFunctionBegin;
104
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
176 PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
105
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
352 for (i=0;i<dimGrid_xcount;i++) {
106
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
176 shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
107
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
176 PetscCallCUDA(cudaGetLastError());
108 }
109 PetscFunctionReturn(PETSC_SUCCESS);
110 }
111
112 16 __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
113 16 {
114 PetscInt x,y,i,j;
115
116 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
117 y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
118 for (j=y;j<n;j+=blockDim.y) {
119 for (i=x;i<m;i+=blockDim.x) {
120 d_pa[i+j*lda] = d_pb[i+j*ldb];
121 }
122 }
123 16 }
124
125 16 __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
126 {
127 16 PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
128 16 dim3 blocks3d,threads3d;
129
130 16 PetscFunctionBegin;
131
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
16 PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
132
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
32 for (i=0;i<dimGrid_xcount;i++) {
133
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
32 for (j=0;j<dimGrid_ycount;j++) {
134
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
16 copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
135
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
16 PetscCallCUDA(cudaGetLastError());
136 }
137 }
138 PetscFunctionReturn(PETSC_SUCCESS);
139 }
140
141 16 __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
142 16 {
143 PetscInt x,y,i,j;
144
145 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
146 y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
147 for (j=y;j<n;j+=blockDim.y) {
148 for (i=x;i<m;i+=blockDim.x) {
149 d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
150 }
151 }
152 16 }
153
154 16 __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
155 {
156 16 PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
157 16 dim3 blocks3d,threads3d;
158
159 16 PetscFunctionBegin;
160
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
16 PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
161
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
32 for (i=0;i<dimGrid_xcount;i++) {
162
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
32 for (j=0;j<dimGrid_ycount;j++) {
163
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
16 copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
164
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
16 PetscCallCUDA(cudaGetLastError());
165 }
166 }
167 PetscFunctionReturn(PETSC_SUCCESS);
168 }
169
170 32 __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
171 32 {
172 PetscInt x,y,i,j;
173
174 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
175 y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
176 for (j=y;j<n;j+=blockDim.y) {
177 for (i=x;i<m;i+=blockDim.x) {
178 d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
179 }
180 }
181 32 }
182
183 32 __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
184 {
185 32 PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
186 32 dim3 blocks3d,threads3d;
187
188 32 PetscFunctionBegin;
189
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32 PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
190
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
64 for (i=0;i<dimGrid_xcount;i++) {
191
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
64 for (j=0;j<dimGrid_ycount;j++) {
192
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
32 add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
193
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
32 PetscCallCUDA(cudaGetLastError());
194 }
195 }
196 PetscFunctionReturn(PETSC_SUCCESS);
197 }
198
199 8 __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
200 8 {
201 PetscInt x,y,i,j;
202
203 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
204 y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
205 if (*d_result) {
206 for (j=y;j<n;j+=blockDim.y) {
207 for (i=x;i<m;i+=blockDim.x) {
208 if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
209 }
210 }
211 }
212 8 }
213
214 8 __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
215 {
216 8 PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
217 8 PetscBool result=PETSC_TRUE;
218 8 dim3 blocks3d,threads3d;
219
220 8 PetscFunctionBegin;
221
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
8 PetscCallCUDA(cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice));
222
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
8 PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount));
223
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
16 for (i=0;i<dimGrid_xcount;i++) {
224
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
16 for (j=0;j<dimGrid_ycount;j++) {
225
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
8 getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
226
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
8 PetscCallCUDA(cudaGetLastError());
227 }
228 }
229 PetscFunctionReturn(PETSC_SUCCESS);
230 }
231
232 160 __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount)
233 {
234 PetscInt x;
235 unsigned int bs=blockDim.x;
236 __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X];
237
238 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
239 shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0;
240 __syncthreads();
241
242 /* reduction */
243 if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
244 if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
245 if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads();
246 if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads();
247 if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads();
248 if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads();
249 if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads();
250 if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads();
251 if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads();
252 if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
253 160 }
254
255 160 __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v)
256 {
257 160 PetscInt i,j,dimGrid_xcount;
258 160 PetscScalar *part,*d_part;
259 160 dim3 blocks3d,threads3d;
260
261 160 PetscFunctionBegin;
262
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
160 PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount));
263
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
160 PetscCallCUDA(cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x));
264
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
160 PetscCall(PetscMalloc1(blocks3d.x,&part));
265
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
320 for (i=0;i<dimGrid_xcount;i++) {
266
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
160 mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i);
267
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
160 PetscCallCUDA(cudaGetLastError());
268
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
160 PetscCallCUDA(cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
269
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
160 if (i == 0) {
270 160 *v = part[0];
271 160 j=1;
272 } else j=0;
273
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
160 for (; j<(int)blocks3d.x; j++) *v *= part[j];
274 }
275
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
160 PetscCallCUDA(cudaFree(d_part));
276
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
160 PetscCall(PetscFree(part));
277 PetscFunctionReturn(PETSC_SUCCESS);
278 }
279