Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
5 | |||
6 | This file is part of SLEPc. | ||
7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
9 | */ | ||
10 | /* | ||
11 | Utility subroutines common to several impls | ||
12 | */ | ||
13 | |||
14 | #include "fnutilcuda.h" | ||
15 | |||
16 | 808 | static __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount) | |
17 | 808 | { | |
18 | PetscInt x; | ||
19 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
20 | |||
21 | if (x<n) { | ||
22 | d_pa[x+x*ld] = v; | ||
23 | } | ||
24 | 404 | } | |
25 | |||
26 | 808 | __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v) | |
27 | { | ||
28 | 808 | PetscInt i,dimGrid_xcount; | |
29 | 808 | dim3 blocks3d,threads3d; | |
30 | |||
31 | 808 | PetscFunctionBegin; | |
32 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
808 | PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount)); |
33 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
1616 | for (i=0;i<dimGrid_xcount;i++) { |
34 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
1212 | set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i); |
35 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
808 | PetscCallCUDA(cudaGetLastError()); |
36 | } | ||
37 | PetscFunctionReturn(PETSC_SUCCESS); | ||
38 | } | ||
39 | |||
40 | 64 | __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount) | |
41 | 64 | { | |
42 | PetscInt x; | ||
43 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
44 | |||
45 | if (x<n) { | ||
46 | d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi); | ||
47 | } | ||
48 | 64 | } | |
49 | |||
50 | 64 | __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi) | |
51 | { | ||
52 | 64 | PetscInt i,dimGrid_xcount; | |
53 | 64 | dim3 blocks3d,threads3d; | |
54 | |||
55 | 64 | PetscFunctionBegin; | |
56 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
64 | PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount)); |
57 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
128 | for (i=0;i<dimGrid_xcount;i++) { |
58 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
64 | set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i); |
59 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
64 | PetscCallCUDA(cudaGetLastError()); |
60 | } | ||
61 | PetscFunctionReturn(PETSC_SUCCESS); | ||
62 | } | ||
63 | |||
64 | 1816 | __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount) | |
65 | 1816 | { | |
66 | PetscInt x; | ||
67 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
68 | |||
69 | if (x<n) { | ||
70 | d_pa[x+x*ld] += v; | ||
71 | } | ||
72 | 900 | } | |
73 | |||
74 | 1816 | __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v) | |
75 | { | ||
76 | 1816 | PetscInt i,dimGrid_xcount; | |
77 | 1816 | dim3 blocks3d,threads3d; | |
78 | |||
79 | 1816 | PetscFunctionBegin; | |
80 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1816 | PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount)); |
81 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
3632 | for (i=0;i<dimGrid_xcount;i++) { |
82 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2732 | shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i); |
83 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1816 | PetscCallCUDA(cudaGetLastError()); |
84 | } | ||
85 | PetscFunctionReturn(PETSC_SUCCESS); | ||
86 | } | ||
87 | |||
88 | 176 | __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount) | |
89 | 176 | { | |
90 | PetscInt x; | ||
91 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
92 | |||
93 | if (x<n) { | ||
94 | d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi); | ||
95 | } | ||
96 | 176 | } | |
97 | |||
98 | 176 | __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi) | |
99 | { | ||
100 | 176 | PetscInt i,dimGrid_xcount; | |
101 | 176 | dim3 blocks3d,threads3d; | |
102 | |||
103 | 176 | PetscFunctionBegin; | |
104 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
176 | PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount)); |
105 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
352 | for (i=0;i<dimGrid_xcount;i++) { |
106 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
176 | shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i); |
107 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
176 | PetscCallCUDA(cudaGetLastError()); |
108 | } | ||
109 | PetscFunctionReturn(PETSC_SUCCESS); | ||
110 | } | ||
111 | |||
112 | 16 | __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount) | |
113 | 16 | { | |
114 | PetscInt x,y,i,j; | ||
115 | |||
116 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
117 | y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y; | ||
118 | for (j=y;j<n;j+=blockDim.y) { | ||
119 | for (i=x;i<m;i+=blockDim.x) { | ||
120 | d_pa[i+j*lda] = d_pb[i+j*ldb]; | ||
121 | } | ||
122 | } | ||
123 | 16 | } | |
124 | |||
125 | 16 | __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb) | |
126 | { | ||
127 | 16 | PetscInt i,j,dimGrid_xcount,dimGrid_ycount; | |
128 | 16 | dim3 blocks3d,threads3d; | |
129 | |||
130 | 16 | PetscFunctionBegin; | |
131 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
16 | PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount)); |
132 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
32 | for (i=0;i<dimGrid_xcount;i++) { |
133 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
32 | for (j=0;j<dimGrid_ycount;j++) { |
134 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
16 | copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j); |
135 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
16 | PetscCallCUDA(cudaGetLastError()); |
136 | } | ||
137 | } | ||
138 | PetscFunctionReturn(PETSC_SUCCESS); | ||
139 | } | ||
140 | |||
141 | 16 | __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount) | |
142 | 16 | { | |
143 | PetscInt x,y,i,j; | ||
144 | |||
145 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
146 | y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y; | ||
147 | for (j=y;j<n;j+=blockDim.y) { | ||
148 | for (i=x;i<m;i+=blockDim.x) { | ||
149 | d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]); | ||
150 | } | ||
151 | } | ||
152 | 16 | } | |
153 | |||
154 | 16 | __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb) | |
155 | { | ||
156 | 16 | PetscInt i,j,dimGrid_xcount,dimGrid_ycount; | |
157 | 16 | dim3 blocks3d,threads3d; | |
158 | |||
159 | 16 | PetscFunctionBegin; | |
160 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
16 | PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount)); |
161 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
32 | for (i=0;i<dimGrid_xcount;i++) { |
162 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
32 | for (j=0;j<dimGrid_ycount;j++) { |
163 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
16 | copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j); |
164 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
16 | PetscCallCUDA(cudaGetLastError()); |
165 | } | ||
166 | } | ||
167 | PetscFunctionReturn(PETSC_SUCCESS); | ||
168 | } | ||
169 | |||
170 | 32 | __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount) | |
171 | 32 | { | |
172 | PetscInt x,y,i,j; | ||
173 | |||
174 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
175 | y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y; | ||
176 | for (j=y;j<n;j+=blockDim.y) { | ||
177 | for (i=x;i<m;i+=blockDim.x) { | ||
178 | d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]); | ||
179 | } | ||
180 | } | ||
181 | 32 | } | |
182 | |||
183 | 32 | __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda) | |
184 | { | ||
185 | 32 | PetscInt i,j,dimGrid_xcount,dimGrid_ycount; | |
186 | 32 | dim3 blocks3d,threads3d; | |
187 | |||
188 | 32 | PetscFunctionBegin; | |
189 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32 | PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount)); |
190 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
64 | for (i=0;i<dimGrid_xcount;i++) { |
191 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
64 | for (j=0;j<dimGrid_ycount;j++) { |
192 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
32 | add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j); |
193 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
32 | PetscCallCUDA(cudaGetLastError()); |
194 | } | ||
195 | } | ||
196 | PetscFunctionReturn(PETSC_SUCCESS); | ||
197 | } | ||
198 | |||
199 | 8 | __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount) | |
200 | 8 | { | |
201 | PetscInt x,y,i,j; | ||
202 | |||
203 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
204 | y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y; | ||
205 | if (*d_result) { | ||
206 | for (j=y;j<n;j+=blockDim.y) { | ||
207 | for (i=x;i<m;i+=blockDim.x) { | ||
208 | if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE; | ||
209 | } | ||
210 | } | ||
211 | } | ||
212 | 8 | } | |
213 | |||
214 | 8 | __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result) | |
215 | { | ||
216 | 8 | PetscInt i,j,dimGrid_xcount,dimGrid_ycount; | |
217 | 8 | PetscBool result=PETSC_TRUE; | |
218 | 8 | dim3 blocks3d,threads3d; | |
219 | |||
220 | 8 | PetscFunctionBegin; | |
221 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
8 | PetscCallCUDA(cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice)); |
222 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
8 | PetscCall(SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount)); |
223 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
16 | for (i=0;i<dimGrid_xcount;i++) { |
224 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
16 | for (j=0;j<dimGrid_ycount;j++) { |
225 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
8 | getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j); |
226 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
8 | PetscCallCUDA(cudaGetLastError()); |
227 | } | ||
228 | } | ||
229 | PetscFunctionReturn(PETSC_SUCCESS); | ||
230 | } | ||
231 | |||
232 | 160 | __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount) | |
233 | ✗ | { | |
234 | PetscInt x; | ||
235 | unsigned int bs=blockDim.x; | ||
236 | __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X]; | ||
237 | |||
238 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
239 | shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0; | ||
240 | __syncthreads(); | ||
241 | |||
242 | /* reduction */ | ||
243 | if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads(); | ||
244 | if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads(); | ||
245 | if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads(); | ||
246 | if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads(); | ||
247 | if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads(); | ||
248 | if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads(); | ||
249 | if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads(); | ||
250 | if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads(); | ||
251 | if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads(); | ||
252 | if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x]; | ||
253 | 160 | } | |
254 | |||
255 | 160 | __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v) | |
256 | { | ||
257 | 160 | PetscInt i,j,dimGrid_xcount; | |
258 | 160 | PetscScalar *part,*d_part; | |
259 | 160 | dim3 blocks3d,threads3d; | |
260 | |||
261 | 160 | PetscFunctionBegin; | |
262 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount)); |
263 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
160 | PetscCallCUDA(cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x)); |
264 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | PetscCall(PetscMalloc1(blocks3d.x,&part)); |
265 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
320 | for (i=0;i<dimGrid_xcount;i++) { |
266 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
160 | mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i); |
267 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
160 | PetscCallCUDA(cudaGetLastError()); |
268 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
160 | PetscCallCUDA(cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
269 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
160 | if (i == 0) { |
270 | 160 | *v = part[0]; | |
271 | 160 | j=1; | |
272 | } else j=0; | ||
273 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
160 | for (; j<(int)blocks3d.x; j++) *v *= part[j]; |
274 | } | ||
275 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
160 | PetscCallCUDA(cudaFree(d_part)); |
276 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
160 | PetscCall(PetscFree(part)); |
277 | PetscFunctionReturn(PETSC_SUCCESS); | ||
278 | } | ||
279 |