GCC Code Coverage Report


Directory: ./
File: src/sys/classes/bv/impls/cuda/bvcuda.cu
Date: 2025-10-03 04:28:47
Exec Total Coverage
Lines: 315 345 91.3%
Functions: 16 20 80.0%
Branches: 228 668 34.1%

Line Branch Exec Source
1 /*
2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3 SLEPc - Scalable Library for Eigenvalue Problem Computations
4 Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
5
6 This file is part of SLEPc.
7 SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9 */
10 /*
11 CUDA-related code common to several BV impls
12 */
13
14 #include <slepc/private/bvimpl.h>
15 #include <slepccupmblas.h>
16
17 #define BLOCKSIZE 64
18
19 /*
20 C := alpha*A*B + beta*C
21 */
22 444 PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
23 {
24 444 PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
25 444 cublasHandle_t cublasv2handle;
26
27 444 PetscFunctionBegin;
28
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
29
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(m_,&m));
30
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(n_,&n));
31
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(k_,&k));
32
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(lda_,&lda));
33
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
34
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
35
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscLogGpuTimeBegin());
36
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
444 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
37
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
444 PetscCall(PetscLogGpuTimeEnd());
38 444 PetscCall(PetscLogGpuFlops(2.0*m*n*k));
39 444 PetscFunctionReturn(PETSC_SUCCESS);
40 }
41
42 /*
43 y := alpha*A*x + beta*y
44 */
45 66638 PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
46 {
47 66638 PetscCuBLASInt n=0,k=0,lda=0,one=1;
48 66638 cublasHandle_t cublasv2handle;
49
50 66638 PetscFunctionBegin;
51
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
52
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscCuBLASIntCast(n_,&n));
53
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscCuBLASIntCast(k_,&k));
54
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscCuBLASIntCast(lda_,&lda));
55
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscLogGpuTimeBegin());
56
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
66638 PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
57
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
66638 PetscCall(PetscLogGpuTimeEnd());
58 66638 PetscCall(PetscLogGpuFlops(2.0*n*k));
59 66638 PetscFunctionReturn(PETSC_SUCCESS);
60 }
61
62 /*
63 A(:,s:e-1) := A*B(:,s:e-1)
64 */
65 3357 PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
66 {
67 3357 const PetscScalar *d_B1;
68 3357 PetscScalar *d_work,sone=1.0,szero=0.0;
69 3357 PetscCuBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
70 3357 size_t freemem,totmem;
71 3357 cublasHandle_t cublasv2handle;
72 3357 cublasOperation_t bt;
73
74 3357 PetscFunctionBegin;
75
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
76
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCuBLASIntCast(m_,&m));
77
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCuBLASIntCast(e-s,&n));
78
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCuBLASIntCast(k_,&k));
79
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCuBLASIntCast(lda_,&lda));
80
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
81
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscLogGpuTimeBegin());
82
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 if (PetscUnlikely(btrans)) {
83 d_B1 = d_B+s;
84 bt = CUBLAS_OP_C;
85 } else {
86 3357 d_B1 = d_B+s*ldb;
87 3357 bt = CUBLAS_OP_N;
88 }
89 /* try to allocate the whole matrix */
90
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3357 PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
91
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
3357 if (freemem>=lda*n*sizeof(PetscScalar)) {
92
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3357 PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
93
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
3357 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
94
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3357 PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
95 } else {
96 PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
97 PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
98 PetscCall(PetscCuBLASIntCast(m % bs,&l));
99 if (l) {
100 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101 PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
102 }
103 for (;l<m;l+=bs) {
104 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105 PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
106 }
107 }
108
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
3357 PetscCall(PetscLogGpuTimeEnd());
109
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
3357 PetscCallCUDA(cudaFree(d_work));
110 3357 PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111 3357 PetscFunctionReturn(PETSC_SUCCESS);
112 }
113
114 /*
115 B := alpha*A + beta*B
116 */
117 274 PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118 {
119 274 PetscCuBLASInt n=0,k=0,lda=0,ldb=0;
120 274 cublasHandle_t cublasv2handle;
121
122 274 PetscFunctionBegin;
123
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
124
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscCuBLASIntCast(n_,&n));
125
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscCuBLASIntCast(k_,&k));
126
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscCuBLASIntCast(lda_,&lda));
127
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
128
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscLogGpuTimeBegin());
129
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
274 PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
274 PetscCall(PetscLogGpuTimeEnd());
131
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
415 PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132 274 PetscFunctionReturn(PETSC_SUCCESS);
133 }
134
135 /*
136 C := A'*B
137
138 C is a CPU array
139 */
140 5285 PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141 {
142 5285 PetscScalar *d_work,sone=1.0,szero=0.0,*CC;
143 5285 PetscInt j;
144 5285 PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145 5285 PetscMPIInt len;
146 5285 cublasHandle_t cublasv2handle;
147
148 5285 PetscFunctionBegin;
149
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
150
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(m_,&m));
151
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(n_,&n));
152
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(k_,&k));
153
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(lda_,&lda));
154
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
155
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
5285 PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
156
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
5285 PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
5285 if (mpi) {
158
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
608 if (ldc==m) {
159
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
128 PetscCall(BVAllocateWork_Private(bv,m*n));
160
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
128 if (k) {
161
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
128 PetscCall(PetscLogGpuTimeBegin());
162
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
128 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
128 PetscCall(PetscLogGpuTimeEnd());
164
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
128 PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
165 128 PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166 } else PetscCall(PetscArrayzero(bv->work,m*n));
167
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
128 PetscCall(PetscMPIIntCast(m*n,&len));
168
2/6
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
128 PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169 } else {
170
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
480 PetscCall(BVAllocateWork_Private(bv,2*m*n));
171 480 CC = bv->work+m*n;
172
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
480 if (k) {
173
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
480 PetscCall(PetscLogGpuTimeBegin());
174
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
480 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
480 PetscCall(PetscLogGpuTimeEnd());
176
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
480 PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
177 480 PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178 } else PetscCall(PetscArrayzero(bv->work,m*n));
179
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
480 PetscCall(PetscMPIIntCast(m*n,&len));
180
2/6
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
480 PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181
3/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
2592 for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182 }
183 } else {
184
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
4677 if (k) {
185
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
4677 PetscCall(BVAllocateWork_Private(bv,m*n));
186
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
4677 PetscCall(PetscLogGpuTimeBegin());
187
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
4677 PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
4677 PetscCall(PetscLogGpuTimeEnd());
189
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
4677 PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
190 4677 PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191
3/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
31585 for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192 }
193 }
194
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
5285 PetscCallCUDA(cudaFree(d_work));
195 5285 PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196 5285 PetscFunctionReturn(PETSC_SUCCESS);
197 }
198
199 /*
200 y := A'*x
201
202 y is a CPU array, if NULL bv->buffer is used as a workspace
203 */
204 39850 PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205 {
206 39850 PetscScalar *d_work,szero=0.0,sone=1.0,*yy;
207 39850 PetscCuBLASInt n=0,k=0,lda=0,one=1;
208 39850 PetscMPIInt len;
209 39850 cublasHandle_t cublasv2handle;
210
211 39850 PetscFunctionBegin;
212
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
39850 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
213
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
39850 PetscCall(PetscCuBLASIntCast(n_,&n));
214
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
39850 PetscCall(PetscCuBLASIntCast(k_,&k));
215
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
39850 PetscCall(PetscCuBLASIntCast(lda_,&lda));
216
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
39850 if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work));
217
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
7788 else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
39850 if (mpi) {
219
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
976 PetscCall(BVAllocateWork_Private(bv,k));
220
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
976 if (n) {
221
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
976 PetscCall(PetscLogGpuTimeBegin());
222
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
976 PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
223
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
976 PetscCall(PetscLogGpuTimeEnd());
224
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
976 PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
225 976 PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
226 } else PetscCall(PetscArrayzero(bv->work,k));
227 /* reduction */
228
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
976 PetscCall(PetscMPIIntCast(k,&len));
229
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
976 if (!y) {
230
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
512 if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */
231
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
512 PetscCallCUDA(cudaMalloc((void**)&yy,k*sizeof(PetscScalar)));
232
2/6
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
512 PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
233
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
512 PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
234
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
512 PetscCallCUDA(cudaFree(yy));
235 } else { /* case 2: reduce on CPU, copy result back to GPU */
236 PetscCall(BVAllocateWork_Private(bv,2*k));
237 yy = bv->work+k;
238 PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
239 PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
240 PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
241 PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
242 PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
243 }
244
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
512 PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
245 } else { /* case 3: user-provided array y, reduce on CPU */
246
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
464 PetscCallCUDA(cudaFree(d_work));
247
2/6
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
464 PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
248 }
249 } else {
250
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
38874 if (n) {
251
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
38874 PetscCall(PetscLogGpuTimeBegin());
252
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
38874 PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
253
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
38874 PetscCall(PetscLogGpuTimeEnd());
254 }
255
3/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
38874 if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
256 else {
257
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
7324 PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
258 7324 PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
259
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
7324 PetscCallCUDA(cudaFree(d_work));
260 }
261 }
262 39850 PetscCall(PetscLogGpuFlops(2.0*n*k));
263 39850 PetscFunctionReturn(PETSC_SUCCESS);
264 }
265
266 /*
267 Scale n scalars
268 */
269 25132 PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
270 {
271 25132 PetscCuBLASInt n=0,one=1;
272 25132 cublasHandle_t cublasv2handle;
273
274 25132 PetscFunctionBegin;
275
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
25132 PetscCall(PetscCuBLASIntCast(n_,&n));
276
5/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
37496 if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
277
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
37286 else if (alpha != (PetscScalar)1.0) {
278
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24922 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
279
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24922 PetscCall(PetscLogGpuTimeBegin());
280
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
24922 PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
281
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24922 PetscCall(PetscLogGpuTimeEnd());
282 24922 PetscCall(PetscLogGpuFlops(1.0*n));
283 }
284 PetscFunctionReturn(PETSC_SUCCESS);
285 }
286
287 /*
288 Compute 2-norm of vector consisting of n scalars
289 */
290 2526 PetscErrorCode BVNorm_BLAS_CUDA(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm)
291 {
292 2526 PetscCuBLASInt n=0,one=1;
293 2526 cublasHandle_t cublasv2handle;
294
295 2526 PetscFunctionBegin;
296
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2526 PetscCall(PetscCuBLASIntCast(n_,&n));
297
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2526 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
298
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2526 PetscCall(PetscLogGpuTimeBegin());
299
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
2526 PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n,d_A,one,nrm));
300
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2526 PetscCall(PetscLogGpuTimeEnd());
301 2526 PetscCall(PetscLogGpuFlops(2.0*n));
302 2526 PetscFunctionReturn(PETSC_SUCCESS);
303 }
304
305 /*
306 Normalize the columns of A
307 */
308 496 PetscErrorCode BVNormalize_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi)
309 {
310 496 PetscInt j,k;
311 496 PetscReal nrm,nrm1;
312 496 PetscScalar alpha;
313 496 PetscCuBLASInt m=0,one=1;
314 496 cublasHandle_t cublasv2handle;
315
316 496 PetscFunctionBegin;
317
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
496 PetscCall(PetscCuBLASIntCast(m_,&m));
318
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
496 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
319
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
496 PetscCall(PetscLogGpuTimeBegin());
320
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
9480 for (j=0;j<n_;j++) {
321 8984 k = 1;
322 #if !defined(PETSC_USE_COMPLEX)
323
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
4598 if (eigi && eigi[j] != 0.0) k = 2;
324 #endif
325
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
8984 PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+j*lda_,one,&nrm));
326
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
8984 if (k==2) {
327
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
4 PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+(j+1)*lda_,one,&nrm1));
328 4 nrm = SlepcAbs(nrm,nrm1);
329 }
330 8984 alpha = 1.0/nrm;
331
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
8984 PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+j*lda_,one));
332
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
8984 if (k==2) {
333
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
4 PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+(j+1)*lda_,one));
334 j++;
335 }
336 }
337
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
496 PetscCall(PetscLogGpuTimeEnd());
338 496 PetscCall(PetscLogGpuFlops(3.0*m*n_));
339 496 PetscFunctionReturn(PETSC_SUCCESS);
340 }
341
342 /*
343 BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
344 */
345 23914 PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
346 {
347 23914 PetscScalar *d_hh,*d_a;
348 23914 PetscInt i;
349
350 23914 PetscFunctionBegin;
351
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
23914 if (!h) {
352
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
22259 PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
353
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
22259 PetscCall(PetscLogGpuTimeBegin());
354 22259 d_hh = d_a + j*(bv->nc+bv->m);
355
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
22259 PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
356
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
22259 PetscCall(PetscLogGpuTimeEnd());
357
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
22259 PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
358 } else { /* cpu memory */
359
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
13553 for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
360 }
361 PetscFunctionReturn(PETSC_SUCCESS);
362 }
363
364 /*
365 BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
366 into column j of the bv buffer
367 */
368 34464 PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
369 {
370
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
34464 PetscScalar *d_h,*d_c,sone=1.0;
371 34464 PetscInt i;
372 34464 PetscCuBLASInt idx=0,one=1;
373 34464 cublasHandle_t cublasv2handle;
374
375 34464 PetscFunctionBegin;
376
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
34464 if (!h) {
377
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
378
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
379 32514 d_h = d_c + j*(bv->nc+bv->m);
380
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
381
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(PetscLogGpuTimeBegin());
382
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
32514 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
383
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(PetscLogGpuTimeEnd());
384 32514 PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
385
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
32514 PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
386 } else { /* cpu memory */
387
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
15766 for (i=0;i<bv->nc+j;i++) h[i] += c[i];
388 1950 PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
389 }
390 PetscFunctionReturn(PETSC_SUCCESS);
391 }
392
393 /*
394 BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
395 of the coefficients array
396 */
397 25123 PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
398 {
399 25123 PetscScalar *d_h,*a;
400
401 25123 PetscFunctionBegin;
402
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
25123 if (!h) {
403
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24723 PetscCall(VecCUDAGetArray(bv->buffer,&a));
404
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24723 PetscCall(PetscLogGpuTimeBegin());
405 24723 d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
406
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
24723 PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
407 24723 PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
408
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24723 PetscCall(PetscLogGpuTimeEnd());
409
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
24723 PetscCall(VecCUDARestoreArray(bv->buffer,&a));
410 } else { /* cpu memory */
411 400 h[bv->nc+j] = value;
412 }
413 PetscFunctionReturn(PETSC_SUCCESS);
414 }
415
416 /*
417 BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
418 coefficients array (up to position j)
419 */
420 33564 PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
421 {
422 33564 const PetscScalar *d_h;
423 33564 PetscScalar dot;
424 33564 PetscInt i;
425 33564 PetscCuBLASInt idx=0,one=1;
426 33564 cublasHandle_t cublasv2handle;
427
428 33564 PetscFunctionBegin;
429
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
33564 if (!h) {
430
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
431
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
432
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
433
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(PetscLogGpuTimeBegin());
434
1/10
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
31846 PetscCallCUBLAS(cublasXdot(cublasv2handle,idx,d_h,one,d_h,one,&dot));
435
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(PetscLogGpuTimeEnd());
436 31846 PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
437 31846 *sum = PetscRealPart(dot);
438
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31846 PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
439 } else { /* cpu memory */
440 1718 *sum = 0.0;
441
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
14798 for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
442 1718 PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
443 }
444 PetscFunctionReturn(PETSC_SUCCESS);
445 }
446
447 /* pointwise multiplication */
448 184 static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
449 {
450 PetscInt x;
451
452 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
453 if (x<n) a[x] *= PetscRealPart(b[x]);
454 184 }
455
456 /* pointwise division */
457 184 static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
458 {
459 PetscInt x;
460
461 x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
462 if (x<n) a[x] /= PetscRealPart(b[x]);
463 184 }
464
465 /*
466 BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
467 the contents of the coefficients array (up to position j) and omega is the signature;
468 if inverse=TRUE then the operation is h/omega
469 */
470 392 PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
471 {
472 392 PetscScalar *d_h;
473 392 const PetscScalar *d_omega,*omega;
474 392 PetscInt i,xcount;
475 392 dim3 blocks3d, threads3d;
476
477 392 PetscFunctionBegin;
478
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
392 if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
479
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
368 if (!h) {
480
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
481
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
482
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
483
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(PetscLogGpuTimeBegin());
484
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
368 if (inverse) {
485
3/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
368 for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
486 } else {
487
3/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
368 for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
488 }
489
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
368 PetscCallCUDA(cudaGetLastError());
490
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(PetscLogGpuTimeEnd());
491 368 PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
492
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
368 PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
493
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
368 PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
494 } else {
495 PetscCall(VecGetArrayRead(bv->omega,&omega));
496 if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
497 else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
498 PetscCall(VecRestoreArrayRead(bv->omega,&omega));
499 PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
500 }
501 PetscFunctionReturn(PETSC_SUCCESS);
502 }
503
504 /*
505 BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
506 of the coefficients array
507 */
508 31944 PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
509 {
510 31944 const PetscScalar *d_h;
511 31944 PetscScalar hh;
512
513 31944 PetscFunctionBegin;
514
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
31944 if (!h) {
515
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31944 PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
516
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31944 PetscCall(PetscLogGpuTimeBegin());
517
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
31944 PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
518 31944 PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
519
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31944 PetscCall(PetscLogGpuTimeEnd());
520
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31944 PetscCall(BV_SafeSqrt(bv,hh,beta));
521
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
31944 PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
522 } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
523 PetscFunctionReturn(PETSC_SUCCESS);
524 }
525
526 /*
527 BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
528 provided by the caller (only values from l to j are copied)
529 */
530 1321 PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
531 {
532 1321 const PetscScalar *d_h,*d_a;
533 1321 PetscInt i;
534
535 1321 PetscFunctionBegin;
536
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
1321 if (!h) {
537
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
1321 PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
538
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
1321 PetscCall(PetscLogGpuTimeBegin());
539 1321 d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
540
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
1321 PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
541 1321 PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
542
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
1321 PetscCall(PetscLogGpuTimeEnd());
543
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
1321 PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
544 } else {
545 for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
546 }
547 PetscFunctionReturn(PETSC_SUCCESS);
548 }
549