Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
5 | |||
6 | This file is part of SLEPc. | ||
7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
9 | */ | ||
10 | /* | ||
11 | CUDA-related code common to several BV impls | ||
12 | */ | ||
13 | |||
14 | #include <slepc/private/bvimpl.h> | ||
15 | #include <slepccupmblas.h> | ||
16 | |||
17 | #define BLOCKSIZE 64 | ||
18 | |||
19 | /* | ||
20 | C := alpha*A*B + beta*C | ||
21 | */ | ||
22 | 444 | PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_) | |
23 | { | ||
24 | 444 | PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
25 | 444 | cublasHandle_t cublasv2handle; | |
26 | |||
27 | 444 | PetscFunctionBegin; | |
28 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
29 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
30 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
31 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
32 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
33 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
34 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(ldc_,&ldc)); |
35 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscLogGpuTimeBegin()); |
36 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
444 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc)); |
37 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscLogGpuTimeEnd()); |
38 | 444 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
39 | 444 | PetscFunctionReturn(PETSC_SUCCESS); | |
40 | } | ||
41 | |||
42 | /* | ||
43 | y := alpha*A*x + beta*y | ||
44 | */ | ||
45 | 66638 | PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y) | |
46 | { | ||
47 | 66638 | PetscCuBLASInt n=0,k=0,lda=0,one=1; | |
48 | 66638 | cublasHandle_t cublasv2handle; | |
49 | |||
50 | 66638 | PetscFunctionBegin; | |
51 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
52 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
53 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
54 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
55 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscLogGpuTimeBegin()); |
56 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
66638 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one)); |
57 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscLogGpuTimeEnd()); |
58 | 66638 | PetscCall(PetscLogGpuFlops(2.0*n*k)); | |
59 | 66638 | PetscFunctionReturn(PETSC_SUCCESS); | |
60 | } | ||
61 | |||
62 | /* | ||
63 | A(:,s:e-1) := A*B(:,s:e-1) | ||
64 | */ | ||
65 | 3357 | PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans) | |
66 | { | ||
67 | 3357 | const PetscScalar *d_B1; | |
68 | 3357 | PetscScalar *d_work,sone=1.0,szero=0.0; | |
69 | 3357 | PetscCuBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE; | |
70 | 3357 | size_t freemem,totmem; | |
71 | 3357 | cublasHandle_t cublasv2handle; | |
72 | 3357 | cublasOperation_t bt; | |
73 | |||
74 | 3357 | PetscFunctionBegin; | |
75 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
76 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
77 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(e-s,&n)); |
78 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
79 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
80 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
81 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscLogGpuTimeBegin()); |
82 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | if (PetscUnlikely(btrans)) { |
83 | ✗ | d_B1 = d_B+s; | |
84 | ✗ | bt = CUBLAS_OP_C; | |
85 | } else { | ||
86 | 3357 | d_B1 = d_B+s*ldb; | |
87 | 3357 | bt = CUBLAS_OP_N; | |
88 | } | ||
89 | /* try to allocate the whole matrix */ | ||
90 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem)); |
91 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
3357 | if (freemem>=lda*n*sizeof(PetscScalar)) { |
92 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar))); |
93 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
3357 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda)); |
94 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); |
95 | } else { | ||
96 | ✗ | PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs)); | |
97 | ✗ | PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar))); | |
98 | ✗ | PetscCall(PetscCuBLASIntCast(m % bs,&l)); | |
99 | ✗ | if (l) { | |
100 | ✗ | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l)); | |
101 | ✗ | PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); | |
102 | } | ||
103 | ✗ | for (;l<m;l+=bs) { | |
104 | ✗ | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs)); | |
105 | ✗ | PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); | |
106 | } | ||
107 | } | ||
108 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscLogGpuTimeEnd()); |
109 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaFree(d_work)); |
110 | 3357 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
111 | 3357 | PetscFunctionReturn(PETSC_SUCCESS); | |
112 | } | ||
113 | |||
114 | /* | ||
115 | B := alpha*A + beta*B | ||
116 | */ | ||
117 | 274 | PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_) | |
118 | { | ||
119 | 274 | PetscCuBLASInt n=0,k=0,lda=0,ldb=0; | |
120 | 274 | cublasHandle_t cublasv2handle; | |
121 | |||
122 | 274 | PetscFunctionBegin; | |
123 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
124 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
125 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
126 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
127 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
128 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscLogGpuTimeBegin()); |
129 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
274 | PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb)); |
130 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscLogGpuTimeEnd()); |
131 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
415 | PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k)); |
132 | 274 | PetscFunctionReturn(PETSC_SUCCESS); | |
133 | } | ||
134 | |||
135 | /* | ||
136 | C := A'*B | ||
137 | |||
138 | C is a CPU array | ||
139 | */ | ||
140 | 5285 | PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi) | |
141 | { | ||
142 | 5285 | PetscScalar *d_work,sone=1.0,szero=0.0,*CC; | |
143 | 5285 | PetscInt j; | |
144 | 5285 | PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
145 | 5285 | PetscMPIInt len; | |
146 | 5285 | cublasHandle_t cublasv2handle; | |
147 | |||
148 | 5285 | PetscFunctionBegin; | |
149 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
150 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
151 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
152 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
153 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
154 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
155 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(ldc_,&ldc)); |
156 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
5285 | PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar))); |
157 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
5285 | if (mpi) { |
158 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
608 | if (ldc==m) { |
159 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(BVAllocateWork_Private(bv,m*n)); |
160 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
128 | if (k) { |
161 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscLogGpuTimeBegin()); |
162 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
128 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc)); |
163 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscLogGpuTimeEnd()); |
164 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
128 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
165 | 128 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
166 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
167 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscMPIIntCast(m*n,&len)); |
168 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
128 | PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
169 | } else { | ||
170 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(BVAllocateWork_Private(bv,2*m*n)); |
171 | 480 | CC = bv->work+m*n; | |
172 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
480 | if (k) { |
173 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscLogGpuTimeBegin()); |
174 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
480 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); |
175 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscLogGpuTimeEnd()); |
176 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
480 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
177 | 480 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
178 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
179 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscMPIIntCast(m*n,&len)); |
180 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
480 | PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
181 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
2592 | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m)); |
182 | } | ||
183 | } else { | ||
184 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
4677 | if (k) { |
185 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(BVAllocateWork_Private(bv,m*n)); |
186 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(PetscLogGpuTimeBegin()); |
187 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4677 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); |
188 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(PetscLogGpuTimeEnd()); |
189 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
4677 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
190 | 4677 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
191 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
31585 | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m)); |
192 | } | ||
193 | } | ||
194 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
5285 | PetscCallCUDA(cudaFree(d_work)); |
195 | 5285 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
196 | 5285 | PetscFunctionReturn(PETSC_SUCCESS); | |
197 | } | ||
198 | |||
199 | /* | ||
200 | y := A'*x | ||
201 | |||
202 | y is a CPU array, if NULL bv->buffer is used as a workspace | ||
203 | */ | ||
204 | 39850 | PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi) | |
205 | { | ||
206 | 39850 | PetscScalar *d_work,szero=0.0,sone=1.0,*yy; | |
207 | 39850 | PetscCuBLASInt n=0,k=0,lda=0,one=1; | |
208 | 39850 | PetscMPIInt len; | |
209 | 39850 | cublasHandle_t cublasv2handle; | |
210 | |||
211 | 39850 | PetscFunctionBegin; | |
212 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
213 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
214 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
215 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
216 |
3/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
39850 | if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work)); |
217 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7788 | else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar))); |
218 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
39850 | if (mpi) { |
219 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(BVAllocateWork_Private(bv,k)); |
220 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
976 | if (n) { |
221 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscLogGpuTimeBegin()); |
222 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
976 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
223 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscLogGpuTimeEnd()); |
224 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
976 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
225 | 976 | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
226 | ✗ | } else PetscCall(PetscArrayzero(bv->work,k)); | |
227 | /* reduction */ | ||
228 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscMPIIntCast(k,&len)); |
229 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
976 | if (!y) { |
230 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
512 | if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */ |
231 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMalloc((void**)&yy,k*sizeof(PetscScalar))); |
232 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
512 | PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
233 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyDeviceToDevice)); |
234 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaFree(yy)); |
235 | } else { /* case 2: reduce on CPU, copy result back to GPU */ | ||
236 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*k)); | |
237 | ✗ | yy = bv->work+k; | |
238 | ✗ | PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); | |
239 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
240 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
241 | ✗ | PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyHostToDevice)); | |
242 | ✗ | PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar))); | |
243 | } | ||
244 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
512 | PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work)); |
245 | } else { /* case 3: user-provided array y, reduce on CPU */ | ||
246 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
464 | PetscCallCUDA(cudaFree(d_work)); |
247 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
464 | PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
248 | } | ||
249 | } else { | ||
250 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
38874 | if (n) { |
251 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
38874 | PetscCall(PetscLogGpuTimeBegin()); |
252 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
38874 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
253 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
38874 | PetscCall(PetscLogGpuTimeEnd()); |
254 | } | ||
255 |
3/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
38874 | if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work)); |
256 | else { | ||
257 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7324 | PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
258 | 7324 | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
259 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7324 | PetscCallCUDA(cudaFree(d_work)); |
260 | } | ||
261 | } | ||
262 | 39850 | PetscCall(PetscLogGpuFlops(2.0*n*k)); | |
263 | 39850 | PetscFunctionReturn(PETSC_SUCCESS); | |
264 | } | ||
265 | |||
266 | /* | ||
267 | Scale n scalars | ||
268 | */ | ||
269 | 25132 | PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha) | |
270 | { | ||
271 | 25132 | PetscCuBLASInt n=0,one=1; | |
272 | 25132 | cublasHandle_t cublasv2handle; | |
273 | |||
274 | 25132 | PetscFunctionBegin; | |
275 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
25132 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
276 |
5/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
37496 | if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar))); |
277 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
37286 | else if (alpha != (PetscScalar)1.0) { |
278 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
279 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscLogGpuTimeBegin()); |
280 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
24922 | PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one)); |
281 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscLogGpuTimeEnd()); |
282 | 24922 | PetscCall(PetscLogGpuFlops(1.0*n)); | |
283 | } | ||
284 | PetscFunctionReturn(PETSC_SUCCESS); | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | Compute 2-norm of vector consisting of n scalars | ||
289 | */ | ||
290 | 2526 | PetscErrorCode BVNorm_BLAS_CUDA(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm) | |
291 | { | ||
292 | 2526 | PetscCuBLASInt n=0,one=1; | |
293 | 2526 | cublasHandle_t cublasv2handle; | |
294 | |||
295 | 2526 | PetscFunctionBegin; | |
296 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
297 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
298 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscLogGpuTimeBegin()); |
299 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
2526 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n,d_A,one,nrm)); |
300 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscLogGpuTimeEnd()); |
301 | 2526 | PetscCall(PetscLogGpuFlops(2.0*n)); | |
302 | 2526 | PetscFunctionReturn(PETSC_SUCCESS); | |
303 | } | ||
304 | |||
305 | /* | ||
306 | Normalize the columns of A | ||
307 | */ | ||
308 | 496 | PetscErrorCode BVNormalize_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi) | |
309 | { | ||
310 | 496 | PetscInt j,k; | |
311 | 496 | PetscReal nrm,nrm1; | |
312 | 496 | PetscScalar alpha; | |
313 | 496 | PetscCuBLASInt m=0,one=1; | |
314 | 496 | cublasHandle_t cublasv2handle; | |
315 | |||
316 | 496 | PetscFunctionBegin; | |
317 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
318 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
319 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscLogGpuTimeBegin()); |
320 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
9480 | for (j=0;j<n_;j++) { |
321 | 8984 | k = 1; | |
322 | #if !defined(PETSC_USE_COMPLEX) | ||
323 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
|
4598 | if (eigi && eigi[j] != 0.0) k = 2; |
324 | #endif | ||
325 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
8984 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+j*lda_,one,&nrm)); |
326 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
8984 | if (k==2) { |
327 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+(j+1)*lda_,one,&nrm1)); |
328 | 4 | nrm = SlepcAbs(nrm,nrm1); | |
329 | } | ||
330 | 8984 | alpha = 1.0/nrm; | |
331 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
8984 | PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+j*lda_,one)); |
332 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
8984 | if (k==2) { |
333 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4 | PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+(j+1)*lda_,one)); |
334 | j++; | ||
335 | } | ||
336 | } | ||
337 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscLogGpuTimeEnd()); |
338 | 496 | PetscCall(PetscLogGpuFlops(3.0*m*n_)); | |
339 | 496 | PetscFunctionReturn(PETSC_SUCCESS); | |
340 | } | ||
341 | |||
342 | /* | ||
343 | BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer | ||
344 | */ | ||
345 | 23914 | PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h) | |
346 | { | ||
347 | 23914 | PetscScalar *d_hh,*d_a; | |
348 | 23914 | PetscInt i; | |
349 | |||
350 | 23914 | PetscFunctionBegin; | |
351 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
23914 | if (!h) { |
352 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(VecCUDAGetArray(bv->buffer,&d_a)); |
353 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(PetscLogGpuTimeBegin()); |
354 | 22259 | d_hh = d_a + j*(bv->nc+bv->m); | |
355 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
22259 | PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar))); |
356 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(PetscLogGpuTimeEnd()); |
357 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_a)); |
358 | } else { /* cpu memory */ | ||
359 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
13553 | for (i=0;i<bv->nc+j;i++) h[i] = 0.0; |
360 | } | ||
361 | PetscFunctionReturn(PETSC_SUCCESS); | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer | ||
366 | into column j of the bv buffer | ||
367 | */ | ||
368 | 34464 | PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c) | |
369 | { | ||
370 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
34464 | PetscScalar *d_h,*d_c,sone=1.0; |
371 | 34464 | PetscInt i; | |
372 | 34464 | PetscCuBLASInt idx=0,one=1; | |
373 | 34464 | cublasHandle_t cublasv2handle; | |
374 | |||
375 | 34464 | PetscFunctionBegin; | |
376 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
34464 | if (!h) { |
377 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
378 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(VecCUDAGetArray(bv->buffer,&d_c)); |
379 | 32514 | d_h = d_c + j*(bv->nc+bv->m); | |
380 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx)); |
381 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscLogGpuTimeBegin()); |
382 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
32514 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one)); |
383 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscLogGpuTimeEnd()); |
384 | 32514 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
385 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_c)); |
386 | } else { /* cpu memory */ | ||
387 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
15766 | for (i=0;i<bv->nc+j;i++) h[i] += c[i]; |
388 | 1950 | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
389 | } | ||
390 | PetscFunctionReturn(PETSC_SUCCESS); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k | ||
395 | of the coefficients array | ||
396 | */ | ||
397 | 25123 | PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value) | |
398 | { | ||
399 | 25123 | PetscScalar *d_h,*a; | |
400 | |||
401 | 25123 | PetscFunctionBegin; | |
402 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
25123 | if (!h) { |
403 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(VecCUDAGetArray(bv->buffer,&a)); |
404 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(PetscLogGpuTimeBegin()); |
405 | 24723 | d_h = a + k*(bv->nc+bv->m) + bv->nc+j; | |
406 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
24723 | PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice)); |
407 | 24723 | PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar))); | |
408 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(PetscLogGpuTimeEnd()); |
409 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(VecCUDARestoreArray(bv->buffer,&a)); |
410 | } else { /* cpu memory */ | ||
411 | 400 | h[bv->nc+j] = value; | |
412 | } | ||
413 | PetscFunctionReturn(PETSC_SUCCESS); | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the | ||
418 | coefficients array (up to position j) | ||
419 | */ | ||
420 | 33564 | PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum) | |
421 | { | ||
422 | 33564 | const PetscScalar *d_h; | |
423 | 33564 | PetscScalar dot; | |
424 | 33564 | PetscInt i; | |
425 | 33564 | PetscCuBLASInt idx=0,one=1; | |
426 | 33564 | cublasHandle_t cublasv2handle; | |
427 | |||
428 | 33564 | PetscFunctionBegin; | |
429 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
33564 | if (!h) { |
430 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
431 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h)); |
432 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx)); |
433 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscLogGpuTimeBegin()); |
434 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
31846 | PetscCallCUBLAS(cublasXdot(cublasv2handle,idx,d_h,one,d_h,one,&dot)); |
435 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscLogGpuTimeEnd()); |
436 | 31846 | PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j))); | |
437 | 31846 | *sum = PetscRealPart(dot); | |
438 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h)); |
439 | } else { /* cpu memory */ | ||
440 | 1718 | *sum = 0.0; | |
441 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
14798 | for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i])); |
442 | 1718 | PetscCall(PetscLogFlops(2.0*(bv->nc+j))); | |
443 | } | ||
444 | PetscFunctionReturn(PETSC_SUCCESS); | ||
445 | } | ||
446 | |||
447 | /* pointwise multiplication */ | ||
448 | 184 | static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | |
449 | ✗ | { | |
450 | PetscInt x; | ||
451 | |||
452 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
453 | if (x<n) a[x] *= PetscRealPart(b[x]); | ||
454 | 184 | } | |
455 | |||
456 | /* pointwise division */ | ||
457 | 184 | static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | |
458 | ✗ | { | |
459 | PetscInt x; | ||
460 | |||
461 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
462 | if (x<n) a[x] /= PetscRealPart(b[x]); | ||
463 | 184 | } | |
464 | |||
465 | /* | ||
466 | BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents | ||
467 | the contents of the coefficients array (up to position j) and omega is the signature; | ||
468 | if inverse=TRUE then the operation is h/omega | ||
469 | */ | ||
470 | 392 | PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse) | |
471 | { | ||
472 | 392 | PetscScalar *d_h; | |
473 | 392 | const PetscScalar *d_omega,*omega; | |
474 | 392 | PetscInt i,xcount; | |
475 | 392 | dim3 blocks3d, threads3d; | |
476 | |||
477 | 392 | PetscFunctionBegin; | |
478 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
392 | if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS); |
479 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
368 | if (!h) { |
480 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDAGetArray(bv->buffer,&d_h)); |
481 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega)); |
482 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount)); |
483 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(PetscLogGpuTimeBegin()); |
484 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
368 | if (inverse) { |
485 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
368 | for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j); |
486 | } else { | ||
487 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
368 | for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j); |
488 | } | ||
489 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
368 | PetscCallCUDA(cudaGetLastError()); |
490 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(PetscLogGpuTimeEnd()); |
491 | 368 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
492 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega)); |
493 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
368 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_h)); |
494 | } else { | ||
495 | ✗ | PetscCall(VecGetArrayRead(bv->omega,&omega)); | |
496 | ✗ | if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]); | |
497 | ✗ | else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]); | |
498 | ✗ | PetscCall(VecRestoreArrayRead(bv->omega,&omega)); | |
499 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
500 | } | ||
501 | PetscFunctionReturn(PETSC_SUCCESS); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints) | ||
506 | of the coefficients array | ||
507 | */ | ||
508 | 31944 | PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta) | |
509 | { | ||
510 | 31944 | const PetscScalar *d_h; | |
511 | 31944 | PetscScalar hh; | |
512 | |||
513 | 31944 | PetscFunctionBegin; | |
514 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
31944 | if (!h) { |
515 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h)); |
516 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(PetscLogGpuTimeBegin()); |
517 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
31944 | PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
518 | 31944 | PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar))); | |
519 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(PetscLogGpuTimeEnd()); |
520 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(BV_SafeSqrt(bv,hh,beta)); |
521 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h)); |
522 | ✗ | } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta)); | |
523 | PetscFunctionReturn(PETSC_SUCCESS); | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest | ||
528 | provided by the caller (only values from l to j are copied) | ||
529 | */ | ||
530 | 1321 | PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest) | |
531 | { | ||
532 | 1321 | const PetscScalar *d_h,*d_a; | |
533 | 1321 | PetscInt i; | |
534 | |||
535 | 1321 | PetscFunctionBegin; | |
536 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
1321 | if (!h) { |
537 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a)); |
538 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(PetscLogGpuTimeBegin()); |
539 | 1321 | d_h = d_a + j*(bv->nc+bv->m)+bv->nc; | |
540 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1321 | PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
541 | 1321 | PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar))); | |
542 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(PetscLogGpuTimeEnd()); |
543 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a)); |
544 | } else { | ||
545 | ✗ | for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i]; | |
546 | } | ||
547 | PetscFunctionReturn(PETSC_SUCCESS); | ||
548 | } | ||
549 |