| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
| 4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
| 5 | |||
| 6 | This file is part of SLEPc. | ||
| 7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
| 8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 9 | */ | ||
| 10 | /* | ||
| 11 | CUDA-related code common to several BV impls | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <slepc/private/bvimpl.h> | ||
| 15 | #include <slepccupmblas.h> | ||
| 16 | |||
| 17 | #define BLOCKSIZE 64 | ||
| 18 | |||
| 19 | /* | ||
| 20 | C := alpha*A*B + beta*C | ||
| 21 | */ | ||
| 22 | 444 | PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_) | |
| 23 | { | ||
| 24 | 444 | PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
| 25 | 444 | cublasHandle_t cublasv2handle; | |
| 26 | |||
| 27 | 444 | PetscFunctionBegin; | |
| 28 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 29 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
| 30 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 31 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 32 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 33 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
| 34 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscCuBLASIntCast(ldc_,&ldc)); |
| 35 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscLogGpuTimeBegin()); |
| 36 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
444 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc)); |
| 37 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
444 | PetscCall(PetscLogGpuTimeEnd()); |
| 38 | 444 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
| 39 | 444 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 40 | } | ||
| 41 | |||
| 42 | /* | ||
| 43 | y := alpha*A*x + beta*y | ||
| 44 | */ | ||
| 45 | 66638 | PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y) | |
| 46 | { | ||
| 47 | 66638 | PetscCuBLASInt n=0,k=0,lda=0,one=1; | |
| 48 | 66638 | cublasHandle_t cublasv2handle; | |
| 49 | |||
| 50 | 66638 | PetscFunctionBegin; | |
| 51 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 52 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 53 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 54 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 55 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscLogGpuTimeBegin()); |
| 56 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
66638 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one)); |
| 57 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
66638 | PetscCall(PetscLogGpuTimeEnd()); |
| 58 | 66638 | PetscCall(PetscLogGpuFlops(2.0*n*k)); | |
| 59 | 66638 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 60 | } | ||
| 61 | |||
| 62 | /* | ||
| 63 | A(:,s:e-1) := A*B(:,s:e-1) | ||
| 64 | */ | ||
| 65 | 3357 | PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans) | |
| 66 | { | ||
| 67 | 3357 | const PetscScalar *d_B1; | |
| 68 | 3357 | PetscScalar *d_work,sone=1.0,szero=0.0; | |
| 69 | 3357 | PetscCuBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE; | |
| 70 | 3357 | size_t freemem,totmem; | |
| 71 | 3357 | cublasHandle_t cublasv2handle; | |
| 72 | 3357 | cublasOperation_t bt; | |
| 73 | |||
| 74 | 3357 | PetscFunctionBegin; | |
| 75 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 76 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
| 77 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(e-s,&n)); |
| 78 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 79 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 80 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
| 81 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscLogGpuTimeBegin()); |
| 82 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | if (PetscUnlikely(btrans)) { |
| 83 | ✗ | d_B1 = d_B+s; | |
| 84 | ✗ | bt = CUBLAS_OP_C; | |
| 85 | } else { | ||
| 86 | 3357 | d_B1 = d_B+s*ldb; | |
| 87 | 3357 | bt = CUBLAS_OP_N; | |
| 88 | } | ||
| 89 | /* try to allocate the whole matrix */ | ||
| 90 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem)); |
| 91 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
3357 | if (freemem>=lda*n*sizeof(PetscScalar)) { |
| 92 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar))); |
| 93 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
3357 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda)); |
| 94 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); |
| 95 | } else { | ||
| 96 | ✗ | PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs)); | |
| 97 | ✗ | PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar))); | |
| 98 | ✗ | PetscCall(PetscCuBLASIntCast(m % bs,&l)); | |
| 99 | ✗ | if (l) { | |
| 100 | ✗ | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l)); | |
| 101 | ✗ | PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); | |
| 102 | } | ||
| 103 | ✗ | for (;l<m;l+=bs) { | |
| 104 | ✗ | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs)); | |
| 105 | ✗ | PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice)); | |
| 106 | } | ||
| 107 | } | ||
| 108 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3357 | PetscCall(PetscLogGpuTimeEnd()); |
| 109 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3357 | PetscCallCUDA(cudaFree(d_work)); |
| 110 | 3357 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
| 111 | 3357 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 112 | } | ||
| 113 | |||
| 114 | /* | ||
| 115 | B := alpha*A + beta*B | ||
| 116 | */ | ||
| 117 | 274 | PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_) | |
| 118 | { | ||
| 119 | 274 | PetscCuBLASInt n=0,k=0,lda=0,ldb=0; | |
| 120 | 274 | cublasHandle_t cublasv2handle; | |
| 121 | |||
| 122 | 274 | PetscFunctionBegin; | |
| 123 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 124 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 125 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 126 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 127 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
| 128 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscLogGpuTimeBegin()); |
| 129 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
274 | PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb)); |
| 130 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
274 | PetscCall(PetscLogGpuTimeEnd()); |
| 131 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
415 | PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k)); |
| 132 | 274 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 133 | } | ||
| 134 | |||
| 135 | /* | ||
| 136 | C := A'*B | ||
| 137 | |||
| 138 | C is a CPU array | ||
| 139 | */ | ||
| 140 | 5285 | PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi) | |
| 141 | { | ||
| 142 | 5285 | PetscScalar *d_work,sone=1.0,szero=0.0,*CC; | |
| 143 | 5285 | PetscInt j; | |
| 144 | 5285 | PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
| 145 | 5285 | PetscMPIInt len; | |
| 146 | 5285 | cublasHandle_t cublasv2handle; | |
| 147 | |||
| 148 | 5285 | PetscFunctionBegin; | |
| 149 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 150 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
| 151 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 152 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 153 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 154 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(ldb_,&ldb)); |
| 155 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
5285 | PetscCall(PetscCuBLASIntCast(ldc_,&ldc)); |
| 156 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
5285 | PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar))); |
| 157 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
5285 | if (mpi) { |
| 158 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
608 | if (ldc==m) { |
| 159 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(BVAllocateWork_Private(bv,m*n)); |
| 160 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
128 | if (k) { |
| 161 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscLogGpuTimeBegin()); |
| 162 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
128 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc)); |
| 163 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscLogGpuTimeEnd()); |
| 164 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
128 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 165 | 128 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 166 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
| 167 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
128 | PetscCall(PetscMPIIntCast(m*n,&len)); |
| 168 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
128 | PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
| 169 | } else { | ||
| 170 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(BVAllocateWork_Private(bv,2*m*n)); |
| 171 | 480 | CC = bv->work+m*n; | |
| 172 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
480 | if (k) { |
| 173 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscLogGpuTimeBegin()); |
| 174 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
480 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); |
| 175 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscLogGpuTimeEnd()); |
| 176 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
480 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 177 | 480 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 178 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
| 179 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
480 | PetscCall(PetscMPIIntCast(m*n,&len)); |
| 180 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
480 | PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
| 181 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
2592 | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m)); |
| 182 | } | ||
| 183 | } else { | ||
| 184 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
4677 | if (k) { |
| 185 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(BVAllocateWork_Private(bv,m*n)); |
| 186 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(PetscLogGpuTimeBegin()); |
| 187 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4677 | PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); |
| 188 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4677 | PetscCall(PetscLogGpuTimeEnd()); |
| 189 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
4677 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 190 | 4677 | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 191 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
31585 | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m)); |
| 192 | } | ||
| 193 | } | ||
| 194 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
5285 | PetscCallCUDA(cudaFree(d_work)); |
| 195 | 5285 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
| 196 | 5285 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 197 | } | ||
| 198 | |||
| 199 | /* | ||
| 200 | y := A'*x | ||
| 201 | |||
| 202 | y is a CPU array, if NULL bv->buffer is used as a workspace | ||
| 203 | */ | ||
| 204 | 39850 | PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi) | |
| 205 | { | ||
| 206 | 39850 | PetscScalar *d_work,szero=0.0,sone=1.0,*yy; | |
| 207 | 39850 | PetscCuBLASInt n=0,k=0,lda=0,one=1; | |
| 208 | 39850 | PetscMPIInt len; | |
| 209 | 39850 | cublasHandle_t cublasv2handle; | |
| 210 | |||
| 211 | 39850 | PetscFunctionBegin; | |
| 212 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 213 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 214 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(k_,&k)); |
| 215 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
39850 | PetscCall(PetscCuBLASIntCast(lda_,&lda)); |
| 216 |
3/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
39850 | if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work)); |
| 217 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7788 | else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar))); |
| 218 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
39850 | if (mpi) { |
| 219 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(BVAllocateWork_Private(bv,k)); |
| 220 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
976 | if (n) { |
| 221 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscLogGpuTimeBegin()); |
| 222 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
976 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
| 223 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscLogGpuTimeEnd()); |
| 224 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
976 | PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 225 | 976 | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 226 | ✗ | } else PetscCall(PetscArrayzero(bv->work,k)); | |
| 227 | /* reduction */ | ||
| 228 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
976 | PetscCall(PetscMPIIntCast(k,&len)); |
| 229 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
976 | if (!y) { |
| 230 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
512 | if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */ |
| 231 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMalloc((void**)&yy,k*sizeof(PetscScalar))); |
| 232 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
512 | PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
| 233 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyDeviceToDevice)); |
| 234 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
512 | PetscCallCUDA(cudaFree(yy)); |
| 235 | } else { /* case 2: reduce on CPU, copy result back to GPU */ | ||
| 236 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*k)); | |
| 237 | ✗ | yy = bv->work+k; | |
| 238 | ✗ | PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); | |
| 239 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 240 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 241 | ✗ | PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyHostToDevice)); | |
| 242 | ✗ | PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar))); | |
| 243 | } | ||
| 244 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
512 | PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work)); |
| 245 | } else { /* case 3: user-provided array y, reduce on CPU */ | ||
| 246 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
464 | PetscCallCUDA(cudaFree(d_work)); |
| 247 |
2/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
464 | PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); |
| 248 | } | ||
| 249 | } else { | ||
| 250 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
38874 | if (n) { |
| 251 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
38874 | PetscCall(PetscLogGpuTimeBegin()); |
| 252 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
38874 | PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
| 253 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
38874 | PetscCall(PetscLogGpuTimeEnd()); |
| 254 | } | ||
| 255 |
3/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
38874 | if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work)); |
| 256 | else { | ||
| 257 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7324 | PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 258 | 7324 | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 259 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7324 | PetscCallCUDA(cudaFree(d_work)); |
| 260 | } | ||
| 261 | } | ||
| 262 | 39850 | PetscCall(PetscLogGpuFlops(2.0*n*k)); | |
| 263 | 39850 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 264 | } | ||
| 265 | |||
| 266 | /* | ||
| 267 | Scale n scalars | ||
| 268 | */ | ||
| 269 | 25132 | PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha) | |
| 270 | { | ||
| 271 | 25132 | PetscCuBLASInt n=0,one=1; | |
| 272 | 25132 | cublasHandle_t cublasv2handle; | |
| 273 | |||
| 274 | 25132 | PetscFunctionBegin; | |
| 275 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
25132 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 276 |
5/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
37496 | if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar))); |
| 277 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
37286 | else if (alpha != (PetscScalar)1.0) { |
| 278 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 279 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscLogGpuTimeBegin()); |
| 280 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
24922 | PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one)); |
| 281 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24922 | PetscCall(PetscLogGpuTimeEnd()); |
| 282 | 24922 | PetscCall(PetscLogGpuFlops(1.0*n)); | |
| 283 | } | ||
| 284 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | Compute 2-norm of vector consisting of n scalars | ||
| 289 | */ | ||
| 290 | 2526 | PetscErrorCode BVNorm_BLAS_CUDA(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm) | |
| 291 | { | ||
| 292 | 2526 | PetscCuBLASInt n=0,one=1; | |
| 293 | 2526 | cublasHandle_t cublasv2handle; | |
| 294 | |||
| 295 | 2526 | PetscFunctionBegin; | |
| 296 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscCuBLASIntCast(n_,&n)); |
| 297 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 298 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscLogGpuTimeBegin()); |
| 299 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
2526 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n,d_A,one,nrm)); |
| 300 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2526 | PetscCall(PetscLogGpuTimeEnd()); |
| 301 | 2526 | PetscCall(PetscLogGpuFlops(2.0*n)); | |
| 302 | 2526 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 303 | } | ||
| 304 | |||
| 305 | /* | ||
| 306 | Normalize the columns of A | ||
| 307 | */ | ||
| 308 | 496 | PetscErrorCode BVNormalize_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi) | |
| 309 | { | ||
| 310 | 496 | PetscInt j,k; | |
| 311 | 496 | PetscReal nrm,nrm1; | |
| 312 | 496 | PetscScalar alpha; | |
| 313 | 496 | PetscCuBLASInt m=0,one=1; | |
| 314 | 496 | cublasHandle_t cublasv2handle; | |
| 315 | |||
| 316 | 496 | PetscFunctionBegin; | |
| 317 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscCuBLASIntCast(m_,&m)); |
| 318 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 319 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscLogGpuTimeBegin()); |
| 320 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
9480 | for (j=0;j<n_;j++) { |
| 321 | 8984 | k = 1; | |
| 322 | #if !defined(PETSC_USE_COMPLEX) | ||
| 323 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
|
4598 | if (eigi && eigi[j] != 0.0) k = 2; |
| 324 | #endif | ||
| 325 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
8984 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+j*lda_,one,&nrm)); |
| 326 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
8984 | if (k==2) { |
| 327 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4 | PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+(j+1)*lda_,one,&nrm1)); |
| 328 | 4 | nrm = SlepcAbs(nrm,nrm1); | |
| 329 | } | ||
| 330 | 8984 | alpha = 1.0/nrm; | |
| 331 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
8984 | PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+j*lda_,one)); |
| 332 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
8984 | if (k==2) { |
| 333 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4 | PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+(j+1)*lda_,one)); |
| 334 | j++; | ||
| 335 | } | ||
| 336 | } | ||
| 337 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
496 | PetscCall(PetscLogGpuTimeEnd()); |
| 338 | 496 | PetscCall(PetscLogGpuFlops(3.0*m*n_)); | |
| 339 | 496 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 340 | } | ||
| 341 | |||
| 342 | /* | ||
| 343 | BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer | ||
| 344 | */ | ||
| 345 | 23914 | PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h) | |
| 346 | { | ||
| 347 | 23914 | PetscScalar *d_hh,*d_a; | |
| 348 | 23914 | PetscInt i; | |
| 349 | |||
| 350 | 23914 | PetscFunctionBegin; | |
| 351 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
23914 | if (!h) { |
| 352 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(VecCUDAGetArray(bv->buffer,&d_a)); |
| 353 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(PetscLogGpuTimeBegin()); |
| 354 | 22259 | d_hh = d_a + j*(bv->nc+bv->m); | |
| 355 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
22259 | PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar))); |
| 356 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(PetscLogGpuTimeEnd()); |
| 357 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
22259 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_a)); |
| 358 | } else { /* cpu memory */ | ||
| 359 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
13553 | for (i=0;i<bv->nc+j;i++) h[i] = 0.0; |
| 360 | } | ||
| 361 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer | ||
| 366 | into column j of the bv buffer | ||
| 367 | */ | ||
| 368 | 34464 | PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c) | |
| 369 | { | ||
| 370 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
34464 | PetscScalar *d_h,*d_c,sone=1.0; |
| 371 | 34464 | PetscInt i; | |
| 372 | 34464 | PetscCuBLASInt idx=0,one=1; | |
| 373 | 34464 | cublasHandle_t cublasv2handle; | |
| 374 | |||
| 375 | 34464 | PetscFunctionBegin; | |
| 376 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
34464 | if (!h) { |
| 377 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 378 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(VecCUDAGetArray(bv->buffer,&d_c)); |
| 379 | 32514 | d_h = d_c + j*(bv->nc+bv->m); | |
| 380 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx)); |
| 381 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscLogGpuTimeBegin()); |
| 382 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
32514 | PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one)); |
| 383 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(PetscLogGpuTimeEnd()); |
| 384 | 32514 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
| 385 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
32514 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_c)); |
| 386 | } else { /* cpu memory */ | ||
| 387 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
15766 | for (i=0;i<bv->nc+j;i++) h[i] += c[i]; |
| 388 | 1950 | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
| 389 | } | ||
| 390 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 391 | } | ||
| 392 | |||
| 393 | /* | ||
| 394 | BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k | ||
| 395 | of the coefficients array | ||
| 396 | */ | ||
| 397 | 25123 | PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value) | |
| 398 | { | ||
| 399 | 25123 | PetscScalar *d_h,*a; | |
| 400 | |||
| 401 | 25123 | PetscFunctionBegin; | |
| 402 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
25123 | if (!h) { |
| 403 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(VecCUDAGetArray(bv->buffer,&a)); |
| 404 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(PetscLogGpuTimeBegin()); |
| 405 | 24723 | d_h = a + k*(bv->nc+bv->m) + bv->nc+j; | |
| 406 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
24723 | PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice)); |
| 407 | 24723 | PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar))); | |
| 408 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(PetscLogGpuTimeEnd()); |
| 409 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
24723 | PetscCall(VecCUDARestoreArray(bv->buffer,&a)); |
| 410 | } else { /* cpu memory */ | ||
| 411 | 400 | h[bv->nc+j] = value; | |
| 412 | } | ||
| 413 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | ||
| 417 | BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the | ||
| 418 | coefficients array (up to position j) | ||
| 419 | */ | ||
| 420 | 33564 | PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum) | |
| 421 | { | ||
| 422 | 33564 | const PetscScalar *d_h; | |
| 423 | 33564 | PetscScalar dot; | |
| 424 | 33564 | PetscInt i; | |
| 425 | 33564 | PetscCuBLASInt idx=0,one=1; | |
| 426 | 33564 | cublasHandle_t cublasv2handle; | |
| 427 | |||
| 428 | 33564 | PetscFunctionBegin; | |
| 429 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
33564 | if (!h) { |
| 430 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); |
| 431 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h)); |
| 432 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx)); |
| 433 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscLogGpuTimeBegin()); |
| 434 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
31846 | PetscCallCUBLAS(cublasXdot(cublasv2handle,idx,d_h,one,d_h,one,&dot)); |
| 435 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(PetscLogGpuTimeEnd()); |
| 436 | 31846 | PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j))); | |
| 437 | 31846 | *sum = PetscRealPart(dot); | |
| 438 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31846 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h)); |
| 439 | } else { /* cpu memory */ | ||
| 440 | 1718 | *sum = 0.0; | |
| 441 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
14798 | for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i])); |
| 442 | 1718 | PetscCall(PetscLogFlops(2.0*(bv->nc+j))); | |
| 443 | } | ||
| 444 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 445 | } | ||
| 446 | |||
| 447 | /* pointwise multiplication */ | ||
| 448 | 184 | static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | |
| 449 | ✗ | { | |
| 450 | PetscInt x; | ||
| 451 | |||
| 452 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
| 453 | if (x<n) a[x] *= PetscRealPart(b[x]); | ||
| 454 | 184 | } | |
| 455 | |||
| 456 | /* pointwise division */ | ||
| 457 | 184 | static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | |
| 458 | ✗ | { | |
| 459 | PetscInt x; | ||
| 460 | |||
| 461 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
| 462 | if (x<n) a[x] /= PetscRealPart(b[x]); | ||
| 463 | 184 | } | |
| 464 | |||
| 465 | /* | ||
| 466 | BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents | ||
| 467 | the contents of the coefficients array (up to position j) and omega is the signature; | ||
| 468 | if inverse=TRUE then the operation is h/omega | ||
| 469 | */ | ||
| 470 | 392 | PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse) | |
| 471 | { | ||
| 472 | 392 | PetscScalar *d_h; | |
| 473 | 392 | const PetscScalar *d_omega,*omega; | |
| 474 | 392 | PetscInt i,xcount; | |
| 475 | 392 | dim3 blocks3d, threads3d; | |
| 476 | |||
| 477 | 392 | PetscFunctionBegin; | |
| 478 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
392 | if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS); |
| 479 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
368 | if (!h) { |
| 480 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDAGetArray(bv->buffer,&d_h)); |
| 481 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega)); |
| 482 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount)); |
| 483 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(PetscLogGpuTimeBegin()); |
| 484 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
368 | if (inverse) { |
| 485 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
368 | for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j); |
| 486 | } else { | ||
| 487 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
368 | for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j); |
| 488 | } | ||
| 489 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
368 | PetscCallCUDA(cudaGetLastError()); |
| 490 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(PetscLogGpuTimeEnd()); |
| 491 | 368 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
| 492 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
368 | PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega)); |
| 493 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
368 | PetscCall(VecCUDARestoreArray(bv->buffer,&d_h)); |
| 494 | } else { | ||
| 495 | ✗ | PetscCall(VecGetArrayRead(bv->omega,&omega)); | |
| 496 | ✗ | if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]); | |
| 497 | ✗ | else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]); | |
| 498 | ✗ | PetscCall(VecRestoreArrayRead(bv->omega,&omega)); | |
| 499 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
| 500 | } | ||
| 501 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 502 | } | ||
| 503 | |||
| 504 | /* | ||
| 505 | BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints) | ||
| 506 | of the coefficients array | ||
| 507 | */ | ||
| 508 | 31944 | PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta) | |
| 509 | { | ||
| 510 | 31944 | const PetscScalar *d_h; | |
| 511 | 31944 | PetscScalar hh; | |
| 512 | |||
| 513 | 31944 | PetscFunctionBegin; | |
| 514 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
31944 | if (!h) { |
| 515 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h)); |
| 516 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(PetscLogGpuTimeBegin()); |
| 517 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
31944 | PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 518 | 31944 | PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar))); | |
| 519 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(PetscLogGpuTimeEnd()); |
| 520 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(BV_SafeSqrt(bv,hh,beta)); |
| 521 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
31944 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h)); |
| 522 | ✗ | } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta)); | |
| 523 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 524 | } | ||
| 525 | |||
| 526 | /* | ||
| 527 | BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest | ||
| 528 | provided by the caller (only values from l to j are copied) | ||
| 529 | */ | ||
| 530 | 1321 | PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest) | |
| 531 | { | ||
| 532 | 1321 | const PetscScalar *d_h,*d_a; | |
| 533 | 1321 | PetscInt i; | |
| 534 | |||
| 535 | 1321 | PetscFunctionBegin; | |
| 536 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
1321 | if (!h) { |
| 537 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a)); |
| 538 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(PetscLogGpuTimeBegin()); |
| 539 | 1321 | d_h = d_a + j*(bv->nc+bv->m)+bv->nc; | |
| 540 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
1321 | PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); |
| 541 | 1321 | PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar))); | |
| 542 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(PetscLogGpuTimeEnd()); |
| 543 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
1321 | PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a)); |
| 544 | } else { | ||
| 545 | ✗ | for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i]; | |
| 546 | } | ||
| 547 | PetscFunctionReturn(PETSC_SUCCESS); | ||
| 548 | } | ||
| 549 |