| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
| 4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
| 5 | |||
| 6 | This file is part of SLEPc. | ||
| 7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
| 8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| 9 | */ | ||
| 10 | /* | ||
| 11 | HIP-related code common to several BV impls | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <slepc/private/bvimpl.h> | ||
| 15 | #include <slepccupmblas.h> | ||
| 16 | |||
| 17 | #define BLOCKSIZE 64 | ||
| 18 | |||
| 19 | /* | ||
| 20 | C := alpha*A*B + beta*C | ||
| 21 | */ | ||
| 22 | ✗ | PetscErrorCode BVMult_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_) | |
| 23 | { | ||
| 24 | ✗ | PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
| 25 | ✗ | hipblasHandle_t hipblashandle; | |
| 26 | |||
| 27 | PetscFunctionBegin; | ||
| 28 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
| 29 | ✗ | PetscCall(PetscHipBLASIntCast(m_,&m)); | |
| 30 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
| 31 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
| 32 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
| 33 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
| 34 | ✗ | PetscCall(PetscHipBLASIntCast(ldc_,&ldc)); | |
| 35 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 36 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc)); | |
| 37 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 38 | ✗ | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
| 39 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
| 40 | ✗ | } | |
| 41 | |||
| 42 | /* | ||
| 43 | y := alpha*A*x + beta*y | ||
| 44 | */ | ||
| 45 | 4403601413398079328 | PetscErrorCode BVMultVec_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y) | |
| 46 | { | ||
| 47 | 4403601413398079328 | PetscHipBLASInt n=0,k=0,lda=0,one=1; | |
| 48 | 4403601413398079328 | hipblasHandle_t hipblashandle; | |
| 49 | |||
| 50 | PetscFunctionBegin; | ||
| 51 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 52 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
| 53 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
| 54 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
| 55 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscLogGpuTimeBegin()); |
| 56 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
4403601413398079328 | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one)); |
| 57 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscLogGpuTimeEnd()); |
| 58 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4403601413398079328 | PetscCall(PetscLogGpuFlops(2.0*n*k)); |
| 59 | 4403601413398079328 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 60 | 15325409517198366880 | } | |
| 61 | |||
| 62 | /* | ||
| 63 | A(:,s:e-1) := A*B(:,s:e-1) | ||
| 64 | */ | ||
| 65 | 16227252309694873600 | PetscErrorCode BVMultInPlace_BLAS_HIP(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans) | |
| 66 | { | ||
| 67 | 16227252309694873600 | const PetscScalar *d_B1; | |
| 68 | 16227252309694873600 | PetscScalar *d_work,sone=1.0,szero=0.0; | |
| 69 | 16227252309694873600 | PetscHipBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE; | |
| 70 | 16227252309694873600 | size_t freemem,totmem; | |
| 71 | 16227252309694873600 | hipblasHandle_t hipblashandle; | |
| 72 | 16227252309694873600 | hipblasOperation_t bt; | |
| 73 | |||
| 74 | PetscFunctionBegin; | ||
| 75 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 76 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHipBLASIntCast(m_,&m)); |
| 77 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHipBLASIntCast(e-s,&n)); |
| 78 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
| 79 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
| 80 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); |
| 81 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscLogGpuTimeBegin()); |
| 82 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
16227252309694873600 | if (PetscUnlikely(btrans)) { |
| 83 | ✗ | d_B1 = d_B+s; | |
| 84 | ✗ | bt = HIPBLAS_OP_C; | |
| 85 | ✗ | } else { | |
| 86 | 16227252309694873600 | d_B1 = d_B+s*ldb; | |
| 87 | 16227252309694873600 | bt = HIPBLAS_OP_N; | |
| 88 | } | ||
| 89 | /* try to allocate the whole matrix */ | ||
| 90 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIP(hipMemGetInfo(&freemem,&totmem)); |
| 91 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
16227252309694873600 | if (freemem>=lda*n*sizeof(PetscScalar)) { |
| 92 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIP(hipMalloc((void**)&d_work,lda*n*sizeof(PetscScalar))); |
| 93 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda)); |
| 94 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); |
| 95 | 16227252309694873600 | } else { | |
| 96 | ✗ | PetscCall(PetscHipBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs)); | |
| 97 | ✗ | PetscCallHIP(hipMalloc((void**)&d_work,bs*n*sizeof(PetscScalar))); | |
| 98 | ✗ | PetscCall(PetscHipBLASIntCast(m % bs,&l)); | |
| 99 | ✗ | if (l) { | |
| 100 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l)); | |
| 101 | ✗ | PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); | |
| 102 | ✗ | } | |
| 103 | ✗ | for (;l<m;l+=bs) { | |
| 104 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs)); | |
| 105 | ✗ | PetscCallHIP(hipMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); | |
| 106 | ✗ | } | |
| 107 | } | ||
| 108 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscLogGpuTimeEnd()); |
| 109 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIP(hipFree(d_work)); |
| 110 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
16227252309694873600 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); |
| 111 | 16227252309694873600 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 112 | 16227252309694873600 | } | |
| 113 | |||
| 114 | /* | ||
| 115 | B := alpha*A + beta*B | ||
| 116 | */ | ||
| 117 | ✗ | PetscErrorCode BVAXPY_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_) | |
| 118 | { | ||
| 119 | ✗ | PetscHipBLASInt n=0,k=0,lda=0,ldb=0; | |
| 120 | ✗ | hipblasHandle_t hipblashandle; | |
| 121 | |||
| 122 | PetscFunctionBegin; | ||
| 123 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
| 124 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
| 125 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
| 126 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
| 127 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
| 128 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 129 | ✗ | PetscCallHIPBLAS(hipblasXgeam(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb)); | |
| 130 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 131 | ✗ | PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k)); | |
| 132 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
| 133 | ✗ | } | |
| 134 | |||
| 135 | /* | ||
| 136 | C := A'*B | ||
| 137 | |||
| 138 | C is a CPU array | ||
| 139 | */ | ||
| 140 | ✗ | PetscErrorCode BVDot_BLAS_HIP(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi) | |
| 141 | { | ||
| 142 | ✗ | PetscScalar *d_work,sone=1.0,szero=0.0,*CC; | |
| 143 | ✗ | PetscInt j; | |
| 144 | ✗ | PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
| 145 | ✗ | PetscMPIInt len; | |
| 146 | ✗ | hipblasHandle_t hipblashandle; | |
| 147 | |||
| 148 | PetscFunctionBegin; | ||
| 149 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
| 150 | ✗ | PetscCall(PetscHipBLASIntCast(m_,&m)); | |
| 151 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
| 152 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
| 153 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
| 154 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
| 155 | ✗ | PetscCall(PetscHipBLASIntCast(ldc_,&ldc)); | |
| 156 | ✗ | PetscCallHIP(hipMalloc((void**)&d_work,m*n*sizeof(PetscScalar))); | |
| 157 | ✗ | if (mpi) { | |
| 158 | ✗ | if (ldc==m) { | |
| 159 | ✗ | PetscCall(BVAllocateWork_Private(bv,m*n)); | |
| 160 | ✗ | if (k) { | |
| 161 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 162 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc)); | |
| 163 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 164 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 165 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 166 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
| 167 | ✗ | PetscCall(PetscMPIIntCast(m*n,&len)); | |
| 168 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 169 | ✗ | } else { | |
| 170 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*m*n)); | |
| 171 | ✗ | CC = bv->work+m*n; | |
| 172 | ✗ | if (k) { | |
| 173 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 174 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); | |
| 175 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 176 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 177 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 178 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
| 179 | ✗ | PetscCall(PetscMPIIntCast(m*n,&len)); | |
| 180 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 181 | ✗ | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m)); | |
| 182 | } | ||
| 183 | ✗ | } else { | |
| 184 | ✗ | if (k) { | |
| 185 | ✗ | PetscCall(BVAllocateWork_Private(bv,m*n)); | |
| 186 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 187 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); | |
| 188 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 189 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 190 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
| 191 | ✗ | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m)); | |
| 192 | ✗ | } | |
| 193 | } | ||
| 194 | ✗ | PetscCallHIP(hipFree(d_work)); | |
| 195 | ✗ | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
| 196 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
| 197 | ✗ | } | |
| 198 | |||
| 199 | /* | ||
| 200 | y := A'*x | ||
| 201 | |||
| 202 | y is a CPU array, if NULL bv->buffer is used as a workspace | ||
| 203 | */ | ||
| 204 | 14459712507201182024 | PetscErrorCode BVDotVec_BLAS_HIP(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi) | |
| 205 | { | ||
| 206 | 14459712507201182024 | PetscScalar *d_work,szero=0.0,sone=1.0,*yy; | |
| 207 | 14459712507201182024 | PetscHipBLASInt n=0,k=0,lda=0,one=1; | |
| 208 | 14459712507201182024 | PetscMPIInt len; | |
| 209 | 14459712507201182024 | hipblasHandle_t hipblashandle; | |
| 210 | |||
| 211 | PetscFunctionBegin; | ||
| 212 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 213 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
| 214 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
| 215 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
| 216 |
4/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 2 times.
|
14459712507201182024 | if (!y) PetscCall(VecHIPGetArrayWrite(bv->buffer,&d_work)); |
| 217 | ✗ | else PetscCallHIP(hipMalloc((void**)&d_work,k*sizeof(PetscScalar))); | |
| 218 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
23683102943695899208 | if (mpi) { |
| 219 | ✗ | PetscCall(BVAllocateWork_Private(bv,k)); | |
| 220 | ✗ | if (n) { | |
| 221 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 222 | ✗ | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); | |
| 223 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 224 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 225 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 226 | ✗ | } else PetscCall(PetscArrayzero(bv->work,k)); | |
| 227 | /* reduction */ | ||
| 228 | ✗ | PetscCall(PetscMPIIntCast(k,&len)); | |
| 229 | ✗ | if (!y) { | |
| 230 | ✗ | if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */ | |
| 231 | ✗ | PetscCallHIP(hipMalloc((void**)&yy,k*sizeof(PetscScalar))); | |
| 232 | ✗ | PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 233 | ✗ | PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyDeviceToDevice)); | |
| 234 | ✗ | PetscCallHIP(hipFree(yy)); | |
| 235 | ✗ | } else { /* case 2: reduce on CPU, copy result back to GPU */ | |
| 236 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*k)); | |
| 237 | ✗ | yy = bv->work+k; | |
| 238 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 239 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 240 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 241 | ✗ | PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyHostToDevice)); | |
| 242 | ✗ | PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar))); | |
| 243 | } | ||
| 244 | ✗ | PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work)); | |
| 245 | ✗ | } else { /* case 3: user-provided array y, reduce on CPU */ | |
| 246 | ✗ | PetscCallHIP(hipFree(d_work)); | |
| 247 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
| 248 | } | ||
| 249 | ✗ | } else { | |
| 250 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
14459712507201182024 | if (n) { |
| 251 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeBegin()); |
| 252 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
14459712507201182024 | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
| 253 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeEnd()); |
| 254 | 14459712507201182024 | } | |
| 255 |
4/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 2 times.
|
23683102943695899208 | if (!y) PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work)); |
| 256 | else { | ||
| 257 | ✗ | PetscCallHIP(hipMemcpy(y,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
| 258 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
| 259 | ✗ | PetscCallHIP(hipFree(d_work)); | |
| 260 | } | ||
| 261 | } | ||
| 262 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuFlops(2.0*n*k)); |
| 263 | 14459712507201182024 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 264 | 9848127686793471688 | } | |
| 265 | |||
| 266 | /* | ||
| 267 | Scale n scalars | ||
| 268 | */ | ||
| 269 | 4511865541610169816 | PetscErrorCode BVScale_BLAS_HIP(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha) | |
| 270 | { | ||
| 271 | 4511865541610169816 | PetscHipBLASInt n=0,one=1; | |
| 272 | 4511865541610169816 | hipblasHandle_t hipblashandle; | |
| 273 | |||
| 274 | PetscFunctionBegin; | ||
| 275 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865541610169816 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
| 276 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4511865541610169816 | if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallHIP(hipMemset(d_A,0,n*sizeof(PetscScalar))); |
| 277 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4511865541610169816 | else if (alpha != (PetscScalar)1.0) { |
| 278 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865541610169816 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 279 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865541610169816 | PetscCall(PetscLogGpuTimeBegin()); |
| 280 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 2 times.
|
4511865541610169816 | PetscCallHIPBLAS(hipblasXscal(hipblashandle,n,&alpha,d_A,one)); |
| 281 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865541610169816 | PetscCall(PetscLogGpuTimeEnd()); |
| 282 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865541610169816 | PetscCall(PetscLogGpuFlops(1.0*n)); |
| 283 | 4511865541610169816 | } | |
| 284 | 32381622605808933416 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 285 | 21760763893513104376 | } | |
| 286 | |||
| 287 | /* | ||
| 288 | Compute 2-norm of vector consisting of n scalars | ||
| 289 | */ | ||
| 290 | 216 | PetscErrorCode BVNorm_BLAS_HIP(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm) | |
| 291 | { | ||
| 292 | 216 | PetscHipBLASInt n=0,one=1; | |
| 293 | 216 | hipblasHandle_t hipblashandle; | |
| 294 | |||
| 295 | PetscFunctionBegin; | ||
| 296 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
216 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
| 297 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
216 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 298 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
216 | PetscCall(PetscLogGpuTimeBegin()); |
| 299 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
216 | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,n,d_A,one,nrm)); |
| 300 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
216 | PetscCall(PetscLogGpuTimeEnd()); |
| 301 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
216 | PetscCall(PetscLogGpuFlops(2.0*n)); |
| 302 | 216 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 303 | 216 | } | |
| 304 | |||
| 305 | /* | ||
| 306 | Normalize the columns of A | ||
| 307 | */ | ||
| 308 | 8668499095851106304 | PetscErrorCode BVNormalize_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi) | |
| 309 | { | ||
| 310 | 8668499095851106304 | PetscInt j,k; | |
| 311 | 8668499095851106304 | PetscReal nrm,nrm1; | |
| 312 | 8668499095851106304 | PetscScalar alpha; | |
| 313 | 8668499095851106304 | PetscHipBLASInt m=0,one=1; | |
| 314 | 8668499095851106304 | hipblasHandle_t hipblashandle; | |
| 315 | |||
| 316 | PetscFunctionBegin; | ||
| 317 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
8668499095851106304 | PetscCall(PetscHipBLASIntCast(m_,&m)); |
| 318 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
8668499095851106304 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 319 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
8668499095851106304 | PetscCall(PetscLogGpuTimeBegin()); |
| 320 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
24895751405545979904 | for (j=0;j<n_;j++) { |
| 321 | 16227252309694873600 | k = 1; | |
| 322 | #if !defined(PETSC_USE_COMPLEX) | ||
| 323 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
7003880272840097792 | if (eigi && eigi[j] != 0.0) k = 2; |
| 324 | #endif | ||
| 325 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+j*lda_,one,&nrm)); |
| 326 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
16227252309694873600 | if (k==2) { |
| 327 | ✗ | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+(j+1)*lda_,one,&nrm1)); | |
| 328 | ✗ | nrm = SlepcAbs(nrm,nrm1); | |
| 329 | ✗ | } | |
| 330 | 16227252309694873600 | alpha = 1.0/nrm; | |
| 331 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
16227252309694873600 | PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+j*lda_,one)); |
| 332 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
16227252309694873600 | if (k==2) { |
| 333 | ✗ | PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+(j+1)*lda_,one)); | |
| 334 | ✗ | j++; | |
| 335 | ✗ | } | |
| 336 | 16227252309694873600 | } | |
| 337 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
8668499095851106304 | PetscCall(PetscLogGpuTimeEnd()); |
| 338 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
8668499095851106304 | PetscCall(PetscLogGpuFlops(3.0*m*n_)); |
| 339 | 8668499095851106304 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 340 | 8668499095851106304 | } | |
| 341 | |||
| 342 | /* | ||
| 343 | BV_CleanCoefficients_HIP - Sets to zero all entries of column j of the bv buffer | ||
| 344 | */ | ||
| 345 | 4511865581170845360 | PetscErrorCode BV_CleanCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h) | |
| 346 | { | ||
| 347 | 4511865581170845360 | PetscScalar *d_hh,*d_a; | |
| 348 | 4511865581170845360 | PetscInt i; | |
| 349 | |||
| 350 | PetscFunctionBegin; | ||
| 351 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4511865581170845360 | if (!h) { |
| 352 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(VecHIPGetArray(bv->buffer,&d_a)); |
| 353 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(PetscLogGpuTimeBegin()); |
| 354 | 4511865581170845360 | d_hh = d_a + j*(bv->nc+bv->m); | |
| 355 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 2 times.
|
4511865581170845360 | PetscCallHIP(hipMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar))); |
| 356 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(PetscLogGpuTimeEnd()); |
| 357 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(VecHIPRestoreArray(bv->buffer,&d_a)); |
| 358 | 4511865581170845360 | } else { /* cpu memory */ | |
| 359 | ✗ | for (i=0;i<bv->nc+j;i++) h[i] = 0.0; | |
| 360 | } | ||
| 361 | 32381622566248257872 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 362 | 3713302083118505008 | } | |
| 363 | |||
| 364 | /* | ||
| 365 | BV_AddCoefficients_HIP - Add the contents of the scratch (0-th column) of the bv buffer | ||
| 366 | into column j of the bv buffer | ||
| 367 | */ | ||
| 368 | 14459712507201182024 | PetscErrorCode BV_AddCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c) | |
| 369 | { | ||
| 370 | 14459712507201182024 | PetscScalar *d_h,*d_c,sone=1.0; | |
| 371 | 14459712507201182024 | PetscInt i; | |
| 372 | 14459712507201182024 | PetscHipBLASInt idx=0,one=1; | |
| 373 | 14459712507201182024 | hipblasHandle_t hipblashandle; | |
| 374 | |||
| 375 | PetscFunctionBegin; | ||
| 376 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
14459712507201182024 | if (!h) { |
| 377 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 378 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPGetArray(bv->buffer,&d_c)); |
| 379 | 14459712507201182024 | d_h = d_c + j*(bv->nc+bv->m); | |
| 380 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx)); |
| 381 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeBegin()); |
| 382 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
14459712507201182024 | PetscCallHIPBLAS(hipblasXaxpy(hipblashandle,idx,&sone,d_c,one,d_h,one)); |
| 383 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeEnd()); |
| 384 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); |
| 385 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPRestoreArray(bv->buffer,&d_c)); |
| 386 | 14459712507201182024 | } else { /* cpu memory */ | |
| 387 | ✗ | for (i=0;i<bv->nc+j;i++) h[i] += c[i]; | |
| 388 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
| 389 | } | ||
| 390 | 19071407725448540616 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 391 | 14459749306481064776 | } | |
| 392 | |||
| 393 | /* | ||
| 394 | BV_SetValue_HIP - Sets value in row j (counted after the constraints) of column k | ||
| 395 | of the coefficients array | ||
| 396 | */ | ||
| 397 | 4511865581170845360 | PetscErrorCode BV_SetValue_HIP(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value) | |
| 398 | { | ||
| 399 | 4511865581170845360 | PetscScalar *d_h,*a; | |
| 400 | |||
| 401 | PetscFunctionBegin; | ||
| 402 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4511865581170845360 | if (!h) { |
| 403 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(VecHIPGetArray(bv->buffer,&a)); |
| 404 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(PetscLogGpuTimeBegin()); |
| 405 | 4511865581170845360 | d_h = a + k*(bv->nc+bv->m) + bv->nc+j; | |
| 406 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 2 times.
|
4511865581170845360 | PetscCallHIP(hipMemcpy(d_h,&value,sizeof(PetscScalar),hipMemcpyHostToDevice)); |
| 407 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar))); |
| 408 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(PetscLogGpuTimeEnd()); |
| 409 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511865581170845360 | PetscCall(VecHIPRestoreArray(bv->buffer,&a)); |
| 410 | 4511865581170845360 | } else { /* cpu memory */ | |
| 411 | ✗ | h[bv->nc+j] = value; | |
| 412 | } | ||
| 413 | 32381622566248257872 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 414 | 21760764407801886448 | } | |
| 415 | |||
| 416 | /* | ||
| 417 | BV_SquareSum_HIP - Returns the value h'*h, where h represents the contents of the | ||
| 418 | coefficients array (up to position j) | ||
| 419 | */ | ||
| 420 | 14459712507201182024 | PetscErrorCode BV_SquareSum_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum) | |
| 421 | { | ||
| 422 | 14459712507201182024 | const PetscScalar *d_h; | |
| 423 | 14459712507201182024 | PetscScalar dot; | |
| 424 | 14459712507201182024 | PetscInt i; | |
| 425 | 14459712507201182024 | PetscHipBLASInt idx=0,one=1; | |
| 426 | 14459712507201182024 | hipblasHandle_t hipblashandle; | |
| 427 | |||
| 428 | PetscFunctionBegin; | ||
| 429 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
14459712507201182024 | if (!h) { |
| 430 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
| 431 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h)); |
| 432 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx)); |
| 433 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeBegin()); |
| 434 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
14459712507201182024 | PetscCallHIPBLAS(hipblasXdot(hipblashandle,idx,d_h,one,d_h,one,&dot)); |
| 435 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeEnd()); |
| 436 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j))); |
| 437 | 14459712507201182024 | *sum = PetscRealPart(dot); | |
| 438 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h)); |
| 439 | 14459712507201182024 | } else { /* cpu memory */ | |
| 440 | ✗ | *sum = 0.0; | |
| 441 | ✗ | for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i])); | |
| 442 | ✗ | PetscCall(PetscLogFlops(2.0*(bv->nc+j))); | |
| 443 | } | ||
| 444 | 19071407725448540616 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 445 | 14459749306481064776 | } | |
| 446 | |||
| 447 | /* pointwise multiplication */ | ||
| 448 | static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | ||
| 449 | { | ||
| 450 | PetscInt x; | ||
| 451 | |||
| 452 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
| 453 | if (x<n) a[x] *= PetscRealPart(b[x]); | ||
| 454 | } | ||
| 455 | |||
| 456 | /* pointwise division */ | ||
| 457 | static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | ||
| 458 | { | ||
| 459 | PetscInt x; | ||
| 460 | |||
| 461 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
| 462 | if (x<n) a[x] /= PetscRealPart(b[x]); | ||
| 463 | } | ||
| 464 | |||
| 465 | /* | ||
| 466 | BV_ApplySignature_HIP - Computes the pointwise product h*omega, where h represents | ||
| 467 | the contents of the coefficients array (up to position j) and omega is the signature; | ||
| 468 | if inverse=TRUE then the operation is h/omega | ||
| 469 | */ | ||
| 470 | ✗ | PetscErrorCode BV_ApplySignature_HIP(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse) | |
| 471 | { | ||
| 472 | ✗ | PetscScalar *d_h; | |
| 473 | ✗ | const PetscScalar *d_omega,*omega; | |
| 474 | ✗ | PetscInt i,xcount; | |
| 475 | ✗ | dim3 blocks3d, threads3d; | |
| 476 | |||
| 477 | PetscFunctionBegin; | ||
| 478 | ✗ | if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS); | |
| 479 | ✗ | if (!h) { | |
| 480 | ✗ | PetscCall(VecHIPGetArray(bv->buffer,&d_h)); | |
| 481 | ✗ | PetscCall(VecHIPGetArrayRead(bv->omega,&d_omega)); | |
| 482 | ✗ | PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount)); | |
| 483 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
| 484 | ✗ | if (inverse) { | |
| 485 | ✗ | for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j); | |
| 486 | ✗ | } else { | |
| 487 | ✗ | for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j); | |
| 488 | } | ||
| 489 | ✗ | PetscCallHIP(hipGetLastError()); | |
| 490 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
| 491 | ✗ | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
| 492 | ✗ | PetscCall(VecHIPRestoreArrayRead(bv->omega,&d_omega)); | |
| 493 | ✗ | PetscCall(VecHIPRestoreArray(bv->buffer,&d_h)); | |
| 494 | ✗ | } else { | |
| 495 | ✗ | PetscCall(VecGetArrayRead(bv->omega,&omega)); | |
| 496 | ✗ | if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]); | |
| 497 | ✗ | else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]); | |
| 498 | ✗ | PetscCall(VecRestoreArrayRead(bv->omega,&omega)); | |
| 499 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
| 500 | } | ||
| 501 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
| 502 | ✗ | } | |
| 503 | |||
| 504 | /* | ||
| 505 | BV_SquareRoot_HIP - Returns the square root of position j (counted after the constraints) | ||
| 506 | of the coefficients array | ||
| 507 | */ | ||
| 508 | 14459712507201182024 | PetscErrorCode BV_SquareRoot_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta) | |
| 509 | { | ||
| 510 | 14459712507201182024 | const PetscScalar *d_h; | |
| 511 | 14459712507201182024 | PetscScalar hh; | |
| 512 | |||
| 513 | PetscFunctionBegin; | ||
| 514 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
14459712507201182024 | if (!h) { |
| 515 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h)); |
| 516 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeBegin()); |
| 517 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
14459712507201182024 | PetscCallHIP(hipMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),hipMemcpyDeviceToHost)); |
| 518 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar))); |
| 519 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(PetscLogGpuTimeEnd()); |
| 520 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(BV_SafeSqrt(bv,hh,beta)); |
| 521 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
14459712507201182024 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h)); |
| 522 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
14459712507201182024 | } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta)); |
| 523 | 19071407725448540616 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 524 | 9848090887513588936 | } | |
| 525 | |||
| 526 | /* | ||
| 527 | BV_StoreCoefficients_HIP - Copy the contents of the coefficients array to an array dest | ||
| 528 | provided by the caller (only values from l to j are copied) | ||
| 529 | */ | ||
| 530 | 4511867954811371520 | PetscErrorCode BV_StoreCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest) | |
| 531 | { | ||
| 532 | 4511867954811371520 | const PetscScalar *d_h,*d_a; | |
| 533 | 4511867954811371520 | PetscInt i; | |
| 534 | |||
| 535 | PetscFunctionBegin; | ||
| 536 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4511867954811371520 | if (!h) { |
| 537 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511867954811371520 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_a)); |
| 538 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
4511867954811371520 | PetscCall(PetscLogGpuTimeBegin()); |
| 539 | 4511867954811371520 | d_h = d_a + j*(bv->nc+bv->m)+bv->nc; | |
| 540 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 2 times.
|
4511867954811371520 | PetscCallHIP(hipMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),hipMemcpyDeviceToHost)); |
| 541 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511867954811371520 | PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar))); |
| 542 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 2 times.
|
4511867954811371520 | PetscCall(PetscLogGpuTimeEnd()); |
| 543 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
4511867954811371520 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_a)); |
| 544 | 4511867954811371520 | } else { | |
| 545 | ✗ | for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i]; | |
| 546 | } | ||
| 547 | 32381620192607731712 | PetscFunctionReturn(PETSC_SUCCESS); | |
| 548 | 3713323445883240448 | } | |
| 549 |