Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
3 | SLEPc - Scalable Library for Eigenvalue Problem Computations | ||
4 | Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain | ||
5 | |||
6 | This file is part of SLEPc. | ||
7 | SLEPc is distributed under a 2-clause BSD license (see LICENSE). | ||
8 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
9 | */ | ||
10 | /* | ||
11 | HIP-related code common to several BV impls | ||
12 | */ | ||
13 | |||
14 | #include <slepc/private/bvimpl.h> | ||
15 | #include <slepccupmblas.h> | ||
16 | |||
17 | #define BLOCKSIZE 64 | ||
18 | |||
19 | /* | ||
20 | C := alpha*A*B + beta*C | ||
21 | */ | ||
22 | ✗ | PetscErrorCode BVMult_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_) | |
23 | { | ||
24 | ✗ | PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
25 | ✗ | hipblasHandle_t hipblashandle; | |
26 | |||
27 | PetscFunctionBegin; | ||
28 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
29 | ✗ | PetscCall(PetscHipBLASIntCast(m_,&m)); | |
30 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
31 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
32 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
33 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
34 | ✗ | PetscCall(PetscHipBLASIntCast(ldc_,&ldc)); | |
35 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
36 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc)); | |
37 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
38 | ✗ | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
39 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
40 | ✗ | } | |
41 | |||
42 | /* | ||
43 | y := alpha*A*x + beta*y | ||
44 | */ | ||
45 | 8200246420960204592 | PetscErrorCode BVMultVec_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y) | |
46 | { | ||
47 | 8200246420960204592 | PetscHipBLASInt n=0,k=0,lda=0,one=1; | |
48 | 8200246420960204592 | hipblasHandle_t hipblashandle; | |
49 | |||
50 | PetscFunctionBegin; | ||
51 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
52 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
53 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
54 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
55 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscLogGpuTimeBegin()); |
56 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
8200246420960204592 | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one)); |
57 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscLogGpuTimeEnd()); |
58 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
8200246420960204592 | PetscCall(PetscLogGpuFlops(2.0*n*k)); |
59 | 8200246420960204592 | PetscFunctionReturn(PETSC_SUCCESS); | |
60 | 8170016003200158160 | } | |
61 | |||
62 | /* | ||
63 | A(:,s:e-1) := A*B(:,s:e-1) | ||
64 | */ | ||
65 | 1981614622368595968 | PetscErrorCode BVMultInPlace_BLAS_HIP(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans) | |
66 | { | ||
67 | 1981614622368595968 | const PetscScalar *d_B1; | |
68 | 1981614622368595968 | PetscScalar *d_work,sone=1.0,szero=0.0; | |
69 | 1981614622368595968 | PetscHipBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE; | |
70 | 1981614622368595968 | size_t freemem,totmem; | |
71 | 1981614622368595968 | hipblasHandle_t hipblashandle; | |
72 | 1981614622368595968 | hipblasOperation_t bt; | |
73 | |||
74 | PetscFunctionBegin; | ||
75 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
76 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHipBLASIntCast(m_,&m)); |
77 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHipBLASIntCast(e-s,&n)); |
78 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
79 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
80 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); |
81 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscLogGpuTimeBegin()); |
82 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1981614622368595968 | if (PetscUnlikely(btrans)) { |
83 | ✗ | d_B1 = d_B+s; | |
84 | ✗ | bt = HIPBLAS_OP_C; | |
85 | ✗ | } else { | |
86 | 1981614622368595968 | d_B1 = d_B+s*ldb; | |
87 | 1981614622368595968 | bt = HIPBLAS_OP_N; | |
88 | } | ||
89 | /* try to allocate the whole matrix */ | ||
90 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIP(hipMemGetInfo(&freemem,&totmem)); |
91 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1981614622368595968 | if (freemem>=lda*n*sizeof(PetscScalar)) { |
92 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIP(hipMalloc((void**)&d_work,lda*n*sizeof(PetscScalar))); |
93 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda)); |
94 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); |
95 | 1981614622368595968 | } else { | |
96 | ✗ | PetscCall(PetscHipBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs)); | |
97 | ✗ | PetscCallHIP(hipMalloc((void**)&d_work,bs*n*sizeof(PetscScalar))); | |
98 | ✗ | PetscCall(PetscHipBLASIntCast(m % bs,&l)); | |
99 | ✗ | if (l) { | |
100 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l)); | |
101 | ✗ | PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); | |
102 | ✗ | } | |
103 | ✗ | for (;l<m;l+=bs) { | |
104 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs)); | |
105 | ✗ | PetscCallHIP(hipMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice)); | |
106 | ✗ | } | |
107 | } | ||
108 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscLogGpuTimeEnd()); |
109 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIP(hipFree(d_work)); |
110 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
1981614622368595968 | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); |
111 | 1981614622368595968 | PetscFunctionReturn(PETSC_SUCCESS); | |
112 | 1981614622368595968 | } | |
113 | |||
114 | /* | ||
115 | B := alpha*A + beta*B | ||
116 | */ | ||
117 | ✗ | PetscErrorCode BVAXPY_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_) | |
118 | { | ||
119 | ✗ | PetscHipBLASInt n=0,k=0,lda=0,ldb=0; | |
120 | ✗ | hipblasHandle_t hipblashandle; | |
121 | |||
122 | PetscFunctionBegin; | ||
123 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
124 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
125 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
126 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
127 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
128 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
129 | ✗ | PetscCallHIPBLAS(hipblasXgeam(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb)); | |
130 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
131 | ✗ | PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k)); | |
132 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
133 | ✗ | } | |
134 | |||
135 | /* | ||
136 | C := A'*B | ||
137 | |||
138 | C is a CPU array | ||
139 | */ | ||
140 | ✗ | PetscErrorCode BVDot_BLAS_HIP(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi) | |
141 | { | ||
142 | ✗ | PetscScalar *d_work,sone=1.0,szero=0.0,*CC; | |
143 | ✗ | PetscInt j; | |
144 | ✗ | PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0; | |
145 | ✗ | PetscMPIInt len; | |
146 | ✗ | hipblasHandle_t hipblashandle; | |
147 | |||
148 | PetscFunctionBegin; | ||
149 | ✗ | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); | |
150 | ✗ | PetscCall(PetscHipBLASIntCast(m_,&m)); | |
151 | ✗ | PetscCall(PetscHipBLASIntCast(n_,&n)); | |
152 | ✗ | PetscCall(PetscHipBLASIntCast(k_,&k)); | |
153 | ✗ | PetscCall(PetscHipBLASIntCast(lda_,&lda)); | |
154 | ✗ | PetscCall(PetscHipBLASIntCast(ldb_,&ldb)); | |
155 | ✗ | PetscCall(PetscHipBLASIntCast(ldc_,&ldc)); | |
156 | ✗ | PetscCallHIP(hipMalloc((void**)&d_work,m*n*sizeof(PetscScalar))); | |
157 | ✗ | if (mpi) { | |
158 | ✗ | if (ldc==m) { | |
159 | ✗ | PetscCall(BVAllocateWork_Private(bv,m*n)); | |
160 | ✗ | if (k) { | |
161 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
162 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc)); | |
163 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
164 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
165 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
166 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
167 | ✗ | PetscCall(PetscMPIIntCast(m*n,&len)); | |
168 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
169 | ✗ | } else { | |
170 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*m*n)); | |
171 | ✗ | CC = bv->work+m*n; | |
172 | ✗ | if (k) { | |
173 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
174 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); | |
175 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
176 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
177 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
178 | ✗ | } else PetscCall(PetscArrayzero(bv->work,m*n)); | |
179 | ✗ | PetscCall(PetscMPIIntCast(m*n,&len)); | |
180 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
181 | ✗ | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m)); | |
182 | } | ||
183 | ✗ | } else { | |
184 | ✗ | if (k) { | |
185 | ✗ | PetscCall(BVAllocateWork_Private(bv,m*n)); | |
186 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
187 | ✗ | PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m)); | |
188 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
189 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
190 | ✗ | PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar))); | |
191 | ✗ | for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m)); | |
192 | ✗ | } | |
193 | } | ||
194 | ✗ | PetscCallHIP(hipFree(d_work)); | |
195 | ✗ | PetscCall(PetscLogGpuFlops(2.0*m*n*k)); | |
196 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
197 | ✗ | } | |
198 | |||
199 | /* | ||
200 | y := A'*x | ||
201 | |||
202 | y is a CPU array, if NULL bv->buffer is used as a workspace | ||
203 | */ | ||
204 | 3576595478344653636 | PetscErrorCode BVDotVec_BLAS_HIP(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi) | |
205 | { | ||
206 | 3576595478344653636 | PetscScalar *d_work,szero=0.0,sone=1.0,*yy; | |
207 | 3576595478344653636 | PetscHipBLASInt n=0,k=0,lda=0,one=1; | |
208 | 3576595478344653636 | PetscMPIInt len; | |
209 | 3576595478344653636 | hipblasHandle_t hipblashandle; | |
210 | |||
211 | PetscFunctionBegin; | ||
212 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
213 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
214 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHipBLASIntCast(k_,&k)); |
215 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHipBLASIntCast(lda_,&lda)); |
216 |
3/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
3576595478344653636 | if (!y) PetscCall(VecHIPGetArrayWrite(bv->buffer,&d_work)); |
217 | ✗ | else PetscCallHIP(hipMalloc((void**)&d_work,k*sizeof(PetscScalar))); | |
218 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3576595478344653636 | if (mpi) { |
219 | ✗ | PetscCall(BVAllocateWork_Private(bv,k)); | |
220 | ✗ | if (n) { | |
221 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
222 | ✗ | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); | |
223 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
224 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
225 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
226 | ✗ | } else PetscCall(PetscArrayzero(bv->work,k)); | |
227 | /* reduction */ | ||
228 | ✗ | PetscCall(PetscMPIIntCast(k,&len)); | |
229 | ✗ | if (!y) { | |
230 | ✗ | if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */ | |
231 | ✗ | PetscCallHIP(hipMalloc((void**)&yy,k*sizeof(PetscScalar))); | |
232 | ✗ | PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
233 | ✗ | PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyDeviceToDevice)); | |
234 | ✗ | PetscCallHIP(hipFree(yy)); | |
235 | ✗ | } else { /* case 2: reduce on CPU, copy result back to GPU */ | |
236 | ✗ | PetscCall(BVAllocateWork_Private(bv,2*k)); | |
237 | ✗ | yy = bv->work+k; | |
238 | ✗ | PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
239 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
240 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
241 | ✗ | PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyHostToDevice)); | |
242 | ✗ | PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar))); | |
243 | } | ||
244 | ✗ | PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work)); | |
245 | ✗ | } else { /* case 3: user-provided array y, reduce on CPU */ | |
246 | ✗ | PetscCallHIP(hipFree(d_work)); | |
247 | ✗ | PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv))); | |
248 | } | ||
249 | ✗ | } else { | |
250 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3576595478344653636 | if (n) { |
251 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeBegin()); |
252 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
3576595478344653636 | PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one)); |
253 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeEnd()); |
254 | 3576595478344653636 | } | |
255 |
3/6✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 2 times.
|
3576595478344653636 | if (!y) PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work)); |
256 | else { | ||
257 | ✗ | PetscCallHIP(hipMemcpy(y,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost)); | |
258 | ✗ | PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar))); | |
259 | ✗ | PetscCallHIP(hipFree(d_work)); | |
260 | } | ||
261 | } | ||
262 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuFlops(2.0*n*k)); |
263 | 3576595478344653636 | PetscFunctionReturn(PETSC_SUCCESS); | |
264 | 3576595478344653636 | } | |
265 | |||
266 | /* | ||
267 | Scale n scalars | ||
268 | */ | ||
269 | 4881539584850144524 | PetscErrorCode BVScale_BLAS_HIP(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha) | |
270 | { | ||
271 | 4881539584850144524 | PetscHipBLASInt n=0,one=1; | |
272 | 4881539584850144524 | hipblasHandle_t hipblashandle; | |
273 | |||
274 | PetscFunctionBegin; | ||
275 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539584850144524 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
276 |
1/10✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
|
4881539584850144524 | if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallHIP(hipMemset(d_A,0,n*sizeof(PetscScalar))); |
277 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4881539584850144524 | else if (alpha != (PetscScalar)1.0) { |
278 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539584850144524 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
279 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539584850144524 | PetscCall(PetscLogGpuTimeBegin()); |
280 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
4881539584850144524 | PetscCallHIPBLAS(hipblasXscal(hipblashandle,n,&alpha,d_A,one)); |
281 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539584850144524 | PetscCall(PetscLogGpuTimeEnd()); |
282 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539584850144524 | PetscCall(PetscLogGpuFlops(1.0*n)); |
283 | 4881539584850144524 | } | |
284 | 13565218636481703924 | PetscFunctionReturn(PETSC_SUCCESS); | |
285 | 1643381673510846076 | } | |
286 | |||
287 | /* | ||
288 | Compute 2-norm of vector consisting of n scalars | ||
289 | */ | ||
290 | 396 | PetscErrorCode BVNorm_BLAS_HIP(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm) | |
291 | { | ||
292 | 396 | PetscHipBLASInt n=0,one=1; | |
293 | 396 | hipblasHandle_t hipblashandle; | |
294 | |||
295 | PetscFunctionBegin; | ||
296 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
396 | PetscCall(PetscHipBLASIntCast(n_,&n)); |
297 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
396 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
298 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
396 | PetscCall(PetscLogGpuTimeBegin()); |
299 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
396 | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,n,d_A,one,nrm)); |
300 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
396 | PetscCall(PetscLogGpuTimeEnd()); |
301 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
396 | PetscCall(PetscLogGpuFlops(2.0*n)); |
302 | 396 | PetscFunctionReturn(PETSC_SUCCESS); | |
303 | 396 | } | |
304 | |||
305 | /* | ||
306 | Normalize the columns of A | ||
307 | */ | ||
308 | 5107089674019536896 | PetscErrorCode BVNormalize_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi) | |
309 | { | ||
310 | 5107089674019536896 | PetscInt j,k; | |
311 | 5107089674019536896 | PetscReal nrm,nrm1; | |
312 | 5107089674019536896 | PetscScalar alpha; | |
313 | 5107089674019536896 | PetscHipBLASInt m=0,one=1; | |
314 | 5107089674019536896 | hipblasHandle_t hipblashandle; | |
315 | |||
316 | PetscFunctionBegin; | ||
317 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
5107089674019536896 | PetscCall(PetscHipBLASIntCast(m_,&m)); |
318 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
5107089674019536896 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
319 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
5107089674019536896 | PetscCall(PetscLogGpuTimeBegin()); |
320 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
7088704296388132864 | for (j=0;j<n_;j++) { |
321 | 1981614622368595968 | k = 1; | |
322 | #if !defined(PETSC_USE_COMPLEX) | ||
323 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✗ Branch 3 not taken.
|
1981614622368595968 | if (eigi && eigi[j] != 0.0) k = 2; |
324 | #endif | ||
325 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+j*lda_,one,&nrm)); |
326 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1981614622368595968 | if (k==2) { |
327 | ✗ | PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+(j+1)*lda_,one,&nrm1)); | |
328 | ✗ | nrm = SlepcAbs(nrm,nrm1); | |
329 | ✗ | } | |
330 | 1981614622368595968 | alpha = 1.0/nrm; | |
331 |
2/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 1 times.
|
1981614622368595968 | PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+j*lda_,one)); |
332 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1981614622368595968 | if (k==2) { |
333 | ✗ | PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+(j+1)*lda_,one)); | |
334 | ✗ | j++; | |
335 | ✗ | } | |
336 | 1981614622368595968 | } | |
337 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
5107089674019536896 | PetscCall(PetscLogGpuTimeEnd()); |
338 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
5107089674019536896 | PetscCall(PetscLogGpuFlops(3.0*m*n_)); |
339 | 5107089674019536896 | PetscFunctionReturn(PETSC_SUCCESS); | |
340 | 5107089674019536896 | } | |
341 | |||
342 | /* | ||
343 | BV_CleanCoefficients_HIP - Sets to zero all entries of column j of the bv buffer | ||
344 | */ | ||
345 | 4881539469691333528 | PetscErrorCode BV_CleanCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h) | |
346 | { | ||
347 | 4881539469691333528 | PetscScalar *d_hh,*d_a; | |
348 | 4881539469691333528 | PetscInt i; | |
349 | |||
350 | PetscFunctionBegin; | ||
351 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4881539469691333528 | if (!h) { |
352 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(VecHIPGetArray(bv->buffer,&d_a)); |
353 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(PetscLogGpuTimeBegin()); |
354 | 4881539469691333528 | d_hh = d_a + j*(bv->nc+bv->m); | |
355 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
4881539469691333528 | PetscCallHIP(hipMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar))); |
356 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(PetscLogGpuTimeEnd()); |
357 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(VecHIPRestoreArray(bv->buffer,&d_a)); |
358 | 4881539469691333528 | } else { /* cpu memory */ | |
359 | ✗ | for (i=0;i<bv->nc+j;i++) h[i] = 0.0; | |
360 | } | ||
361 | 13565218519712280168 | PetscFunctionReturn(PETSC_SUCCESS); | |
362 | 2722767522356016856 | } | |
363 | |||
364 | /* | ||
365 | BV_AddCoefficients_HIP - Add the contents of the scratch (0-th column) of the bv buffer | ||
366 | into column j of the bv buffer | ||
367 | */ | ||
368 | 3576595478344653636 | PetscErrorCode BV_AddCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c) | |
369 | { | ||
370 | 3576595478344653636 | PetscScalar *d_h,*d_c,sone=1.0; | |
371 | 3576595478344653636 | PetscInt i; | |
372 | 3576595478344653636 | PetscHipBLASInt idx=0,one=1; | |
373 | 3576595478344653636 | hipblasHandle_t hipblashandle; | |
374 | |||
375 | PetscFunctionBegin; | ||
376 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3576595478344653636 | if (!h) { |
377 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
378 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPGetArray(bv->buffer,&d_c)); |
379 | 3576595478344653636 | d_h = d_c + j*(bv->nc+bv->m); | |
380 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx)); |
381 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeBegin()); |
382 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
3576595478344653636 | PetscCallHIPBLAS(hipblasXaxpy(hipblashandle,idx,&sone,d_c,one,d_h,one)); |
383 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeEnd()); |
384 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); |
385 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPRestoreArray(bv->buffer,&d_c)); |
386 | 3576595478344653636 | } else { /* cpu memory */ | |
387 | ✗ | for (i=0;i<bv->nc+j;i++) h[i] += c[i]; | |
388 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
389 | } | ||
390 | 3576595478344653636 | PetscFunctionReturn(PETSC_SUCCESS); | |
391 | 3576595478344653636 | } | |
392 | |||
393 | /* | ||
394 | BV_SetValue_HIP - Sets value in row j (counted after the constraints) of column k | ||
395 | of the coefficients array | ||
396 | */ | ||
397 | 4881539469691333528 | PetscErrorCode BV_SetValue_HIP(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value) | |
398 | { | ||
399 | 4881539469691333528 | PetscScalar *d_h,*a; | |
400 | |||
401 | PetscFunctionBegin; | ||
402 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
4881539469691333528 | if (!h) { |
403 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(VecHIPGetArray(bv->buffer,&a)); |
404 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(PetscLogGpuTimeBegin()); |
405 | 4881539469691333528 | d_h = a + k*(bv->nc+bv->m) + bv->nc+j; | |
406 |
3/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 2 times.
|
4881539469691333528 | PetscCallHIP(hipMemcpy(d_h,&value,sizeof(PetscScalar),hipMemcpyHostToDevice)); |
407 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar))); |
408 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(PetscLogGpuTimeEnd()); |
409 |
3/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2 times.
|
4881539469691333528 | PetscCall(VecHIPRestoreArray(bv->buffer,&a)); |
410 | 4881539469691333528 | } else { /* cpu memory */ | |
411 | ✗ | h[bv->nc+j] = value; | |
412 | } | ||
413 | 13565218519712280168 | PetscFunctionReturn(PETSC_SUCCESS); | |
414 | 1643381548688358520 | } | |
415 | |||
416 | /* | ||
417 | BV_SquareSum_HIP - Returns the value h'*h, where h represents the contents of the | ||
418 | coefficients array (up to position j) | ||
419 | */ | ||
420 | 3576595478344653636 | PetscErrorCode BV_SquareSum_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum) | |
421 | { | ||
422 | 3576595478344653636 | const PetscScalar *d_h; | |
423 | 3576595478344653636 | PetscScalar dot; | |
424 | 3576595478344653636 | PetscInt i; | |
425 | 3576595478344653636 | PetscHipBLASInt idx=0,one=1; | |
426 | 3576595478344653636 | hipblasHandle_t hipblashandle; | |
427 | |||
428 | PetscFunctionBegin; | ||
429 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3576595478344653636 | if (!h) { |
430 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHIPBLASGetHandle(&hipblashandle)); |
431 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h)); |
432 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx)); |
433 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeBegin()); |
434 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
3576595478344653636 | PetscCallHIPBLAS(hipblasXdot(hipblashandle,idx,d_h,one,d_h,one,&dot)); |
435 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeEnd()); |
436 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j))); |
437 | 3576595478344653636 | *sum = PetscRealPart(dot); | |
438 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h)); |
439 | 3576595478344653636 | } else { /* cpu memory */ | |
440 | ✗ | *sum = 0.0; | |
441 | ✗ | for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i])); | |
442 | ✗ | PetscCall(PetscLogFlops(2.0*(bv->nc+j))); | |
443 | } | ||
444 | 3576595478344653636 | PetscFunctionReturn(PETSC_SUCCESS); | |
445 | 3576595478344653636 | } | |
446 | |||
447 | /* pointwise multiplication */ | ||
448 | static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | ||
449 | { | ||
450 | PetscInt x; | ||
451 | |||
452 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
453 | if (x<n) a[x] *= PetscRealPart(b[x]); | ||
454 | } | ||
455 | |||
456 | /* pointwise division */ | ||
457 | static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n) | ||
458 | { | ||
459 | PetscInt x; | ||
460 | |||
461 | x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x; | ||
462 | if (x<n) a[x] /= PetscRealPart(b[x]); | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | BV_ApplySignature_HIP - Computes the pointwise product h*omega, where h represents | ||
467 | the contents of the coefficients array (up to position j) and omega is the signature; | ||
468 | if inverse=TRUE then the operation is h/omega | ||
469 | */ | ||
470 | ✗ | PetscErrorCode BV_ApplySignature_HIP(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse) | |
471 | { | ||
472 | ✗ | PetscScalar *d_h; | |
473 | ✗ | const PetscScalar *d_omega,*omega; | |
474 | ✗ | PetscInt i,xcount; | |
475 | ✗ | dim3 blocks3d, threads3d; | |
476 | |||
477 | PetscFunctionBegin; | ||
478 | ✗ | if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS); | |
479 | ✗ | if (!h) { | |
480 | ✗ | PetscCall(VecHIPGetArray(bv->buffer,&d_h)); | |
481 | ✗ | PetscCall(VecHIPGetArrayRead(bv->omega,&d_omega)); | |
482 | ✗ | PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount)); | |
483 | ✗ | PetscCall(PetscLogGpuTimeBegin()); | |
484 | ✗ | if (inverse) { | |
485 | ✗ | for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j); | |
486 | ✗ | } else { | |
487 | ✗ | for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j); | |
488 | } | ||
489 | ✗ | PetscCallHIP(hipGetLastError()); | |
490 | ✗ | PetscCall(PetscLogGpuTimeEnd()); | |
491 | ✗ | PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j))); | |
492 | ✗ | PetscCall(VecHIPRestoreArrayRead(bv->omega,&d_omega)); | |
493 | ✗ | PetscCall(VecHIPRestoreArray(bv->buffer,&d_h)); | |
494 | ✗ | } else { | |
495 | ✗ | PetscCall(VecGetArrayRead(bv->omega,&omega)); | |
496 | ✗ | if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]); | |
497 | ✗ | else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]); | |
498 | ✗ | PetscCall(VecRestoreArrayRead(bv->omega,&omega)); | |
499 | ✗ | PetscCall(PetscLogFlops(1.0*(bv->nc+j))); | |
500 | } | ||
501 | ✗ | PetscFunctionReturn(PETSC_SUCCESS); | |
502 | ✗ | } | |
503 | |||
504 | /* | ||
505 | BV_SquareRoot_HIP - Returns the square root of position j (counted after the constraints) | ||
506 | of the coefficients array | ||
507 | */ | ||
508 | 3576595478344653636 | PetscErrorCode BV_SquareRoot_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta) | |
509 | { | ||
510 | 3576595478344653636 | const PetscScalar *d_h; | |
511 | 3576595478344653636 | PetscScalar hh; | |
512 | |||
513 | PetscFunctionBegin; | ||
514 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
3576595478344653636 | if (!h) { |
515 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h)); |
516 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeBegin()); |
517 |
2/8✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 2 times.
|
3576595478344653636 | PetscCallHIP(hipMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),hipMemcpyDeviceToHost)); |
518 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar))); |
519 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(PetscLogGpuTimeEnd()); |
520 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(BV_SafeSqrt(bv,hh,beta)); |
521 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
3576595478344653636 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h)); |
522 |
0/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
3576595478344653636 | } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta)); |
523 | 3576595478344653636 | PetscFunctionReturn(PETSC_SUCCESS); | |
524 | 3576595478344653636 | } | |
525 | |||
526 | /* | ||
527 | BV_StoreCoefficients_HIP - Copy the contents of the coefficients array to an array dest | ||
528 | provided by the caller (only values from l to j are copied) | ||
529 | */ | ||
530 | 4881532560162684928 | PetscErrorCode BV_StoreCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest) | |
531 | { | ||
532 | 4881532560162684928 | const PetscScalar *d_h,*d_a; | |
533 | 4881532560162684928 | PetscInt i; | |
534 | |||
535 | PetscFunctionBegin; | ||
536 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
4881532560162684928 | if (!h) { |
537 |
3/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
|
4881532560162684928 | PetscCall(VecHIPGetArrayRead(bv->buffer,&d_a)); |
538 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
4881532560162684928 | PetscCall(PetscLogGpuTimeBegin()); |
539 | 4881532560162684928 | d_h = d_a + j*(bv->nc+bv->m)+bv->nc; | |
540 |
3/8✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 1 times.
|
4881532560162684928 | PetscCallHIP(hipMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),hipMemcpyDeviceToHost)); |
541 |
3/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
|
4881532560162684928 | PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar))); |
542 |
3/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
|
4881532560162684928 | PetscCall(PetscLogGpuTimeEnd()); |
543 |
2/4✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
|
4881532560162684928 | PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_a)); |
544 | 4881532560162684928 | } else { | |
545 | ✗ | for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i]; | |
546 | } | ||
547 | 13565211513546866688 | PetscFunctionReturn(PETSC_SUCCESS); | |
548 | 2722760226280308736 | } | |
549 |