Actual source code: bvcuda.cu
 
   slepc-3.22.2 2024-12-02
   
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    CUDA-related code common to several BV impls
 12: */
 14: #include <slepc/private/bvimpl.h>
 15: #include <slepccupmblas.h>
 17: #define BLOCKSIZE 64
 19: /*
 20:     C := alpha*A*B + beta*C
 21: */
 22: PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
 23: {
 24:   PetscCuBLASInt    m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
 25:   cublasHandle_t    cublasv2handle;
 27:   PetscFunctionBegin;
 28:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 29:   PetscCall(PetscCuBLASIntCast(m_,&m));
 30:   PetscCall(PetscCuBLASIntCast(n_,&n));
 31:   PetscCall(PetscCuBLASIntCast(k_,&k));
 32:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 33:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
 34:   PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
 35:   PetscCall(PetscLogGpuTimeBegin());
 36:   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
 37:   PetscCall(PetscLogGpuTimeEnd());
 38:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
 39:   PetscFunctionReturn(PETSC_SUCCESS);
 40: }
 42: /*
 43:     y := alpha*A*x + beta*y
 44: */
 45: PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
 46: {
 47:   PetscCuBLASInt    n=0,k=0,lda=0,one=1;
 48:   cublasHandle_t    cublasv2handle;
 50:   PetscFunctionBegin;
 51:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 52:   PetscCall(PetscCuBLASIntCast(n_,&n));
 53:   PetscCall(PetscCuBLASIntCast(k_,&k));
 54:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 55:   PetscCall(PetscLogGpuTimeBegin());
 56:   PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
 57:   PetscCall(PetscLogGpuTimeEnd());
 58:   PetscCall(PetscLogGpuFlops(2.0*n*k));
 59:   PetscFunctionReturn(PETSC_SUCCESS);
 60: }
 62: /*
 63:     A(:,s:e-1) := A*B(:,s:e-1)
 64: */
 65: PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
 66: {
 67:   const PetscScalar *d_B1;
 68:   PetscScalar       *d_work,sone=1.0,szero=0.0;
 69:   PetscCuBLASInt    m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
 70:   size_t            freemem,totmem;
 71:   cublasHandle_t    cublasv2handle;
 72:   cublasOperation_t bt;
 74:   PetscFunctionBegin;
 75:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 76:   PetscCall(PetscCuBLASIntCast(m_,&m));
 77:   PetscCall(PetscCuBLASIntCast(e-s,&n));
 78:   PetscCall(PetscCuBLASIntCast(k_,&k));
 79:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
 80:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
 81:   PetscCall(PetscLogGpuTimeBegin());
 82:   if (PetscUnlikely(btrans)) {
 83:     d_B1 = d_B+s;
 84:     bt   = CUBLAS_OP_C;
 85:   } else {
 86:     d_B1 = d_B+s*ldb;
 87:     bt   = CUBLAS_OP_N;
 88:   }
 89:   /* try to allocate the whole matrix */
 90:   PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
 91:   if (freemem>=lda*n*sizeof(PetscScalar)) {
 92:     PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
 93:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
 94:     PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
 95:   } else {
 96:     PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
 97:     PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
 98:     PetscCall(PetscCuBLASIntCast(m % bs,&l));
 99:     if (l) {
100:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101:       PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
102:     }
103:     for (;l<m;l+=bs) {
104:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105:       PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
106:     }
107:   }
108:   PetscCall(PetscLogGpuTimeEnd());
109:   PetscCallCUDA(cudaFree(d_work));
110:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: /*
115:     B := alpha*A + beta*B
116: */
117: PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118: {
119:   PetscCuBLASInt n=0,k=0,lda=0,ldb=0;
120:   cublasHandle_t cublasv2handle;
122:   PetscFunctionBegin;
123:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
124:   PetscCall(PetscCuBLASIntCast(n_,&n));
125:   PetscCall(PetscCuBLASIntCast(k_,&k));
126:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
127:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
128:   PetscCall(PetscLogGpuTimeBegin());
129:   PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130:   PetscCall(PetscLogGpuTimeEnd());
131:   PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132:   PetscFunctionReturn(PETSC_SUCCESS);
133: }
135: /*
136:     C := A'*B
138:     C is a CPU array
139: */
140: PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141: {
142:   PetscScalar       *d_work,sone=1.0,szero=0.0,*CC;
143:   PetscInt          j;
144:   PetscCuBLASInt    m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145:   PetscMPIInt       len;
146:   cublasHandle_t    cublasv2handle;
148:   PetscFunctionBegin;
149:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
150:   PetscCall(PetscCuBLASIntCast(m_,&m));
151:   PetscCall(PetscCuBLASIntCast(n_,&n));
152:   PetscCall(PetscCuBLASIntCast(k_,&k));
153:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
154:   PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
155:   PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
156:   PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157:   if (mpi) {
158:     if (ldc==m) {
159:       PetscCall(BVAllocateWork_Private(bv,m*n));
160:       if (k) {
161:         PetscCall(PetscLogGpuTimeBegin());
162:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163:         PetscCall(PetscLogGpuTimeEnd());
164:         PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
165:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166:       } else PetscCall(PetscArrayzero(bv->work,m*n));
167:       PetscCall(PetscMPIIntCast(m*n,&len));
168:       PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169:     } else {
170:       PetscCall(BVAllocateWork_Private(bv,2*m*n));
171:       CC = bv->work+m*n;
172:       if (k) {
173:         PetscCall(PetscLogGpuTimeBegin());
174:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175:         PetscCall(PetscLogGpuTimeEnd());
176:         PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
177:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178:       } else PetscCall(PetscArrayzero(bv->work,m*n));
179:       PetscCall(PetscMPIIntCast(m*n,&len));
180:       PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182:     }
183:   } else {
184:     if (k) {
185:       PetscCall(BVAllocateWork_Private(bv,m*n));
186:       PetscCall(PetscLogGpuTimeBegin());
187:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188:       PetscCall(PetscLogGpuTimeEnd());
189:       PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
190:       PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192:     }
193:   }
194:   PetscCallCUDA(cudaFree(d_work));
195:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: /*
200:     y := A'*x
202:     y is a CPU array, if NULL bv->buffer is used as a workspace
203: */
204: PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205: {
206:   PetscScalar       *d_work,szero=0.0,sone=1.0,*yy;
207:   PetscCuBLASInt    n=0,k=0,lda=0,one=1;
208:   PetscMPIInt       len;
209:   cublasHandle_t    cublasv2handle;
211:   PetscFunctionBegin;
212:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
213:   PetscCall(PetscCuBLASIntCast(n_,&n));
214:   PetscCall(PetscCuBLASIntCast(k_,&k));
215:   PetscCall(PetscCuBLASIntCast(lda_,&lda));
216:   if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work));
217:   else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218:   if (mpi) {
219:     PetscCall(BVAllocateWork_Private(bv,k));
220:     if (n) {
221:       PetscCall(PetscLogGpuTimeBegin());
222:       PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
223:       PetscCall(PetscLogGpuTimeEnd());
224:       PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
225:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
226:     } else PetscCall(PetscArrayzero(bv->work,k));
227:     /* reduction */
228:     PetscCall(PetscMPIIntCast(k,&len));
229:     if (!y) {
230:       if (use_gpu_aware_mpi) {  /* case 1: reduce on GPU using a temporary buffer */
231:         PetscCallCUDA(cudaMalloc((void**)&yy,k*sizeof(PetscScalar)));
232:         PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
233:         PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
234:         PetscCallCUDA(cudaFree(yy));
235:       } else {  /* case 2: reduce on CPU, copy result back to GPU */
236:         PetscCall(BVAllocateWork_Private(bv,2*k));
237:         yy = bv->work+k;
238:         PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
239:         PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
240:         PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
241:         PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
242:         PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
243:       }
244:       PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
245:     } else {  /* case 3: user-provided array y, reduce on CPU */
246:       PetscCallCUDA(cudaFree(d_work));
247:       PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
248:     }
249:   } else {
250:     if (n) {
251:       PetscCall(PetscLogGpuTimeBegin());
252:       PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
253:       PetscCall(PetscLogGpuTimeEnd());
254:     }
255:     if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
256:     else {
257:       PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
258:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
259:       PetscCallCUDA(cudaFree(d_work));
260:     }
261:   }
262:   PetscCall(PetscLogGpuFlops(2.0*n*k));
263:   PetscFunctionReturn(PETSC_SUCCESS);
264: }
266: /*
267:     Scale n scalars
268: */
269: PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
270: {
271:   PetscCuBLASInt n=0,one=1;
272:   cublasHandle_t cublasv2handle;
274:   PetscFunctionBegin;
275:   PetscCall(PetscCuBLASIntCast(n_,&n));
276:   if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
277:   else if (alpha != (PetscScalar)1.0) {
278:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
279:     PetscCall(PetscLogGpuTimeBegin());
280:     PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
281:     PetscCall(PetscLogGpuTimeEnd());
282:     PetscCall(PetscLogGpuFlops(1.0*n));
283:   }
284:   PetscFunctionReturn(PETSC_SUCCESS);
285: }
287: /*
288:     Compute 2-norm of vector consisting of n scalars
289: */
290: PetscErrorCode BVNorm_BLAS_CUDA(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm)
291: {
292:   PetscCuBLASInt n=0,one=1;
293:   cublasHandle_t cublasv2handle;
295:   PetscFunctionBegin;
296:   PetscCall(PetscCuBLASIntCast(n_,&n));
297:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
298:   PetscCall(PetscLogGpuTimeBegin());
299:   PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n,d_A,one,nrm));
300:   PetscCall(PetscLogGpuTimeEnd());
301:   PetscCall(PetscLogGpuFlops(2.0*n));
302:   PetscFunctionReturn(PETSC_SUCCESS);
303: }
305: /*
306:     Normalize the columns of A
307: */
308: PetscErrorCode BVNormalize_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi)
309: {
310:   PetscInt       j,k;
311:   PetscReal      nrm,nrm1;
312:   PetscScalar    alpha;
313:   PetscCuBLASInt m=0,one=1;
314:   cublasHandle_t cublasv2handle;
316:   PetscFunctionBegin;
317:   PetscCall(PetscCuBLASIntCast(m_,&m));
318:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
319:   PetscCall(PetscLogGpuTimeBegin());
320:   for (j=0;j<n_;j++) {
321:     k = 1;
322: #if !defined(PETSC_USE_COMPLEX)
323:     if (eigi && eigi[j] != 0.0) k = 2;
324: #endif
325:     PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+j*lda_,one,&nrm));
326:     if (k==2) {
327:       PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+(j+1)*lda_,one,&nrm1));
328:       nrm = SlepcAbs(nrm,nrm1);
329:     }
330:     alpha = 1.0/nrm;
331:     PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+j*lda_,one));
332:     if (k==2) {
333:       PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+(j+1)*lda_,one));
334:       j++;
335:     }
336:   }
337:   PetscCall(PetscLogGpuTimeEnd());
338:   PetscCall(PetscLogGpuFlops(3.0*m*n_));
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: /*
343:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
344: */
345: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
346: {
347:   PetscScalar    *d_hh,*d_a;
348:   PetscInt       i;
350:   PetscFunctionBegin;
351:   if (!h) {
352:     PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
353:     PetscCall(PetscLogGpuTimeBegin());
354:     d_hh = d_a + j*(bv->nc+bv->m);
355:     PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
356:     PetscCall(PetscLogGpuTimeEnd());
357:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
358:   } else { /* cpu memory */
359:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
360:   }
361:   PetscFunctionReturn(PETSC_SUCCESS);
362: }
364: /*
365:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
366:    into column j of the bv buffer
367:  */
368: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
369: {
370:   PetscScalar    *d_h,*d_c,sone=1.0;
371:   PetscInt       i;
372:   PetscCuBLASInt idx=0,one=1;
373:   cublasHandle_t cublasv2handle;
375:   PetscFunctionBegin;
376:   if (!h) {
377:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
378:     PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
379:     d_h = d_c + j*(bv->nc+bv->m);
380:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
381:     PetscCall(PetscLogGpuTimeBegin());
382:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
383:     PetscCall(PetscLogGpuTimeEnd());
384:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
385:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
386:   } else { /* cpu memory */
387:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
388:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
389:   }
390:   PetscFunctionReturn(PETSC_SUCCESS);
391: }
393: /*
394:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
395:    of the coefficients array
396: */
397: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
398: {
399:   PetscScalar    *d_h,*a;
401:   PetscFunctionBegin;
402:   if (!h) {
403:     PetscCall(VecCUDAGetArray(bv->buffer,&a));
404:     PetscCall(PetscLogGpuTimeBegin());
405:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
406:     PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
407:     PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
408:     PetscCall(PetscLogGpuTimeEnd());
409:     PetscCall(VecCUDARestoreArray(bv->buffer,&a));
410:   } else { /* cpu memory */
411:     h[bv->nc+j] = value;
412:   }
413:   PetscFunctionReturn(PETSC_SUCCESS);
414: }
416: /*
417:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
418:    coefficients array (up to position j)
419: */
420: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
421: {
422:   const PetscScalar *d_h;
423:   PetscScalar       dot;
424:   PetscInt          i;
425:   PetscCuBLASInt    idx=0,one=1;
426:   cublasHandle_t    cublasv2handle;
428:   PetscFunctionBegin;
429:   if (!h) {
430:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
431:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
432:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
433:     PetscCall(PetscLogGpuTimeBegin());
434:     PetscCallCUBLAS(cublasXdot(cublasv2handle,idx,d_h,one,d_h,one,&dot));
435:     PetscCall(PetscLogGpuTimeEnd());
436:     PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
437:     *sum = PetscRealPart(dot);
438:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
439:   } else { /* cpu memory */
440:     *sum = 0.0;
441:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
442:     PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
443:   }
444:   PetscFunctionReturn(PETSC_SUCCESS);
445: }
447: /* pointwise multiplication */
448: static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
449: {
450:   PetscInt x;
452:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
453:   if (x<n) a[x] *= PetscRealPart(b[x]);
454: }
456: /* pointwise division */
457: static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
458: {
459:   PetscInt x;
461:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
462:   if (x<n) a[x] /= PetscRealPart(b[x]);
463: }
465: /*
466:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
467:    the contents of the coefficients array (up to position j) and omega is the signature;
468:    if inverse=TRUE then the operation is h/omega
469: */
470: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
471: {
472:   PetscScalar       *d_h;
473:   const PetscScalar *d_omega,*omega;
474:   PetscInt          i,xcount;
475:   dim3              blocks3d, threads3d;
477:   PetscFunctionBegin;
478:   if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
479:   if (!h) {
480:     PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
481:     PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
482:     PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
483:     PetscCall(PetscLogGpuTimeBegin());
484:     if (inverse) {
485:       for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
486:     } else {
487:       for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
488:     }
489:     PetscCallCUDA(cudaGetLastError());
490:     PetscCall(PetscLogGpuTimeEnd());
491:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
492:     PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
493:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
494:   } else {
495:     PetscCall(VecGetArrayRead(bv->omega,&omega));
496:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
497:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
498:     PetscCall(VecRestoreArrayRead(bv->omega,&omega));
499:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
500:   }
501:   PetscFunctionReturn(PETSC_SUCCESS);
502: }
504: /*
505:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
506:    of the coefficients array
507: */
508: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
509: {
510:   const PetscScalar *d_h;
511:   PetscScalar       hh;
513:   PetscFunctionBegin;
514:   if (!h) {
515:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
516:     PetscCall(PetscLogGpuTimeBegin());
517:     PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
518:     PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
519:     PetscCall(PetscLogGpuTimeEnd());
520:     PetscCall(BV_SafeSqrt(bv,hh,beta));
521:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
522:   } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
523:   PetscFunctionReturn(PETSC_SUCCESS);
524: }
526: /*
527:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
528:    provided by the caller (only values from l to j are copied)
529: */
530: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
531: {
532:   const PetscScalar *d_h,*d_a;
533:   PetscInt          i;
535:   PetscFunctionBegin;
536:   if (!h) {
537:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
538:     PetscCall(PetscLogGpuTimeBegin());
539:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
540:     PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
541:     PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
542:     PetscCall(PetscLogGpuTimeEnd());
543:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
544:   } else {
545:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
546:   }
547:   PetscFunctionReturn(PETSC_SUCCESS);
548: }