Actual source code: sveccuda.cu
slepc-3.13.3 2020-06-14
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <petsccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscBLASInt m,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: PetscLogGpuFlops(2.0*m);
47: return(0);
48: }
50: /*
51: C := alpha*A*B + beta*C
52: */
53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
54: {
55: PetscErrorCode ierr;
56: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
57: const PetscScalar *d_px,*d_A;
58: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
59: PetscInt ldq,mq;
60: PetscBLASInt m,n,k,ldq_;
61: cublasStatus_t cberr;
62: cudaError_t cerr;
63: cublasHandle_t cublasv2handle;
66: if (!Y->n) return(0);
67: VecCUDAGetArrayRead(x->v,&d_px);
68: if (beta==(PetscScalar)0.0) {
69: VecCUDAGetArrayWrite(y->v,&d_py);
70: } else {
71: VecCUDAGetArray(y->v,&d_py);
72: }
73: d_A = d_px+(X->nc+X->l)*X->n;
74: d_C = d_py+(Y->nc+Y->l)*Y->n;
75: if (Q) {
76: PetscBLASIntCast(Y->n,&m);
77: PetscBLASIntCast(Y->k-Y->l,&n);
78: PetscBLASIntCast(X->k-X->l,&k);
79: PetscCUBLASGetHandle(&cublasv2handle);
80: MatGetSize(Q,&ldq,&mq);
81: PetscBLASIntCast(ldq,&ldq_);
82: MatDenseGetArray(Q,&q);
83: cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
84: cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
85: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
86: d_B = d_q+Y->l*ldq+X->l;
87: PetscLogGpuTimeBegin();
88: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
89: PetscLogGpuTimeEnd();
90: MatDenseRestoreArray(Q,&q);
91: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
92: PetscLogGpuFlops(2.0*m*n*k);
93: } else {
94: BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
95: }
96: VecCUDARestoreArrayRead(x->v,&d_px);
97: VecCUDARestoreArrayWrite(y->v,&d_py);
98: return(0);
99: }
101: /*
102: y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106: PetscErrorCode ierr;
107: BV_SVEC *x = (BV_SVEC*)X->data;
108: const PetscScalar *d_px,*d_A;
109: PetscScalar *d_py,*d_q,*d_x,*d_y;
110: PetscBLASInt n,k,one=1;
111: cublasStatus_t cberr;
112: cublasHandle_t cublasv2handle;
113: cudaError_t cerr;
116: PetscBLASIntCast(X->n,&n);
117: PetscBLASIntCast(X->k-X->l,&k);
118: PetscCUBLASGetHandle(&cublasv2handle);
119: VecCUDAGetArrayRead(x->v,&d_px);
120: if (beta==(PetscScalar)0.0) {
121: VecCUDAGetArrayWrite(y,&d_py);
122: } else {
123: VecCUDAGetArray(y,&d_py);
124: }
125: if (!q) {
126: VecCUDAGetArray(X->buffer,&d_q);
127: } else {
128: cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129: cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130: PetscLogCpuToGpu(k*sizeof(PetscScalar));
131: }
132: d_A = d_px+(X->nc+X->l)*X->n;
133: d_x = d_q;
134: d_y = d_py;
135: PetscLogGpuTimeBegin();
136: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137: PetscLogGpuTimeEnd();
138: VecCUDARestoreArrayRead(x->v,&d_px);
139: if (beta==(PetscScalar)0.0) {
140: VecCUDARestoreArrayWrite(y,&d_py);
141: } else {
142: VecCUDARestoreArray(y,&d_py);
143: }
144: if (!q) {
145: VecCUDARestoreArray(X->buffer,&d_q);
146: } else {
147: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148: }
149: PetscLogGpuFlops(2.0*n*k);
150: return(0);
151: }
153: /*
154: A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159: BV_SVEC *ctx = (BV_SVEC*)V->data;
160: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161: PetscInt j,ldq,nq;
162: PetscBLASInt m,n,k,l,ldq_,bs=BLOCKSIZE;
163: cublasStatus_t cberr;
164: size_t freemem,totmem;
165: cublasHandle_t cublasv2handle;
166: cudaError_t cerr;
169: if (!V->n) return(0);
170: PetscBLASIntCast(V->n,&m);
171: PetscBLASIntCast(e-s,&n);
172: PetscBLASIntCast(V->k-V->l,&k);
173: MatGetSize(Q,&ldq,&nq);
174: PetscBLASIntCast(ldq,&ldq_);
175: VecCUDAGetArray(ctx->v,&d_pv);
176: MatDenseGetArray(Q,&q);
177: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
178: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
180: PetscCUBLASGetHandle(&cublasv2handle);
181: PetscLogGpuTimeBegin();
182: /* try to allocate the whole matrix */
183: cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
184: if (freemem>=m*n*sizeof(PetscScalar)) {
185: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
186: d_A = d_pv+(V->nc+V->l)*m;
187: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
189: for (j=0;j<n;j++) {
190: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
191: }
192: } else {
193: bs = freemem/(m*sizeof(PetscScalar));
194: cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
195: l = m % bs;
196: if (l) {
197: d_A = d_pv+(V->nc+V->l)*m;
198: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
199: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
200: for (j=0;j<n;j++) {
201: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
202: }
203: }
204: for (;l<m;l+=bs) {
205: d_A = d_pv+(V->nc+V->l)*m+l;
206: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
207: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
208: for (j=0;j<n;j++) {
209: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
210: }
211: }
212: }
213: PetscLogGpuTimeEnd();
214: cerr = WaitForGPU();CHKERRCUDA(cerr);
215: MatDenseRestoreArray(Q,&q);
216: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
217: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
218: VecCUDARestoreArray(ctx->v,&d_pv);
219: PetscLogGpuFlops(2.0*m*n*k);
220: return(0);
221: }
223: /*
224: A(:,s:e-1) := A*B(:,s:e-1)
225: */
226: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
227: {
229: BV_SVEC *ctx = (BV_SVEC*)V->data;
230: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
231: PetscInt j,ldq,nq;
232: PetscBLASInt m,n,k,ldq_;
233: cublasStatus_t cberr;
234: cublasHandle_t cublasv2handle;
235: cudaError_t cerr;
238: if (!V->n) return(0);
239: PetscBLASIntCast(V->n,&m);
240: PetscBLASIntCast(e-s,&n);
241: PetscBLASIntCast(V->k-V->l,&k);
242: MatGetSize(Q,&ldq,&nq);
243: PetscBLASIntCast(ldq,&ldq_);
244: VecCUDAGetArray(ctx->v,&d_pv);
245: MatDenseGetArray(Q,&q);
246: PetscCUBLASGetHandle(&cublasv2handle);
247: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
248: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
249: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250: PetscLogGpuTimeBegin();
251: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
252: d_A = d_pv+(V->nc+V->l)*m;
253: d_B = d_q+V->l*ldq+s;
254: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255: for (j=0;j<n;j++) {
256: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
257: }
258: PetscLogGpuTimeEnd();
259: cerr = WaitForGPU();CHKERRCUDA(cerr);
260: MatDenseRestoreArray(Q,&q);
261: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
262: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
263: VecCUDARestoreArray(ctx->v,&d_pv);
264: PetscLogGpuFlops(2.0*m*n*k);
265: return(0);
266: }
268: /*
269: C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273: PetscErrorCode ierr;
274: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275: const PetscScalar *d_px,*d_py,*d_A,*d_B;
276: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277: PetscInt j,ldm;
278: PetscBLASInt m,n,k,ldm_;
279: PetscMPIInt len;
280: cublasStatus_t cberr;
281: cublasHandle_t cublasv2handle;
282: cudaError_t cerr;
285: PetscBLASIntCast(Y->k-Y->l,&m);
286: PetscBLASIntCast(X->k-X->l,&n);
287: PetscBLASIntCast(X->n,&k);
288: MatGetSize(M,&ldm,NULL);
289: PetscBLASIntCast(ldm,&ldm_);
290: VecCUDAGetArrayRead(x->v,&d_px);
291: VecCUDAGetArrayRead(y->v,&d_py);
292: MatDenseGetArray(M,&pm);
293: PetscCUBLASGetHandle(&cublasv2handle);
294: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
295: d_A = d_py+(Y->nc+Y->l)*Y->n;
296: d_B = d_px+(X->nc+X->l)*X->n;
297: C = pm+X->l*ldm+Y->l;
298: if (x->mpi) {
299: if (ldm==m) {
300: BVAllocateWork_Private(X,m*n);
301: if (k) {
302: PetscLogGpuTimeBegin();
303: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
304: PetscLogGpuTimeEnd();
305: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
306: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
307: } else {
308: PetscArrayzero(X->work,m*n);
309: }
310: PetscMPIIntCast(m*n,&len);
311: MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
312: } else {
313: BVAllocateWork_Private(X,2*m*n);
314: CC = X->work+m*n;
315: if (k) {
316: PetscLogGpuTimeBegin();
317: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
318: PetscLogGpuTimeEnd();
319: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
320: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
321: } else {
322: PetscArrayzero(X->work,m*n);
323: }
324: PetscMPIIntCast(m*n,&len);
325: MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
326: for (j=0;j<n;j++) {
327: PetscArraycpy(C+j*ldm,CC+j*m,m);
328: }
329: }
330: } else {
331: if (k) {
332: BVAllocateWork_Private(X,m*n);
333: PetscLogGpuTimeBegin();
334: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
335: PetscLogGpuTimeEnd();
336: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
337: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
338: for (j=0;j<n;j++) {
339: PetscArraycpy(C+j*ldm,X->work+j*m,m);
340: }
341: }
342: }
343: cerr = WaitForGPU();CHKERRCUDA(cerr);
344: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
345: MatDenseRestoreArray(M,&pm);
346: VecCUDARestoreArrayRead(x->v,&d_px);
347: VecCUDARestoreArrayRead(y->v,&d_py);
348: PetscLogGpuFlops(2.0*m*n*k);
349: return(0);
350: }
352: #if defined(PETSC_USE_COMPLEX)
353: struct conjugate
354: {
355: __host__ __device__
356: PetscScalar operator()(PetscScalar x)
357: {
358: return PetscConj(x);
359: }
360: };
362: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
363: {
364: cudaError_t cerr;
365: thrust::device_ptr<PetscScalar> ptr;
368: try {
369: ptr = thrust::device_pointer_cast(a);
370: thrust::transform(ptr,ptr+n,ptr,conjugate());
371: cerr = WaitForGPU();CHKERRCUDA(cerr);
372: } catch (char *ex) {
373: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
374: }
375: return(0);
376: }
377: #endif
379: /*
380: y := A'*x computed as y' := x'*A
381: */
382: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
383: {
384: PetscErrorCode ierr;
385: BV_SVEC *x = (BV_SVEC*)X->data;
386: const PetscScalar *d_A,*d_x,*d_px,*d_py;
387: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
388: PetscBLASInt n,k,one=1;
389: PetscMPIInt len;
390: Vec z = y;
391: cublasStatus_t cberr;
392: cublasHandle_t cublasv2handle;
393: cudaError_t cerr;
396: PetscBLASIntCast(X->n,&n);
397: PetscBLASIntCast(X->k-X->l,&k);
398: PetscCUBLASGetHandle(&cublasv2handle);
399: if (X->matrix) {
400: BV_IPMatMult(X,y);
401: z = X->Bx;
402: }
403: VecCUDAGetArrayRead(x->v,&d_px);
404: VecCUDAGetArrayRead(z,&d_py);
405: if (!q) {
406: VecCUDAGetArrayWrite(X->buffer,&d_work);
407: } else {
408: cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
409: }
410: d_A = d_px+(X->nc+X->l)*X->n;
411: d_x = d_py;
412: if (x->mpi) {
413: BVAllocateWork_Private(X,k);
414: if (n) {
415: PetscLogGpuTimeBegin();
416: #if defined(PETSC_USE_COMPLEX)
417: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
418: ConjugateCudaArray(d_work,k);
419: #else
420: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
421: #endif
422: PetscLogGpuTimeEnd();
423: cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
424: PetscLogGpuToCpu(k*sizeof(PetscScalar));
425: } else {
426: PetscArrayzero(X->work,k);
427: }
428: if (!q) {
429: VecCUDARestoreArrayWrite(X->buffer,&d_work);
430: VecGetArray(X->buffer,&qq);
431: } else {
432: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
433: }
434: PetscMPIIntCast(k,&len);
435: MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
436: if (!q) { VecRestoreArray(X->buffer,&qq); }
437: } else {
438: if (n) {
439: PetscLogGpuTimeBegin();
440: #if defined(PETSC_USE_COMPLEX)
441: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
442: ConjugateCudaArray(d_work,k);
443: #else
444: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
445: #endif
446: PetscLogGpuTimeEnd();
447: }
448: if (!q) {
449: VecCUDARestoreArrayWrite(X->buffer,&d_work);
450: } else {
451: cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
452: PetscLogGpuToCpu(k*sizeof(PetscScalar));
453: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
454: }
455: }
456: cerr = WaitForGPU();CHKERRCUDA(cerr);
457: VecCUDARestoreArrayRead(z,&d_py);
458: VecCUDARestoreArrayRead(x->v,&d_px);
459: PetscLogGpuFlops(2.0*n*k);
460: return(0);
461: }
463: /*
464: y := A'*x computed as y' := x'*A
465: */
466: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
467: {
468: PetscErrorCode ierr;
469: BV_SVEC *x = (BV_SVEC*)X->data;
470: const PetscScalar *d_A,*d_x,*d_px,*d_py;
471: PetscScalar *d_y,szero=0.0,sone=1.0;
472: PetscBLASInt n,k,one=1;
473: Vec z = y;
474: cublasStatus_t cberr;
475: cublasHandle_t cublasv2handle;
476: cudaError_t cerr;
479: PetscBLASIntCast(X->n,&n);
480: PetscBLASIntCast(X->k-X->l,&k);
481: if (X->matrix) {
482: BV_IPMatMult(X,y);
483: z = X->Bx;
484: }
485: PetscCUBLASGetHandle(&cublasv2handle);
486: VecCUDAGetArrayRead(x->v,&d_px);
487: VecCUDAGetArrayRead(z,&d_py);
488: d_A = d_px+(X->nc+X->l)*X->n;
489: d_x = d_py;
490: if (n) {
491: cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
492: PetscLogGpuTimeBegin();
493: #if defined(PETSC_USE_COMPLEX)
494: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495: ConjugateCudaArray(d_y,k);
496: #else
497: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
498: #endif
499: PetscLogGpuTimeEnd();
500: cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
501: PetscLogGpuToCpu(k*sizeof(PetscScalar));
502: cerr = cudaFree(d_y);CHKERRCUDA(cerr);
503: }
504: VecCUDARestoreArrayRead(z,&d_py);
505: VecCUDARestoreArrayRead(x->v,&d_px);
506: PetscLogGpuFlops(2.0*n*k);
507: return(0);
508: }
510: /*
511: Scale n scalars
512: */
513: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
514: {
516: BV_SVEC *ctx = (BV_SVEC*)bv->data;
517: PetscScalar *d_array, *d_A;
518: PetscBLASInt n,one=1;
519: cublasStatus_t cberr;
520: cublasHandle_t cublasv2handle;
521: cudaError_t cerr;
524: VecCUDAGetArray(ctx->v,&d_array);
525: if (j<0) {
526: d_A = d_array+(bv->nc+bv->l)*bv->n;
527: PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
528: } else {
529: d_A = d_array+(bv->nc+j)*bv->n;
530: PetscBLASIntCast(bv->n,&n);
531: }
532: if (alpha == (PetscScalar)0.0) {
533: cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
534: } else if (alpha != (PetscScalar)1.0) {
535: PetscCUBLASGetHandle(&cublasv2handle);
536: PetscLogGpuTimeBegin();
537: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
538: PetscLogGpuTimeEnd();
539: PetscLogGpuFlops(1.0*n);
540: }
541: VecCUDARestoreArray(ctx->v,&d_array);
542: return(0);
543: }
545: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
546: {
547: PetscErrorCode ierr;
548: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
549: const PetscScalar *d_pv;
550: PetscScalar *d_pw;
551: PetscInt j;
554: VecCUDAGetArrayRead(v->v,&d_pv);
555: VecCUDAGetArrayWrite(w->v,&d_pw);
556: for (j=0;j<V->k-V->l;j++) {
557: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
558: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
559: MatMult(A,V->cv[1],W->cv[1]);
560: VecCUDAResetArray(V->cv[1]);
561: VecCUDAResetArray(W->cv[1]);
562: }
563: VecCUDARestoreArrayRead(v->v,&d_pv);
564: VecCUDARestoreArrayWrite(w->v,&d_pw);
565: return(0);
566: }
568: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
569: {
570: PetscErrorCode ierr;
571: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
572: const PetscScalar *d_pv,*d_pvc;
573: PetscScalar *d_pw,*d_pwc;
574: cudaError_t cerr;
577: VecCUDAGetArrayRead(v->v,&d_pv);
578: VecCUDAGetArrayWrite(w->v,&d_pw);
579: d_pvc = d_pv+(V->nc+V->l)*V->n;
580: d_pwc = d_pw+(W->nc+W->l)*W->n;
581: cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
582: VecCUDARestoreArrayRead(v->v,&d_pv);
583: VecCUDARestoreArrayWrite(w->v,&d_pw);
584: return(0);
585: }
587: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
588: {
590: BV_SVEC *v = (BV_SVEC*)V->data;
591: PetscScalar *d_pv;
592: cudaError_t cerr;
595: VecCUDAGetArray(v->v,&d_pv);
596: cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
597: VecCUDARestoreArray(v->v,&d_pv);
598: return(0);
599: }
601: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
602: {
603: PetscErrorCode ierr;
604: BV_SVEC *ctx = (BV_SVEC*)bv->data;
605: const PetscScalar *d_pv;
606: PetscScalar *d_pnew,*d_ptr;
607: PetscInt bs,lsplit;
608: Vec vnew,vpar;
609: char str[50];
610: cudaError_t cerr;
611: BV parent;
614: if (bv->issplit==2) {
615: parent = bv->splitparent;
616: lsplit = parent->lsplit;
617: vpar = ((BV_SVEC*)parent->data)->v;
618: VecCUDAResetArray(ctx->v);
619: VecCUDAGetArray(vpar,&d_ptr);
620: VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
621: VecCUDARestoreArray(vpar,&d_ptr);
622: } else if (!bv->issplit) {
623: VecGetBlockSize(bv->t,&bs);
624: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
625: VecSetType(vnew,((PetscObject)bv->t)->type_name);
626: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
627: VecSetBlockSize(vnew,bs);
628: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
629: if (((PetscObject)bv)->name) {
630: PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
631: PetscObjectSetName((PetscObject)vnew,str);
632: }
633: if (copy) {
634: VecCUDAGetArrayRead(ctx->v,&d_pv);
635: VecCUDAGetArrayWrite(vnew,&d_pnew);
636: cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
637: VecCUDARestoreArrayRead(ctx->v,&d_pv);
638: VecCUDARestoreArrayWrite(vnew,&d_pnew);
639: }
640: VecDestroy(&ctx->v);
641: ctx->v = vnew;
642: }
643: return(0);
644: }
646: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
647: {
649: BV_SVEC *ctx = (BV_SVEC*)bv->data;
650: PetscScalar *d_pv;
651: PetscInt l;
654: l = BVAvailableVec;
655: VecCUDAGetArray(ctx->v,&d_pv);
656: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
657: return(0);
658: }
660: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
661: {
663: BV_SVEC *ctx = (BV_SVEC*)bv->data;
664: PetscInt l;
667: l = (j==bv->ci[0])? 0: 1;
668: VecCUDAResetArray(bv->cv[l]);
669: VecCUDARestoreArray(ctx->v,NULL);
670: return(0);
671: }
673: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
674: {
675: PetscErrorCode ierr;
676: Vec v;
677: const PetscScalar *d_pv;
678: PetscObjectState lstate,rstate;
679: PetscBool change=PETSC_FALSE;
682: /* force sync flag to PETSC_CUDA_BOTH */
683: if (L) {
684: PetscObjectStateGet((PetscObject)*L,&lstate);
685: if (lstate != bv->lstate) {
686: v = ((BV_SVEC*)bv->L->data)->v;
687: VecCUDAGetArrayRead(v,&d_pv);
688: VecCUDARestoreArrayRead(v,&d_pv);
689: change = PETSC_TRUE;
690: }
691: }
692: if (R) {
693: PetscObjectStateGet((PetscObject)*R,&rstate);
694: if (rstate != bv->rstate) {
695: v = ((BV_SVEC*)bv->R->data)->v;
696: VecCUDAGetArrayRead(v,&d_pv);
697: VecCUDARestoreArrayRead(v,&d_pv);
698: change = PETSC_TRUE;
699: }
700: }
701: if (change) {
702: v = ((BV_SVEC*)bv->data)->v;
703: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
704: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
705: }
706: return(0);
707: }