Actual source code: mpicuda.cu
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
10: #include <petsc/private/cudavecimpl.h>
12: /*MC
13: VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.
15: Options Database Keys:
16: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()
18: Level: beginner
20: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECSEQCUDA, VECMPICUDA, VECSTANDARD, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
21: M*/
23: PetscErrorCode VecDestroy_MPICUDA(Vec v)
24: {
25: Vec_MPI *vecmpi = (Vec_MPI*)v->data;
26: Vec_CUDA *veccuda;
28: cudaError_t err;
31: if (v->spptr) {
32: veccuda = (Vec_CUDA*)v->spptr;
33: if (veccuda->GPUarray_allocated) {
34: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
35: veccuda->GPUarray_allocated = NULL;
36: }
37: if (veccuda->stream) {
38: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
39: }
40: if (v->pinned_memory) {
41: PetscMallocSetCUDAHost();
42: PetscFree(vecmpi->array_allocated);
43: PetscMallocResetCUDAHost();
44: v->pinned_memory = PETSC_FALSE;
45: }
46: PetscFree(v->spptr);
47: }
48: VecDestroy_MPI(v);
49: return(0);
50: }
52: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
53: {
54: PetscReal sum,work = 0.0;
58: if (type == NORM_2 || type == NORM_FROBENIUS) {
59: VecNorm_SeqCUDA(xin,NORM_2,&work);
60: work *= work;
61: MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
62: *z = PetscSqrtReal(sum);
63: } else if (type == NORM_1) {
64: /* Find the local part */
65: VecNorm_SeqCUDA(xin,NORM_1,&work);
66: /* Find the global max */
67: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
68: } else if (type == NORM_INFINITY) {
69: /* Find the local max */
70: VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
71: /* Find the global max */
72: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
73: } else if (type == NORM_1_AND_2) {
74: PetscReal temp[2];
75: VecNorm_SeqCUDA(xin,NORM_1,temp);
76: VecNorm_SeqCUDA(xin,NORM_2,temp+1);
77: temp[1] = temp[1]*temp[1];
78: MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
79: z[1] = PetscSqrtReal(z[1]);
80: }
81: return(0);
82: }
84: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
85: {
86: PetscScalar sum,work;
90: VecDot_SeqCUDA(xin,yin,&work);
91: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
92: *z = sum;
93: return(0);
94: }
96: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
97: {
98: PetscScalar sum,work;
102: VecTDot_SeqCUDA(xin,yin,&work);
103: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
104: *z = sum;
105: return(0);
106: }
108: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
109: {
110: PetscScalar awork[128],*work = awork;
114: if (nv > 128) {
115: PetscMalloc1(nv,&work);
116: }
117: VecMDot_SeqCUDA(xin,nv,y,work);
118: MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
119: if (nv > 128) {
120: PetscFree(work);
121: }
122: return(0);
123: }
125: /*MC
126: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
128: Options Database Keys:
129: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
131: Level: beginner
133: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
134: M*/
137: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
138: {
140: Vec_MPI *vw,*w = (Vec_MPI*)win->data;
141: PetscScalar *array;
144: VecCreate(PetscObjectComm((PetscObject)win),v);
145: PetscLayoutReference(win->map,&(*v)->map);
147: VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
148: vw = (Vec_MPI*)(*v)->data;
149: PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));
151: /* save local representation of the parallel vector (and scatter) if it exists */
152: if (w->localrep) {
153: VecGetArray(*v,&array);
154: VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
155: PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
156: VecRestoreArray(*v,&array);
157: PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
158: vw->localupdate = w->localupdate;
159: if (vw->localupdate) {
160: PetscObjectReference((PetscObject)vw->localupdate);
161: }
162: }
164: /* New vector should inherit stashing property of parent */
165: (*v)->stash.donotstash = win->stash.donotstash;
166: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
168: /* change type_name appropriately */
169: VecCUDAAllocateCheck(*v);
170: PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);
172: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
173: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
174: (*v)->map->bs = PetscAbs(win->map->bs);
175: (*v)->bstash.bs = win->bstash.bs;
176: return(0);
177: }
179: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
180: {
182: PetscScalar work[2],sum[2];
185: VecDotNorm2_SeqCUDA(s,t,work,work+1);
186: MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
187: *dp = sum[0];
188: *nm = sum[1];
189: return(0);
190: }
192: PetscErrorCode VecCreate_MPICUDA(Vec vv)
193: {
197: PetscCUDAInitializeCheck();
198: PetscLayoutSetUp(vv->map);
199: VecCUDAAllocateCheck(vv);
200: VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
201: VecCUDAAllocateCheckHost(vv);
202: VecSet(vv,0.0);
203: VecSet_Seq(vv,0.0);
204: vv->offloadmask = PETSC_OFFLOAD_BOTH;
205: return(0);
206: }
208: PetscErrorCode VecCreate_CUDA(Vec v)
209: {
211: PetscMPIInt size;
214: MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
215: if (size == 1) {
216: VecSetType(v,VECSEQCUDA);
217: } else {
218: VecSetType(v,VECMPICUDA);
219: }
220: return(0);
221: }
223: /*@C
224: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
225: where the user provides the GPU array space to store the vector values.
227: Collective
229: Input Parameters:
230: + comm - the MPI communicator to use
231: . bs - block size, same meaning as VecSetBlockSize()
232: . n - local vector length, cannot be PETSC_DECIDE
233: . N - global vector length (or PETSC_DECIDE to have calculated)
234: - array - the user provided GPU array to store the vector values
236: Output Parameter:
237: . vv - the vector
239: Notes:
240: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
241: same type as an existing vector.
243: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
244: at a later stage to SET the array for storing the vector values.
246: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
247: The user should not free the array until the vector is destroyed.
249: Level: intermediate
251: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
252: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
253: VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()
255: @*/
256: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
257: {
261: if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
262: PetscCUDAInitializeCheck();
263: VecCreate(comm,vv);
264: VecSetSizes(*vv,n,N);
265: VecSetBlockSize(*vv,bs);
266: VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
267: return(0);
268: }
270: /*@C
271: VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
272: where the user provides the GPU array space to store the vector values.
274: Collective
276: Input Parameters:
277: + comm - the MPI communicator to use
278: . bs - block size, same meaning as VecSetBlockSize()
279: . n - local vector length, cannot be PETSC_DECIDE
280: . N - global vector length (or PETSC_DECIDE to have calculated)
281: - cpuarray - the user provided CPU array to store the vector values
282: - gpuarray - the user provided GPU array to store the vector values
284: Output Parameter:
285: . vv - the vector
287: Notes:
288: If both cpuarray and gpuarray are provided, the caller must ensure that
289: the provided arrays have identical values.
291: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
292: same type as an existing vector.
294: PETSc does NOT free the provided arrays when the vector is destroyed via
295: VecDestroy(). The user should not free the array until the vector is
296: destroyed.
298: Level: intermediate
300: .seealso: VecCreateSeqCUDAWithArrays(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
301: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
302: VecCreateMPI(), VecCreateGhostWithArray(), VecCUDAPlaceArray(), VecPlaceArray(),
303: VecCUDAAllocateCheckHost()
304: @*/
305: PetscErrorCode VecCreateMPICUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *vv)
306: {
310: VecCreateMPICUDAWithArray(comm,bs,n,N,gpuarray,vv);
312: if (cpuarray && gpuarray) {
313: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
314: s->array = (PetscScalar*)cpuarray;
315: (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
316: } else if (cpuarray) {
317: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
318: s->array = (PetscScalar*)cpuarray;
319: (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
320: } else if (gpuarray) {
321: (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
322: } else {
323: (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
324: }
326: return(0);
327: }
329: PetscErrorCode VecMax_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
330: {
332: PetscReal work;
335: VecMax_SeqCUDA(xin,idx,&work);
336: if (!idx) {
337: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
338: } else {
339: PetscReal work2[2],z2[2];
340: PetscInt rstart;
341: rstart = xin->map->rstart;
342: work2[0] = work;
343: work2[1] = *idx + rstart;
344: MPIU_Allreduce(work2,z2,2,MPIU_REAL,MPIU_MAXINDEX_OP,PetscObjectComm((PetscObject)xin));
345: *z = z2[0];
346: *idx = (PetscInt)z2[1];
347: }
348: return(0);
349: }
351: PetscErrorCode VecMin_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
352: {
354: PetscReal work;
357: VecMin_SeqCUDA(xin,idx,&work);
358: if (!idx) {
359: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)xin));
360: } else {
361: PetscReal work2[2],z2[2];
362: PetscInt rstart;
364: VecGetOwnershipRange(xin,&rstart,NULL);
365: work2[0] = work;
366: work2[1] = *idx + rstart;
367: MPIU_Allreduce(work2,z2,2,MPIU_REAL,MPIU_MININDEX_OP,PetscObjectComm((PetscObject)xin));
368: *z = z2[0];
369: *idx = (PetscInt)z2[1];
370: }
371: return(0);
372: }
374: PetscErrorCode VecBindToCPU_MPICUDA(Vec V,PetscBool pin)
375: {
379: V->boundtocpu = pin;
380: if (pin) {
381: VecCUDACopyFromGPU(V);
382: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
383: V->ops->dotnorm2 = NULL;
384: V->ops->waxpy = VecWAXPY_Seq;
385: V->ops->dot = VecDot_MPI;
386: V->ops->mdot = VecMDot_MPI;
387: V->ops->tdot = VecTDot_MPI;
388: V->ops->norm = VecNorm_MPI;
389: V->ops->scale = VecScale_Seq;
390: V->ops->copy = VecCopy_Seq;
391: V->ops->set = VecSet_Seq;
392: V->ops->swap = VecSwap_Seq;
393: V->ops->axpy = VecAXPY_Seq;
394: V->ops->axpby = VecAXPBY_Seq;
395: V->ops->maxpy = VecMAXPY_Seq;
396: V->ops->aypx = VecAYPX_Seq;
397: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
398: V->ops->pointwisemult = VecPointwiseMult_Seq;
399: V->ops->setrandom = VecSetRandom_Seq;
400: V->ops->placearray = VecPlaceArray_Seq;
401: V->ops->replacearray = VecReplaceArray_SeqCUDA;
402: V->ops->resetarray = VecResetArray_Seq;
403: V->ops->dot_local = VecDot_Seq;
404: V->ops->tdot_local = VecTDot_Seq;
405: V->ops->norm_local = VecNorm_Seq;
406: V->ops->mdot_local = VecMDot_Seq;
407: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
408: V->ops->getlocalvector = NULL;
409: V->ops->restorelocalvector = NULL;
410: V->ops->getlocalvectorread = NULL;
411: V->ops->restorelocalvectorread = NULL;
412: V->ops->getarraywrite = NULL;
413: V->ops->max = VecMax_MPI;
414: V->ops->min = VecMin_MPI;
415: /* default random number generator */
416: PetscFree(V->defaultrandtype);
417: PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
418: } else {
419: V->ops->dotnorm2 = VecDotNorm2_MPICUDA;
420: V->ops->waxpy = VecWAXPY_SeqCUDA;
421: V->ops->duplicate = VecDuplicate_MPICUDA;
422: V->ops->dot = VecDot_MPICUDA;
423: V->ops->mdot = VecMDot_MPICUDA;
424: V->ops->tdot = VecTDot_MPICUDA;
425: V->ops->norm = VecNorm_MPICUDA;
426: V->ops->scale = VecScale_SeqCUDA;
427: V->ops->copy = VecCopy_SeqCUDA;
428: V->ops->set = VecSet_SeqCUDA;
429: V->ops->swap = VecSwap_SeqCUDA;
430: V->ops->axpy = VecAXPY_SeqCUDA;
431: V->ops->axpby = VecAXPBY_SeqCUDA;
432: V->ops->maxpy = VecMAXPY_SeqCUDA;
433: V->ops->aypx = VecAYPX_SeqCUDA;
434: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
435: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
436: V->ops->setrandom = VecSetRandom_SeqCUDA;
437: V->ops->placearray = VecPlaceArray_SeqCUDA;
438: V->ops->replacearray = VecReplaceArray_SeqCUDA;
439: V->ops->resetarray = VecResetArray_SeqCUDA;
440: V->ops->dot_local = VecDot_SeqCUDA;
441: V->ops->tdot_local = VecTDot_SeqCUDA;
442: V->ops->norm_local = VecNorm_SeqCUDA;
443: V->ops->mdot_local = VecMDot_SeqCUDA;
444: V->ops->destroy = VecDestroy_MPICUDA;
445: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
446: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
447: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
448: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
449: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
450: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
451: V->ops->getarray = VecGetArray_SeqCUDA;
452: V->ops->restorearray = VecRestoreArray_SeqCUDA;
453: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
454: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
455: V->ops->max = VecMax_MPICUDA;
456: V->ops->min = VecMin_MPICUDA;
457: /* default random number generator */
458: PetscFree(V->defaultrandtype);
459: PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
460: }
461: return(0);
462: }
464: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
465: {
467: Vec_CUDA *veccuda;
470: VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
471: PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);
473: VecBindToCPU_MPICUDA(vv,PETSC_FALSE);
474: vv->ops->bindtocpu = VecBindToCPU_MPICUDA;
476: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
477: if (alloc && !array) {
478: VecCUDAAllocateCheck(vv);
479: VecCUDAAllocateCheckHost(vv);
480: VecSet(vv,0.0);
481: VecSet_Seq(vv,0.0);
482: vv->offloadmask = PETSC_OFFLOAD_BOTH;
483: }
484: if (array) {
485: if (!vv->spptr) {
486: PetscReal pinned_memory_min;
487: PetscBool flag;
488: /* Cannot use PetscNew() here because spptr is void* */
489: PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
490: veccuda = (Vec_CUDA*)vv->spptr;
491: veccuda->stream = 0; /* using default stream */
492: veccuda->GPUarray_allocated = 0;
493: vv->minimum_bytes_pinned_memory = 0;
495: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
496: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
497: PetscOptionsBegin(PetscObjectComm((PetscObject)vv),((PetscObject)vv)->prefix,"VECCUDA Options","Vec");
498: pinned_memory_min = vv->minimum_bytes_pinned_memory;
499: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&flag);
500: if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
501: PetscOptionsEnd();
502: }
503: veccuda = (Vec_CUDA*)vv->spptr;
504: veccuda->GPUarray = (PetscScalar*)array;
505: vv->offloadmask = PETSC_OFFLOAD_GPU;
506: }
507: return(0);
508: }