Skip to content

Commit

Permalink
[feat] implement photon sharing on GPU, not working, stalls, #72
Browse files Browse the repository at this point in the history
  • Loading branch information
fangq committed Sep 24, 2024
1 parent 2a7e9dd commit 749f0eb
Show file tree
Hide file tree
Showing 10 changed files with 252 additions and 160 deletions.
4 changes: 2 additions & 2 deletions mmclab/example/demo_photon_sharing.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
cfg.tstart = 0;
cfg.tend = 5e-9;
cfg.tstep = 1e-10;
cfg.gpuid = -1;
cfg.gpuid = 1;
cfg.method = 'elem';
cfg.debuglevel = 'TP';

Expand All @@ -30,7 +30,7 @@
pat(16:25, :, 5) = 1;
pat(:, 16:25, 5) = 1; % pattern with a bright cross

cfg.srcpattern = pat;
cfg.srcpattern = permute(pat, [3 1 2]);

%% run the simulation

Expand Down
4 changes: 2 additions & 2 deletions mmclab/mmclab.m
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@
% by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3)
% 'pattern' - a 3D quadrilateral pattern illumination, same as above, except
% srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions,
% and srcpattern is a floating-point pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a floating-point array with
% and srcpattern is a double-precision pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with
% a dimension of [srcnum srcparam1(4) srcparam2(4)]
% Example: <demo_photon_sharing.m>
% 'fourier' - spatial frequency domain source, similar to 'planar', except
Expand Down
31 changes: 22 additions & 9 deletions src/mmc_cl_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/
cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/

cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer

cl_float* field, *dref = NULL;
Expand All @@ -103,6 +103,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
char opt[MAX_PATH_LENGTH + 1] = {'\0'};
cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1;
cl_uint hostdetreclen = detreclen + 1;
int sharedmemsize = 0;

GPUInfo* gpu = NULL;
float4* propdet;

Expand All @@ -122,7 +124,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
(MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
};
Expand All @@ -138,6 +140,16 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__);
}

if (cfg->issavedet) {
sharedmemsize = sizeof(cl_float) * detreclen;
}

param.reclen = sharedmemsize / sizeof(float);

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sharedmemsize += sizeof(cl_float) * cfg->srcnum;
}

cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

/* Use NULL for backward compatibility */
Expand Down Expand Up @@ -324,9 +336,9 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));

if (cfg->srctype == MCX_SRC_PATTERN) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status)));
} else if (cfg->srctype == MCX_SRC_PATTERN3D) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status)));
} else {
gsrcpattern[i] = NULL;
}
Expand Down Expand Up @@ -417,6 +429,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL");
}

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING");
}

if (gpu[0].vendor == dvNVIDIA) {
sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU");
}
Expand Down Expand Up @@ -497,7 +513,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
//OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autothread, NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i))));
Expand Down Expand Up @@ -708,7 +724,7 @@ is more than what your have specified (%d), please use the -H option to specify
mcx_fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
}

free(rawfield);
Expand All @@ -732,9 +748,6 @@ is more than what your have specified (%d), please use the -H option to specify
}// iteration
}// time gates

fieldlen = (fieldlen >> cfg->nbuffer);
field = realloc(field, sizeof(field[0]) * fieldlen);

if (cfg->exportfield) {
if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
for (i = 0; i < fieldlen; i++) {
Expand Down
1 change: 0 additions & 1 deletion src/mmc_cl_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig {
cl_int e0;
cl_int isextdet;
cl_int framelen;
cl_uint nbuffer;
cl_uint maxpropdet;
cl_uint normbuf;
cl_int issaveseed;
Expand Down
Loading

0 comments on commit 749f0eb

Please sign in to comment.