X-Git-Url: http://git.rrze.uni-erlangen.de/gitweb/?p=LbmBenchmarkKernelsPublic.git;a=blobdiff_plain;f=src%2FBenchKernelD3Q19Common.c;h=8f756493fe62d744c3cdee663dcd2bb5dad744f9;hp=e697bc8a573a2d5fb646a0686ed2157805c829f8;hb=e3f82424829ebb623343ce0092238f83b4a1b8c2;hpb=ecf590ae9bb13ba2b2f01c3bf7a53056a8b1467b diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c index e697bc8..8f75649 100644 --- a/src/BenchKernelD3Q19Common.c +++ b/src/BenchKernelD3Q19Common.c @@ -32,6 +32,9 @@ #include #include +#ifdef _OPENMP + #include +#endif // Forward definition. void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); @@ -224,8 +227,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -236,8 +239,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -248,8 +251,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -260,8 +263,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -285,7 +288,7 @@ static void ParseParameters(Parameters * params, int * blk) } -void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { KernelDataEx * kdex = NULL; MemAlloc((void **)&kdex, sizeof(KernelDataEx)); @@ -332,7 +335,17 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters PdfT * pdfs[2]; - ParseParameters(params, blk); + if (blocking) { + ParseParameters(params, blk); + } + else { + if (params->nKernelArgs != 0) { + printf("ERROR: unknown kernel parameter.\n"); + printf("This kernels accepts no parameters.\n"); + exit(1); + } + } + if (blk[0] == 0) blk[0] = gX; if (blk[1] == 0) blk[1] = gY; @@ -348,8 +361,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); - MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); - MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + + printf("# pdfs[0] = %p\n", pdfs[0]); + printf("# pdfs[1] = %p\n", pdfs[1]); kd->Pdfs[0] = pdfs[0]; kd->Pdfs[1] = pdfs[1]; @@ -358,39 +376,82 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // This depends on the chosen data layout. // The structure of the loop should resemble the same "execution layout" // as in the kernel! -#ifdef _OPENMP - #pragma omp parallel for collapse(3) -#endif - for (int bZ = 0; bZ < gZ; bZ += blk[2]) { - for (int bY = 0; bY < gY; bY += blk[1]) { - for (int bX = 0; bX < gX; bX += blk[0]) { + if (blocking) { + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; - // Must do everything here, else it would break collapse. - int eZ = MIN(bZ + blk[2], gZ); - int eY = MIN(bY + blk[1], gY); - int eX = MIN(bX + blk[0], gX); + int nThreads = 1; - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { - for (int x = bX; x < eX; ++x) { + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif - for (int d = 0; d < N_D3Q19; ++d) { - pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - } + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads) + #endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } } } } } } + + } + else { + #ifdef _OPENMP + #pragma omp parallel for collapse(3) + #endif + for (int x = 0; x < gX; ++x) { + for (int y = 0; y < gY; ++y) { + for (int z = 0; z < gZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } + } } // Initialize all PDFs to some standard value. - for (int z = 0; z < gZ; ++z) { + for (int x = 0; x < gX; ++x) { for (int y = 0; y < gY; ++y) { - for (int x = 0; x < gX; ++x) { + for (int z = 0; z < gZ; ++z) { for (int d = 0; d < N_D3Q19; ++d) { pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0; @@ -417,9 +478,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // TODO: apply blocking? - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -471,9 +532,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters int srcIndex; int dstIndex; - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -598,7 +659,12 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); - kd->Kernel = FNAME(D3Q19BlkKernel); + if (blocking) { + kd->Kernel = FNAME(D3Q19BlkKernel); + } + else { + kd->Kernel = FNAME(D3Q19Kernel); + } kd->DstPdfs = NULL; kd->PdfsActive = kd->Pdfs[0]; @@ -606,6 +672,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters return; } + +void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + D3Q19InternalInit(1, ld, kernelData, params); +} + + void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) { MemFree((void **) & ((*kernelData)->Pdfs[0])); @@ -624,27 +697,7 @@ void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { - Parameters p; - - if (params->nKernelArgs != 0) { - printf("ERROR: unknown kernel parameter.\n"); - printf("This kernels accepts no parameters.\n"); - exit(1); - } - - // Setup an empty parameters structure. - p.nArgs = params->nArgs; - p.Args = params->Args; - p.nKernelArgs = 0; - p.KernelArgs = NULL; - - // Call init routine for blocking kernel and override the - // kernel function to be called later on. - FNAME(D3Q19BlkInit)(ld, kernelData, &p); - - (*kernelData)->Kernel = FNAME(D3Q19Kernel); - - return; + D3Q19InternalInit(0, ld, kernelData, params); }