#include <inttypes.h>
#include <math.h>
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
// Forward definition.
void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
}
-void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
{
KernelDataEx * kdex = NULL;
MemAlloc((void **)&kdex, sizeof(KernelDataEx));
PdfT * pdfs[2];
- ParseParameters(params, blk);
+ if (blocking) {
+ ParseParameters(params, blk);
+ }
+ else {
+ if (params->nKernelArgs != 0) {
+ printf("ERROR: unknown kernel parameter.\n");
+ printf("This kernels accepts no parameters.\n");
+ exit(1);
+ }
+ }
+
if (blk[0] == 0) blk[0] = gX;
if (blk[1] == 0) blk[1] = gY;
nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
- MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
- MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+ // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+ // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+ MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+ MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+
+ printf("# pdfs[0] = %p\n", pdfs[0]);
+ printf("# pdfs[1] = %p\n", pdfs[1]);
kd->Pdfs[0] = pdfs[0];
kd->Pdfs[1] = pdfs[1];
// This depends on the chosen data layout.
// The structure of the loop should resemble the same "execution layout"
// as in the kernel!
-#ifdef _OPENMP
- #pragma omp parallel for collapse(3)
-#endif
- for (int bZ = 0; bZ < gZ; bZ += blk[2]) {
- for (int bY = 0; bY < gY; bY += blk[1]) {
- for (int bX = 0; bX < gX; bX += blk[0]) {
+ if (blocking) {
+ int nX = ld->Dims[0];
+ int nY = ld->Dims[1];
+ int nZ = ld->Dims[2];
- // Must do everything here, else it would break collapse.
- int eZ = MIN(bZ + blk[2], gZ);
- int eY = MIN(bY + blk[1], gY);
- int eX = MIN(bX + blk[0], gX);
+ int nThreads = 1;
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
- for (int x = bX; x < eX; ++x) {
+ #ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+ #endif
- for (int d = 0; d < N_D3Q19; ++d) {
- pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
- pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
- }
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads)
+ #endif
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = nX / nThreads * i;
+ int threadEndX = nX / nThreads * (i + 1);
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ }
+ }
+ }
}
}
}
}
}
+
+ }
+ else {
+ #ifdef _OPENMP
+ #pragma omp parallel for collapse(3)
+ #endif
+ for (int x = 0; x < gX; ++x) {
+ for (int y = 0; y < gY; ++y) {
+ for (int z = 0; z < gZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ }
+ }
+ }
+ }
}
// Initialize all PDFs to some standard value.
- for (int z = 0; z < gZ; ++z) {
+ for (int x = 0; x < gX; ++x) {
for (int y = 0; y < gY; ++y) {
- for (int x = 0; x < gX; ++x) {
+ for (int z = 0; z < gZ; ++z) {
for (int d = 0; d < N_D3Q19; ++d) {
pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
// TODO: apply blocking?
- for (int z = 0; z < lZ; ++z) {
+ for (int x = 0; x < lX; ++x) {
for (int y = 0; y < lY; ++y) {
- for (int x = 0; x < lX; ++x) {
+ for (int z = 0; z < lZ; ++z) {
if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
for (int d = 0; d < N_D3Q19; ++d) {
int srcIndex;
int dstIndex;
- for (int z = 0; z < lZ; ++z) {
+ for (int x = 0; x < lX; ++x) {
for (int y = 0; y < lY; ++y) {
- for (int x = 0; x < lX; ++x) {
+ for (int z = 0; z < lZ; ++z) {
if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
for (int d = 0; d < N_D3Q19; ++d) {
kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
- kd->Kernel = FNAME(D3Q19BlkKernel);
+ if (blocking) {
+ kd->Kernel = FNAME(D3Q19BlkKernel);
+ }
+ else {
+ kd->Kernel = FNAME(D3Q19Kernel);
+ }
kd->DstPdfs = NULL;
kd->PdfsActive = kd->Pdfs[0];
return;
}
+
+void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+ D3Q19InternalInit(1, ld, kernelData, params);
+}
+
+
void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
{
MemFree((void **) & ((*kernelData)->Pdfs[0]));
void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
{
- Parameters p;
-
- if (params->nKernelArgs != 0) {
- printf("ERROR: unknown kernel parameter.\n");
- printf("This kernels accepts no parameters.\n");
- exit(1);
- }
-
- // Setup an empty parameters structure.
- p.nArgs = params->nArgs;
- p.Args = params->Args;
- p.nKernelArgs = 0;
- p.KernelArgs = NULL;
-
- // Call init routine for blocking kernel and override the
- // kernel function to be called later on.
- FNAME(D3Q19BlkInit)(ld, kernelData, &p);
-
- (*kernelData)->Kernel = FNAME(D3Q19Kernel);
-
- return;
+ D3Q19InternalInit(0, ld, kernelData, params);
}