1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19Common.h"
39 // Forward definition.
40 void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
42 void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
46 static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
49 Assert(kd->PdfsActive != NULL);
50 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
56 Assert(x < kd->Dims[0]);
57 Assert(y < kd->Dims[1]);
58 Assert(z < kd->Dims[2]);
60 Assert(dir < N_D3Q19);
62 int oX = kd->Offsets[0];
63 int oY = kd->Offsets[1];
64 int oZ = kd->Offsets[2];
66 #ifdef PROP_MODEL_PUSH
71 int nx = x - D3Q19_X[dir];
72 int ny = y - D3Q19_Y[dir];
73 int nz = z - D3Q19_Z[dir];
76 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
77 *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
83 static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
86 Assert(kd->PdfsActive != NULL);
87 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
91 Assert(x < kd->Dims[0]);
92 Assert(y < kd->Dims[1]);
93 Assert(z < kd->Dims[2]);
95 Assert(dir < N_D3Q19);
97 int oX = kd->Offsets[0];
98 int oY = kd->Offsets[1];
99 int oZ = kd->Offsets[2];
101 #ifdef PROP_MODEL_PUSH
105 #elif PROP_MODEL_PULL
106 int nx = x - D3Q19_X[dir];
107 int ny = y - D3Q19_Y[dir];
108 int nz = z - D3Q19_Z[dir];
111 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
112 kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
120 static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
123 Assert(kd->PdfsActive != NULL);
124 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
125 Assert(pdfs != NULL);
129 Assert(x < kd->Dims[0]);
130 Assert(y < kd->Dims[1]);
131 Assert(z < kd->Dims[2]);
133 int oX = kd->Offsets[0];
134 int oY = kd->Offsets[1];
135 int oZ = kd->Offsets[2];
138 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
139 #ifdef PROP_MODEL_PUSH
140 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
141 #elif PROP_MODEL_PULL
142 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)];
150 for (int d = 0; d < 19; ++d) {
151 if (isnan(pdfs[d])) {
152 printf("%d %d %d %d nan! get node\n", x, y, z, d);
154 for (int d2 = 0; d2 < 19; ++d2) {
155 printf("%d: %e\n", d2, pdfs[d2]);
168 static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
171 Assert(kd->PdfsActive != NULL);
172 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
173 Assert(pdfs != NULL);
178 Assert(x < kd->Dims[0]);
179 Assert(y < kd->Dims[1]);
180 Assert(z < kd->Dims[2]);
182 int oX = kd->Offsets[0];
183 int oY = kd->Offsets[1];
184 int oZ = kd->Offsets[2];
186 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
187 #ifdef PROP_MODEL_PUSH
188 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
189 #elif PROP_MODEL_PULL
190 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)] = pdfs[idx];
200 static void ParameterUsage()
202 printf("Kernel parameters:\n");
203 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
208 static void ParseParameters(Parameters * params, int * blk)
212 blk[0] = 0; blk[1] = 0; blk[2] = 0;
214 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
215 #define NEXT_ARG_PRESENT() \
217 if (i + 1 >= params->nKernelArgs) { \
218 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
224 for (int i = 0; i < params->nKernelArgs; ++i) {
225 if (ARG_IS("-blk") || ARG_IS("--blk")) {
228 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
231 printf("ERROR: blocking parameter must be >= 0.\n");
235 blk[0] = blk[1] = blk[2] = tmp;
237 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
240 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
243 printf("ERROR: blocking parameter must be >= 0.\n");
249 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
252 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
255 printf("ERROR: blocking parameter must be >= 0.\n");
261 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
264 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
267 printf("ERROR: blocking parameter must be >= 0.\n");
273 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
278 printf("ERROR: unknown kernel parameter.\n");
285 #undef NEXT_ARG_PRESENT
291 static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
293 KernelDataEx * kdex = NULL;
294 MemAlloc((void **)&kdex, sizeof(KernelDataEx));
296 kdex->Blk[0] = 0; kdex->Blk[1] = 0; kdex->Blk[2] = 0;
298 KernelData * kd = &kdex->kd;
301 kd->nObstIndices = ld->nObst;
303 // Ajust the dimensions according to padding, if used.
304 kd->Dims[0] = ld->Dims[0];
305 kd->Dims[1] = ld->Dims[1];
306 kd->Dims[2] = ld->Dims[2];
309 int * lDims = ld->Dims;
310 int * gDims = kd->GlobalDims;
312 gDims[0] = lDims[0] + 2;
313 gDims[1] = lDims[1] + 2;
314 gDims[2] = lDims[2] + 2;
328 int oX = kd->Offsets[0];
329 int oY = kd->Offsets[1];
330 int oZ = kd->Offsets[2];
334 int nCells = gX * gY * gZ;
339 ParseParameters(params, blk);
342 if (params->nKernelArgs != 0) {
343 printf("ERROR: unknown kernel parameter.\n");
344 printf("This kernels accepts no parameters.\n");
350 if (blk[0] == 0) blk[0] = gX;
351 if (blk[1] == 0) blk[1] = gY;
352 if (blk[2] == 0) blk[2] = gZ;
354 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
357 kdex->Blk[0] = blk[0]; kdex->Blk[1] = blk[1]; kdex->Blk[2] = blk[2];
360 printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for both lattices)\n",
361 nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
362 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
364 // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
365 // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
366 MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
367 MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
369 printf("# pdfs[0] = %p\n", pdfs[0]);
370 printf("# pdfs[1] = %p\n", pdfs[1]);
372 kd->Pdfs[0] = pdfs[0];
373 kd->Pdfs[1] = pdfs[1];
375 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
376 // This depends on the chosen data layout.
377 // The structure of the loop should resemble the same "execution layout"
381 int nX = ld->Dims[0];
382 int nY = ld->Dims[1];
383 int nZ = ld->Dims[2];
388 nThreads = omp_get_max_threads();
392 #pragma omp parallel for default(none) \
393 shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads)
395 for (int i = 0; i < nThreads; ++i) {
397 int threadStartX = nX / nThreads * i;
398 int threadEndX = nX / nThreads * (i + 1);
400 if (nX % nThreads > 0) {
401 if (nX % nThreads > i) {
406 threadStartX += nX % nThreads;
407 threadEndX += nX % nThreads;
411 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
412 for (int bY = oY; bY < nY + oY; bY += blk[1]) {
413 for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
415 int eZ = MIN(bZ + blk[2], nZ + oZ);
416 int eY = MIN(bY + blk[1], nY + oY);
417 int eX = MIN(bX + blk[0], threadEndX + oX);
419 for (int x = bX; x < eX; ++x) {
420 for (int y = bY; y < eY; ++y) {
421 for (int z = bZ; z < eZ; ++z) {
422 for (int d = 0; d < N_D3Q19; ++d) {
423 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
424 pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
437 #pragma omp parallel for collapse(3)
439 for (int x = 0; x < gX; ++x) {
440 for (int y = 0; y < gY; ++y) {
441 for (int z = 0; z < gZ; ++z) {
442 for (int d = 0; d < N_D3Q19; ++d) {
443 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
444 pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
451 // Initialize all PDFs to some standard value.
452 for (int x = 0; x < gX; ++x) {
453 for (int y = 0; y < gY; ++y) {
454 for (int z = 0; z < gZ; ++z) {
455 for (int d = 0; d < N_D3Q19; ++d) {
456 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
457 pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
464 // Count how many *PDFs* need bounce back treatment.
466 uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
468 if (nPdfs > ((2LU << 31) - 1)) {
469 printf("ERROR: number of PDFs exceed 2^31.\n");
473 // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
474 // Works when declaring nBounceBackPdfs as int64_t or using volatile.
475 volatile int nBounceBackPdfs = 0;
476 // int64_t nBounceBackPdfs = 0;
477 int nx, ny, nz, px, py, pz;
479 // TODO: apply blocking?
481 for (int x = 0; x < lX; ++x) {
482 for (int y = 0; y < lY; ++y) {
483 for (int z = 0; z < lZ; ++z) {
485 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
486 for (int d = 0; d < N_D3Q19; ++d) {
487 #ifdef PROP_MODEL_PUSH
491 #elif PROP_MODEL_PULL
496 #error PROP_MODEL_NAME unknown.
498 // Check if neighbor is inside the lattice.
499 // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
502 if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
503 ++nBounceBackPdfs; // Compiler bug --> see above
505 else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
506 ++nBounceBackPdfs; // Compiler bug --> see above
508 else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
509 ++nBounceBackPdfs; // Compiler bug --> see above
511 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
514 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
515 ++nBounceBackPdfs; // Compiler bug --> see above
524 printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
526 MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
527 MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
529 kd->nBounceBackPdfs = nBounceBackPdfs;
535 for (int x = 0; x < lX; ++x) {
536 for (int y = 0; y < lY; ++y) {
537 for (int z = 0; z < lZ; ++z) {
539 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
540 for (int d = 0; d < N_D3Q19; ++d) {
541 #ifdef PROP_MODEL_PUSH
545 #elif PROP_MODEL_PULL
550 #error PROP_MODEL_NAME unknown.
553 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
554 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
555 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
557 // Implement periodic boundary in X direction.
559 // If the target node reached through propagation is outside the lattice
560 // the kernel stores it in some buffer around the domain.
561 // From this position the PDF must be transported to the other side of the
564 // Take PDF from outside the domain.
596 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
597 #ifdef PROP_MODEL_PUSH
598 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
599 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
600 #elif PROP_MODEL_PULL
601 srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
602 dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
607 #ifdef PROP_MODEL_PUSH
608 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
609 // Put it on the other side back into the domain.
610 dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
611 #elif PROP_MODEL_PULL
612 srcIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
613 // Put it on the other side back into the ghost layer.
614 dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
617 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
621 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
622 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
627 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
630 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
631 #ifdef PROP_MODEL_PUSH
632 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
633 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
634 #elif PROP_MODEL_PULL
635 srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
636 dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
637 // srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, d);
638 // dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, D3Q19_INV[d]);
641 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
643 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
644 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
655 // Fill remaining KernelData structures
656 kd->GetNode = FNAME(GetNode);
657 kd->SetNode = FNAME(SetNode);
659 kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
660 kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
663 kd->Kernel = FNAME(D3Q19BlkKernel);
666 kd->Kernel = FNAME(D3Q19Kernel);
670 kd->PdfsActive = kd->Pdfs[0];
676 void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
678 D3Q19InternalInit(1, ld, kernelData, params);
682 void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
684 MemFree((void **) & ((*kernelData)->Pdfs[0]));
685 MemFree((void **) & ((*kernelData)->Pdfs[1]));
687 MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
688 MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
690 MemFree((void **)kernelData);
695 // Kernels without blocking perform the same initialization/deinitialization as with
696 // blocking, except that a different kernel is called. Hence, no arguments are allowed.
698 void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
700 D3Q19InternalInit(0, ld, kernelData, params);
704 void FNAME(D3Q19Deinit)(LatticeDesc * ld, KernelData ** kernelData)
706 FNAME(D3Q19BlkDeinit)(ld, kernelData);