1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19AaCommon.h"
39 // Forward definition.
40 void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
42 void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
46 static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
49 Assert(kd->PdfsActive != NULL);
50 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
53 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
54 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
55 Assert(dir >= 0); Assert(dir < N_D3Q19);
57 KernelDataAa * kda = KDA(kd);
59 int oX = kd->Offsets[0];
60 int oY = kd->Offsets[1];
61 int oZ = kd->Offsets[2];
63 if (kda->Iteration % 2 == 0) {
64 // Pdfs are stored inverse, local PDFs are located in remote nodes
65 int nx = x - D3Q19_X[dir];
66 int ny = y - D3Q19_Y[dir];
67 int nz = z - D3Q19_Z[dir];
69 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
70 *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
78 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
79 *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
87 static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
90 Assert(kd->PdfsActive != NULL);
91 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
93 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
94 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
95 Assert(dir >= 0); Assert(dir < N_D3Q19);
97 KernelDataAa * kda = KDA(kd);
99 int oX = kd->Offsets[0];
100 int oY = kd->Offsets[1];
101 int oZ = kd->Offsets[2];
103 if (kda->Iteration % 2 == 0) {
104 // Pdfs are stored inverse, local PDFs are located in remote nodes
105 int nx = x - D3Q19_X[dir];
106 int ny = y - D3Q19_Y[dir];
107 int nz = z - D3Q19_Z[dir];
109 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
110 pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
118 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
119 kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
127 static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
130 Assert(kd->PdfsActive != NULL);
131 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
132 Assert(pdfs != NULL);
134 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
135 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
137 KernelDataAa * kda = KDA(kd);
139 int oX = kd->Offsets[0];
140 int oY = kd->Offsets[1];
141 int oZ = kd->Offsets[2];
144 if (kda->Iteration % 2 == 0) {
145 // Pdfs are stored inverse, local PDFs are located in remote nodes
147 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
148 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
154 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
155 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
164 for (int d = 0; d < 19; ++d) {
165 if (isnan(pdfs[d])) {
166 printf("%d %d %d %d nan! get node\n", x, y, z, d);
168 for (int d2 = 0; d2 < 19; ++d2) {
169 printf("%d: %e\n", d2, pdfs[d2]);
182 static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
185 Assert(kd->PdfsActive != NULL);
186 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
187 Assert(pdfs != NULL);
189 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
190 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
192 KernelDataAa * kda = KDA(kd);
194 int oX = kd->Offsets[0];
195 int oY = kd->Offsets[1];
196 int oZ = kd->Offsets[2];
198 if (kda->Iteration % 2 == 0) {
199 // Pdfs are stored inverse, local PDFs are located in remote nodes
201 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
202 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
208 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
209 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
218 static void ParameterUsage()
220 printf("Kernel parameters:\n");
221 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
226 static void ParseParameters(Parameters * params, int * blk)
230 blk[0] = 0; blk[1] = 0; blk[2] = 0;
232 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
233 #define NEXT_ARG_PRESENT() \
235 if (i + 1 >= params->nKernelArgs) { \
236 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
242 for (int i = 0; i < params->nKernelArgs; ++i) {
243 if (ARG_IS("-blk") || ARG_IS("--blk")) {
246 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
249 printf("ERROR: blocking parameter must be >= 0.\n");
253 blk[0] = blk[1] = blk[2] = tmp;
255 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
258 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
261 printf("ERROR: blocking parameter must be >= 0.\n");
267 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
270 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
273 printf("ERROR: blocking parameter must be >= 0.\n");
279 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
282 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
285 printf("ERROR: blocking parameter must be >= 0.\n");
291 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
296 printf("ERROR: unknown kernel parameter.\n");
303 #undef NEXT_ARG_PRESENT
309 void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
311 KernelDataAa * kda = NULL;
312 MemAlloc((void **)&kda, sizeof(KernelDataAa));
314 kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
317 KernelData * kd = &kda->kd;
320 kd->nObstIndices = ld->nObst;
322 // Ajust the dimensions according to padding, if used.
323 kd->Dims[0] = ld->Dims[0];
324 kd->Dims[1] = ld->Dims[1];
325 kd->Dims[2] = ld->Dims[2];
328 int * lDims = ld->Dims;
329 int * gDims = kd->GlobalDims;
331 gDims[0] = lDims[0] + 2;
332 gDims[1] = lDims[1] + 2;
333 gDims[2] = lDims[2] + 2;
347 int oX = kd->Offsets[0];
348 int oY = kd->Offsets[1];
349 int oZ = kd->Offsets[2];
353 int nCells = gX * gY * gZ;
355 PdfT * pdfs[2] = { NULL, NULL };
357 ParseParameters(params, blk);
359 if (blk[0] == 0) blk[0] = gX;
360 if (blk[1] == 0) blk[1] = gY;
361 if (blk[2] == 0) blk[2] = gZ;
363 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
366 kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
369 printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
371 sizeof(PdfT) * nCells * N_D3Q19,
372 sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
376 MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
378 kd->Pdfs[0] = pdfs[0];
382 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
383 // This depends on the chosen data layout.
384 // The structure of the loop should resemble the same "execution layout"
389 nThreads = omp_get_max_threads();
393 #pragma omp parallel for \
394 shared(gDims, pdfs, \
395 oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
397 for (int i = 0; i < nThreads; ++i) {
399 int threadStartX = lX / nThreads * i;
400 int threadEndX = lX / nThreads * (i + 1);
402 if (lX % nThreads > 0) {
403 if (lX % nThreads > i) {
408 threadStartX += lX % nThreads;
409 threadEndX += lX % nThreads;
413 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
414 for (int bY = oY; bY < lY + oY; bY += blk[1]) {
415 for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
417 int eX = MIN(bX + blk[0], threadEndX + oX);
418 int eY = MIN(bY + blk[1], lY + oY);
419 int eZ = MIN(bZ + blk[2], lZ + oZ);
421 // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
423 for (int x = bX; x < eX; ++x) {
424 for (int y = bY; y < eY; ++y) {
425 for (int z = bZ; z < eZ; ++z) {
427 for (int d = 0; d < N_D3Q19; ++d) {
428 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0;
436 // Initialize all PDFs to some standard value.
437 for (int x = oX; x < lX + oX; ++x) {
438 for (int y = oY; y < lY + oY; ++y) {
439 for (int z = oZ; z < lZ + oZ; ++z) {
440 for (int d = 0; d < N_D3Q19; ++d) {
441 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
446 // Count how many *PDFs* need bounce back treatment.
448 uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
450 if (nPdfs > ((2LU << 31) - 1)) {
451 printf("ERROR: number of PDFs exceed 2^31.\n");
455 // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
456 // Works when declaring nBounceBackPdfs as int64_t or using volatile.
457 volatile int nBounceBackPdfs = 0;
458 // int64_t nBounceBackPdfs = 0;
459 int nx, ny, nz, px, py, pz;
462 for (int x = 0; x < lX; ++x) {
463 for (int y = 0; y < lY; ++y) {
464 for (int z = 0; z < lZ; ++z) {
466 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
467 for (int d = 0; d < N_D3Q19; ++d) {
472 // Check if neighbor is inside the lattice.
473 // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
476 if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
477 ++nBounceBackPdfs; // Compiler bug --> see above
479 else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
480 ++nBounceBackPdfs; // Compiler bug --> see above
482 else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
483 ++nBounceBackPdfs; // Compiler bug --> see above
485 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
488 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
489 ++nBounceBackPdfs; // Compiler bug --> see above
497 printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
499 MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
500 MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
502 kd->nBounceBackPdfs = nBounceBackPdfs;
508 // TODO: currently this is not NUMA-aware
509 // - maybe use the same blocking as for lattice initialization?
510 // - do place the bounce back index vector parallel?
512 for (int x = 0; x < lX; ++x) {
513 for (int y = 0; y < lY; ++y) {
514 for (int z = 0; z < lZ; ++z) {
516 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
517 for (int d = 0; d < N_D3Q19; ++d) {
522 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
523 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
524 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
528 // We assume we have finished odd time step (accessing neighbor PDFs) and are
529 // before executing the even time step (accessing local PDFs only).
531 // Assuming we are at the most east position of the lattice. Through the odd
532 // time step propagation has put a PDF in the east slot of the ghost cell east
533 // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node.
535 // In case of transition from even to odd time step , src and dest must be
569 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
570 // See description of bounce back handling below.
571 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
572 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
576 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
577 // Put it on the other side back into the domain.
578 dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
580 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
584 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
585 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
590 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
593 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
594 // Depending on the time step we are in we have to exchange src and dst index.
596 // We build the list for the case, when we have finished odd time step
597 // (accessing neighbor PDFs) and before we start with the even time step
598 // (accessing local PDFs only).
600 // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
601 // Then we have to move the east PDF from the obstacle to our west position,
602 // i.e. the inverse of east.
604 // In case of transition from even to odd time step src and dest just
605 // have to be exchanged.
607 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
608 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
610 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
612 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
613 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
624 // Fill remaining KernelData structures
625 kd->GetNode = FNAME(GetNode);
626 kd->SetNode = FNAME(SetNode);
628 kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
629 kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
631 kd->Kernel = FNAME(D3Q19AaKernel);
634 kd->PdfsActive = kd->Pdfs[0];
639 void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
641 MemFree((void **) & ((*kernelData)->Pdfs[0]));
642 // MemFree((void **) & ((*kernelData)->Pdfs[1]));
644 MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
645 MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
647 MemFree((void **)kernelData);