1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19AaVecCommon.h"
40 // Forward definition.
41 void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
44 static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
47 Assert(kd->PdfsActive != NULL);
48 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
51 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
52 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
53 Assert(dir >= 0); Assert(dir < N_D3Q19);
55 KernelDataAa * kda = KDA(kd);
57 int oX = kd->Offsets[0];
58 int oY = kd->Offsets[1];
59 int oZ = kd->Offsets[2];
61 if (kda->Iteration % 2 == 0) {
62 // Pdfs are stored inverse, local PDFs are located in remote nodes
63 int nx = x - D3Q19_X[dir];
64 int ny = y - D3Q19_Y[dir];
65 int nz = z - D3Q19_Z[dir];
67 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
68 *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
76 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
77 *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
85 static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
88 Assert(kd->PdfsActive != NULL);
89 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
91 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
92 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
93 Assert(dir >= 0); Assert(dir < N_D3Q19);
95 KernelDataAa * kda = KDA(kd);
97 int oX = kd->Offsets[0];
98 int oY = kd->Offsets[1];
99 int oZ = kd->Offsets[2];
101 if (kda->Iteration % 2 == 0) {
102 // Pdfs are stored inverse, local PDFs are located in remote nodes
103 int nx = x - D3Q19_X[dir];
104 int ny = y - D3Q19_Y[dir];
105 int nz = z - D3Q19_Z[dir];
107 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
108 pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
116 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
117 kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
125 static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
128 Assert(kd->PdfsActive != NULL);
129 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
130 Assert(pdfs != NULL);
132 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
133 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
135 KernelDataAa * kda = KDA(kd);
137 int oX = kd->Offsets[0];
138 int oY = kd->Offsets[1];
139 int oZ = kd->Offsets[2];
142 if (kda->Iteration % 2 == 0) {
143 // Pdfs are stored inverse, local PDFs are located in remote nodes
145 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
146 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
152 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
153 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
162 for (int d = 0; d < 19; ++d) {
163 if (isnan(pdfs[d])) {
164 printf("%d %d %d %d nan! get node\n", x, y, z, d);
166 for (int d2 = 0; d2 < 19; ++d2) {
167 printf("%d: %e\n", d2, pdfs[d2]);
180 static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
183 Assert(kd->PdfsActive != NULL);
184 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
185 Assert(pdfs != NULL);
187 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
188 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
190 KernelDataAa * kda = KDA(kd);
192 int oX = kd->Offsets[0];
193 int oY = kd->Offsets[1];
194 int oZ = kd->Offsets[2];
196 if (kda->Iteration % 2 == 0) {
197 // Pdfs are stored inverse, local PDFs are located in remote nodes
199 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
200 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
206 #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
207 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
216 static void ParameterUsage()
218 printf("Kernel parameters:\n");
219 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
224 static void ParseParameters(Parameters * params, int * blk)
228 blk[0] = 0; blk[1] = 0; blk[2] = 0;
230 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
231 #define NEXT_ARG_PRESENT() \
233 if (i + 1 >= params->nKernelArgs) { \
234 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
240 for (int i = 0; i < params->nKernelArgs; ++i) {
241 if (ARG_IS("-blk") || ARG_IS("--blk")) {
244 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
247 printf("ERROR: blocking parameter must be >= 0.\n");
251 blk[0] = blk[1] = blk[2] = tmp;
253 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
256 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
259 printf("ERROR: blocking parameter must be >= 0.\n");
265 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
268 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
271 printf("ERROR: blocking parameter must be >= 0.\n");
277 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
280 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
283 printf("ERROR: blocking parameter must be >= 0.\n");
289 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
294 printf("ERROR: unknown kernel parameter.\n");
301 #undef NEXT_ARG_PRESENT
307 void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
309 KernelDataAa * kda = NULL;
310 MemAlloc((void **)&kda, sizeof(KernelDataAa));
312 kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
315 KernelData * kd = &kda->kd;
318 kd->nObstIndices = ld->nObst;
320 // Ajust the dimensions according to padding, if used.
321 kd->Dims[0] = ld->Dims[0];
322 kd->Dims[1] = ld->Dims[1];
323 kd->Dims[2] = ld->Dims[2];
326 int * lDims = ld->Dims;
327 int * gDims = kd->GlobalDims;
331 // TODO: only add enough ghost cells so we can compute everything vectorized.
332 gDims[0] = lDims[0] + 2;
333 gDims[1] = lDims[1] + 2;
334 // TODO: fix this for aa-vec2-soa
335 gDims[2] = lDims[2] + 2 + VSIZE - 2; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4
349 int oX = kd->Offsets[0];
350 int oY = kd->Offsets[1];
351 int oZ = kd->Offsets[2];
355 int nCells = gX * gY * gZ;
357 PdfT * pdfs[2] = { NULL, NULL };
359 ParseParameters(params, blk);
361 if (blk[2] % VSIZE != 0) {
362 blk[2] -= blk[2] % VSIZE;
363 printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]);
366 if (blk[0] == 0) blk[0] = gX;
367 if (blk[1] == 0) blk[1] = gY;
368 if (blk[2] == 0) blk[2] = gZ;
370 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
372 kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
375 printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
377 sizeof(PdfT) * nCells * N_D3Q19,
378 sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
382 MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
384 kd->Pdfs[0] = pdfs[0];
388 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
389 // This depends on the chosen data layout.
390 // The structure of the loop should resemble the same "execution layout"
395 nThreads = omp_get_max_threads();
399 #pragma omp parallel for \
400 shared(gDims, pdfs, \
401 oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
403 for (int i = 0; i < nThreads; ++i) {
405 int threadStartX = lX / nThreads * i;
406 int threadEndX = lX / nThreads * (i + 1);
408 if (lX % nThreads > 0) {
409 if (lX % nThreads > i) {
414 threadStartX += lX % nThreads;
415 threadEndX += lX % nThreads;
419 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
420 for (int bY = oY; bY < lY + oY; bY += blk[1]) {
421 for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
423 int eX = MIN(bX + blk[0], threadEndX + oX);
424 int eY = MIN(bY + blk[1], lY + oY);
425 int eZ = MIN(bZ + blk[2], lZ + oZ);
427 // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
429 for (int x = bX; x < eX; ++x) {
430 for (int y = bY; y < eY; ++y) {
431 for (int z = bZ; z < eZ; ++z) {
433 for (int d = 0; d < N_D3Q19; ++d) {
434 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0;
442 // Initialize all PDFs to some standard value.
443 for (int x = oX; x < lX + oX; ++x) {
444 for (int y = oY; y < lY + oY; ++y) {
445 for (int z = oZ; z < lZ + oZ; ++z) {
446 for (int d = 0; d < N_D3Q19; ++d) {
447 pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
452 // Count how many *PDFs* need bounce back treatment.
454 uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
456 if (nPdfs > ((2LU << 31) - 1)) {
457 printf("ERROR: number of PDFs exceed 2^31.\n");
461 // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
462 // Works when declaring nBounceBackPdfs as int64_t or using volatile.
463 volatile int nBounceBackPdfs = 0;
464 // int64_t nBounceBackPdfs = 0;
465 int nx, ny, nz, px, py, pz;
468 for (int x = 0; x < lX; ++x) {
469 for (int y = 0; y < lY; ++y) {
470 for (int z = 0; z < lZ; ++z) {
472 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
473 for (int d = 0; d < N_D3Q19; ++d) {
478 // Check if neighbor is inside the lattice.
479 // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
482 if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
483 ++nBounceBackPdfs; // Compiler bug --> see above
485 else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
486 ++nBounceBackPdfs; // Compiler bug --> see above
488 else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
489 ++nBounceBackPdfs; // Compiler bug --> see above
491 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
494 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
495 ++nBounceBackPdfs; // Compiler bug --> see above
503 printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
505 MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
506 MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
508 kd->nBounceBackPdfs = nBounceBackPdfs;
514 // TODO: currently this is not NUMA-aware
515 // - maybe use the same blocking as for lattice initialization?
516 // - do place the bounce back index vector parallel?
518 for (int x = 0; x < lX; ++x) {
519 for (int y = 0; y < lY; ++y) {
520 for (int z = 0; z < lZ; ++z) {
522 if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
523 for (int d = 0; d < N_D3Q19; ++d) {
528 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
529 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
530 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
534 // We assume we have finished odd time step (accessing neighbor PDFs) and are
535 // before executing the even time step (accessing local PDFs only).
537 // Assuming we are at the most east position of the lattice. Through the odd
538 // time step propagation has put a PDF in the east slot of the ghost cell east
539 // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node.
541 // In case of transition from even to odd time step , src and dest must be
575 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
576 // See description of bounce back handling below.
577 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
578 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
582 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
583 // Put it on the other side back into the domain.
584 dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
586 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
590 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
591 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
596 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
599 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
600 // Depending on the time step we are in we have to exchange src and dst index.
602 // We build the list for the case, when we have finished odd time step
603 // (accessing neighbor PDFs) and before we start with the even time step
604 // (accessing local PDFs only).
606 // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
607 // Then we have to move the east PDF from the obstacle to our west position,
608 // i.e. the inverse of east.
610 // In case of transition from even to odd time step src and dest just
611 // have to be exchanged.
613 srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
614 dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
616 VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
618 kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
619 kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
630 // Fill remaining KernelData structures
631 kd->GetNode = FNAME(GetNode);
632 kd->SetNode = FNAME(SetNode);
634 kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
635 kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
637 kd->Kernel = FNAME(D3Q19AaVecKernel);
640 kd->PdfsActive = kd->Pdfs[0];
645 void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData)
647 MemFree((void **) & ((*kernelData)->Pdfs[0]));
649 MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
650 MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
652 MemFree((void **)kernelData);