1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListAaCommon.h"
36 // Forward definition.
37 void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
42 // -----------------------------------------------------------------------
43 // Functions which are used as callback by the kernel to read or write
46 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
49 Assert(kd->PdfsActive != NULL);
50 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
53 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
54 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
55 Assert(dir >= 0); Assert(dir < N_D3Q19);
57 KernelDataList * kdl = (KernelDataList *)kd;
59 if (kdl->Iteration % 2 == 0) {
60 // Pdfs are stored inverse, local PDFs are located in remote nodes
62 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
65 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
67 *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
70 *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
75 *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
82 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
85 Assert(kd->PdfsActive != NULL);
86 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
87 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
88 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
89 Assert(dir >= 0); Assert(dir < N_D3Q19);
92 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
97 KernelDataList * kdl = (KernelDataList *)kd;
99 if (kdl->Iteration % 2 == 0) {
100 // Pdfs are stored inverse, local PDFs are located in remote nodes
102 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
104 if (dir != D3Q19_C) {
105 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
107 kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
110 kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
115 kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
122 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
125 Assert(kd->PdfsActive != NULL);
126 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
127 Assert(pdfs != NULL);
128 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
129 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
131 KernelDataList * kdl = (KernelDataList *)kd;
133 if(kdl->Iteration % 2 == 0){
135 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
136 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
138 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
139 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
141 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
147 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
148 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
157 for (int d = 0; d < 19; ++d) {
158 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
159 printf("%d %d %d %d nan! get node\n", x, y, z, d);
160 for (int d2 = 0; d2 < 19; ++d2) {
161 printf("%d: %e\n", d2, pdfs[d2]);
172 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
175 Assert(kd->PdfsActive != NULL);
176 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
177 Assert(pdfs != NULL);
179 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
180 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
184 for (int d = 0; d < 19; ++d) {
186 printf("%d %d %d %d nan! get node\n", x, y, z, d);
187 for (int d2 = 0; d2 < 19; ++d2) {
188 printf("%d: %e\n", d2, pdfs[d2]);
195 KernelDataList * kdl = (KernelDataList *)kd;
197 if(kdl->Iteration % 2 == 0){
199 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
200 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
202 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
203 kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
205 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
211 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
212 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
222 static void ParameterUsage()
224 printf("Kernel parameters:\n");
225 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
226 #ifdef DATA_LAYOUT_SOA
227 printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
232 static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
236 blk[0] = 0; blk[1] = 0; blk[2] = 0;
239 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
240 #define NEXT_ARG_PRESENT() \
242 if (i + 1 >= params->nKernelArgs) { \
243 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
249 for (int i = 0; i < params->nKernelArgs; ++i) {
250 if (ARG_IS("-blk") || ARG_IS("--blk")) {
253 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
256 printf("ERROR: blocking parameter must be >= 0.\n");
260 blk[0] = blk[1] = blk[2] = tmp;
262 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
265 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
268 printf("ERROR: blocking parameter must be >= 0.\n");
274 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
277 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
280 printf("ERROR: blocking parameter must be >= 0.\n");
286 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
289 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
292 printf("ERROR: blocking parameter must be >= 0.\n");
298 #ifdef DATA_LAYOUT_SOA
299 else if (ARG_IS("-pad") || ARG_IS("--pad")) {
302 *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
305 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
310 printf("ERROR: unknown kernel parameter.\n");
317 #undef NEXT_ARG_PRESENT
322 void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
325 KernelDataList * kdl;
326 MemAlloc((void **)&kdl, sizeof(KernelDataList));
328 kd = (KernelData *)kdl;
334 kd->PdfsActive = NULL;
340 kd->GlobalDims[0] = -1;
341 kd->GlobalDims[1] = -1;
342 kd->GlobalDims[2] = -1;
347 kd->ObstIndices = NULL;
348 kd->nObstIndices = -1;
349 kd->BounceBackPdfsSrc = NULL;
350 kd->BounceBackPdfsDst = NULL;
351 kd->nBounceBackPdfs = -1;
361 PadInfo * padInfo = NULL;
363 ParseParameters(params, blk, &padInfo);
365 // Ajust the dimensions according to padding, if used.
366 kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
367 kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
368 kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
370 int * lDims = ld->Dims;
376 int nTotalCells = lX * lY * lZ;
377 int nCells = ld->nFluid;
378 int nFluid = ld->nFluid;
380 #ifdef DATA_LAYOUT_SOA
382 nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
383 PadInfoFree(padInfo); padInfo = NULL;
387 // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
388 // done during lattice setup.
389 kdl->nCells = nCells;
390 kdl->nFluid = nFluid;
394 if (blk[0] == 0) blk[0] = lX;
395 if (blk[1] == 0) blk[1] = lY;
396 if (blk[2] == 0) blk[2] = lZ;
398 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
400 printf("# allocating data for %d fluid LB nodes with padding (%lu bytes = %f MiB for both lattices)\n",
401 nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
402 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
404 MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
406 kd->Pdfs[0] = pdfs[0];
408 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
409 // Here we touch only the fluid nodes as this loop is OpenMP parallel and
410 // we want the same scheduling as in the kernel.
412 #pragma omp parallel for
414 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
415 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
418 // Initialize all PDFs to some standard value.
419 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
420 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
423 // ----------------------------------------------------------------------
424 // create grid which will hold the index numbers of the fluid nodes
428 if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
429 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
437 for(int z = 0; z < lZ; ++z) {
438 for(int y = 0; y < lY; ++y) {
439 for(int x = 0; x < lX; ++x) {
441 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
443 grid[latticeIndex] = ~0;
449 // ----------------------------------------------------------------------
450 // generate numbering over grid
454 if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
455 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
459 kdl->Coords = coords;
461 // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
462 // through the ld->Lattice array.
465 // Blocking is implemented via setup of the adjacency list. The kernel later will
466 // walk through the lattice blocked automatically.
467 for (int bX = 0; bX < lX; bX += blk[0]) {
468 for (int bY = 0; bY < lY; bY += blk[1]) {
469 for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
471 int eX = MIN(bX + blk[0], lX);
472 int eY = MIN(bY + blk[1], lY);
473 int eZ = MIN(bZ + blk[2], lZ);
475 for (int x = bX; x < eX; ++x) {
476 for (int y = bY; y < eY; ++y) {
477 for (int z = bZ; z < eZ; ++z) {
479 latticeIndex = L_INDEX_4(lDims, x, y, z);
481 if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
482 grid[latticeIndex] = counter;
484 coords[C_INDEX_X(counter)] = x;
485 coords[C_INDEX_Y(counter)] = y;
486 coords[C_INDEX_Z(counter)] = z;
493 Verify(counter == nFluid);
497 // AdjList only requires 18 instead of 19 entries per node, as
498 // the center PDF needs no addressing.
499 if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
500 printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
504 kdl->AdjList = adjList;
508 uint32_t neighborIndex;
511 int nx, ny, nz, px, py, pz;
513 // Loop over all fluid nodes and compute the indices to the neighboring
514 // PDFs for configure data layout (AoS/SoA).
516 #pragma omp parallel for
518 for (int index = 0; index < nFluid; ++index) {
519 for (int d = 0; d < N_D3Q19_IDX; ++d) {
520 adjList[index * N_D3Q19_IDX + d] = -1;
524 // #ifdef _OPENMP --> add line continuation
525 // #pragma omp parallel for default(none)
526 // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
528 // lDims, grid, ld, lX, lY, lZ, adjList)
529 // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
531 for (int index = 0; index < nFluid; ++index) {
532 x = coords[C_INDEX_X(index)];
533 y = coords[C_INDEX_Y(index)];
534 z = coords[C_INDEX_Z(index)];
536 Assert(x >= 0 && x < lX);
537 Assert(y >= 0 && y < lY);
538 Assert(z >= 0 && z < lZ);
540 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
542 // Loop over all directions except the center one.
543 for(int d = 0; d < N_D3Q19 - 1; ++d) {
544 Assert(d != D3Q19_C);
546 #ifdef PROP_MODEL_PUSH
551 #elif PROP_MODEL_PULL
556 #error No implementation for this PROP_MODEL_NAME.
558 // If the neighbor is outside the latcie in X direction and we have a
559 // periodic boundary then we need to wrap around.
560 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
561 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
562 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
594 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
595 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
598 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
600 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
602 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
605 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
606 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
608 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
609 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
612 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
614 Assert(neighborIndex != ~0);
616 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
619 Assert(dstIndex >= 0);
620 Assert(dstIndex < nCells * N_D3Q19);
622 adjList[index * N_D3Q19_IDX + d] = dstIndex;
627 // Fill remaining KernelData structures
628 kd->GetNode = GetNode;
629 kd->SetNode = SetNode;
631 kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
632 kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
634 kd->Kernel = FNAME(D3Q19ListAaKernel);
637 kd->PdfsActive = kd->Pdfs[0];
642 void FNAME(D3Q19ListAaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
644 KernelDataList ** kdl = (KernelDataList **)kernelData;
646 MemFree((void **)&((*kernelData)->Pdfs[0]));
648 MemFree((void **)&((*kdl)->AdjList));
649 MemFree((void **)&((*kdl)->Coords));
650 MemFree((void **)&((*kdl)->Grid));
652 MemFree((void **)kernelData);