src/BenchKernelD3Q19ListPullSplitNtCommon.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  12 //
  13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  14 //  it under the terms of the GNU General Public License as published by
  15 //  the Free Software Foundation, either version 3 of the License, or
  16 //  (at your option) any later version.
  17 //
  18 //  LbmBenchKernels is distributed in the hope that it will be useful,
  19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 //  GNU General Public License for more details.
  22 //
  23 //  You should have received a copy of the GNU General Public License
  24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  25 //
  26 // --------------------------------------------------------------------------
  27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
  28
  29 #include "Memory.h"
  30 #include "Vtk.h"
  31 #include "Vector.h"
  32 #include "Padding.h"
  33
  34
  35 #include <math.h>
  36
  37 #ifdef _OPENMP
  38         #include <omp.h>
  39 #endif
  40
  41 // Forward definition.
  42 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  43 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  44
  45 void FNAME(KernelPullSplitNtRia1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  46 void FNAME(KernelPullSplitNtRia2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  47
  48
  49
  50
  51 // -----------------------------------------------------------------------
  52 // Functions which are used as callback by the kernel to read or write
  53 // PDFs and nodes.
  54
  55 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
  56 {
  57         Assert(kd != NULL);
  58         Assert(kd->PdfsActive != NULL);
  59         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  60         Assert(pdf != NULL);
  61
  62         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  63         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  64         Assert(dir >= 0); Assert(dir < N_D3Q19);
  65
  66         // The relevant PDFs here are the ones, which will get streamed in later
  67         // during propagation. So we must return the *remote* PDFs.
  68         uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
  69
  70         if (dir != D3Q19_C) {
  71
  72                 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
  73
  74                 *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]];
  75         }
  76         else {
  77                 *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
  78
  79         }
  80
  81         return;
  82 }
  83
  84 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
  85 {
  86         Assert(kd != NULL);
  87         Assert(kd->PdfsActive != NULL);
  88         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  89         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  90         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  91         Assert(dir >= 0); Assert(dir < N_D3Q19);
  92
  93         if (isnan(pdf)) {
  94                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
  95                 DEBUG_BREAK_POINT();
  96                 exit(1);
  97         }
  98
  99         // The relevant PDFs here are the ones, which will get streamed in later
 100         // during propagation. So we must set this *remote* PDFs.
 101         uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
 102
 103         if (dir != D3Q19_C) {
 104
 105                 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
 106
 107                 kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf;
 108         }
 109         else {
 110                 kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
 111
 112         }
 113
 114         return;
 115 }
 116
 117
 118 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 119 {
 120         Assert(kd != NULL);
 121         Assert(kd->PdfsActive != NULL);
 122         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 123         Assert(pdfs != NULL);
 124         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 125         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 126
 127         PdfT sum = 0.0;
 128
 129         // TODO: pull scheme?
 130
 131         #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 132         #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx];
 133         D3Q19_LIST
 134         #undef X
 135         #undef I
 136
 137 #ifdef DETECT_NANS
 138         for (int d = 0; d < 19; ++d) {
 139                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
 140                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 141                                                 for (int d2 = 0; d2 < 19; ++d2) {
 142                                                         printf("%d: %e\n", d2, pdfs[d2]);
 143                                                 }
 144                         exit(1);
 145                 }
 146         }
 147 #endif
 148         return;
 149 }
 150
 151
 152 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 153 {
 154         Assert(kd != NULL);
 155         Assert(kd->PdfsActive != NULL);
 156         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 157         Assert(pdfs != NULL);
 158
 159         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 160         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 161
 162 #ifdef DETECT_NANS
 163         for (int d = 0; d < 19; ++d) {
 164                 if(isnan(pdfs[d])) {
 165                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 166                                                 for (int d2 = 0; d2 < 19; ++d2) {
 167                                                         printf("%d: %e\n", d2, pdfs[d2]);
 168                                                 }
 169                         exit(1);
 170                 }
 171         }
 172 #endif
 173
 174         // TODO: pull scheme?
 175         #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 176         #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
 177         D3Q19_LIST
 178         #undef X
 179         #undef I
 180
 181         return;
 182 }
 183
 184 static void ParameterUsage()
 185 {
 186         printf("Kernel parameters:\n");
 187         printf("  [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
 188         printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
 189
 190         return;
 191 }
 192
 193 static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo)
 194 {
 195         Assert(blk != NULL);
 196
 197         blk[0] = 0; blk[1] = 0; blk[2] = 0;
 198         *nTmpArray = 152 - (152 % VSIZE);
 199         *padInfo = NULL;
 200
 201         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 202         #define NEXT_ARG_PRESENT() \
 203                 do { \
 204                         if (i + 1 >= params->nKernelArgs) { \
 205                                 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
 206                                 exit(1); \
 207                         } \
 208                 } while (0)
 209
 210
 211         for (int i = 0; i < params->nKernelArgs; ++i) {
 212                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
 213                         NEXT_ARG_PRESENT();
 214
 215                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 216
 217                         if (tmp < 0) {
 218                                 printf("ERROR: blocking parameter must be >= 0.\n");
 219                                 exit(1);
 220                         }
 221
 222                         blk[0] = blk[1] = blk[2] = tmp;
 223                 }
 224                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
 225                         NEXT_ARG_PRESENT();
 226
 227                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 228
 229                         if (tmp < 0) {
 230                                 printf("ERROR: blocking parameter must be >= 0.\n");
 231                                 exit(1);
 232                         }
 233
 234                         blk[0] = tmp;
 235                 }
 236                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
 237                         NEXT_ARG_PRESENT();
 238
 239                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 240
 241                         if (tmp < 0) {
 242                                 printf("ERROR: blocking parameter must be >= 0.\n");
 243                                 exit(1);
 244                         }
 245
 246                         blk[1] = tmp;
 247                 }
 248                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
 249                         NEXT_ARG_PRESENT();
 250
 251                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 252
 253                         if (tmp < 0) {
 254                                 printf("ERROR: blocking parameter must be >= 0.\n");
 255                                 exit(1);
 256                         }
 257
 258                         blk[2] = tmp;
 259                 }
 260                 else if (ARG_IS("-n-tmp-array") || ARG_IS("--n-tmp-array")) {
 261                         NEXT_ARG_PRESENT();
 262
 263                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 264
 265                         if (tmp <= 0) {
 266                                 printf("ERROR: -n-tmp-array parameter must be > 0.\n");
 267                                 exit(1);
 268                         }
 269
 270                         if (tmp % VSIZE != 0) {
 271                                 printf("ERROR: value for -n-tmp-array must be a multiple of %d.\n", VSIZE);
 272                                 exit(1);
 273                         }
 274
 275                         *nTmpArray = tmp;
 276                 }
 277                 else if (ARG_IS("-pad") || ARG_IS("--pad")) {
 278                         NEXT_ARG_PRESENT();
 279
 280                         *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
 281                 }
 282                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 283                         ParameterUsage();
 284                         exit(1);
 285                 }
 286                 else {
 287                         printf("ERROR: unknown kernel parameter.\n");
 288                         ParameterUsage();
 289                         exit(1);
 290                 }
 291         }
 292
 293         #undef ARG_IS
 294         #undef NEXT_ARG_PRESENT
 295
 296         return;
 297 }
 298
 299 static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
 300 {
 301         Assert(ld != NULL);
 302         Assert(kdlr != NULL);
 303         Assert(nThreads > 0);
 304
 305         uint32_t * adjList = kdlr->kdl.AdjList;
 306
 307         uint32_t nConsecNodes = 0;
 308         uint32_t consecIndex = 0;
 309
 310         int nFluid = kdlr->kdl.nFluid;
 311
 312         uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
 313
 314         int nNodesPerThread = nFluid / nThreads;
 315
 316         for (int i = 0; i < nThreads; ++i) {
 317                 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
 318         }
 319         consecThreadIndices[nThreads] = -1;
 320
 321         int indexThread = 1;
 322
 323         // We execute following code two times.
 324         // - The first time to get the count of how many entries we need for the
 325         //   consecNodes array.
 326         // - The second time to fill the array.
 327
 328         // Loop over adjacency list of all nodes.
 329     // Compare if adjacent nodes share the same access pattern.
 330         for (int index = 1; index < nFluid; ++index) {
 331
 332                 int different = 0;
 333
 334                 // Loop over all directions except the center one.
 335                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 336                         Assert(d != D3Q19_C);
 337
 338                         if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
 339                                 // Different access pattern.
 340                                 different = 1;
 341                                 break;
 342                         }
 343                 }
 344
 345                 if (consecThreadIndices[indexThread] == index) {
 346                         // We are at a thread boundary. Starting from this index the fluids
 347                         // belong to another thread. Force a break, if nodes are consecutive.
 348                         ++indexThread;
 349                         different = 1;
 350                 }
 351
 352                 if (different) {
 353                         ++consecIndex;
 354                 }
 355         }
 356
 357         if (nFluid > 0) {
 358                 nConsecNodes = consecIndex + 1;
 359         }
 360
 361         uint32_t * consecNodes;
 362         MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
 363
 364         consecIndex = 0;
 365
 366         if (nFluid > 0) {
 367                 consecNodes[consecIndex] = 1;
 368         }
 369
 370         indexThread = 1;
 371         consecThreadIndices[0] = 0;
 372
 373         // Loop over adjacency list of all nodes.
 374     // Compare if adjacent nodes share the same access pattern.
 375         for (int index = 1; index < nFluid; ++index) {
 376
 377                 int different = 0;
 378
 379                 // Loop over all directions except the center one.
 380                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 381                         Assert(d != D3Q19_C);
 382
 383                         if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
 384                                 // Different access pattern.
 385                                 different = 1;
 386                                 break;
 387                         }
 388                 }
 389
 390                 if (consecThreadIndices[indexThread] == index) {
 391                         // We are at a thread boundary. Starting from this index the fluids
 392                         // belong to another thread. Force a break, if nodes are consecutive.
 393                         consecThreadIndices[indexThread] = consecIndex + 1;
 394                         ++indexThread;
 395                         different = 1;
 396                 }
 397
 398                 if (different) {
 399                         ++consecIndex;
 400                         Assert(consecIndex < nConsecNodes);
 401                         consecNodes[consecIndex] = 1;
 402                 }
 403                 else {
 404                         Assert(consecIndex < nConsecNodes);
 405                         consecNodes[consecIndex] += 1;
 406                 }
 407         }
 408
 409
 410         kdlr->ConsecNodes = consecNodes;
 411         kdlr->nConsecNodes = nConsecNodes;
 412
 413         kdlr->ConsecThreadIndices  = consecThreadIndices;
 414         kdlr->nConsecThreadIndices = nThreads;
 415
 416         // printf("# total fluid nodes: %d   consecutive blocks: %d\n", nFluid, nConsecNodes);
 417
 418         return;
 419 }
 420
 421
 422 static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 423 {
 424         KernelData * kd;
 425         KernelDataList * kdl;
 426         KernelDataListRia * kdlr;
 427         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
 428
 429         kd = (KernelData *)kdlr;
 430         kdl = KDL(kdlr);
 431
 432         *kernelData = kd;
 433
 434 #ifdef DEBUG
 435         kd->Pdfs[0] = NULL;
 436         kd->Pdfs[1] = NULL;
 437         kd->PdfsActive = NULL;
 438         kd->DstPdfs = NULL;
 439         kd->SrcPdfs = NULL;
 440         kd->Dims[0] = -1;
 441         kd->Dims[1] = -1;
 442         kd->Dims[2] = -1;
 443         kd->GlobalDims[0] = -1;
 444         kd->GlobalDims[1] = -1;
 445         kd->GlobalDims[2] = -1;
 446         kd->Offsets[0] = -1;
 447         kd->Offsets[1] = -1;
 448         kd->Offsets[2] = -1;
 449
 450         kd->ObstIndices = NULL;
 451         kd->nObstIndices = -1;
 452         kd->BounceBackPdfsSrc = NULL;
 453         kd->BounceBackPdfsDst = NULL;
 454         kd->nBounceBackPdfs = -1;
 455
 456         kdl->AdjList = NULL;
 457         kdl->Coords = NULL;
 458         kdl->Grid = NULL;
 459         kdl->nCells = -1;
 460         kdl->nFluid = -1;
 461
 462         kdlr->ConsecNodes = NULL;
 463         kdlr->nConsecNodes = 0;
 464         kdlr->ConsecThreadIndices = NULL;
 465         kdlr->nConsecThreadIndices = 0;
 466 #endif
 467
 468         int blk[3] = { 0 };
 469         PadInfo * padInfo = NULL;
 470
 471         ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo);
 472
 473         // Ajust the dimensions according to padding, if used.
 474         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 475         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
 476         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
 477
 478         int * lDims = ld->Dims;
 479
 480         int lX = lDims[0];
 481         int lY = lDims[1];
 482         int lZ = lDims[2];
 483
 484         int nTotalCells = lX * lY * lZ;
 485         int nCells = ld->nFluid;
 486         int nFluid = ld->nFluid;
 487
 488         {
 489                 nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
 490                 PadInfoFree(padInfo); padInfo = NULL;
 491         }
 492
 493         // We padd each stream of a PDF array for a complete cache line.
 494         int nElementsPerCl = 64 / sizeof(PdfT);
 495
 496         nCells = nCells + (nElementsPerCl - nCells % nElementsPerCl);
 497
 498         Assert(nCells % VSIZE == 0);
 499
 500         kdl->nCells = nCells;
 501         kdl->nFluid = nFluid;
 502
 503         PdfT * pdfs[2];
 504
 505         if (blk[0] == 0) blk[0] = lX;
 506         if (blk[1] == 0) blk[1] = lY;
 507         if (blk[2] == 0) blk[2] = lZ;
 508
 509         printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
 510         printf("# temporary array size:  %d PDFs, %lu b\n", kdlr->nTmpArray, kdlr->nTmpArray * sizeof(PdfT) * 23);
 511
 512         double latMiB      = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 513         double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 514         double latPadMib   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 515
 516         printf("# lattice size:          %e MiB total: %e MiB\n", latMiB,      latMiB * 2);
 517         printf("# fluid lattice size:    %e MiB total: %e MiB\n", latFluidMib, latFluidMib * 2);
 518         printf("# lattice padding:       %e MiB total: %e MiB\n", latPadMib,   latPadMib * 2);
 519
 520 #define PAGE_4K         4096
 521
 522         printf("# aligning lattices to:  %d b\n", PAGE_4K);
 523
 524         MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
 525         MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
 526
 527         kd->Pdfs[0] = pdfs[0];
 528         kd->Pdfs[1] = pdfs[1];
 529
 530         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
 531         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
 532         // we want the same scheduling as in the kernel.
 533         #ifdef _OPENMP
 534                 #pragma omp parallel for
 535         #endif
 536         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 537                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
 538                 pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0;
 539         } }
 540
 541         // Initialize all PDFs to some standard value.
 542         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 543                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
 544                 pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0;
 545         } }
 546
 547         // ----------------------------------------------------------------------
 548         // create grid which will hold the index numbers of the fluid nodes
 549
 550         uint32_t * grid;
 551
 552         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
 553                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
 554                 exit(1);
 555         }
 556         kdl->Grid = grid;
 557
 558         int latticeIndex;
 559
 560 #ifdef DEBUG
 561         for(int z = 0; z < lZ; ++z) {
 562                 for(int y = 0; y < lY; ++y) {
 563                         for(int x = 0; x < lX; ++x) {
 564
 565                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
 566
 567                                 grid[latticeIndex] = ~0;
 568                         }
 569                 }
 570         }
 571 #endif
 572
 573         // ----------------------------------------------------------------------
 574         // generate numbering over grid
 575
 576         uint32_t * coords;
 577
 578         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
 579                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
 580                 exit(1);
 581         }
 582
 583         kdl->Coords = coords;
 584
 585         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
 586         // through the ld->Lattice array.
 587         int counter = 0;
 588
 589         // Blocking is implemented via setup of the adjacency list. The kernel later will
 590         // walk through the lattice blocked automatically.
 591         for (int bX = 0; bX < lX; bX += blk[0]) {
 592         for (int bY = 0; bY < lY; bY += blk[1]) {
 593         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 594
 595                 int eX = MIN(bX + blk[0], lX);
 596                 int eY = MIN(bY + blk[1], lY);
 597                 int eZ = MIN(bZ + blk[2], lZ);
 598
 599
 600                 for (int x = bX; x < eX; ++x) {
 601                 for (int y = bY; y < eY; ++y) {
 602                 for (int z = bZ; z < eZ; ++z) {
 603
 604                         latticeIndex = L_INDEX_4(lDims, x, y, z);
 605
 606                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
 607                                 grid[latticeIndex] = counter;
 608
 609                                 coords[C_INDEX_X(counter)] = x;
 610                                 coords[C_INDEX_Y(counter)] = y;
 611                                 coords[C_INDEX_Z(counter)] = z;
 612
 613                                 ++counter;
 614                         }
 615                 } } }
 616         } } }
 617
 618         Verify(counter == nFluid);
 619
 620         uint32_t * adjList;
 621
 622         double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
 623
 624         printf("# index size:            %e MiB\n", indexMib);
 625
 626
 627         // AdjList only requires 18 instead of 19 entries per node, as
 628         // the center PDF needs no addressing.
 629         if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
 630                 printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
 631                 exit(1);
 632         }
 633
 634         kdl->AdjList = adjList;
 635
 636         int x, y, z;
 637
 638         uint32_t neighborIndex;
 639         uint32_t dstIndex;
 640
 641         int nx, ny, nz, px, py, pz;
 642
 643         // Loop over all fluid nodes and compute the indices to the neighboring
 644         // PDFs for configured data layout (AoS/SoA).
 645         // Parallelized loop to ensure correct NUMA placement.
 646         #ifdef _OPENMP
 647                 #pragma omp parallel for
 648         #endif
 649         for (int index = 0; index < nFluid; ++index) {
 650                 for (int d = 0; d < N_D3Q19_IDX; ++d) {
 651                         adjList[index * N_D3Q19_IDX + d] = -1;
 652                 }
 653         }
 654
 655         // #ifdef _OPENMP  --> add line continuation
 656         //      #pragma omp parallel for default(none)
 657         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
 658         //                              stderr,
 659         //                              lDims, grid, ld, lX, lY, lZ, adjList)
 660         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
 661         // #endif
 662         for (int index = 0; index < nFluid; ++index) {
 663                 x = coords[C_INDEX_X(index)];
 664                 y = coords[C_INDEX_Y(index)];
 665                 z = coords[C_INDEX_Z(index)];
 666
 667                 Assert(x >= 0 && x < lX);
 668                 Assert(y >= 0 && y < lY);
 669                 Assert(z >= 0 && z < lZ);
 670
 671                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
 672
 673                 // Loop over all directions except the center one.
 674                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 675                         Assert(d != D3Q19_C);
 676
 677 #ifdef PROP_MODEL_PUSH
 678                         nx = x + D3Q19_X[d];
 679                         ny = y + D3Q19_Y[d];
 680                         nz = z + D3Q19_Z[d];
 681
 682 #elif PROP_MODEL_PULL
 683                         nx = x - D3Q19_X[d];
 684                         ny = y - D3Q19_Y[d];
 685                         nz = z - D3Q19_Z[d];
 686 #else
 687                         #error No implementation for this PROP_MODEL_NAME.
 688 #endif
 689                         // If the neighbor is outside the latcie in X direction and we have a
 690                         // periodic boundary then we need to wrap around.
 691                         if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
 692                                         ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
 693                                         ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
 694                                                                                                                                 ){
 695                                 // x periodic
 696
 697                                 if (nx < 0) {
 698                                         px = lX - 1;
 699                                 }
 700                                 else if (nx >= lX) {
 701                                         px = 0;
 702                                 } else {
 703                                         px = nx;
 704                                 }
 705                                 // y periodic
 706                                 if (ny < 0) {
 707                                         py = lY - 1;
 708                                 }
 709                                 else if (ny >= lY) {
 710                                         py = 0;
 711                                 } else {
 712                                         py = ny;
 713                                 }
 714
 715                                 // z periodic
 716                                 if (nz < 0) {
 717                                         pz = lZ - 1;
 718                                 }
 719                                 else if (nz >= lZ) {
 720                                         pz = 0;
 721                                 } else {
 722                                         pz = nz;
 723                                 }
 724
 725                                 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
 726                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 727                                 }
 728                                 else {
 729                                         neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
 730
 731                                         AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
 732
 733                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 734                                 }
 735                         }
 736                         else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
 737                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 738                         }
 739                         else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
 740                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 741                         }
 742                         else {
 743                                 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
 744
 745                                 Assert(neighborIndex != ~0);
 746
 747                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 748                         }
 749
 750                         Assert(dstIndex >= 0);
 751                         Assert(dstIndex < nCells * N_D3Q19);
 752
 753                         adjList[index * N_D3Q19_IDX + d] = dstIndex;
 754                 }
 755         }
 756
 757         int nThreads = 1;
 758
 759 #ifdef _OPENMP
 760         nThreads = omp_get_max_threads();
 761 #endif
 762
 763         SetupConsecNodes(ld, KDLR(kd), nThreads);
 764
 765
 766         // Fill remaining KernelData structures
 767         kd->GetNode = GetNode;
 768         kd->SetNode = SetNode;
 769
 770         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
 771         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
 772
 773         kd->Kernel = NULL; // FNAME(KernelPullSplitNt2S);
 774
 775         kd->DstPdfs = NULL;
 776         kd->PdfsActive = kd->Pdfs[0];
 777
 778         return;
 779 }
 780
 781 void FNAME(D3Q19ListPullSplitNt1SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 782 {
 783         FNAME(Init)(ld, kernelData, params);
 784         (*kernelData)->Kernel = FNAME(KernelPullSplitNt1S);
 785
 786         double loopBalance  = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
 787         printf("# loop balance:          %.2f B/FLUP\n", loopBalance);
 788 }
 789
 790 void FNAME(D3Q19ListPullSplitNt2SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 791 {
 792         FNAME(Init)(ld, kernelData, params);
 793         (*kernelData)->Kernel = FNAME(KernelPullSplitNt2S);
 794
 795         double loopBalance  = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
 796         printf("# loop balance:          %.2f B/FLUP\n", loopBalance);
 797 }
 798
 799
 800 void FNAME(D3Q19ListPullSplitNtDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 801 {
 802         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
 803
 804         MemFree((void **)&((*kdlr)->ConsecNodes));
 805
 806         if ((*kdlr)->ConsecThreadIndices != NULL) {
 807                 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
 808         }
 809
 810         KernelDataList ** kdl = (KernelDataList **)kernelData;
 811
 812         MemFree((void **)&((*kdl)->AdjList));
 813         MemFree((void **)&((*kdl)->Coords));
 814         MemFree((void **)&((*kdl)->Grid));
 815
 816         MemFree((void **)&((*kernelData)->Pdfs[0]));
 817         MemFree((void **)&((*kernelData)->Pdfs[1]));
 818
 819         MemFree((void **)kernelData);
 820         return;
 821 }
 822