src/BenchKernelD3Q19ListAaRiaCommon.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  12 //
  13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  14 //  it under the terms of the GNU General Public License as published by
  15 //  the Free Software Foundation, either version 3 of the License, or
  16 //  (at your option) any later version.
  17 //
  18 //  LbmBenchKernels is distributed in the hope that it will be useful,
  19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 //  GNU General Public License for more details.
  22 //
  23 //  You should have received a copy of the GNU General Public License
  24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  25 //
  26 // --------------------------------------------------------------------------
  27 #include "BenchKernelD3Q19ListAaRiaCommon.h"
  28
  29 #include "Memory.h"
  30 #include "Vtk.h"
  31
  32 #include <math.h>
  33
  34 #ifdef _OPENMP
  35         #include <omp.h>
  36 #endif
  37
  38 // Forward definition.
  39 void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  40
  41
  42
  43
  44 // -----------------------------------------------------------------------
  45 // Functions which are used as callback by the kernel to read or write
  46 // PDFs and nodes.
  47
  48 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
  49 {
  50         Assert(kd != NULL);
  51         Assert(kd->PdfsActive != NULL);
  52         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  53         Assert(pdf != NULL);
  54
  55         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  56         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  57         Assert(dir >= 0); Assert(dir < N_D3Q19);
  58
  59         KernelDataList * kdl = (KernelDataList *)kd;
  60
  61         if (kdl->Iteration % 2 == 0) {
  62                 // Pdfs are stored inverse, local PDFs are located in remote nodes
  63
  64                 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
  65
  66                 if (dir != D3Q19_C) {
  67                         uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
  68
  69                         *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
  70                 }
  71                 else {
  72                         *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
  73                 }
  74
  75         }
  76         else {
  77                 *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
  78         }
  79
  80
  81         return;
  82 }
  83
  84 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
  85 {
  86         Assert(kd != NULL);
  87         Assert(kd->PdfsActive != NULL);
  88         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  89         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  90         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  91         Assert(dir >= 0); Assert(dir < N_D3Q19);
  92
  93 #if 0
  94         if (isnan(pdf)) {
  95                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
  96                 DEBUG_BREAK_POINT();
  97                 exit(1);
  98         }
  99 #endif
 100
 101         KernelDataList * kdl = (KernelDataList *)kd;
 102
 103         if (kdl->Iteration % 2 == 0) {
 104                 // Pdfs are stored inverse, local PDFs are located in remote nodes
 105
 106                 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
 107
 108                 if (dir != D3Q19_C) {
 109                         uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
 110
 111                         kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
 112                 }
 113                 else {
 114                         kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
 115                 }
 116
 117         }
 118         else {
 119                 kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
 120         }
 121
 122         return;
 123 }
 124
 125
 126 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 127 {
 128         Assert(kd != NULL);
 129         Assert(kd->PdfsActive != NULL);
 130         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 131         Assert(pdfs != NULL);
 132         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 133         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 134
 135         KernelDataList * kdl = (KernelDataList *)kd;
 136
 137         if(kdl->Iteration % 2 == 0){
 138
 139                 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 140                 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
 141
 142                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 143                 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
 144
 145                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
 146                 D3Q19_LIST_WO_C
 147                 #undef X
 148
 149         } else {
 150
 151                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 152                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
 153                 D3Q19_LIST
 154                 #undef X
 155                 #undef I
 156
 157         }
 158
 159 #if 0
 160         for (int d = 0; d < 19; ++d) {
 161                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
 162                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 163                                                 for (int d2 = 0; d2 < 19; ++d2) {
 164                                                         printf("%d: %e\n", d2, pdfs[d2]);
 165                                                 }
 166                         exit(1);
 167                 }
 168         }
 169 #endif
 170
 171         return;
 172 }
 173
 174
 175 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 176 {
 177         Assert(kd != NULL);
 178         Assert(kd->PdfsActive != NULL);
 179         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 180         Assert(pdfs != NULL);
 181
 182         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 183         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 184
 185 #if 0
 186         for (int d = 0; d < 19; ++d) {
 187                 if(isnan(pdfs[d])) {
 188                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 189                                                 for (int d2 = 0; d2 < 19; ++d2) {
 190                                                         printf("%d: %e\n", d2, pdfs[d2]);
 191                                                 }
 192                         exit(1);
 193                 }
 194         }
 195 #endif
 196
 197         KernelDataList * kdl = (KernelDataList *)kd;
 198
 199         if(kdl->Iteration % 2 == 0){
 200
 201                 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 202                 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
 203
 204                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 205                 kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
 206
 207                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
 208                 D3Q19_LIST_WO_C
 209                 #undef X
 210
 211         } else {
 212
 213                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 214                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
 215                 D3Q19_LIST
 216                 #undef X
 217                 #undef I
 218
 219         }
 220
 221         return;
 222 }
 223
 224 static void ParameterUsage()
 225 {
 226         printf("Kernel parameters:\n");
 227         printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
 228
 229         return;
 230 }
 231
 232 static void ParseParameters(Parameters * params, int * blk)
 233 {
 234         Assert(blk != NULL);
 235
 236         blk[0] = 0; blk[1] = 0; blk[2] = 0;
 237
 238         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 239         #define NEXT_ARG_PRESENT() \
 240                 do { \
 241                         if (i + 1 >= params->nKernelArgs) { \
 242                                 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
 243                                 exit(1); \
 244                         } \
 245                 } while (0)
 246
 247
 248         for (int i = 0; i < params->nKernelArgs; ++i) {
 249                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
 250                         NEXT_ARG_PRESENT();
 251
 252                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 253
 254                         if (tmp <= 0) {
 255                                 printf("ERROR: blocking parameter must be > 0.\n");
 256                                 exit(1);
 257                         }
 258
 259                         blk[0] = blk[1] = blk[2] = tmp;
 260                 }
 261                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
 262                         NEXT_ARG_PRESENT();
 263
 264                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 265
 266                         if (tmp <= 0) {
 267                                 printf("ERROR: blocking parameter must be > 0.\n");
 268                                 exit(1);
 269                         }
 270
 271                         blk[0] = tmp;
 272                 }
 273                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
 274                         NEXT_ARG_PRESENT();
 275
 276                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 277
 278                         if (tmp <= 0) {
 279                                 printf("ERROR: blocking parameter must be > 0.\n");
 280                                 exit(1);
 281                         }
 282
 283                         blk[1] = tmp;
 284                 }
 285                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
 286                         NEXT_ARG_PRESENT();
 287
 288                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 289
 290                         if (tmp <= 0) {
 291                                 printf("ERROR: blocking parameter must be > 0.\n");
 292                                 exit(1);
 293                         }
 294
 295                         blk[2] = tmp;
 296                 }
 297                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 298                         ParameterUsage();
 299                         exit(1);
 300                 }
 301                 else {
 302                         printf("ERROR: unknown kernel parameter.\n");
 303                         ParameterUsage();
 304                         exit(1);
 305                 }
 306         }
 307
 308         #undef ARG_IS
 309         #undef NEXT_ARG_PRESENT
 310
 311         return;
 312 }
 313
 314 static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
 315 {
 316         Assert(ld != NULL);
 317         Assert(kdlr != NULL);
 318         Assert(nThreads > 0);
 319
 320         uint32_t * adjList = kdlr->kdl.AdjList;
 321
 322         uint32_t nConsecNodes = 0;
 323         uint32_t consecIndex = 0;
 324
 325         int nFluid = kdlr->kdl.nFluid;
 326
 327         uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
 328         int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
 329
 330         int nNodesPerThread = nFluid / nThreads;
 331
 332         for (int i = 0; i < nThreads; ++i) {
 333                 consecThreadIndices[i]     = i * nNodesPerThread + MinI(i, nFluid % nThreads);
 334                 fluidNodeThreadIndices[i] = consecThreadIndices[i];
 335         }
 336         consecThreadIndices[nThreads]     = -1;
 337         fluidNodeThreadIndices[nThreads] = nFluid;
 338
 339         int indexThread = 1;
 340
 341         // We execute following code two times.
 342         // - The first time to get the count of how many entries we need for the
 343         //   consecNodes array.
 344         // - The second time to fill the array.
 345
 346         // Loop over adjacency list of all nodes.
 347     // Compare if adjacent nodes share the same access pattern.
 348         for (int index = 1; index < nFluid; ++index) {
 349
 350                 int different = 0;
 351
 352                 // Loop over all directions except the center one.
 353                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 354                         Assert(d != D3Q19_C);
 355
 356                         if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
 357                                 // Different access pattern.
 358                                 different = 1;
 359                                 break;
 360                         }
 361                 }
 362
 363                 if (consecThreadIndices[indexThread] == index) {
 364                         // We are at a thread boundary. Starting from this index the fluids
 365                         // belong to another thread. Force a break, if nodes are consecutive.
 366                         ++indexThread;
 367                         different = 1;
 368                 }
 369
 370                 if (different) {
 371                         ++consecIndex;
 372                 }
 373         }
 374
 375         if (nFluid > 0) {
 376                 nConsecNodes = consecIndex + 1;
 377         }
 378
 379         uint32_t * consecNodes;
 380         MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
 381
 382         consecIndex = 0;
 383
 384         if (nFluid > 0) {
 385                 consecNodes[consecIndex] = 1;
 386         }
 387
 388         indexThread = 1;
 389         consecThreadIndices[0] = 0;
 390
 391         // Loop over adjacency list of all nodes.
 392     // Compare if adjacent nodes share the same access pattern.
 393         for (int index = 1; index < nFluid; ++index) {
 394
 395                 int different = 0;
 396
 397                 // Loop over all directions except the center one.
 398                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 399                         Assert(d != D3Q19_C);
 400
 401                         if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
 402                                 // Different access pattern.
 403                                 different = 1;
 404                                 break;
 405                         }
 406                 }
 407
 408                 if (consecThreadIndices[indexThread] == index) {
 409                         // We are at a thread boundary. Starting from this index the fluids
 410                         // belong to another thread. Force a break, if nodes are consecutive.
 411                         consecThreadIndices[indexThread] = consecIndex + 1;
 412                         ++indexThread;
 413                         different = 1;
 414                 }
 415
 416                 if (different) {
 417                         ++consecIndex;
 418                         Assert(consecIndex < nConsecNodes);
 419                         consecNodes[consecIndex] = 1;
 420                 }
 421                 else {
 422                         Assert(consecIndex < nConsecNodes);
 423                         consecNodes[consecIndex] += 1;
 424                 }
 425         }
 426
 427
 428         kdlr->ConsecNodes = consecNodes;
 429         kdlr->nConsecNodes = nConsecNodes;
 430
 431         kdlr->ConsecThreadIndices  = consecThreadIndices;
 432         kdlr->nConsecThreadIndices = nThreads;
 433
 434         kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices;
 435         kdlr->nFluidNodeThreadIndices = nThreads;
 436
 437         printf("# total fluid nodes: %d   consecutive blocks: %d\n", nFluid, nConsecNodes);
 438
 439         return;
 440 }
 441
 442 void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 443 {
 444         KernelData * kd;
 445         KernelDataList * kdl;
 446         KernelDataListRia * kdlr;
 447         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
 448
 449         kd = (KernelData *)kdlr;
 450         kdl = KDL(kdlr);
 451
 452         *kernelData = kd;
 453
 454 #ifdef DEBUG
 455         kd->Pdfs[0] = NULL;
 456         kd->Pdfs[1] = NULL;
 457         kd->PdfsActive = NULL;
 458         kd->DstPdfs = NULL;
 459         kd->SrcPdfs = NULL;
 460         kd->Dims[0] = -1;
 461         kd->Dims[1] = -1;
 462         kd->Dims[2] = -1;
 463         kd->GlobalDims[0] = -1;
 464         kd->GlobalDims[1] = -1;
 465         kd->GlobalDims[2] = -1;
 466         kd->Offsets[0] = -1;
 467         kd->Offsets[1] = -1;
 468         kd->Offsets[2] = -1;
 469
 470         kd->ObstIndices = NULL;
 471         kd->nObstIndices = -1;
 472         kd->BounceBackPdfsSrc = NULL;
 473         kd->BounceBackPdfsDst = NULL;
 474         kd->nBounceBackPdfs = -1;
 475
 476         kdl->AdjList = NULL;
 477         kdl->Coords = NULL;
 478         kdl->Grid = NULL;
 479         kdl->nCells = -1;
 480         kdl->nFluid = -1;
 481
 482         kdlr->ConsecNodes = NULL;
 483         kdlr->nConsecNodes = 0;
 484         kdlr->ConsecThreadIndices = NULL;
 485         kdlr->nConsecThreadIndices = 0;
 486 #endif
 487
 488         // Ajust the dimensions according to padding, if used.
 489         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 490         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
 491         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
 492
 493         int * lDims = ld->Dims;
 494
 495         int lX = lDims[0];
 496         int lY = lDims[1];
 497         int lZ = lDims[2];
 498
 499         int nTotalCells = lX * lY * lZ;
 500         int nCells = ld->nFluid; // TODO: + padding
 501         int nFluid = ld->nFluid;
 502
 503         kdl->nCells = nCells;
 504         kdl->nFluid = nFluid;
 505
 506         PdfT * pdfs[2];
 507
 508         int blk[3] = { 0 };
 509
 510         ParseParameters(params, blk);
 511
 512         if (blk[0] == 0) blk[0] = lX;
 513         if (blk[1] == 0) blk[1] = lY;
 514         if (blk[2] == 0) blk[2] = lZ;
 515
 516         printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
 517
 518         double latMiB      = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 519         double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 520         double latPadMib   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
 521
 522         printf("# lattice size:          %e MiB\n", latMiB);
 523         printf("# fluid lattice size:    %e MiB\n", latFluidMib);
 524         printf("# lattice padding:       %e MiB\n", latPadMib);
 525
 526 #define PAGE_4K         4096
 527
 528         printf("# aligning lattices to:  %d b\n", PAGE_4K);
 529
 530         MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
 531
 532         kd->Pdfs[0] = pdfs[0];
 533
 534         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
 535         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
 536         // we want the same scheduling as in the kernel.
 537         #ifdef _OPENMP
 538                 #pragma omp parallel for
 539         #endif
 540         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 541                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
 542         } }
 543
 544         // Initialize all PDFs to some standard value.
 545         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 546                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
 547         } }
 548
 549         // ----------------------------------------------------------------------
 550         // create grid which will hold the index numbers of the fluid nodes
 551
 552         uint32_t * grid;
 553
 554         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
 555                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
 556                 exit(1);
 557         }
 558         kdl->Grid = grid;
 559
 560         int latticeIndex;
 561
 562 #ifdef DEBUG
 563         for(int z = 0; z < lZ; ++z) {
 564                 for(int y = 0; y < lY; ++y) {
 565                         for(int x = 0; x < lX; ++x) {
 566
 567                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
 568
 569                                 grid[latticeIndex] = ~0;
 570                         }
 571                 }
 572         }
 573 #endif
 574
 575         // ----------------------------------------------------------------------
 576         // generate numbering over grid
 577
 578         uint32_t * coords;
 579
 580         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
 581                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
 582                 exit(1);
 583         }
 584
 585         kdl->Coords = coords;
 586
 587         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
 588         // through the ld->Lattice array.
 589         int counter = 0;
 590
 591         // Blocking is implemented via setup of the adjacency list. The kernel later will
 592         // walk through the lattice blocked automatically.
 593         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 594         for (int bY = 0; bY < lY; bY += blk[1]) {
 595         for (int bX = 0; bX < lX; bX += blk[0]) {
 596
 597                 int eX = MIN(bX + blk[0], lX);
 598                 int eY = MIN(bY + blk[1], lY);
 599                 int eZ = MIN(bZ + blk[2], lZ);
 600
 601
 602                 for (int z = bZ; z < eZ; ++z) {
 603                 for (int y = bY; y < eY; ++y) {
 604                 for (int x = bX; x < eX; ++x) {
 605
 606                         latticeIndex = L_INDEX_4(lDims, x, y, z);
 607
 608                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
 609                                 grid[latticeIndex] = counter;
 610
 611                                 coords[C_INDEX_X(counter)] = x;
 612                                 coords[C_INDEX_Y(counter)] = y;
 613                                 coords[C_INDEX_Z(counter)] = z;
 614
 615                                 ++counter;
 616                         }
 617                 } } }
 618         } } }
 619
 620         Verify(counter == nFluid);
 621
 622         uint32_t * adjList;
 623
 624         double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
 625
 626         printf("# index size:            %e MiB\n", indexMib);
 627
 628         // AdjList only requires 18 instead of 19 entries per node, as
 629         // the center PDF needs no addressing.
 630         if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
 631                 printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
 632                 exit(1);
 633         }
 634
 635         kdl->AdjList = adjList;
 636
 637         int x, y, z;
 638
 639         uint32_t neighborIndex;
 640         uint32_t dstIndex;
 641
 642         int nx, ny, nz, px, py, pz;
 643
 644         // Loop over all fluid nodes and compute the indices to the neighboring
 645         // PDFs for configured data layout (AoS/SoA).
 646         // TODO: Parallelized loop to ensure correct NUMA placement.
 647         // #ifdef _OPENMP  --> add line continuation
 648         //      #pragma omp parallel for default(none)
 649         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
 650         //                              stderr,
 651         //                              lDims, grid, ld, lX, lY, lZ, adjList)
 652         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
 653         // #endif
 654         for (int index = 0; index < nFluid; ++index) {
 655                 x = coords[C_INDEX_X(index)];
 656                 y = coords[C_INDEX_Y(index)];
 657                 z = coords[C_INDEX_Z(index)];
 658
 659                 Assert(x >= 0 && x < lX);
 660                 Assert(y >= 0 && y < lY);
 661                 Assert(z >= 0 && z < lZ);
 662
 663                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
 664
 665                 // Loop over all directions except the center one.
 666                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 667                         Assert(d != D3Q19_C);
 668
 669 #ifdef PROP_MODEL_PUSH
 670                         nx = x + D3Q19_X[d];
 671                         ny = y + D3Q19_Y[d];
 672                         nz = z + D3Q19_Z[d];
 673
 674 #elif PROP_MODEL_PULL
 675                         nx = x - D3Q19_X[d];
 676                         ny = y - D3Q19_Y[d];
 677                         nz = z - D3Q19_Z[d];
 678 #else
 679                         #error No implementation for this PROP_MODEL_NAME.
 680 #endif
 681                         // If the neighbor is outside the latcie in X direction and we have a
 682                         // periodic boundary then we need to wrap around.
 683                         if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
 684                                         ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
 685                                         ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
 686                                                                                                                                 ){
 687                                 // x periodic
 688
 689                                 if (nx < 0) {
 690                                         px = lX - 1;
 691                                 }
 692                                 else if (nx >= lX) {
 693                                         px = 0;
 694                                 } else {
 695                                         px = nx;
 696                                 }
 697                                 // y periodic
 698                                 if (ny < 0) {
 699                                         py = lY - 1;
 700                                 }
 701                                 else if (ny >= lY) {
 702                                         py = 0;
 703                                 } else {
 704                                         py = ny;
 705                                 }
 706
 707                                 // z periodic
 708                                 if (nz < 0) {
 709                                         pz = lZ - 1;
 710                                 }
 711                                 else if (nz >= lZ) {
 712                                         pz = 0;
 713                                 } else {
 714                                         pz = nz;
 715                                 }
 716
 717                                 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
 718                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 719                                 }
 720                                 else {
 721                                         neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
 722
 723                                         AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
 724
 725                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 726                                 }
 727                         }
 728                         else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
 729                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 730                         }
 731                         else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
 732                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 733                         }
 734                         else {
 735                                 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
 736
 737                                 Assert(neighborIndex != ~0);
 738
 739                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 740                         }
 741
 742                         Assert(dstIndex >= 0);
 743                         Assert(dstIndex < nCells * N_D3Q19);
 744
 745                         adjList[index * N_D3Q19_IDX + d] = dstIndex;
 746                 }
 747         }
 748
 749         int nThreads = 1;
 750
 751 #ifdef _OPENMP
 752         nThreads = omp_get_max_threads();
 753 #endif
 754
 755         SetupConsecNodes(ld, KDLR(kd), nThreads);
 756
 757         double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
 758         double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0);
 759         double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
 760
 761         printf("# loop balance:          %.2f B/FLUP  even: %.2f B/FLUP  odd %.2f B/FLUP\n",
 762                         loopBalance, loopBalanceEven, loopBalanceOdd);
 763
 764         // Fill remaining KernelData structures
 765         kd->GetNode = GetNode;
 766         kd->SetNode = SetNode;
 767
 768         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
 769         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
 770
 771         kd->Kernel = FNAME(D3Q19ListAaRiaKernel);
 772
 773         kd->DstPdfs = NULL;
 774         kd->PdfsActive = kd->Pdfs[0];
 775
 776         return;
 777 }
 778
 779 void FNAME(D3Q19ListAaRiaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 780 {
 781         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
 782
 783         MemFree((void **)&((*kdlr)->ConsecNodes));
 784
 785         if ((*kdlr)->ConsecThreadIndices != NULL) {
 786                 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
 787         }
 788
 789         if ((*kdlr)->FluidNodeThreadIndices != NULL) {
 790                 MemFree((void **)&((*kdlr)->FluidNodeThreadIndices));
 791         }
 792
 793         KernelDataList ** kdl = (KernelDataList **)kernelData;
 794
 795         MemFree((void **)&((*kdl)->AdjList));
 796         MemFree((void **)&((*kdl)->Coords));
 797         MemFree((void **)&((*kdl)->Grid));
 798
 799         MemFree((void **)&((*kernelData)->Pdfs[0]));
 800
 801         MemFree((void **)kernelData);
 802         return;
 803 }
 804