src/BenchKernelD3Q19ListAaPvGatherHybridCommon.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //   Michael Hussnaetter, 2017-2018
  12 //   University of Erlangen-Nuremberg, Germany
  13 //   michael.hussnaetter -at- fau.de
  14 //
  15 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  16 //
  17 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  18 //  it under the terms of the GNU General Public License as published by
  19 //  the Free Software Foundation, either version 3 of the License, or
  20 //  (at your option) any later version.
  21 //
  22 //  LbmBenchKernels is distributed in the hope that it will be useful,
  23 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 //  GNU General Public License for more details.
  26 //
  27 //  You should have received a copy of the GNU General Public License
  28 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  29 //
  30 // --------------------------------------------------------------------------
  31 #include "BenchKernelD3Q19ListAaPvGatherHybridCommon.h"
  32
  33 #include "Memory.h"
  34 #include "Vtk.h"
  35 #include "Vector.h"
  36
  37 #include <math.h>
  38
  39 #ifdef _OPENMP
  40 #include <omp.h>
  41 #endif
  42
  43 #define PAGE_4K         4096
  44
  45 #if ALLOC_ADJ_LIST_IN_HBM == 1
  46 #define ADJ_LIST_ALLOCATOR HbwAllocAligned
  47 #define ADJ_LIST_FREE HbwFree
  48 #else
  49 #define ADJ_LIST_ALLOCATOR MemAllocAligned
  50 #define ADJ_LIST_FREE MemFree
  51 #endif
  52
  53 #if ALLOC_PDF_IN_HBM == 1
  54 #define PDF_ALLOCATOR HbwAllocAligned
  55 #define PDF_FREE HbwFree
  56 #else
  57 #define PDF_ALLOCATOR MemAllocAligned
  58 #define PDF_FREE MemFree
  59 #endif
  60
  61 // Forward definition.
  62 void FNAME(D3Q19ListAaPvGatherHybridKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  63
  64
  65
  66 // -----------------------------------------------------------------------
  67 // Functions which are used as callback by the kernel to read or write
  68 // PDFs and nodes.
  69
  70 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
  71 {
  72         Assert(kd != NULL);
  73         Assert(kd->PdfsActive != NULL);
  74         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  75         Assert(pdf != NULL);
  76
  77         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  78         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  79         Assert(dir >= 0); Assert(dir < N_D3Q19);
  80
  81         KernelDataList * kdl = KDL(kd);
  82         uint32_t * adjList = kdl->AdjList;
  83
  84         if (kdl->Iteration % 2 == 0) {
  85                 // Pdfs are stored inverse, local PDFs are located in remote nodes
  86
  87                 // getting node index
  88                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
  89
  90                 if (dir != D3Q19_C) {
  91                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
  92                         *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
  93                         #undef ADJ_LIST
  94                 }
  95                 else {
  96                         *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
  97                 }
  98
  99         }
 100         else {
 101                 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
 102         }
 103
 104         return;
 105 }
 106
 107 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
 108 {
 109         Assert(kd != NULL);
 110         Assert(kd->PdfsActive != NULL);
 111         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 112         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 113         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 114         Assert(dir >= 0); Assert(dir < N_D3Q19);
 115
 116         if (isnan(pdf)) {
 117                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
 118                 DEBUG_BREAK_POINT();
 119                 exit(1);
 120         }
 121
 122         KernelDataList * kdl = KDL(kd);
 123         uint32_t * adjList = kdl->AdjList;
 124
 125         if (kdl->Iteration % 2 == 0) {
 126                 // Pdfs are stored inverse, local PDFs are located in remote nodes
 127
 128                 // getting node index
 129                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
 130
 131                 if (dir != D3Q19_C) {
 132                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 133                         kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
 134                         #undef ADJ_LIST
 135                 } else {
 136                         kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
 137                 }
 138
 139         } else {
 140                 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
 141         }
 142
 143         return;
 144 }
 145
 146
 147 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 148 {
 149         Assert(kd != NULL);
 150         Assert(kd->PdfsActive != NULL);
 151         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 152         Assert(pdfs != NULL);
 153         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 154         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 155
 156         KernelDataList * kdl = KDL(kd);
 157         uint32_t * adjList = kdl->AdjList;
 158
 159         if(kdl->Iteration % 2 == 0){
 160
 161                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 162
 163                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 164                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 165                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
 166                 D3Q19_LIST_WO_C
 167                         #undef X
 168                         #undef ADJ_LIST
 169                         pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
 170
 171         } else {
 172
 173                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 174                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
 175                 D3Q19_LIST
 176                         #undef X
 177                         #undef I
 178
 179         }
 180
 181         for (int d = 0; d < 19; ++d) {
 182                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
 183                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 184                         for (int d2 = 0; d2 < 19; ++d2) {
 185                                 printf("%d: %e\n", d2, pdfs[d2]);
 186                         }
 187                         exit(1);
 188                 }
 189         }
 190
 191         return;
 192 }
 193
 194
 195 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 196 {
 197         Assert(kd != NULL);
 198         Assert(kd->PdfsActive != NULL);
 199         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 200         Assert(pdfs != NULL);
 201
 202         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 203         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 204
 205         for (int d = 0; d < 19; ++d) {
 206                 if(isnan(pdfs[d])) {
 207                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 208                         for (int d2 = 0; d2 < 19; ++d2) {
 209                                 printf("%d: %e\n", d2, pdfs[d2]);
 210                         }
 211                         exit(1);
 212                 }
 213         }
 214
 215         KernelDataList * kdl = KDL(kd);
 216         uint32_t * adjList = kdl->AdjList;
 217
 218         if(kdl->Iteration % 2 == 0){
 219
 220                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 221
 222                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 223                 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
 224
 225                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 226                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
 227                 D3Q19_LIST_WO_C
 228                         #undef X
 229                         #undef ADJ_LIST
 230
 231         } else {
 232
 233                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 234                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
 235                 D3Q19_LIST
 236                         #undef X
 237                         #undef I
 238
 239         }
 240
 241         return;
 242 }
 243
 244 static void ParameterUsage()
 245 {
 246         printf("Kernel parameters:\n");
 247         printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
 248
 249         return;
 250 }
 251
 252 static void ParseParameters(Parameters * params, int * blk)
 253 {
 254         Assert(blk != NULL);
 255
 256         blk[0] = 0; blk[1] = 0; blk[2] = 0;
 257
 258         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 259         #define NEXT_ARG_PRESENT() \
 260         do { \
 261                 if (i + 1 >= params->nKernelArgs) { \
 262                         printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
 263                         exit(1); \
 264                 } \
 265         } while (0)
 266
 267
 268         for (int i = 0; i < params->nKernelArgs; ++i) {
 269                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
 270                         NEXT_ARG_PRESENT();
 271
 272                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 273
 274                         if (tmp <= 0) {
 275                                 printf("ERROR: blocking parameter must be > 0.\n");
 276                                 exit(1);
 277                         }
 278
 279                         blk[0] = blk[1] = blk[2] = tmp;
 280                 }
 281                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
 282                         NEXT_ARG_PRESENT();
 283
 284                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 285
 286                         if (tmp <= 0) {
 287                                 printf("ERROR: blocking parameter must be > 0.\n");
 288                                 exit(1);
 289                         }
 290
 291                         blk[0] = tmp;
 292                 }
 293                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
 294                         NEXT_ARG_PRESENT();
 295
 296                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 297
 298                         if (tmp <= 0) {
 299                                 printf("ERROR: blocking parameter must be > 0.\n");
 300                                 exit(1);
 301                         }
 302
 303                         blk[1] = tmp;
 304                 }
 305                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
 306                         NEXT_ARG_PRESENT();
 307
 308                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 309
 310                         if (tmp <= 0) {
 311                                 printf("ERROR: blocking parameter must be > 0.\n");
 312                                 exit(1);
 313                         }
 314
 315                         blk[2] = tmp;
 316                 }
 317                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 318                         ParameterUsage();
 319                         exit(1);
 320                 }
 321                 else {
 322                         printf("ERROR: unknown kernel parameter.\n");
 323                         ParameterUsage();
 324                         exit(1);
 325                 }
 326         }
 327
 328         #undef ARG_IS
 329         #undef NEXT_ARG_PRESENT
 330
 331         return;
 332 }
 333
 334 static void SetuploopStartIndices(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
 335 {
 336         //#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 337         Assert(ld != NULL);
 338         Assert(kdlr != NULL);
 339         Assert(nThreads > 0);
 340
 341         //uint32_t * adjList = kdlr->kdl.AdjList;
 342         uint32_t * adjList = kdlr->kdl.AdjList;
 343
 344         uint32_t nLoopStartIndices = 0;
 345         uint32_t loopStartIndex = 2;
 346
 347         int nFluid = kdlr->kdl.nFluid;
 348
 349         int * oddKernelThreadStartIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
 350
 351         int nNodesPerThread = nFluid / nThreads;
 352         //printf("nodesPerThread: %d\n", nNodesPerThread);
 353
 354         for (int i = 0; i < nThreads; ++i) {
 355                 oddKernelThreadStartIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
 356         }
 357
 358         oddKernelThreadStartIndices[nThreads] = nFluid;
 359
 360         /*
 361            for (int i = 0; i <= nThreads; ++i) {
 362            printf("oddKernelThreadStartIndices[%d] = %d\n", i, oddKernelThreadStartIndices[i]);
 363            }
 364            */
 365
 366         int threadIndex = 1;
 367
 368         // We execute following code two times.
 369         // - The first time to get the count of how many entries we need for the
 370         //   loopStartIndices array.
 371         // - The second time to fill the array.
 372
 373         // Loop over adjacency list of all nodes.
 374         // Compare if adjacent nodes within one cache line share the same access pattern.
 375         // First vectorized access is assumed to be consecutive (-> may be loaded with regular load).
 376
 377         int lastCacheLineConsecutive = 1;
 378
 379         for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
 380
 381                 int currentCacheLineConsecutive = 1;
 382
 383                 // Loop over all directions except the center one.
 384                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 385                         Assert(d != D3Q19_C);
 386
 387                         // check if cache line itself has consecutive memory access pattern
 388                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex) {
 389                                 int index = fluidBaseIndex + inChunkIndex;
 390
 391                                 Assert(index < nFluid);
 392
 393                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 394                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
 395                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
 396                                         //printf("no match for index: %d\n", d);
 397                                         //printf("ADJ_LlST(%d,%d) = %d  !=  %d = ADJ_LlST(%d,%d) + 1\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-1,d), index-1, d);
 398                                         // Different access pattern.
 399                                         currentCacheLineConsecutive = 0;
 400                                         break;
 401                                 }
 402                                 #undef ADJ_LIST
 403
 404                         }
 405
 406                         if(!currentCacheLineConsecutive){
 407                                 break; //exit from nested loop
 408                         }
 409
 410                 }
 411
 412                 int interCacheLineConsecutive = 1;
 413
 414                 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
 415                         // check if cache line has consecutive memory access pattern to last entry of previous cache line
 416                         int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
 417                         if (lastIdxOfPreviousCacheLine > 0) {
 418                                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 419                                         Assert(d != D3Q19_C);
 420                                         #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 421                                         if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
 422                                                 // Different access pattern.
 423                                                 //printf("not interCacheConsecutive\n");
 424                                                 interCacheLineConsecutive = 0;
 425                                                 break;
 426                                         }
 427                                         #undef ADJ_LIST
 428
 429                                 }
 430                         }
 431                 }
 432                 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
 433                 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
 434                                 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
 435                         // Current cache line contains thread boundary.
 436                         // These cache lines are treated by scalar peel and
 437                         // reminder loops in kernel of every thread.
 438                         // TODO maybe replace these loops with masked gather / scatter?!
 439                         if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
 440                                 ++loopStartIndex; // reserving gather/scatter index
 441                         }
 442                         ++loopStartIndex; // reserving space for start remainder loop of thread n
 443
 444                         if (threadIndex < nThreads){
 445                                 ++loopStartIndex; // reserving space for starting peel loop of thread n+1
 446
 447                                 if (fluidBaseIndex - 1 == threadBoundaryIndex){
 448                                         if(!currentCacheLineConsecutive){
 449                                                 ++loopStartIndex;
 450                                         }
 451                                 }
 452                                 else {
 453                                         currentCacheLineConsecutive = 1;
 454                                 }
 455
 456                                 //++loopStartIndex; // reserving space for ending peel loop / starting load/store of thread n+1
 457                                 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
 458                         }
 459                         ++threadIndex;
 460                 }
 461                 else {
 462                         // We are not at a thread boundary.
 463                         if (currentCacheLineConsecutive) {
 464                                 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
 465                                         loopStartIndex+=2;
 466                                 }
 467                                 else if(!lastCacheLineConsecutive){
 468                                         ++loopStartIndex;
 469                                 }
 470                         }
 471                         else {
 472                                 if(lastCacheLineConsecutive){
 473                                         ++loopStartIndex;
 474                                 }
 475                         }
 476                 }
 477
 478                 // treating special case when last thread has no remainder loop
 479                 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
 480                         //printf("--> special case 111. loopStartIndex: %d \n", loopStartIndex);
 481                         if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
 482                                 ++loopStartIndex; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
 483                         }
 484
 485                         ++loopStartIndex; // gather/scatter end and scalar remainder start
 486                         ++loopStartIndex; // scalar remainder end and scalar peel start
 487
 488                 }
 489
 490                 lastCacheLineConsecutive = currentCacheLineConsecutive;
 491         }
 492
 493         if (nFluid > 0) {
 494                 nLoopStartIndices = loopStartIndex;
 495         }
 496
 497         int * loopStartIndices;
 498         unsigned long loopStartIndicesByte = (nLoopStartIndices + 1) * sizeof(int);
 499
 500         printf("# Loop Start Index Array Allocation:\n");
 501         printf("#   elements:      \t\t%d\n",   nLoopStartIndices + 1);
 502         printf("#   size:          \t\t%e MiB\n", loopStartIndicesByte / 1024.0 / 1024.0);
 503         printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
 504
 505         if (MemAllocAligned((void **)&loopStartIndices, loopStartIndicesByte, PAGE_4K)) {
 506                 printf("ERROR: allocating loopStartIndices array with MemAllocAligned failed: %lu bytes.\n", loopStartIndicesByte);
 507                 exit(1);
 508         }
 509         else {
 510                 printf("#   allocator: \t\t\tMemAllocAligned()\n");
 511         }
 512
 513         oddKernelThreadStartIndices[0] = 0;
 514         loopStartIndices[0] = 0; //first scalar loop would start with 0
 515         loopStartIndices[1] = 0; //no peel loop expected -> first load/store loop may start at index==0
 516         loopStartIndices[2] = 0; //may not be set in case first access is gather/scatter -> therefore its set here
 517
 518         // resetting values to default
 519         threadIndex = 1;
 520         lastCacheLineConsecutive = 1;
 521         loopStartIndex = 2;
 522
 523         // Loop over adjacency list of all nodes.
 524         // Compare if adjacent nodes share the same access pattern.
 525
 526         int indexAccumulator = 0;
 527
 528         // for statistical reasons:
 529         int gatherAccumulator = 0;
 530         int loadAccumulator = 0;
 531         int scalarLookups = 0;
 532         int loadLookups = 0;
 533
 534
 535         for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
 536                 int currentCacheLineConsecutive = 1;
 537                 //printf("fluidbaseIndex: %d\n", fluidBaseIndex);
 538                 // Loop over all directions except the center one.
 539                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 540                         Assert(d != D3Q19_C);
 541
 542                         // check if cache line itself has consecutive memory access pattern
 543                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
 544                                 int index = fluidBaseIndex + inChunkIndex;
 545
 546                                 Assert(index < nFluid);
 547
 548                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 549                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
 550                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
 551                                         // Different access pattern.
 552                                         currentCacheLineConsecutive = 0;
 553                                         break;
 554                                 }
 555                                 #undef ADJ_LIST
 556                         }
 557
 558                         if(!currentCacheLineConsecutive){
 559                                 break; //exit from nested loop
 560                         }
 561                 }
 562
 563                 int interCacheLineConsecutive = 1;
 564
 565                 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
 566                         // check if cache line has consecutive memory access pattern to last entry of previous cache line
 567                         int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
 568                         if (lastIdxOfPreviousCacheLine > 0) {
 569                                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 570                                         Assert(d != D3Q19_C);
 571                                         #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 572                                         if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
 573                                                 // Different access pattern.
 574                                                 interCacheLineConsecutive = 0;
 575                                                 break;
 576                                         }
 577                                         #undef ADJ_LIST
 578
 579                                 }
 580                         }
 581                 }
 582
 583                 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
 584                 //if (fluidBaseIndex > 3500)
 585                 //      printf("threadBoundaryIndex: %d  fluidBaseIndex-1: %d fluidBaseIndex + VSIZE - 1: %d\n", threadBoundaryIndex, fluidBaseIndex-1, fluidBaseIndex + VSIZE -1);
 586
 587                 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
 588                                 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
 589                         // Current cache line contains thread boundary.
 590                         // These cache lines are treated by scalar peel and
 591                         // reminder loops in kernel of every thread.
 592                         // TODO maybe replace these loops with masked gather / scatter?!
 593                         if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
 594                                 //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; //same value as scalar remainder start => no more access to gather/scatter loop
 595                                 loopStartIndices[loopStartIndex] = indexAccumulator; //same value as scalar remainder start => no more access to gather/scatter loop
 596                                 ++loopStartIndex;
 597                         }
 598
 599                         //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; // gather/scatter end and scalar remainder start
 600                         loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
 601                         ++loopStartIndex;
 602
 603                         // starting indices of thread n+1
 604                         loopStartIndices[loopStartIndex] = threadBoundaryIndex; // scalar remainder of thread n end and scalar peel of thread n+1 start
 605                         oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
 606
 607                         if (threadIndex < nThreads){
 608                                 indexAccumulator = ((threadBoundaryIndex + VSIZE - 1) / VSIZE ) * VSIZE; // rounding towards next multiple of VSIZE
 609                                 ++loopStartIndex;
 610                                 loopStartIndices[loopStartIndex] = indexAccumulator; // scalar peel end and 1st load/store start
 611
 612                                 // treating special case when there is no peel / remainder loop
 613                                 if (fluidBaseIndex - 1 == threadBoundaryIndex){
 614                                         if(!currentCacheLineConsecutive){
 615                                                 ++loopStartIndex;
 616                                                 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end and 1st gather/scatter start
 617                                                 gatherAccumulator += VSIZE;
 618                                         }
 619                                         else {
 620                                                 loadLookups += VSIZE;
 621                                         }
 622                                         indexAccumulator += VSIZE;
 623                                 }
 624                                 else {
 625                                         scalarLookups += VSIZE;
 626                                         currentCacheLineConsecutive = 1;
 627                                 }
 628
 629                                 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
 630                                 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
 631                         }
 632                         ++threadIndex;
 633
 634                 }
 635                 else {
 636                         // We are not at a thread boundary.
 637                         int print = 0;
 638                         if (currentCacheLineConsecutive) {
 639                                 loadAccumulator += VSIZE;
 640
 641                                 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
 642                                         loadLookups += VSIZE;
 643                                         if (print)
 644                                                 printf("#1 loopStartIndex: %d\n", loopStartIndex);
 645                                         // loopStartIndices[loopStartIndex] is not incremented since pointers need to be fetched again.
 646                                         // loopStartIndices[loopStartIndex + 1] (-> start Load/Store and end Gather/Scatter)
 647                                         // gets same value as loopStartIndices[loopStartindex] (-> start Gather/Scatter)
 648                                         // this ensures that no gather/scatter iteration is executed
 649                                         ++loopStartIndex;
 650                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 651
 652                                         // loopStartIndices[loopStartIndex + 2] (-> start Gather/Scatter and end Load/Store)
 653                                         // gets set to have one Load/Store iteration
 654                                         ++loopStartIndex;
 655                                         indexAccumulator+=VSIZE;
 656                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 657
 658                                 }
 659                                 else if(!lastCacheLineConsecutive){
 660                                         loadLookups += VSIZE;
 661                                         if (print)
 662                                                 printf("#2 loopStartIndex: %d\n", loopStartIndex);
 663                                         ++loopStartIndex;
 664                                         indexAccumulator+=VSIZE;
 665                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 666                                 }
 667                                 else { // (lastCacheLineConsecutive && interCacheLineConsecutive)
 668                                         if (print)
 669                                                 printf("#3 loopStartIndex: %d\n", loopStartIndex);
 670                                         indexAccumulator+=VSIZE;
 671                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 672                                 }
 673                         }
 674                         else {
 675                                 gatherAccumulator += VSIZE;
 676                                 if(lastCacheLineConsecutive){
 677                                         if (print)
 678                                                 printf("#4 loopStartIndex: %d\n", loopStartIndex);
 679                                         ++loopStartIndex;
 680                                         indexAccumulator+=VSIZE;
 681                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 682                                 }
 683                                 else { // lastCacheLine without not consecutive memory access pattern
 684                                         if (print)
 685                                                 printf("#5 loopStartIndex: %d\n", loopStartIndex);
 686                                         indexAccumulator+=VSIZE;
 687                                         loopStartIndices[loopStartIndex] = indexAccumulator;
 688                                 }
 689                         }
 690                 }
 691
 692                 // treating special case when last thread has no remainder loop
 693                 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
 694                         //printf("--> special case. indexAccumulator: %d\n", indexAccumulator);
 695                         if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
 696                                 ++loopStartIndex;
 697                                 loopStartIndices[loopStartIndex] = indexAccumulator; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
 698                         }
 699
 700                         ++loopStartIndex;
 701                         loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
 702                         ++loopStartIndex;
 703                         loopStartIndices[loopStartIndex] = indexAccumulator; // scalar remainder end and scalar peel start
 704
 705                         oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
 706                 }
 707
 708                 lastCacheLineConsecutive = currentCacheLineConsecutive;
 709
 710         }
 711
 712         if (nLoopStartIndices != loopStartIndex){
 713                 printf("ERROR: nLoopStartIndices unequal loopStartIndex!\n");
 714         }
 715
 716         /*
 717         printf("loopStartIndices:\n");
 718         for(int i = 0; i <= nLoopStartIndices; ++i){
 719                 printf("%d ", loopStartIndices[i]);
 720         }
 721         printf("\n");
 722         printf("oddKernelThreadStartIndices:\n");
 723         for(int i = 0; i <= nThreads; ++i){
 724                 printf("%d ", oddKernelThreadStartIndices[i]);
 725         }
 726         printf("\n");
 727         */
 728
 729         kdlr->loopStartIndices = loopStartIndices;
 730         kdlr->nLoopStartIndices = nLoopStartIndices;
 731
 732         kdlr->oddKernelThreadStartIndices  = oddKernelThreadStartIndices;
 733         kdlr->nOddKernelThreadStartIndices = nThreads;
 734
 735         printf("#   vload/vstore nodes:  \t% 10d   \t(%3.4f %% of total fluid nodes)\n", loadAccumulator, ((double) loadAccumulator / (double) nFluid) * 100);
 736         printf("#   gather/scatter nodes:\t% 10d   \t(%3.4f %% of total fluid nodes)\n", gatherAccumulator, ((double) gatherAccumulator / (double) nFluid) * 100.0);
 737         printf("#   vload/vstore lookups:\t% 10d \n", loadLookups * (N_D3Q19 - 1));
 738         printf("#   gather/scatter lookups:\t% 10d \n", gatherAccumulator * (N_D3Q19 - 1));
 739         printf("#   scalar lookups:      \t% 10d \n", scalarLookups * (N_D3Q19 - 1));
 740
 741         double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
 742         double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) /* actual PDFs */
 743                 + (((double)(gatherAccumulator + loadLookups + scalarLookups)) / nFluid) * sizeof(int) * (N_D3Q19 - 1) /* AdjList */
 744                 + (nLoopStartIndices / nFluid) * sizeof(int); // one lookup to loopStartIndices
 745
 746         double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
 747
 748         kdlr->kdl.kd.LoopBalance = loopBalance;
 749
 750         printf("# loop balance:\n");
 751         printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
 752         printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
 753         printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
 754
 755         return;
 756 }
 757
 758 void FNAME(D3Q19ListAaPvGatherHybridInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 759 {
 760         KernelData * kd;
 761         KernelDataList * kdl;
 762         KernelDataListRia * kdlr;
 763         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
 764
 765         kd = (KernelData *)kdlr;
 766         kdl = KDL(kdlr);
 767
 768         *kernelData = kd;
 769
 770 #ifdef DEBUG
 771         kd->Pdfs[0] = NULL;
 772         kd->Pdfs[1] = NULL;
 773         kd->PdfsActive = NULL;
 774         kd->DstPdfs = NULL;
 775         kd->SrcPdfs = NULL;
 776         kd->Dims[0] = -1;
 777         kd->Dims[1] = -1;
 778         kd->Dims[2] = -1;
 779         kd->GlobalDims[0] = -1;
 780         kd->GlobalDims[1] = -1;
 781         kd->GlobalDims[2] = -1;
 782         kd->Offsets[0] = -1;
 783         kd->Offsets[1] = -1;
 784         kd->Offsets[2] = -1;
 785
 786         kd->ObstIndices = NULL;
 787         kd->nObstIndices = -1;
 788         kd->BounceBackPdfsSrc = NULL;
 789         kd->BounceBackPdfsDst = NULL;
 790         kd->nBounceBackPdfs = -1;
 791
 792         kdl->AdjList = NULL;
 793         kdl->Coords = NULL;
 794         kdl->Grid = NULL;
 795         kdl->nCells = -1;
 796         kdl->nFluid = -1;
 797
 798         kdlr->loopStartIndices = NULL;
 799         kdlr->nLoopStartIndices = 0;
 800         kdlr->oddKernelThreadStartIndices = NULL;
 801         kdlr->nOddKernelThreadStartIndices = 0;
 802 #endif
 803
 804         kdl->Iteration = -1;
 805
 806         // Ajust the dimensions according to padding, if used.
 807         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 808         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
 809         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
 810
 811         int * lDims = ld->Dims;
 812
 813         int lX = lDims[0];
 814         int lY = lDims[1];
 815         int lZ = lDims[2];
 816
 817         int nTotalCells = lX * lY * lZ;
 818         int nCells = ld->nFluid; // TODO: + padding
 819         int nFluid = ld->nFluid;
 820
 821         // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
 822         // done during lattice setup.
 823         kdl->nCells = nCells;
 824         kdl->nFluid = nFluid;
 825
 826         PdfT * pdfs[2];
 827
 828         int blk[3] = { 0 };
 829
 830         ParseParameters(params, blk);
 831
 832         if (blk[0] == 0) blk[0] = lX;
 833         if (blk[1] == 0) blk[1] = lY;
 834         if (blk[2] == 0) blk[2] = lZ;
 835
 836         printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
 837
 838         unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
 839         unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
 840         unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
 841
 842         printf("# Lattice Array Allocation:\n");
 843         printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
 844         printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
 845         printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
 846
 847
 848         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
 849
 850         if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
 851                 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
 852                 exit(1);
 853         }
 854         else {
 855                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
 856         }
 857
 858         kd->Pdfs[0] = pdfs[0];
 859
 860         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
 861         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
 862         // we want the same scheduling as in the kernel.
 863         #ifdef _OPENMP
 864         #pragma omp parallel for
 865         #endif
 866         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 867                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
 868         } }
 869
 870         // Initialize all PDFs to some standard value.
 871         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 872                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
 873         } }
 874
 875         // ----------------------------------------------------------------------
 876         // create grid which will hold the index numbers of the fluid nodes
 877
 878         uint32_t * grid;
 879
 880         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
 881                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
 882                 exit(1);
 883         }
 884         kdl->Grid = grid;
 885
 886         int latticeIndex;
 887
 888 #ifdef DEBUG
 889         for(int z = 0; z < lZ; ++z) {
 890                 for(int y = 0; y < lY; ++y) {
 891                         for(int x = 0; x < lX; ++x) {
 892
 893                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
 894
 895                                 grid[latticeIndex] = ~0;
 896                         }
 897                 }
 898         }
 899 #endif
 900
 901         // ----------------------------------------------------------------------
 902         // generate numbering over grid
 903
 904         uint32_t * coords;
 905
 906         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
 907                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
 908                 exit(1);
 909         }
 910
 911         kdl->Coords = coords;
 912
 913         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
 914         // through the ld->Lattice array.
 915         int counter = 0;
 916
 917         // Blocking is implemented via setup of the adjacency list. The kernel later will
 918         // walk through the lattice blocked automatically.
 919         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 920                 for (int bY = 0; bY < lY; bY += blk[1]) {
 921                         for (int bX = 0; bX < lX; bX += blk[0]) {
 922
 923                                 int eX = MIN(bX + blk[0], lX);
 924                                 int eY = MIN(bY + blk[1], lY);
 925                                 int eZ = MIN(bZ + blk[2], lZ);
 926
 927
 928                                 for (int z = bZ; z < eZ; ++z) {
 929                                         for (int y = bY; y < eY; ++y) {
 930                                                 for (int x = bX; x < eX; ++x) {
 931
 932                                                         latticeIndex = L_INDEX_4(lDims, x, y, z);
 933
 934                                                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
 935                                                                 grid[latticeIndex] = counter;
 936
 937                                                                 coords[C_INDEX_X(counter)] = x;
 938                                                                 coords[C_INDEX_Y(counter)] = y;
 939                                                                 coords[C_INDEX_Z(counter)] = z;
 940
 941                                                                 ++counter;
 942                                                         }
 943                                                 } } }
 944                         } } }
 945
 946         Verify(counter == nFluid);
 947
 948         uint32_t * adjList;
 949
 950         // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
 951         int nFluid_padded = ((nFluid + VSIZE - 1) / VSIZE) * VSIZE;
 952         unsigned long adjListBytes = nFluid_padded * sizeof(int) * N_D3Q19_IDX;
 953
 954         printf("# Adjacency List Allocation:\n");
 955         printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
 956         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
 957
 958         // AdjList only requires 18 instead of 19 entries per node, as
 959         // the center PDF needs no addressing.
 960         if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
 961                 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
 962                 exit(1);
 963         }
 964         else {
 965                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
 966         }
 967
 968         for (int i = 0; i < nFluid_padded; ++i){
 969                 adjList[i] = -1;
 970         }
 971
 972         kdl->AdjList = adjList;
 973
 974         int x, y, z;
 975
 976         uint32_t neighborIndex;
 977         uint32_t dstIndex;
 978
 979         int nx, ny, nz, px, py, pz;
 980
 981         // Loop over all fluid nodes and compute the indices to the neighboring
 982         // PDFs for configured data layout (AoS/SoA).
 983         // Parallelized loop to ensure correct NUMA placement.
 984         // #ifdef _OPENMP  --> add line continuation
 985         //      #pragma omp parallel for default(none)
 986         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
 987         //                              stderr,
 988         //                              lDims, grid, ld, lX, lY, lZ, adjList)
 989         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
 990         // #endif
 991
 992         for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
 993
 994
 995                 // Loop over all directions except the center one.
 996                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 997                         Assert(d != D3Q19_C);
 998
 999                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
1000                                 int index = fluidBaseIndex + inChunkIndex;
1001
1002                                 Assert(index < nFluid);
1003
1004                                 x = coords[C_INDEX_X(index)];
1005                                 y = coords[C_INDEX_Y(index)];
1006                                 z = coords[C_INDEX_Z(index)];
1007
1008                                 Assert(x >= 0 && x < lX);
1009                                 Assert(y >= 0 && y < lY);
1010                                 Assert(z >= 0 && z < lZ);
1011
1012                                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
1013
1014 #ifdef PROP_MODEL_PUSH
1015                                 nx = x + D3Q19_X[d];
1016                                 ny = y + D3Q19_Y[d];
1017                                 nz = z + D3Q19_Z[d];
1018
1019 #elif PROP_MODEL_PULL
1020                                 nx = x - D3Q19_X[d];
1021                                 ny = y - D3Q19_Y[d];
1022                                 nz = z - D3Q19_Z[d];
1023 #else
1024                                 #error No implementation for this PROP_MODEL_NAME.
1025 #endif
1026                                 // If the neighbor is outside the latice in X direction and we have a
1027                                 // periodic boundary then we need to wrap around.
1028                                 if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
1029                                                 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
1030                                                 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
1031                                    ){
1032                                         // x periodic
1033
1034                                         if (nx < 0) {
1035                                                 px = lX - 1;
1036                                         }
1037                                         else if (nx >= lX) {
1038                                                 px = 0;
1039                                         } else {
1040                                                 px = nx;
1041                                         }
1042                                         // y periodic
1043                                         if (ny < 0) {
1044                                                 py = lY - 1;
1045                                         }
1046                                         else if (ny >= lY) {
1047                                                 py = 0;
1048                                         } else {
1049                                                 py = ny;
1050                                         }
1051
1052                                         // z periodic
1053                                         if (nz < 0) {
1054                                                 pz = lZ - 1;
1055                                         }
1056                                         else if (nz >= lZ) {
1057                                                 pz = 0;
1058                                         } else {
1059                                                 pz = nz;
1060                                         }
1061
1062                                         if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
1063                                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1064                                         }
1065                                         else {
1066                                                 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
1067
1068                                                 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
1069
1070                                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1071                                         }
1072                                 }
1073                                 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
1074                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1075                                 }
1076                                 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
1077                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1078                                 }
1079                                 else {
1080                                         neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
1081
1082                                         Assert(neighborIndex != ~0);
1083
1084                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1085                                 }
1086
1087                                 Assert(dstIndex >= 0);
1088                                 Assert(dstIndex < nCells * N_D3Q19);
1089
1090                                 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
1091                         }
1092                 }
1093         }
1094
1095 // Sets unused adjList entries to some extreme value which triggers and SIGSEG, whenever these values are accidently accessed.
1096         for(int index = nFluid; index < nFluid_padded; ++index){
1097                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
1098                         adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = -10*1000*1000;
1099                 }
1100         }
1101
1102 /*
1103         printf("============\n");
1104         for (int i = 0; i < nFluid_padded * (N_D3Q19_IDX + 20);){
1105                 for (int j = 0; j < VSIZE; ++j){
1106                         printf("%d ",adjList[i]);
1107                         ++i;
1108                 }
1109                 printf("\n");
1110         }
1111            for(int dir = 0; dir < N_D3Q19; ++dir){
1112            printf("dir: %d\n",dir);
1113            for(int baseIndex = 0; baseIndex < nFluid + VSIZE; baseIndex+=VSIZE){
1114            for(int i = 0; i < VSIZE; ++i){
1115            int index = baseIndex + i;
1116
1117            printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]);
1118            }
1119            printf("\n");
1120            }
1121            printf("\n");
1122            }
1123         printf("============\n");
1124 */
1125
1126
1127         int nThreads = 1;
1128
1129 #ifdef _OPENMP
1130         nThreads = omp_get_max_threads();
1131 #endif
1132
1133         SetuploopStartIndices(ld, KDLR(kd), nThreads);
1134
1135         // Fill remaining KernelData structures
1136         kd->GetNode = GetNode;
1137         kd->SetNode = SetNode;
1138
1139         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
1140         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
1141
1142         kd->Kernel = FNAME(D3Q19ListAaPvGatherHybridKernel);
1143
1144         kd->DstPdfs = NULL;
1145         kd->PdfsActive = kd->Pdfs[0];
1146
1147         return;
1148 }
1149
1150 void FNAME(D3Q19ListAaPvGatherHybridDeinit)(LatticeDesc * ld, KernelData ** kernelData)
1151 {
1152         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
1153
1154         MemFree((void **)&((*kdlr)->loopStartIndices));
1155
1156         if ((*kdlr)->oddKernelThreadStartIndices != NULL) {
1157                 MemFree((void **)&((*kdlr)->oddKernelThreadStartIndices));
1158         }
1159
1160         KernelDataList ** kdl = (KernelDataList **)kernelData;
1161
1162         ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
1163
1164         MemFree((void **)&((*kdl)->Coords));
1165         MemFree((void **)&((*kdl)->Grid));
1166
1167         PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
1168
1169         MemFree((void **)kernelData);
1170         return;
1171 }
1172 #undef PAGE_4K
1173 #undef ADJ_LIST