src/BenchKernelD3Q19ListAaPvGatherCommon.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //   Michael Hussnaetter, 2017-2018
  12 //   University of Erlangen-Nuremberg, Germany
  13 //   michael.hussnaetter -at- fau.de
  14 //
  15 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  16 //
  17 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  18 //  it under the terms of the GNU General Public License as published by
  19 //  the Free Software Foundation, either version 3 of the License, or
  20 //  (at your option) any later version.
  21 //
  22 //  LbmBenchKernels is distributed in the hope that it will be useful,
  23 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 //  GNU General Public License for more details.
  26 //
  27 //  You should have received a copy of the GNU General Public License
  28 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  29 //
  30 // --------------------------------------------------------------------------
  31 #include "BenchKernelD3Q19ListAaPvGatherCommon.h"
  32
  33 #include "Memory.h"
  34 #include "Vector.h"
  35 #include "Vtk.h"
  36
  37 #include <math.h>
  38
  39 #ifdef _OPENMP
  40         #include <omp.h>
  41 #endif
  42
  43 #define PAGE_4K         4096
  44
  45 #if ALLOC_ADJ_LIST_IN_HBM == 1
  46         #define ADJ_LIST_ALLOCATOR HbwAllocAligned
  47         #define ADJ_LIST_FREE HbwFree
  48 #else
  49         #define ADJ_LIST_ALLOCATOR MemAllocAligned
  50         #define ADJ_LIST_FREE MemFree
  51 #endif
  52
  53 #if ALLOC_PDF_IN_HBM == 1
  54         #define PDF_ALLOCATOR HbwAllocAligned
  55         #define PDF_FREE HbwFree
  56 #else
  57         #define PDF_ALLOCATOR MemAllocAligned
  58         #define PDF_FREE MemFree
  59 #endif
  60
  61 // Forward definition.
  62 void FNAME(D3Q19ListAaPvGatherKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
  63
  64
  65 // -----------------------------------------------------------------------
  66 // Functions which are used as callback by the kernel to read or write
  67 // PDFs and nodes.
  68
  69
  70 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
  71 {
  72         Assert(kd != NULL);
  73         Assert(kd->PdfsActive != NULL);
  74         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
  75         Assert(pdf != NULL);
  76
  77         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
  78         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
  79         Assert(dir >= 0); Assert(dir < N_D3Q19);
  80
  81         KernelDataList * kdl = KDL(kd);
  82         uint32_t * adjList = kdl->AdjList;
  83
  84         if (kdl->Iteration % 2 == 0) {
  85                 // Pdfs are stored inverse, local PDFs are located in remote nodes
  86
  87                 // getting node index
  88                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
  89
  90                 if (dir != D3Q19_C) {
  91                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
  92                         *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
  93                         #undef ADJ_LIST
  94                 }
  95                 else {
  96                         *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
  97                 }
  98
  99         }
 100         else {
 101                 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
 102         }
 103
 104         return;
 105 }
 106
 107 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
 108 {
 109         Assert(kd != NULL);
 110         Assert(kd->PdfsActive != NULL);
 111         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 112         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 113         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 114         Assert(dir >= 0); Assert(dir < N_D3Q19);
 115
 116         if (isnan(pdf)) {
 117                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
 118                 DEBUG_BREAK_POINT();
 119                 exit(1);
 120         }
 121
 122         KernelDataList * kdl = KDL(kd);
 123         uint32_t * adjList = kdl->AdjList;
 124
 125         if (kdl->Iteration % 2 == 0) {
 126                 // Pdfs are stored inverse, local PDFs are located in remote nodes
 127
 128                 // getting node index
 129                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
 130
 131                 if (dir != D3Q19_C) {
 132                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 133                         kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
 134                         #undef ADJ_LIST
 135                 } else {
 136                         kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
 137                 }
 138
 139         } else {
 140                 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
 141         }
 142
 143         return;
 144 }
 145
 146
 147 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 148 {
 149         Assert(kd != NULL);
 150         Assert(kd->PdfsActive != NULL);
 151         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 152         Assert(pdfs != NULL);
 153         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 154         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 155
 156         KernelDataList * kdl = KDL(kd);
 157         uint32_t * adjList = kdl->AdjList;
 158
 159         if(kdl->Iteration % 2 == 0){
 160
 161                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 162
 163                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 164                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 165                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
 166                         D3Q19_LIST_WO_C
 167                 #undef X
 168                 #undef ADJ_LIST
 169                 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
 170
 171         } else {
 172
 173                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 174                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
 175                         D3Q19_LIST
 176                 #undef X
 177                 #undef I
 178
 179         }
 180
 181         for (int d = 0; d < 19; ++d) {
 182                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
 183                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 184                         for (int d2 = 0; d2 < 19; ++d2) {
 185                                 printf("%d: %e\n", d2, pdfs[d2]);
 186                         }
 187                         exit(1);
 188                 }
 189         }
 190
 191         return;
 192 }
 193
 194
 195 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
 196 {
 197         Assert(kd != NULL);
 198         Assert(kd->PdfsActive != NULL);
 199         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
 200         Assert(pdfs != NULL);
 201
 202         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
 203         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
 204
 205         for (int d = 0; d < 19; ++d) {
 206                 if(isnan(pdfs[d])) {
 207                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
 208                         for (int d2 = 0; d2 < 19; ++d2) {
 209                                 printf("%d: %e\n", d2, pdfs[d2]);
 210                         }
 211                         exit(1);
 212                 }
 213         }
 214
 215         KernelDataList * kdl = KDL(kd);
 216         uint32_t * adjList = kdl->AdjList;
 217
 218         if(kdl->Iteration % 2 == 0){
 219
 220                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
 221
 222                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
 223                 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
 224
 225                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
 226                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
 227                         D3Q19_LIST_WO_C
 228                 #undef X
 229                 #undef ADJ_LIST
 230
 231         } else {
 232
 233                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
 234                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
 235                         D3Q19_LIST
 236                 #undef X
 237                 #undef I
 238
 239         }
 240
 241         return;
 242 }
 243
 244 static void ParameterUsage()
 245 {
 246         printf("Kernel parameters:\n");
 247         printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
 248
 249         return;
 250 }
 251
 252 static void ParseParameters(Parameters * params, int * blk)
 253 {
 254         Assert(blk != NULL);
 255
 256         blk[0] = 0; blk[1] = 0; blk[2] = 0;
 257
 258         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 259         #define NEXT_ARG_PRESENT() \
 260                 do { \
 261                         if (i + 1 >= params->nKernelArgs) { \
 262                                 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
 263                                 exit(1); \
 264                         } \
 265                 } while (0)
 266
 267
 268         for (int i = 0; i < params->nKernelArgs; ++i) {
 269                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
 270                         NEXT_ARG_PRESENT();
 271
 272                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 273
 274                         if (tmp <= 0) {
 275                                 printf("ERROR: blocking parameter must be > 0.\n");
 276                                 exit(1);
 277                         }
 278
 279                         blk[0] = blk[1] = blk[2] = tmp;
 280                 }
 281                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
 282                         NEXT_ARG_PRESENT();
 283
 284                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 285
 286                         if (tmp <= 0) {
 287                                 printf("ERROR: blocking parameter must be > 0.\n");
 288                                 exit(1);
 289                         }
 290
 291                         blk[0] = tmp;
 292                 }
 293                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
 294                         NEXT_ARG_PRESENT();
 295
 296                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 297
 298                         if (tmp <= 0) {
 299                                 printf("ERROR: blocking parameter must be > 0.\n");
 300                                 exit(1);
 301                         }
 302
 303                         blk[1] = tmp;
 304                 }
 305                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
 306                         NEXT_ARG_PRESENT();
 307
 308                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 309
 310                         if (tmp <= 0) {
 311                                 printf("ERROR: blocking parameter must be > 0.\n");
 312                                 exit(1);
 313                         }
 314
 315                         blk[2] = tmp;
 316                 }
 317                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 318                         ParameterUsage();
 319                         exit(1);
 320                 }
 321                 else {
 322                         printf("ERROR: unknown kernel parameter.\n");
 323                         ParameterUsage();
 324                         exit(1);
 325                 }
 326         }
 327
 328         #undef ARG_IS
 329         #undef NEXT_ARG_PRESENT
 330
 331         return;
 332 }
 333
 334 static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
 335 {
 336         Assert(ld != NULL);
 337         Assert(kdlr != NULL);
 338         Assert(nThreads > 0);
 339
 340         uint32_t * adjList = kdlr->kdl.AdjList;
 341
 342         uint32_t nConsecNodes = 0;
 343         uint32_t consecIndex = 0;
 344
 345         int nFluid = kdlr->kdl.nFluid;
 346
 347         uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
 348
 349         int nNodesPerThread = nFluid / nThreads;
 350
 351         for (int i = 0; i < nThreads; ++i) {
 352                 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
 353         }
 354         consecThreadIndices[nThreads] = nFluid;
 355
 356         int indexThread = 1;
 357         int similarPatterns = 0;
 358         int wasLastChunkThreadBoundary = 0;
 359         // We execute following code two times.
 360         // - The first time to get the count of how many entries we need for the
 361         //   consecNodes array.
 362         // - The second time to fill the array.
 363
 364         // Loop over adjacency list of all nodes.
 365     // Compare if adjacent nodes share the same access pattern.
 366         for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
 367
 368                 int hasSimilarAccessPattern = 1;
 369
 370                 // Loop over all directions except the center one.
 371                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 372                         Assert(d != D3Q19_C);
 373
 374                         // check if cache line itself has consecutive memory access pattern
 375                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
 376                                 int index = fluidBaseIndex + inChunkIndex;
 377
 378                                 Assert(index < nFluid);
 379
 380                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 381                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
 382                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
 383                                         //printf("different @: ADJ_LST(%d,%d)=%d != %d=ADJ_LST(%d, %d) + VSIZE\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-VSIZE,d) + VSIZE, index-VSIZE, d);
 384                                         // Different access pattern.
 385                                         hasSimilarAccessPattern = 0;
 386                                         break;
 387                                 }
 388                                 #undef ADJ_LIST
 389                         }
 390
 391                         if(!hasSimilarAccessPattern){
 392                                 break; //exit from nested loop
 393                         }
 394                 }
 395
 396                 long threadBoundaryIndex = consecThreadIndices[indexThread];
 397                 if (fluidBaseIndex <= threadBoundaryIndex &&
 398                                 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
 399                         // Current chunk contains thread boundary.
 400                         // These chunks are treated by scalar peel and reminder loops
 401                         // in kernel of every thread to ensure VSIZE aligned access to
 402                         // adjacency list
 403
 404                         // final cells of current thread
 405                         ++consecIndex;
 406
 407                         // first cells of next thread
 408                         ++indexThread;
 409                         ++consecIndex;
 410
 411                         wasLastChunkThreadBoundary = 1;
 412                 }
 413                 else {
 414                         // We are not at a thread boundary
 415                         if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
 416                                 ++similarPatterns;
 417                         }
 418                         else {
 419                                 ++consecIndex;
 420                         }
 421
 422                         wasLastChunkThreadBoundary = 0;
 423
 424                         /*
 425                         if (!hasSimilarAccessPattern) {
 426                                 ++consecIndex;
 427                         }
 428                         else {
 429                                 ++similarPatterns;
 430                         }
 431                         */
 432                 }
 433         }
 434
 435         if (nFluid > 0) {
 436                 nConsecNodes = consecIndex + 1;
 437         }
 438
 439         uint32_t * consecNodes;
 440         MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
 441
 442         unsigned long consecNodesByte = (nConsecNodes) * sizeof(uint32_t);
 443
 444         printf("# Consec. Nodes Array Allocation:\n");
 445         printf("#   similar patterns\t\t%d\n", similarPatterns);
 446         printf("#   elements:      \t\t%d\n",   nConsecNodes);
 447         printf("#   size:          \t\t%e MiB\n", consecNodesByte / 1024.0 / 1024.0);
 448         printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
 449
 450         if (MemAllocAligned((void **)&consecNodesByte, consecNodesByte, PAGE_4K)) {
 451                         printf("ERROR: allocating consecNodes array with MemAllocAligned failed: %lu bytes.\n", consecNodesByte);
 452                         exit(1);
 453         }
 454         else {
 455                 printf("#   allocator: \t\t\tMemAllocAligned()\n");
 456         }
 457
 458         consecIndex = 0;
 459
 460         if (nFluid > 0) {
 461                 consecNodes[consecIndex] = VSIZE;
 462         }
 463
 464         indexThread = 1;
 465         consecThreadIndices[0] = 0;
 466
 467         //add first chunk manually to enable backward check for consecutive pattern
 468         consecNodes[consecIndex] = VSIZE;
 469
 470         wasLastChunkThreadBoundary = 0;
 471
 472         // Loop over adjacency list of all nodes.
 473     // Compare if access pattern does not change on chunk level
 474         // Since gather instructions are used, access pattern may not be consecutive
 475         for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
 476
 477                 int hasSimilarAccessPattern = 1;
 478
 479                 // Loop over all directions except the center one.
 480                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 481                         Assert(d != D3Q19_C);
 482
 483                         // check if cache line itself has consecutive memory access pattern
 484                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
 485                                 int index = fluidBaseIndex + inChunkIndex;
 486
 487                                 Assert(index < nFluid);
 488
 489                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
 490                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
 491                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
 492                                         // Different access pattern.
 493                                         hasSimilarAccessPattern = 0;
 494                                         break;
 495                                 }
 496                                 #undef ADJ_LIST
 497                         }
 498
 499                         if(!hasSimilarAccessPattern){
 500                                 break; //exit from nested loop
 501                         }
 502                 }
 503
 504                 long threadBoundaryIndex = consecThreadIndices[indexThread];
 505                 if (fluidBaseIndex <= threadBoundaryIndex &&
 506                                 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
 507                         // Current chunk contains thread boundary.
 508                         // These chunks are treated by scalar peel and reminder loops
 509                         // in kernel of every thread to ensure VSIZE aligned access to
 510                         // adjacency list
 511
 512                         // final cells of current thread
 513                         ++consecIndex;
 514                         //consecThreadIndices[indexThread] = consecIndex;
 515                         consecNodes[consecIndex] = threadBoundaryIndex - fluidBaseIndex;
 516
 517
 518                         // first cells of next thread
 519                         ++consecIndex;
 520                         consecThreadIndices[indexThread] = consecIndex;
 521                         consecNodes[consecIndex] = (fluidBaseIndex + VSIZE) - threadBoundaryIndex;
 522                         ++indexThread;
 523
 524                         wasLastChunkThreadBoundary = 1;
 525
 526                 }
 527                 else {
 528                         // We are not at a thread boundary
 529                         if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
 530                                 Assert(consecIndex < nConsecNodes);
 531                                 consecNodes[consecIndex] += VSIZE;
 532                         }
 533                         else {
 534                                 ++consecIndex;
 535                                 Assert(consecIndex < nConsecNodes);
 536                                 consecNodes[consecIndex] = VSIZE;
 537                         }
 538
 539                         /*
 540                         if (!hasSimilarAccessPattern) {
 541                                 ++consecIndex;
 542                                 Assert(consecIndex < nConsecNodes);
 543                                 consecNodes[consecIndex] = VSIZE;
 544                         }
 545                         else {
 546                                 Assert(consecIndex < nConsecNodes);
 547                                 consecNodes[consecIndex] += VSIZE;
 548                         }
 549                         */
 550                         wasLastChunkThreadBoundary = 0;
 551
 552                 }
 553         }
 554
 555         /*
 556         printf("consecNodes:\n");
 557         for(int i = 0; i < nConsecNodes + 5; ++i){
 558                 printf("%d ", consecNodes[i]);
 559         }
 560         printf("\n");
 561         */
 562         /*
 563         printf("consecThreadIndices:\n");
 564         for(int i = 0; i < nThreads + 5; ++i){
 565                 printf("%d ", consecThreadIndices[i]);
 566         }
 567         printf("\n");
 568         */
 569
 570         kdlr->ConsecNodes = consecNodes;
 571         kdlr->nConsecNodes = nConsecNodes;
 572
 573         kdlr->ConsecThreadIndices  = consecThreadIndices;
 574         kdlr->nConsecThreadIndices = nThreads;
 575
 576         double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
 577         //N_D3Q19 - 1: C lookup not required, +1: transfer of consecValue
 578         double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + ((double)nConsecNodes *((N_D3Q19 - 1) * VSIZE + 1)) / nFluid * sizeof(int);
 579         double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
 580
 581         kdlr->kdl.kd.LoopBalance = loopBalance;
 582
 583         printf("# loop balance:\n");
 584         printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
 585         printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
 586         printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
 587
 588         return;
 589 }
 590
 591 void FNAME(D3Q19ListAaPvGatherInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 592 {
 593         KernelData * kd;
 594         KernelDataList * kdl;
 595         KernelDataListRia * kdlr;
 596         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
 597
 598         kd = (KernelData *)kdlr;
 599         kdl = KDL(kdlr);
 600
 601         *kernelData = kd;
 602
 603 #ifdef DEBUG
 604         kd->Pdfs[0] = NULL;
 605         kd->Pdfs[1] = NULL;
 606         kd->PdfsActive = NULL;
 607         kd->DstPdfs = NULL;
 608         kd->SrcPdfs = NULL;
 609         kd->Dims[0] = -1;
 610         kd->Dims[1] = -1;
 611         kd->Dims[2] = -1;
 612         kd->GlobalDims[0] = -1;
 613         kd->GlobalDims[1] = -1;
 614         kd->GlobalDims[2] = -1;
 615         kd->Offsets[0] = -1;
 616         kd->Offsets[1] = -1;
 617         kd->Offsets[2] = -1;
 618
 619         kd->ObstIndices = NULL;
 620         kd->nObstIndices = -1;
 621         kd->BounceBackPdfsSrc = NULL;
 622         kd->BounceBackPdfsDst = NULL;
 623         kd->nBounceBackPdfs = -1;
 624
 625         kdl->AdjList = NULL;
 626         kdl->Coords = NULL;
 627         kdl->Grid = NULL;
 628         kdl->nCells = -1;
 629         kdl->nFluid = -1;
 630
 631         kdlr->ConsecNodes = NULL;
 632         kdlr->nConsecNodes = 0;
 633         kdlr->ConsecThreadIndices = NULL;
 634         kdlr->nConsecThreadIndices = 0;
 635 #endif
 636
 637         // Ajust the dimensions according to padding, if used.
 638         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 639         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
 640         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
 641
 642         int * lDims = ld->Dims;
 643
 644         int lX = lDims[0];
 645         int lY = lDims[1];
 646         int lZ = lDims[2];
 647
 648         int nTotalCells = lX * lY * lZ;
 649         int nCells = ld->nFluid; // TODO: + padding
 650         int nFluid = ld->nFluid;
 651
 652         // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
 653         // done during lattice setup.
 654         kdl->nCells = nCells;
 655         kdl->nFluid = nFluid;
 656
 657         PdfT * pdfs[2];
 658
 659         int blk[3] = { 0 };
 660
 661         ParseParameters(params, blk);
 662
 663         if (blk[0] == 0) blk[0] = lX;
 664         if (blk[1] == 0) blk[1] = lY;
 665         if (blk[2] == 0) blk[2] = lZ;
 666
 667         printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
 668
 669         unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
 670         unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
 671         unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
 672
 673         printf("# Lattice Array Allocation:\n");
 674         printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
 675         printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
 676         printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
 677
 678
 679         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
 680
 681         if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
 682                 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
 683                 exit(1);
 684         }
 685         else {
 686                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
 687         }
 688
 689         kd->Pdfs[0] = pdfs[0];
 690
 691         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
 692         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
 693         // we want the same scheduling as in the kernel.
 694         #ifdef _OPENMP
 695                 #pragma omp parallel for
 696         #endif
 697         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 698                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
 699         } }
 700
 701         // Initialize all PDFs to some standard value.
 702         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
 703                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
 704         } }
 705
 706         // ----------------------------------------------------------------------
 707         // create grid which will hold the index numbers of the fluid nodes
 708
 709         uint32_t * grid;
 710
 711         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
 712                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
 713                 exit(1);
 714         }
 715         kdl->Grid = grid;
 716
 717         int latticeIndex;
 718
 719 #ifdef DEBUG
 720         for(int z = 0; z < lZ; ++z) {
 721                 for(int y = 0; y < lY; ++y) {
 722                         for(int x = 0; x < lX; ++x) {
 723
 724                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
 725
 726                                 grid[latticeIndex] = ~0;
 727                         }
 728                 }
 729         }
 730 #endif
 731
 732         // ----------------------------------------------------------------------
 733         // generate numbering over grid
 734
 735         uint32_t * coords;
 736
 737         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
 738                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
 739                 exit(1);
 740         }
 741
 742         kdl->Coords = coords;
 743
 744         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
 745         // through the ld->Lattice array.
 746         int counter = 0;
 747
 748         // Blocking is implemented via setup of the adjacency list. The kernel later will
 749         // walk through the lattice blocked automatically.
 750         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 751         for (int bY = 0; bY < lY; bY += blk[1]) {
 752         for (int bX = 0; bX < lX; bX += blk[0]) {
 753
 754                 int eX = MIN(bX + blk[0], lX);
 755                 int eY = MIN(bY + blk[1], lY);
 756                 int eZ = MIN(bZ + blk[2], lZ);
 757
 758
 759                 for (int z = bZ; z < eZ; ++z) {
 760                 for (int y = bY; y < eY; ++y) {
 761                 for (int x = bX; x < eX; ++x) {
 762
 763                         latticeIndex = L_INDEX_4(lDims, x, y, z);
 764
 765                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
 766                                 grid[latticeIndex] = counter;
 767
 768                                 coords[C_INDEX_X(counter)] = x;
 769                                 coords[C_INDEX_Y(counter)] = y;
 770                                 coords[C_INDEX_Z(counter)] = z;
 771
 772                                 ++counter;
 773                         }
 774                 } } }
 775         } } }
 776
 777         Verify(counter == nFluid);
 778         uint32_t * adjList;
 779
 780         // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
 781         unsigned long adjListBytes = nFluid * sizeof(int) * N_D3Q19_IDX;
 782
 783         printf("# Adjacency List Allocation:\n");
 784         printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
 785         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
 786
 787         // AdjList only requires 18 instead of 19 entries per node, as
 788         // the center PDF needs no addressing.
 789         if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
 790                 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
 791                 exit(1);
 792         }
 793         else {
 794                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
 795         }
 796
 797         kdl->AdjList = adjList;
 798
 799         int x, y, z;
 800
 801         uint32_t neighborIndex;
 802         uint32_t dstIndex;
 803
 804         int nx, ny, nz, px, py, pz;
 805
 806         // Loop over all fluid nodes and compute the indices to the neighboring
 807         // PDFs for configured data layout (AoS/SoA).
 808         // Parallelized loop to ensure correct NUMA placement.
 809         // #ifdef _OPENMP  --> add line continuation
 810         //      #pragma omp parallel for default(none)
 811         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
 812         //                              stderr,
 813         //                              lDims, grid, ld, lX, lY, lZ, adjList)
 814         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
 815         // #endif
 816         for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
 817
 818
 819                 // Loop over all directions except the center one.
 820                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
 821                         Assert(d != D3Q19_C);
 822
 823                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
 824                                 int index = fluidBaseIndex + inChunkIndex;
 825
 826                                 Assert(index < nFluid);
 827
 828                                 x = coords[C_INDEX_X(index)];
 829                                 y = coords[C_INDEX_Y(index)];
 830                                 z = coords[C_INDEX_Z(index)];
 831
 832                                 Assert(x >= 0 && x < lX);
 833                                 Assert(y >= 0 && y < lY);
 834                                 Assert(z >= 0 && z < lZ);
 835
 836                                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
 837
 838 #ifdef PROP_MODEL_PUSH
 839                                 nx = x + D3Q19_X[d];
 840                                 ny = y + D3Q19_Y[d];
 841                                 nz = z + D3Q19_Z[d];
 842
 843 #elif PROP_MODEL_PULL
 844                                 nx = x - D3Q19_X[d];
 845                                 ny = y - D3Q19_Y[d];
 846                                 nz = z - D3Q19_Z[d];
 847 #else
 848                                 #error No implementation for this PROP_MODEL_NAME.
 849 #endif
 850                                 // If the neighbor is outside the latice in X direction and we have a
 851                                 // periodic boundary then we need to wrap around.
 852                                 if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
 853                                                 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
 854                                                 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
 855                                    ){
 856                                         // x periodic
 857
 858                                         if (nx < 0) {
 859                                                 px = lX - 1;
 860                                         }
 861                                         else if (nx >= lX) {
 862                                                 px = 0;
 863                                         } else {
 864                                                 px = nx;
 865                                         }
 866                                         // y periodic
 867                                         if (ny < 0) {
 868                                                 py = lY - 1;
 869                                         }
 870                                         else if (ny >= lY) {
 871                                                 py = 0;
 872                                         } else {
 873                                                 py = ny;
 874                                         }
 875
 876                                         // z periodic
 877                                         if (nz < 0) {
 878                                                 pz = lZ - 1;
 879                                         }
 880                                         else if (nz >= lZ) {
 881                                                 pz = 0;
 882                                         } else {
 883                                                 pz = nz;
 884                                         }
 885
 886                                         if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
 887                                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 888                                         }
 889                                         else {
 890                                                 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
 891
 892                                                 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
 893
 894                                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 895                                         }
 896                                 }
 897                                 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
 898                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 899                                 }
 900                                 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
 901                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
 902                                 }
 903                                 else {
 904                                         neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
 905
 906                                         Assert(neighborIndex != ~0);
 907
 908                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
 909                                 }
 910
 911                                 Assert(dstIndex >= 0);
 912                                 Assert(dstIndex < nCells * N_D3Q19);
 913
 914                                 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
 915                         }
 916                 }
 917         }
 918
 919         /*
 920         printf("============\n");
 921         for(int baseIndex = 0; baseIndex < nFluid; baseIndex+=VSIZE){
 922                 for(int i = 0; i < VSIZE; ++i){
 923                         int index = baseIndex + i;
 924
 925                         printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (0 * VSIZE) + (index % VSIZE)]);
 926                 }
 927                 printf("\n");
 928         }
 929         printf("============\n");
 930         */
 931
 932         int nThreads = 1;
 933
 934 #ifdef _OPENMP
 935         nThreads = omp_get_max_threads();
 936 #endif
 937
 938         SetupConsecNodes(ld, KDLR(kd), nThreads);
 939
 940         // Fill remaining KernelData structures
 941         kd->GetNode = GetNode;
 942         kd->SetNode = SetNode;
 943
 944         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
 945         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
 946
 947         kd->Kernel = FNAME(D3Q19ListAaPvGatherKernel);
 948
 949         kd->DstPdfs = NULL;
 950         kd->PdfsActive = kd->Pdfs[0];
 951
 952         return;
 953 }
 954
 955 void FNAME(D3Q19ListAaPvGatherDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 956 {
 957         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
 958
 959         MemFree((void **)&((*kdlr)->ConsecNodes));
 960
 961         if ((*kdlr)->ConsecThreadIndices != NULL) {
 962                 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
 963         }
 964
 965         KernelDataList ** kdl = (KernelDataList **)kernelData;
 966
 967         ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
 968
 969         MemFree((void **)&((*kdl)->Coords));
 970         MemFree((void **)&((*kdl)->Grid));
 971
 972         PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
 973
 974         MemFree((void **)kernelData);
 975         return;
 976 }
 977