merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPvGatherCommon.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //   Michael Hussnaetter, 2017-2018
12 //   University of Erlangen-Nuremberg, Germany
13 //   michael.hussnaetter -at- fau.de
14 //
15 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16 //
17 //  LbmBenchKernels is free software: you can redistribute it and/or modify
18 //  it under the terms of the GNU General Public License as published by
19 //  the Free Software Foundation, either version 3 of the License, or
20 //  (at your option) any later version.
21 //
22 //  LbmBenchKernels is distributed in the hope that it will be useful,
23 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
24 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 //  GNU General Public License for more details.
26 //
27 //  You should have received a copy of the GNU General Public License
28 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
29 //
30 // --------------------------------------------------------------------------
31 #include "BenchKernelD3Q19ListAaPvGatherCommon.h"
32
33 #include "Memory.h"
34 #include "Vector.h"
35 #include "Vtk.h"
36
37 #include <math.h>
38
39 #ifdef _OPENMP
40         #include <omp.h>
41 #endif
42
43 #define PAGE_4K         4096
44
45 #if ALLOC_ADJ_LIST_IN_HBM == 1
46         #define ADJ_LIST_ALLOCATOR HbwAllocAligned
47         #define ADJ_LIST_FREE HbwFree
48 #else
49         #define ADJ_LIST_ALLOCATOR MemAllocAligned
50         #define ADJ_LIST_FREE MemFree
51 #endif
52
53 #if ALLOC_PDF_IN_HBM == 1
54         #define PDF_ALLOCATOR HbwAllocAligned
55         #define PDF_FREE HbwFree
56 #else
57         #define PDF_ALLOCATOR MemAllocAligned
58         #define PDF_FREE MemFree
59 #endif
60
61 // Forward definition.
62 void FNAME(D3Q19ListAaPvGatherKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
63
64
65 // -----------------------------------------------------------------------
66 // Functions which are used as callback by the kernel to read or write
67 // PDFs and nodes.
68
69
70 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
71 {
72         Assert(kd != NULL);
73         Assert(kd->PdfsActive != NULL);
74         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
75         Assert(pdf != NULL);
76
77         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
78         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
79         Assert(dir >= 0); Assert(dir < N_D3Q19);
80
81         KernelDataList * kdl = KDL(kd);
82         uint32_t * adjList = kdl->AdjList;
83
84         if (kdl->Iteration % 2 == 0) {
85                 // Pdfs are stored inverse, local PDFs are located in remote nodes
86
87                 // getting node index
88                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
89
90                 if (dir != D3Q19_C) {
91                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
92                         *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
93                         #undef ADJ_LIST
94                 }
95                 else {
96                         *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
97                 }
98
99         }
100         else {
101                 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
102         }
103
104         return;
105 }
106
107 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
108 {
109         Assert(kd != NULL);
110         Assert(kd->PdfsActive != NULL);
111         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
112         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
113         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
114         Assert(dir >= 0); Assert(dir < N_D3Q19);
115
116         if (isnan(pdf)) {
117                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
118                 DEBUG_BREAK_POINT();
119                 exit(1);
120         }
121
122         KernelDataList * kdl = KDL(kd);
123         uint32_t * adjList = kdl->AdjList;
124
125         if (kdl->Iteration % 2 == 0) {
126                 // Pdfs are stored inverse, local PDFs are located in remote nodes
127
128                 // getting node index
129                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
130
131                 if (dir != D3Q19_C) {
132                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
133                         kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
134                         #undef ADJ_LIST
135                 } else {
136                         kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
137                 }
138
139         } else {
140                 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
141         }
142
143         return;
144 }
145
146
147 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
148 {
149         Assert(kd != NULL);
150         Assert(kd->PdfsActive != NULL);
151         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
152         Assert(pdfs != NULL);
153         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
154         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
155
156         KernelDataList * kdl = KDL(kd);
157         uint32_t * adjList = kdl->AdjList;
158
159         if(kdl->Iteration % 2 == 0){
160
161                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
162
163                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
164                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
165                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
166                         D3Q19_LIST_WO_C
167                 #undef X
168                 #undef ADJ_LIST
169                 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
170
171         } else {
172
173                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
174                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
175                         D3Q19_LIST
176                 #undef X
177                 #undef I
178
179         }
180
181         for (int d = 0; d < 19; ++d) {
182                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
183                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
184                         for (int d2 = 0; d2 < 19; ++d2) {
185                                 printf("%d: %e\n", d2, pdfs[d2]);
186                         }
187                         exit(1);
188                 }
189         }
190
191         return;
192 }
193
194
195 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
196 {
197         Assert(kd != NULL);
198         Assert(kd->PdfsActive != NULL);
199         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
200         Assert(pdfs != NULL);
201
202         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
203         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
204
205         for (int d = 0; d < 19; ++d) {
206                 if(isnan(pdfs[d])) {
207                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
208                         for (int d2 = 0; d2 < 19; ++d2) {
209                                 printf("%d: %e\n", d2, pdfs[d2]);
210                         }
211                         exit(1);
212                 }
213         }
214
215         KernelDataList * kdl = KDL(kd);
216         uint32_t * adjList = kdl->AdjList;
217
218         if(kdl->Iteration % 2 == 0){
219
220                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
221
222                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
223                 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
224
225                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
226                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
227                         D3Q19_LIST_WO_C
228                 #undef X
229                 #undef ADJ_LIST
230
231         } else {
232
233                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
234                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
235                         D3Q19_LIST
236                 #undef X
237                 #undef I
238
239         }
240
241         return;
242 }
243
244 static void ParameterUsage()
245 {
246         printf("Kernel parameters:\n");
247         printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
248
249         return;
250 }
251
252 static void ParseParameters(Parameters * params, int * blk)
253 {
254         Assert(blk != NULL);
255
256         blk[0] = 0; blk[1] = 0; blk[2] = 0;
257
258         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
259         #define NEXT_ARG_PRESENT() \
260                 do { \
261                         if (i + 1 >= params->nKernelArgs) { \
262                                 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
263                                 exit(1); \
264                         } \
265                 } while (0)
266
267
268         for (int i = 0; i < params->nKernelArgs; ++i) {
269                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
270                         NEXT_ARG_PRESENT();
271
272                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
273
274                         if (tmp <= 0) {
275                                 printf("ERROR: blocking parameter must be > 0.\n");
276                                 exit(1);
277                         }
278
279                         blk[0] = blk[1] = blk[2] = tmp;
280                 }
281                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
282                         NEXT_ARG_PRESENT();
283
284                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
285
286                         if (tmp <= 0) {
287                                 printf("ERROR: blocking parameter must be > 0.\n");
288                                 exit(1);
289                         }
290
291                         blk[0] = tmp;
292                 }
293                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
294                         NEXT_ARG_PRESENT();
295
296                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
297
298                         if (tmp <= 0) {
299                                 printf("ERROR: blocking parameter must be > 0.\n");
300                                 exit(1);
301                         }
302
303                         blk[1] = tmp;
304                 }
305                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
306                         NEXT_ARG_PRESENT();
307
308                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
309
310                         if (tmp <= 0) {
311                                 printf("ERROR: blocking parameter must be > 0.\n");
312                                 exit(1);
313                         }
314
315                         blk[2] = tmp;
316                 }
317                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
318                         ParameterUsage();
319                         exit(1);
320                 }
321                 else {
322                         printf("ERROR: unknown kernel parameter.\n");
323                         ParameterUsage();
324                         exit(1);
325                 }
326         }
327
328         #undef ARG_IS
329         #undef NEXT_ARG_PRESENT
330
331         return;
332 }
333
334 static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
335 {
336         Assert(ld != NULL);
337         Assert(kdlr != NULL);
338         Assert(nThreads > 0);
339
340         uint32_t * adjList = kdlr->kdl.AdjList;
341
342         uint32_t nConsecNodes = 0;
343         uint32_t consecIndex = 0;
344
345         int nFluid = kdlr->kdl.nFluid;
346
347         uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
348
349         int nNodesPerThread = nFluid / nThreads;
350
351         for (int i = 0; i < nThreads; ++i) {
352                 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
353         }
354         consecThreadIndices[nThreads] = nFluid;
355
356         int indexThread = 1;
357         int similarPatterns = 0;
358         int wasLastChunkThreadBoundary = 0;
359         // We execute following code two times.
360         // - The first time to get the count of how many entries we need for the
361         //   consecNodes array.
362         // - The second time to fill the array.
363
364         // Loop over adjacency list of all nodes.
365     // Compare if adjacent nodes share the same access pattern.
366         for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
367
368                 int hasSimilarAccessPattern = 1;
369
370                 // Loop over all directions except the center one.
371                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
372                         Assert(d != D3Q19_C);
373
374                         // check if cache line itself has consecutive memory access pattern
375                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
376                                 int index = fluidBaseIndex + inChunkIndex;
377
378                                 Assert(index < nFluid);
379
380                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
381                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
382                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
383                                         //printf("different @: ADJ_LST(%d,%d)=%d != %d=ADJ_LST(%d, %d) + VSIZE\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-VSIZE,d) + VSIZE, index-VSIZE, d);
384                                         // Different access pattern.
385                                         hasSimilarAccessPattern = 0;
386                                         break;
387                                 }
388                                 #undef ADJ_LIST
389                         }
390
391                         if(!hasSimilarAccessPattern){
392                                 break; //exit from nested loop
393                         }
394                 }
395
396                 long threadBoundaryIndex = consecThreadIndices[indexThread];
397                 if (fluidBaseIndex <= threadBoundaryIndex &&
398                                 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
399                         // Current chunk contains thread boundary.
400                         // These chunks are treated by scalar peel and reminder loops
401                         // in kernel of every thread to ensure VSIZE aligned access to
402                         // adjacency list
403
404                         // final cells of current thread
405                         ++consecIndex;
406
407                         // first cells of next thread
408                         ++indexThread;
409                         ++consecIndex;
410
411                         wasLastChunkThreadBoundary = 1;
412                 }
413                 else {
414                         // We are not at a thread boundary
415                         if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
416                                 ++similarPatterns;
417                         }
418                         else {
419                                 ++consecIndex;
420                         }
421
422                         wasLastChunkThreadBoundary = 0;
423
424                         /*
425                         if (!hasSimilarAccessPattern) {
426                                 ++consecIndex;
427                         }
428                         else {
429                                 ++similarPatterns;
430                         }
431                         */
432                 }
433         }
434
435         if (nFluid > 0) {
436                 nConsecNodes = consecIndex + 1;
437         }
438
439         uint32_t * consecNodes;
440         MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
441
442         unsigned long consecNodesByte = (nConsecNodes) * sizeof(uint32_t);
443
444         printf("# Consec. Nodes Array Allocation:\n");
445         printf("#   similar patterns\t\t%d\n", similarPatterns);
446         printf("#   elements:      \t\t%d\n",   nConsecNodes);
447         printf("#   size:          \t\t%e MiB\n", consecNodesByte / 1024.0 / 1024.0);
448         printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
449
450         if (MemAllocAligned((void **)&consecNodesByte, consecNodesByte, PAGE_4K)) {
451                         printf("ERROR: allocating consecNodes array with MemAllocAligned failed: %lu bytes.\n", consecNodesByte);
452                         exit(1);
453         }
454         else {
455                 printf("#   allocator: \t\t\tMemAllocAligned()\n");
456         }
457
458         consecIndex = 0;
459
460         if (nFluid > 0) {
461                 consecNodes[consecIndex] = VSIZE;
462         }
463
464         indexThread = 1;
465         consecThreadIndices[0] = 0;
466
467         //add first chunk manually to enable backward check for consecutive pattern
468         consecNodes[consecIndex] = VSIZE;
469
470         wasLastChunkThreadBoundary = 0;
471
472         // Loop over adjacency list of all nodes.
473     // Compare if access pattern does not change on chunk level
474         // Since gather instructions are used, access pattern may not be consecutive
475         for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
476
477                 int hasSimilarAccessPattern = 1;
478
479                 // Loop over all directions except the center one.
480                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
481                         Assert(d != D3Q19_C);
482
483                         // check if cache line itself has consecutive memory access pattern
484                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
485                                 int index = fluidBaseIndex + inChunkIndex;
486
487                                 Assert(index < nFluid);
488
489                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
490                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
491                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
492                                         // Different access pattern.
493                                         hasSimilarAccessPattern = 0;
494                                         break;
495                                 }
496                                 #undef ADJ_LIST
497                         }
498
499                         if(!hasSimilarAccessPattern){
500                                 break; //exit from nested loop
501                         }
502                 }
503
504                 long threadBoundaryIndex = consecThreadIndices[indexThread];
505                 if (fluidBaseIndex <= threadBoundaryIndex &&
506                                 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
507                         // Current chunk contains thread boundary.
508                         // These chunks are treated by scalar peel and reminder loops
509                         // in kernel of every thread to ensure VSIZE aligned access to
510                         // adjacency list
511
512                         // final cells of current thread
513                         ++consecIndex;
514                         //consecThreadIndices[indexThread] = consecIndex;
515                         consecNodes[consecIndex] = threadBoundaryIndex - fluidBaseIndex;
516
517
518                         // first cells of next thread
519                         ++consecIndex;
520                         consecThreadIndices[indexThread] = consecIndex;
521                         consecNodes[consecIndex] = (fluidBaseIndex + VSIZE) - threadBoundaryIndex;
522                         ++indexThread;
523
524                         wasLastChunkThreadBoundary = 1;
525
526                 }
527                 else {
528                         // We are not at a thread boundary
529                         if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
530                                 Assert(consecIndex < nConsecNodes);
531                                 consecNodes[consecIndex] += VSIZE;
532                         }
533                         else {
534                                 ++consecIndex;
535                                 Assert(consecIndex < nConsecNodes);
536                                 consecNodes[consecIndex] = VSIZE;
537                         }
538
539                         /*
540                         if (!hasSimilarAccessPattern) {
541                                 ++consecIndex;
542                                 Assert(consecIndex < nConsecNodes);
543                                 consecNodes[consecIndex] = VSIZE;
544                         }
545                         else {
546                                 Assert(consecIndex < nConsecNodes);
547                                 consecNodes[consecIndex] += VSIZE;
548                         }
549                         */
550                         wasLastChunkThreadBoundary = 0;
551
552                 }
553         }
554
555         /*
556         printf("consecNodes:\n");
557         for(int i = 0; i < nConsecNodes + 5; ++i){
558                 printf("%d ", consecNodes[i]);
559         }
560         printf("\n");
561         */
562         /*
563         printf("consecThreadIndices:\n");
564         for(int i = 0; i < nThreads + 5; ++i){
565                 printf("%d ", consecThreadIndices[i]);
566         }
567         printf("\n");
568         */
569
570         kdlr->ConsecNodes = consecNodes;
571         kdlr->nConsecNodes = nConsecNodes;
572
573         kdlr->ConsecThreadIndices  = consecThreadIndices;
574         kdlr->nConsecThreadIndices = nThreads;
575
576         double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
577         //N_D3Q19 - 1: C lookup not required, +1: transfer of consecValue
578         double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + ((double)nConsecNodes *((N_D3Q19 - 1) * VSIZE + 1)) / nFluid * sizeof(int);
579         double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
580
581         kdlr->kdl.kd.LoopBalance = loopBalance;
582
583         printf("# loop balance:\n");
584         printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
585         printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
586         printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
587
588         return;
589 }
590
591 void FNAME(D3Q19ListAaPvGatherInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
592 {
593         KernelData * kd;
594         KernelDataList * kdl;
595         KernelDataListRia * kdlr;
596         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
597
598         kd = (KernelData *)kdlr;
599         kdl = KDL(kdlr);
600
601         *kernelData = kd;
602
603 #ifdef DEBUG
604         kd->Pdfs[0] = NULL;
605         kd->Pdfs[1] = NULL;
606         kd->PdfsActive = NULL;
607         kd->DstPdfs = NULL;
608         kd->SrcPdfs = NULL;
609         kd->Dims[0] = -1;
610         kd->Dims[1] = -1;
611         kd->Dims[2] = -1;
612         kd->GlobalDims[0] = -1;
613         kd->GlobalDims[1] = -1;
614         kd->GlobalDims[2] = -1;
615         kd->Offsets[0] = -1;
616         kd->Offsets[1] = -1;
617         kd->Offsets[2] = -1;
618
619         kd->ObstIndices = NULL;
620         kd->nObstIndices = -1;
621         kd->BounceBackPdfsSrc = NULL;
622         kd->BounceBackPdfsDst = NULL;
623         kd->nBounceBackPdfs = -1;
624
625         kdl->AdjList = NULL;
626         kdl->Coords = NULL;
627         kdl->Grid = NULL;
628         kdl->nCells = -1;
629         kdl->nFluid = -1;
630
631         kdlr->ConsecNodes = NULL;
632         kdlr->nConsecNodes = 0;
633         kdlr->ConsecThreadIndices = NULL;
634         kdlr->nConsecThreadIndices = 0;
635 #endif
636
637         // Ajust the dimensions according to padding, if used.
638         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
639         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
640         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
641
642         int * lDims = ld->Dims;
643
644         int lX = lDims[0];
645         int lY = lDims[1];
646         int lZ = lDims[2];
647
648         int nTotalCells = lX * lY * lZ;
649         int nCells = ld->nFluid; // TODO: + padding
650         int nFluid = ld->nFluid;
651
652         // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
653         // done during lattice setup.
654         kdl->nCells = nCells;
655         kdl->nFluid = nFluid;
656
657         PdfT * pdfs[2];
658
659         int blk[3] = { 0 };
660
661         ParseParameters(params, blk);
662
663         if (blk[0] == 0) blk[0] = lX;
664         if (blk[1] == 0) blk[1] = lY;
665         if (blk[2] == 0) blk[2] = lZ;
666
667         printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
668
669         unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
670         unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
671         unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
672
673         printf("# Lattice Array Allocation:\n");
674         printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
675         printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
676         printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
677
678
679         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
680
681         if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
682                 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
683                 exit(1);
684         }
685         else {
686                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
687         }
688
689         kd->Pdfs[0] = pdfs[0];
690
691         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
692         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
693         // we want the same scheduling as in the kernel.
694         #ifdef _OPENMP
695                 #pragma omp parallel for
696         #endif
697         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
698                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
699         } }
700
701         // Initialize all PDFs to some standard value.
702         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
703                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
704         } }
705
706         // ----------------------------------------------------------------------
707         // create grid which will hold the index numbers of the fluid nodes
708
709         uint32_t * grid;
710
711         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
712                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
713                 exit(1);
714         }
715         kdl->Grid = grid;
716
717         int latticeIndex;
718
719 #ifdef DEBUG
720         for(int z = 0; z < lZ; ++z) {
721                 for(int y = 0; y < lY; ++y) {
722                         for(int x = 0; x < lX; ++x) {
723
724                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
725
726                                 grid[latticeIndex] = ~0;
727                         }
728                 }
729         }
730 #endif
731
732         // ----------------------------------------------------------------------
733         // generate numbering over grid
734
735         uint32_t * coords;
736
737         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
738                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
739                 exit(1);
740         }
741
742         kdl->Coords = coords;
743
744         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
745         // through the ld->Lattice array.
746         int counter = 0;
747
748         // Blocking is implemented via setup of the adjacency list. The kernel later will
749         // walk through the lattice blocked automatically.
750         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
751         for (int bY = 0; bY < lY; bY += blk[1]) {
752         for (int bX = 0; bX < lX; bX += blk[0]) {
753
754                 int eX = MIN(bX + blk[0], lX);
755                 int eY = MIN(bY + blk[1], lY);
756                 int eZ = MIN(bZ + blk[2], lZ);
757
758
759                 for (int z = bZ; z < eZ; ++z) {
760                 for (int y = bY; y < eY; ++y) {
761                 for (int x = bX; x < eX; ++x) {
762
763                         latticeIndex = L_INDEX_4(lDims, x, y, z);
764
765                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
766                                 grid[latticeIndex] = counter;
767
768                                 coords[C_INDEX_X(counter)] = x;
769                                 coords[C_INDEX_Y(counter)] = y;
770                                 coords[C_INDEX_Z(counter)] = z;
771
772                                 ++counter;
773                         }
774                 } } }
775         } } }
776
777         Verify(counter == nFluid);
778         uint32_t * adjList;
779
780         // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
781         unsigned long adjListBytes = nFluid * sizeof(int) * N_D3Q19_IDX;
782
783         printf("# Adjacency List Allocation:\n");
784         printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
785         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
786
787         // AdjList only requires 18 instead of 19 entries per node, as
788         // the center PDF needs no addressing.
789         if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
790                 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
791                 exit(1);
792         }
793         else {
794                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
795         }
796
797         kdl->AdjList = adjList;
798
799         int x, y, z;
800
801         uint32_t neighborIndex;
802         uint32_t dstIndex;
803
804         int nx, ny, nz, px, py, pz;
805
806         // Loop over all fluid nodes and compute the indices to the neighboring
807         // PDFs for configured data layout (AoS/SoA).
808         // Parallelized loop to ensure correct NUMA placement.
809         // #ifdef _OPENMP  --> add line continuation
810         //      #pragma omp parallel for default(none)
811         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
812         //                              stderr,
813         //                              lDims, grid, ld, lX, lY, lZ, adjList)
814         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
815         // #endif
816         for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
817
818
819                 // Loop over all directions except the center one.
820                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
821                         Assert(d != D3Q19_C);
822
823                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
824                                 int index = fluidBaseIndex + inChunkIndex;
825
826                                 Assert(index < nFluid);
827
828                                 x = coords[C_INDEX_X(index)];
829                                 y = coords[C_INDEX_Y(index)];
830                                 z = coords[C_INDEX_Z(index)];
831
832                                 Assert(x >= 0 && x < lX);
833                                 Assert(y >= 0 && y < lY);
834                                 Assert(z >= 0 && z < lZ);
835
836                                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
837
838 #ifdef PROP_MODEL_PUSH
839                                 nx = x + D3Q19_X[d];
840                                 ny = y + D3Q19_Y[d];
841                                 nz = z + D3Q19_Z[d];
842
843 #elif PROP_MODEL_PULL
844                                 nx = x - D3Q19_X[d];
845                                 ny = y - D3Q19_Y[d];
846                                 nz = z - D3Q19_Z[d];
847 #else
848                                 #error No implementation for this PROP_MODEL_NAME.
849 #endif
850                                 // If the neighbor is outside the latice in X direction and we have a
851                                 // periodic boundary then we need to wrap around.
852                                 if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
853                                                 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
854                                                 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
855                                    ){
856                                         // x periodic
857
858                                         if (nx < 0) {
859                                                 px = lX - 1;
860                                         }
861                                         else if (nx >= lX) {
862                                                 px = 0;
863                                         } else {
864                                                 px = nx;
865                                         }
866                                         // y periodic
867                                         if (ny < 0) {
868                                                 py = lY - 1;
869                                         }
870                                         else if (ny >= lY) {
871                                                 py = 0;
872                                         } else {
873                                                 py = ny;
874                                         }
875
876                                         // z periodic
877                                         if (nz < 0) {
878                                                 pz = lZ - 1;
879                                         }
880                                         else if (nz >= lZ) {
881                                                 pz = 0;
882                                         } else {
883                                                 pz = nz;
884                                         }
885
886                                         if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
887                                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
888                                         }
889                                         else {
890                                                 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
891
892                                                 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
893
894                                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
895                                         }
896                                 }
897                                 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
898                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
899                                 }
900                                 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
901                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
902                                 }
903                                 else {
904                                         neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
905
906                                         Assert(neighborIndex != ~0);
907
908                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
909                                 }
910
911                                 Assert(dstIndex >= 0);
912                                 Assert(dstIndex < nCells * N_D3Q19);
913
914                                 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
915                         }
916                 }
917         }
918
919         /*
920         printf("============\n");
921         for(int baseIndex = 0; baseIndex < nFluid; baseIndex+=VSIZE){
922                 for(int i = 0; i < VSIZE; ++i){
923                         int index = baseIndex + i;
924
925                         printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (0 * VSIZE) + (index % VSIZE)]);
926                 }
927                 printf("\n");
928         }
929         printf("============\n");
930         */
931
932         int nThreads = 1;
933
934 #ifdef _OPENMP
935         nThreads = omp_get_max_threads();
936 #endif
937
938         SetupConsecNodes(ld, KDLR(kd), nThreads);
939
940         // Fill remaining KernelData structures
941         kd->GetNode = GetNode;
942         kd->SetNode = SetNode;
943
944         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
945         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
946
947         kd->Kernel = FNAME(D3Q19ListAaPvGatherKernel);
948
949         kd->DstPdfs = NULL;
950         kd->PdfsActive = kd->Pdfs[0];
951
952         return;
953 }
954
955 void FNAME(D3Q19ListAaPvGatherDeinit)(LatticeDesc * ld, KernelData ** kernelData)
956 {
957         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
958
959         MemFree((void **)&((*kdlr)->ConsecNodes));
960
961         if ((*kdlr)->ConsecThreadIndices != NULL) {
962                 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
963         }
964
965         KernelDataList ** kdl = (KernelDataList **)kernelData;
966
967         ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
968
969         MemFree((void **)&((*kdl)->Coords));
970         MemFree((void **)&((*kdl)->Grid));
971
972         PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
973
974         MemFree((void **)kernelData);
975         return;
976 }
977
This page took 0.288637 seconds and 4 git commands to generate.