add citation information
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPvGatherAoSoACommon.c
CommitLineData
8cafd9ea
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// Michael Hussnaetter, 2017-2018
12// University of Erlangen-Nuremberg, Germany
13// michael.hussnaetter -at- fau.de
14//
15// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16//
17// LbmBenchKernels is free software: you can redistribute it and/or modify
18// it under the terms of the GNU General Public License as published by
19// the Free Software Foundation, either version 3 of the License, or
20// (at your option) any later version.
21//
22// LbmBenchKernels is distributed in the hope that it will be useful,
23// but WITHOUT ANY WARRANTY; without even the implied warranty of
24// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25// GNU General Public License for more details.
26//
27// You should have received a copy of the GNU General Public License
28// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
29//
30// --------------------------------------------------------------------------
31#include "BenchKernelD3Q19ListAaPvGatherAoSoACommon.h"
32
33#include "Memory.h"
34#include "Vector.h"
35#include "Vtk.h"
36
37#include <math.h>
38
39#ifdef _OPENMP
40 #include <omp.h>
41#endif
42
43#define PAGE_4K 4096
44
45#if ALLOC_ADJ_LIST_IN_HBM == 1
46 #define ADJ_LIST_ALLOCATOR HbwAllocAligned
47 #define ADJ_LIST_FREE HbwFree
48#else
49 #define ADJ_LIST_ALLOCATOR MemAllocAligned
50 #define ADJ_LIST_FREE MemFree
51#endif
52
53#if ALLOC_PDF_IN_HBM == 1
54 #define PDF_ALLOCATOR HbwAllocAligned
55 #define PDF_FREE HbwFree
56#else
57 #define PDF_ALLOCATOR MemAllocAligned
58 #define PDF_FREE MemFree
59#endif
60
61// Forward definition.
62void FNAME(D3Q19ListAaPvGatherAoSoAKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
63
64
65// -----------------------------------------------------------------------
66// Functions which are used as callback by the kernel to read or write
67// PDFs and nodes.
68
69
70static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
71{
72 Assert(kd != NULL);
73 Assert(kd->PdfsActive != NULL);
74 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
75 Assert(pdf != NULL);
76
77 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
78 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
79 Assert(dir >= 0); Assert(dir < N_D3Q19);
80
81 KernelDataList * kdl = KDL(kd);
82 uint32_t * adjList = kdl->AdjList;
83
84 if (kdl->Iteration % 2 == 0) {
85 // Pdfs are stored inverse, local PDFs are located in remote nodes
86
87 // getting node index
88 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
89
90 if (dir != D3Q19_C) {
91 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
92 *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
93 #undef ADJ_LIST
94 }
95 else {
96 *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
97 }
98
99 }
100 else {
101 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
102 }
103
104 return;
105}
106
107static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
108{
109 Assert(kd != NULL);
110 Assert(kd->PdfsActive != NULL);
111 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
112 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
113 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
114 Assert(dir >= 0); Assert(dir < N_D3Q19);
115
116 if (isnan(pdf)) {
117 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
118 DEBUG_BREAK_POINT();
119 exit(1);
120 }
121
122 KernelDataList * kdl = KDL(kd);
123 uint32_t * adjList = kdl->AdjList;
124
125 if (kdl->Iteration % 2 == 0) {
126 // Pdfs are stored inverse, local PDFs are located in remote nodes
127
128 // getting node index
129 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
130
131 if (dir != D3Q19_C) {
132 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
133 kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
134 #undef ADJ_LIST
135 } else {
136 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
137 }
138
139 } else {
140 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
141 }
142
143 return;
144}
145
146
147static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
148{
149 Assert(kd != NULL);
150 Assert(kd->PdfsActive != NULL);
151 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
152 Assert(pdfs != NULL);
153 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
154 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
155
156 KernelDataList * kdl = KDL(kd);
157 uint32_t * adjList = kdl->AdjList;
158
159 if(kdl->Iteration % 2 == 0){
160
161 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
162
163 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
164 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
165 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
166 D3Q19_LIST_WO_C
167 #undef X
168 #undef ADJ_LIST
169 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
170
171 } else {
172
173 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
174 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
175 D3Q19_LIST
176 #undef X
177 #undef I
178
179 }
180
181 for (int d = 0; d < 19; ++d) {
182 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
183 printf("%d %d %d %d nan! get node\n", x, y, z, d);
184 for (int d2 = 0; d2 < 19; ++d2) {
185 printf("%d: %e\n", d2, pdfs[d2]);
186 }
187 exit(1);
188 }
189 }
190
191 return;
192}
193
194
195static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
196{
197 Assert(kd != NULL);
198 Assert(kd->PdfsActive != NULL);
199 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
200 Assert(pdfs != NULL);
201
202 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
203 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
204
205 for (int d = 0; d < 19; ++d) {
206 if(isnan(pdfs[d])) {
207 printf("%d %d %d %d nan! get node\n", x, y, z, d);
208 for (int d2 = 0; d2 < 19; ++d2) {
209 printf("%d: %e\n", d2, pdfs[d2]);
210 }
211 exit(1);
212 }
213 }
214
215 KernelDataList * kdl = KDL(kd);
216 uint32_t * adjList = kdl->AdjList;
217
218 if(kdl->Iteration % 2 == 0){
219
220 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
221
222 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
223 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
224
225 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
226 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
227 D3Q19_LIST_WO_C
228 #undef X
229 #undef ADJ_LIST
230
231 } else {
232
233 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
234 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
235 D3Q19_LIST
236 #undef X
237 #undef I
238
239 }
240
241 return;
242}
243
244static void ParameterUsage()
245{
246 printf("Kernel parameters:\n");
247 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
248
249 return;
250}
251
252static void ParseParameters(Parameters * params, int * blk)
253{
254 Assert(blk != NULL);
255
256 blk[0] = 0; blk[1] = 0; blk[2] = 0;
257
258 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
259 #define NEXT_ARG_PRESENT() \
260 do { \
261 if (i + 1 >= params->nKernelArgs) { \
262 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
263 exit(1); \
264 } \
265 } while (0)
266
267
268 for (int i = 0; i < params->nKernelArgs; ++i) {
269 if (ARG_IS("-blk") || ARG_IS("--blk")) {
270 NEXT_ARG_PRESENT();
271
272 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
273
274 if (tmp <= 0) {
275 printf("ERROR: blocking parameter must be > 0.\n");
276 exit(1);
277 }
278
279 blk[0] = blk[1] = blk[2] = tmp;
280 }
281 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
282 NEXT_ARG_PRESENT();
283
284 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
285
286 if (tmp <= 0) {
287 printf("ERROR: blocking parameter must be > 0.\n");
288 exit(1);
289 }
290
291 blk[0] = tmp;
292 }
293 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
294 NEXT_ARG_PRESENT();
295
296 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
297
298 if (tmp <= 0) {
299 printf("ERROR: blocking parameter must be > 0.\n");
300 exit(1);
301 }
302
303 blk[1] = tmp;
304 }
305 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
306 NEXT_ARG_PRESENT();
307
308 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
309
310 if (tmp <= 0) {
311 printf("ERROR: blocking parameter must be > 0.\n");
312 exit(1);
313 }
314
315 blk[2] = tmp;
316 }
317 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
318 ParameterUsage();
319 exit(1);
320 }
321 else {
322 printf("ERROR: unknown kernel parameter.\n");
323 ParameterUsage();
324 exit(1);
325 }
326 }
327
328 #undef ARG_IS
329 #undef NEXT_ARG_PRESENT
330
331 return;
332}
333
334static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
335{
336 Assert(ld != NULL);
337 Assert(kdlr != NULL);
338 Assert(nThreads > 0);
339
340 uint32_t * adjList = kdlr->kdl.AdjList;
341
342 uint32_t nConsecNodes = 0;
343 uint32_t consecIndex = 0;
344
345 int nFluid = kdlr->kdl.nFluid;
346
347 uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
348
349 int nNodesPerThread = nFluid / nThreads;
350
351 for (int i = 0; i < nThreads; ++i) {
352 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
353 }
354 consecThreadIndices[nThreads] = nFluid;
355
356 int indexThread = 1;
357 int similarPatterns = 0;
358 int wasLastChunkThreadBoundary = 0;
359 // We execute following code two times.
360 // - The first time to get the count of how many entries we need for the
361 // consecNodes array.
362 // - The second time to fill the array.
363
364 // Loop over adjacency list of all nodes.
365 // Compare if adjacent nodes share the same access pattern.
366 for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
367
368 int hasSimilarAccessPattern = 1;
369
370 // Loop over all directions except the center one.
371 for(int d = 0; d < N_D3Q19 - 1; ++d) {
372 Assert(d != D3Q19_C);
373
374 // check if cache line itself has consecutive memory access pattern
375 for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
376 int index = fluidBaseIndex + inChunkIndex;
377
378 Assert(index < nFluid);
379
380 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
381 //if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
382 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE * N_D3Q19_IDX + VSIZE) {
383 //printf("different @: ADJ_LST(%d,%d)=%d != %d=ADJ_LST(%d, %d) + VSIZE\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-VSIZE,d) + VSIZE * N_D3Q19_IDX + VSIZE, index-VSIZE, d);
384 // Different access pattern.
385 hasSimilarAccessPattern = 0;
386 break;
387 }
388 #undef ADJ_LIST
389 }
390
391 if(!hasSimilarAccessPattern){
392 break; //exit from nested loop
393 }
394 }
395
396 long threadBoundaryIndex = consecThreadIndices[indexThread];
397 if (fluidBaseIndex <= threadBoundaryIndex &&
398 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
399 // Current chunk contains thread boundary.
400 // These chunks are treated by scalar peel and reminder loops
401 // in kernel of every thread to ensure VSIZE aligned access to
402 // adjacency list
403
404 // final cells of current thread
405 ++consecIndex;
406
407 // first cells of next thread
408 ++indexThread;
409 ++consecIndex;
410
411 wasLastChunkThreadBoundary = 1;
412 }
413 else {
414 // We are not at a thread boundary
415 if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
416 ++similarPatterns;
417 }
418 else {
419 ++consecIndex;
420 }
421
422 wasLastChunkThreadBoundary = 0;
423
424 /*
425 if (!hasSimilarAccessPattern) {
426 ++consecIndex;
427 }
428 else {
429 ++similarPatterns;
430 }
431 */
432 }
433 }
434
435 if (nFluid > 0) {
436 nConsecNodes = consecIndex + 1;
437 }
438
439 uint32_t * consecNodes;
440 MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
441
442 unsigned long consecNodesByte = (nConsecNodes) * sizeof(uint32_t);
443
444 printf("# Consec. Nodes Array Allocation:\n");
445 printf("# similar patterns\t\t%d\n", similarPatterns);
446 printf("# elements: \t\t%d\n", nConsecNodes);
447 printf("# size: \t\t%e MiB\n", consecNodesByte / 1024.0 / 1024.0);
448 printf("# alignment: \t\t%d b\n", PAGE_4K);
449
450 if (MemAllocAligned((void **)&consecNodesByte, consecNodesByte, PAGE_4K)) {
451 printf("ERROR: allocating consecNodes array with MemAllocAligned failed: %lu bytes.\n", consecNodesByte);
452 exit(1);
453 }
454 else {
455 printf("# allocator: \t\t\tMemAllocAligned()\n");
456 }
457
458 consecIndex = 0;
459
460 if (nFluid > 0) {
461 consecNodes[consecIndex] = VSIZE;
462 }
463
464 indexThread = 1;
465 consecThreadIndices[0] = 0;
466
467 //add first chunk manually to enable backward check for consecutive pattern
468 consecNodes[consecIndex] = VSIZE;
469
470 wasLastChunkThreadBoundary = 0;
471
472 // Loop over adjacency list of all nodes.
473 // Compare if access pattern does not change on chunk level
474 // Since gather instructions are used, access pattern may not be consecutive
475 for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
476
477 int hasSimilarAccessPattern = 1;
478
479 // Loop over all directions except the center one.
480 for(int d = 0; d < N_D3Q19 - 1; ++d) {
481 Assert(d != D3Q19_C);
482
483 // check if cache line itself has consecutive memory access pattern
484 for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
485 int index = fluidBaseIndex + inChunkIndex;
486
487 Assert(index < nFluid);
488
489 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
490 //if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
491 if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE * N_D3Q19_IDX + VSIZE) {
492 // Different access pattern.
493 hasSimilarAccessPattern = 0;
494 break;
495 }
496 #undef ADJ_LIST
497 }
498
499 if(!hasSimilarAccessPattern){
500 break; //exit from nested loop
501 }
502 }
503
504 long threadBoundaryIndex = consecThreadIndices[indexThread];
505 if (fluidBaseIndex <= threadBoundaryIndex &&
506 threadBoundaryIndex < fluidBaseIndex + VSIZE) {
507 // Current chunk contains thread boundary.
508 // These chunks are treated by scalar peel and reminder loops
509 // in kernel of every thread to ensure VSIZE aligned access to
510 // adjacency list
511
512 // final cells of current thread
513 ++consecIndex;
514 //consecThreadIndices[indexThread] = consecIndex;
515 consecNodes[consecIndex] = threadBoundaryIndex - fluidBaseIndex;
516
517
518 // first cells of next thread
519 ++consecIndex;
520 consecThreadIndices[indexThread] = consecIndex;
521 consecNodes[consecIndex] = (fluidBaseIndex + VSIZE) - threadBoundaryIndex;
522 ++indexThread;
523
524 wasLastChunkThreadBoundary = 1;
525
526 }
527 else {
528 // We are not at a thread boundary
529 if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
530 Assert(consecIndex < nConsecNodes);
531 consecNodes[consecIndex] += VSIZE;
532 }
533 else {
534 ++consecIndex;
535 Assert(consecIndex < nConsecNodes);
536 consecNodes[consecIndex] = VSIZE;
537 }
538
539 /*
540 if (!hasSimilarAccessPattern) {
541 ++consecIndex;
542 Assert(consecIndex < nConsecNodes);
543 consecNodes[consecIndex] = VSIZE;
544 }
545 else {
546 Assert(consecIndex < nConsecNodes);
547 consecNodes[consecIndex] += VSIZE;
548 }
549 */
550 wasLastChunkThreadBoundary = 0;
551
552 }
553 }
554
555 /*
556 printf("consecNodes:\n");
557 for(int i = 0; i < nConsecNodes + 5; ++i){
558 printf("%d ", consecNodes[i]);
559 }
560 printf("\n");
561 */
562 /*
563 printf("consecThreadIndices:\n");
564 for(int i = 0; i < nThreads + 5; ++i){
565 printf("%d ", consecThreadIndices[i]);
566 }
567 printf("\n");
568 */
569
570 kdlr->ConsecNodes = consecNodes;
571 kdlr->nConsecNodes = nConsecNodes;
572
573 kdlr->ConsecThreadIndices = consecThreadIndices;
574 kdlr->nConsecThreadIndices = nThreads;
575
576 double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
577 //N_D3Q19 - 1: C lookup not required, +1: transfer of consecValue
578 double loopBalanceOdd = 2.0 * 19 * sizeof(PdfT) + ((double)nConsecNodes *((N_D3Q19 - 1) * VSIZE + 1)) / nFluid * sizeof(int);
579 double loopBalance = (loopBalanceEven + loopBalanceOdd) / 2.0;
580
581 kdlr->kdl.kd.LoopBalance = loopBalance;
582
583 printf("# loop balance:\n");
584 printf("# even timestep: \t\t%.2f B/FLUP\n", loopBalanceEven);
585 printf("# odd timestep: \t\t%.2f B/FLUP\n", loopBalanceOdd);
586 printf("# average: \t\t%.2f B/FLUP\n", loopBalance);
587
588 return;
589}
590
591void FNAME(D3Q19ListAaPvGatherAoSoAInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
592{
593 KernelData * kd;
594 KernelDataList * kdl;
595 KernelDataListRia * kdlr;
596 MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
597
598 kd = (KernelData *)kdlr;
599 kdl = KDL(kdlr);
600
601 *kernelData = kd;
602
603#ifdef DEBUG
604 kd->Pdfs[0] = NULL;
605 kd->Pdfs[1] = NULL;
606 kd->PdfsActive = NULL;
607 kd->DstPdfs = NULL;
608 kd->SrcPdfs = NULL;
609 kd->Dims[0] = -1;
610 kd->Dims[1] = -1;
611 kd->Dims[2] = -1;
612 kd->GlobalDims[0] = -1;
613 kd->GlobalDims[1] = -1;
614 kd->GlobalDims[2] = -1;
615 kd->Offsets[0] = -1;
616 kd->Offsets[1] = -1;
617 kd->Offsets[2] = -1;
618
619 kd->ObstIndices = NULL;
620 kd->nObstIndices = -1;
621 kd->BounceBackPdfsSrc = NULL;
622 kd->BounceBackPdfsDst = NULL;
623 kd->nBounceBackPdfs = -1;
624
625 kdl->AdjList = NULL;
626 kdl->Coords = NULL;
627 kdl->Grid = NULL;
628 kdl->nCells = -1;
629 kdl->nFluid = -1;
630
631 kdlr->ConsecNodes = NULL;
632 kdlr->nConsecNodes = 0;
633 kdlr->ConsecThreadIndices = NULL;
634 kdlr->nConsecThreadIndices = 0;
635#endif
636
637 // Ajust the dimensions according to padding, if used.
638 kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
639 kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
640 kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
641
642 int * lDims = ld->Dims;
643
644 int lX = lDims[0];
645 int lY = lDims[1];
646 int lZ = lDims[2];
647
648 int nTotalCells = lX * lY * lZ;
649 int nCells = ld->nFluid; // TODO: + padding
650 int nFluid = ld->nFluid;
651
652 // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
653 // done during lattice setup.
654 kdl->nCells = nCells;
655 kdl->nFluid = nFluid;
656
657 PdfT * pdfs[2];
658
659 int blk[3] = { 0 };
660
661 ParseParameters(params, blk);
662
663 if (blk[0] == 0) blk[0] = lX;
664 if (blk[1] == 0) blk[1] = lY;
665 if (blk[2] == 0) blk[2] = lZ;
666
667 printf("# blocking: \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
668
669 unsigned long latByte = nCells * sizeof(PdfT) * N_D3Q19;
670 unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
671 unsigned long latPadByte = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
672
673 printf("# Lattice Array Allocation:\n");
674 printf("# lattice size: \t\t%e MiB\n", latByte / 1024.0 / 1024.0);
675 printf("# fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
676 printf("# lattice padding: \t\t%e MiB\n", latPadByte / 1024.0 / 1024.0);
677
678
679 printf("# alignment: \t\t%d b\n", PAGE_4K);
680
681 if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
682 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
683 exit(1);
684 }
685 else {
686 printf("# allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
687 }
688
689 kd->Pdfs[0] = pdfs[0];
690
691 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
692 // Here we touch only the fluid nodes as this loop is OpenMP parallel and
693 // we want the same scheduling as in the kernel.
694 #ifdef _OPENMP
695 #pragma omp parallel for
696 #endif
697 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
698 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
699 } }
700
701 // Initialize all PDFs to some standard value.
702 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
703 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
704 } }
705
706 // ----------------------------------------------------------------------
707 // create grid which will hold the index numbers of the fluid nodes
708
709 uint32_t * grid;
710
711 if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
712 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
713 exit(1);
714 }
715 kdl->Grid = grid;
716
717 int latticeIndex;
718
719#ifdef DEBUG
720 for(int z = 0; z < lZ; ++z) {
721 for(int y = 0; y < lY; ++y) {
722 for(int x = 0; x < lX; ++x) {
723
724 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
725
726 grid[latticeIndex] = ~0;
727 }
728 }
729 }
730#endif
731
732 // ----------------------------------------------------------------------
733 // generate numbering over grid
734
735 uint32_t * coords;
736
737 if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
738 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
739 exit(1);
740 }
741
742 kdl->Coords = coords;
743
744 // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
745 // through the ld->Lattice array.
746 int counter = 0;
747
748 // Blocking is implemented via setup of the adjacency list. The kernel later will
749 // walk through the lattice blocked automatically.
750 for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
751 for (int bY = 0; bY < lY; bY += blk[1]) {
752 for (int bX = 0; bX < lX; bX += blk[0]) {
753
754 int eZ = MIN(bZ + blk[2], lZ);
755 int eY = MIN(bY + blk[1], lY);
756 int eX = MIN(bX + blk[0], lX);
757
758 for (int z = bZ; z < eZ; ++z) {
759 for (int y = bY; y < eY; ++y) {
760 for (int x = bX; x < eX; ++x) {
761
762 latticeIndex = L_INDEX_4(lDims, x, y, z);
763
764 if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
765 grid[latticeIndex] = counter;
766
767 coords[C_INDEX_X(counter)] = x;
768 coords[C_INDEX_Y(counter)] = y;
769 coords[C_INDEX_Z(counter)] = z;
770
771 ++counter;
772 }
773 } } }
774 } } }
775
776 Verify(counter == nFluid);
777 uint32_t * adjList;
778
779 // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
780 unsigned long adjListBytes = nFluid * sizeof(int) * N_D3Q19_IDX;
781
782 printf("# Adjacency List Allocation:\n");
783 printf("# size: \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
784 printf("# alignment: \t\t%d b\n", PAGE_4K);
785
786 // AdjList only requires 18 instead of 19 entries per node, as
787 // the center PDF needs no addressing.
788 if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
789 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
790 exit(1);
791 }
792 else {
793 printf("# allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
794 }
795
796 kdl->AdjList = adjList;
797
798 int x, y, z;
799
800 uint32_t neighborIndex;
801 uint32_t dstIndex;
802
803 int nx, ny, nz, px, py, pz;
804
805 // Loop over all fluid nodes and compute the indices to the neighboring
806 // PDFs for configured data layout (AoS/SoA).
807 // Parallelized loop to ensure correct NUMA placement.
808 // #ifdef _OPENMP --> add line continuation
809 // #pragma omp parallel for default(none)
810 // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
811 // stderr,
812 // lDims, grid, ld, lX, lY, lZ, adjList)
813 // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
814 // #endif
815 for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
816
817
818 // Loop over all directions except the center one.
819 for(int d = 0; d < N_D3Q19 - 1; ++d) {
820 Assert(d != D3Q19_C);
821
822 for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
823 int index = fluidBaseIndex + inChunkIndex;
824
825 Assert(index < nFluid);
826
827 x = coords[C_INDEX_X(index)];
828 y = coords[C_INDEX_Y(index)];
829 z = coords[C_INDEX_Z(index)];
830
831 Assert(x >= 0 && x < lX);
832 Assert(y >= 0 && y < lY);
833 Assert(z >= 0 && z < lZ);
834
835 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
836
837#ifdef PROP_MODEL_PUSH
838 nx = x + D3Q19_X[d];
839 ny = y + D3Q19_Y[d];
840 nz = z + D3Q19_Z[d];
841
842#elif PROP_MODEL_PULL
843 nx = x - D3Q19_X[d];
844 ny = y - D3Q19_Y[d];
845 nz = z - D3Q19_Z[d];
846#else
847 #error No implementation for this PROP_MODEL_NAME.
848#endif
849 // If the neighbor is outside the latice in X direction and we have a
850 // periodic boundary then we need to wrap around.
851 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
852 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
853 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
854 ){
855 // x periodic
856
857 if (nx < 0) {
858 px = lX - 1;
859 }
860 else if (nx >= lX) {
861 px = 0;
862 } else {
863 px = nx;
864 }
865 // y periodic
866 if (ny < 0) {
867 py = lY - 1;
868 }
869 else if (ny >= lY) {
870 py = 0;
871 } else {
872 py = ny;
873 }
874
875 // z periodic
876 if (nz < 0) {
877 pz = lZ - 1;
878 }
879 else if (nz >= lZ) {
880 pz = 0;
881 } else {
882 pz = nz;
883 }
884
885 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
886 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
887 }
888 else {
889 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
890
891 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
892
893 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
894 }
895 }
896 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
897 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
898 }
899 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
900 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
901 }
902 else {
903 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
904
905 Assert(neighborIndex != ~0);
906
907 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
908 }
909
910 Assert(dstIndex >= 0);
911 Assert(dstIndex < nCells * N_D3Q19);
912
913 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
914 }
915 }
916 }
917
918 /*
919 printf("============\n");
920 for(int baseIndex = 0; baseIndex < nFluid; baseIndex+=VSIZE){
921 for(int i = 0; i < VSIZE; ++i){
922 int index = baseIndex + i;
923
924 printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (0 * VSIZE) + (index % VSIZE)]);
925 }
926 printf("\n");
927 }
928 printf("============\n");
929*/
930
931 int nThreads = 1;
932
933#ifdef _OPENMP
934 nThreads = omp_get_max_threads();
935#endif
936
937 SetupConsecNodes(ld, KDLR(kd), nThreads);
938
939 // Fill remaining KernelData structures
940 kd->GetNode = GetNode;
941 kd->SetNode = SetNode;
942
943 kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
944 kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
945
946 kd->Kernel = FNAME(D3Q19ListAaPvGatherAoSoAKernel);
947
948 kd->DstPdfs = NULL;
949 kd->PdfsActive = kd->Pdfs[0];
950
951 return;
952}
953
954void FNAME(D3Q19ListAaPvGatherAoSoADeinit)(LatticeDesc * ld, KernelData ** kernelData)
955{
956 KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
957
958 MemFree((void **)&((*kdlr)->ConsecNodes));
959
960 if ((*kdlr)->ConsecThreadIndices != NULL) {
961 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
962 }
963
964 KernelDataList ** kdl = (KernelDataList **)kernelData;
965
966 ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
967
968 MemFree((void **)&((*kdl)->Coords));
969 MemFree((void **)&((*kdl)->Grid));
970
971 PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
972
973 MemFree((void **)kernelData);
974 return;
975}
976
This page took 0.12255 seconds and 5 git commands to generate.