add citation information
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNtCommon.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31#include "Vector.h"
e3f82424
MW
32#include "Padding.h"
33
10988083
MW
34
35#include <math.h>
36
37#ifdef _OPENMP
38 #include <omp.h>
39#endif
40
41// Forward definition.
42void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
43void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
44
45void FNAME(KernelPullSplitNtRia1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
46void FNAME(KernelPullSplitNtRia2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
47
48
49
50
51// -----------------------------------------------------------------------
52// Functions which are used as callback by the kernel to read or write
53// PDFs and nodes.
54
55static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
56{
57 Assert(kd != NULL);
58 Assert(kd->PdfsActive != NULL);
59 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
60 Assert(pdf != NULL);
61
62 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
63 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
64 Assert(dir >= 0); Assert(dir < N_D3Q19);
65
66 // The relevant PDFs here are the ones, which will get streamed in later
67 // during propagation. So we must return the *remote* PDFs.
68 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
69
70 if (dir != D3Q19_C) {
71
72 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
73
74 *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]];
75 }
76 else {
77 *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
78
79 }
80
81 return;
82}
83
84static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
85{
86 Assert(kd != NULL);
87 Assert(kd->PdfsActive != NULL);
88 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
89 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
90 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
91 Assert(dir >= 0); Assert(dir < N_D3Q19);
92
93 if (isnan(pdf)) {
94 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
95 DEBUG_BREAK_POINT();
96 exit(1);
97 }
98
99 // The relevant PDFs here are the ones, which will get streamed in later
100 // during propagation. So we must set this *remote* PDFs.
101 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
102
103 if (dir != D3Q19_C) {
104
105 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
106
107 kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf;
108 }
109 else {
110 kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
111
112 }
113
114 return;
115}
116
117
118static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
119{
120 Assert(kd != NULL);
121 Assert(kd->PdfsActive != NULL);
122 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
123 Assert(pdfs != NULL);
124 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
125 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
126
127 PdfT sum = 0.0;
128
129 // TODO: pull scheme?
130
131 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
132 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx];
133 D3Q19_LIST
134 #undef X
135 #undef I
136
137#ifdef DETECT_NANS
138 for (int d = 0; d < 19; ++d) {
139 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
140 printf("%d %d %d %d nan! get node\n", x, y, z, d);
141 for (int d2 = 0; d2 < 19; ++d2) {
142 printf("%d: %e\n", d2, pdfs[d2]);
143 }
144 exit(1);
145 }
146 }
147#endif
148 return;
149}
150
151
152static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
153{
154 Assert(kd != NULL);
155 Assert(kd->PdfsActive != NULL);
156 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
157 Assert(pdfs != NULL);
158
159 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
160 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
161
162#ifdef DETECT_NANS
163 for (int d = 0; d < 19; ++d) {
164 if(isnan(pdfs[d])) {
165 printf("%d %d %d %d nan! get node\n", x, y, z, d);
166 for (int d2 = 0; d2 < 19; ++d2) {
167 printf("%d: %e\n", d2, pdfs[d2]);
168 }
169 exit(1);
170 }
171 }
172#endif
173
174 // TODO: pull scheme?
175 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
176 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
177 D3Q19_LIST
178 #undef X
179 #undef I
180
181 return;
182}
183
184static void ParameterUsage()
185{
186 printf("Kernel parameters:\n");
187 printf(" [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
e3f82424 188 printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
10988083
MW
189
190 return;
191}
192
e3f82424 193static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo)
10988083
MW
194{
195 Assert(blk != NULL);
196
197 blk[0] = 0; blk[1] = 0; blk[2] = 0;
9e0051cb 198 *nTmpArray = 152 - (152 % VSIZE);
e3f82424 199 *padInfo = NULL;
10988083
MW
200
201 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
202 #define NEXT_ARG_PRESENT() \
203 do { \
204 if (i + 1 >= params->nKernelArgs) { \
205 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
206 exit(1); \
207 } \
208 } while (0)
209
210
211 for (int i = 0; i < params->nKernelArgs; ++i) {
212 if (ARG_IS("-blk") || ARG_IS("--blk")) {
213 NEXT_ARG_PRESENT();
214
215 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
216
e3f82424
MW
217 if (tmp < 0) {
218 printf("ERROR: blocking parameter must be >= 0.\n");
10988083
MW
219 exit(1);
220 }
221
222 blk[0] = blk[1] = blk[2] = tmp;
223 }
224 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
225 NEXT_ARG_PRESENT();
226
227 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
228
e3f82424
MW
229 if (tmp < 0) {
230 printf("ERROR: blocking parameter must be >= 0.\n");
10988083
MW
231 exit(1);
232 }
233
234 blk[0] = tmp;
235 }
236 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
237 NEXT_ARG_PRESENT();
238
239 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
240
e3f82424
MW
241 if (tmp < 0) {
242 printf("ERROR: blocking parameter must be >= 0.\n");
10988083
MW
243 exit(1);
244 }
245
246 blk[1] = tmp;
247 }
248 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
249 NEXT_ARG_PRESENT();
250
251 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
252
e3f82424
MW
253 if (tmp < 0) {
254 printf("ERROR: blocking parameter must be >= 0.\n");
10988083
MW
255 exit(1);
256 }
257
258 blk[2] = tmp;
259 }
260 else if (ARG_IS("-n-tmp-array") || ARG_IS("--n-tmp-array")) {
261 NEXT_ARG_PRESENT();
262
263 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
264
265 if (tmp <= 0) {
266 printf("ERROR: -n-tmp-array parameter must be > 0.\n");
267 exit(1);
268 }
269
270 if (tmp % VSIZE != 0) {
271 printf("ERROR: value for -n-tmp-array must be a multiple of %d.\n", VSIZE);
272 exit(1);
273 }
274
275 *nTmpArray = tmp;
276 }
e3f82424
MW
277 else if (ARG_IS("-pad") || ARG_IS("--pad")) {
278 NEXT_ARG_PRESENT();
279
280 *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
281 }
10988083
MW
282 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
283 ParameterUsage();
284 exit(1);
285 }
286 else {
287 printf("ERROR: unknown kernel parameter.\n");
288 ParameterUsage();
289 exit(1);
290 }
291 }
292
293 #undef ARG_IS
294 #undef NEXT_ARG_PRESENT
295
296 return;
297}
298
299static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
300{
301 Assert(ld != NULL);
302 Assert(kdlr != NULL);
303 Assert(nThreads > 0);
304
305 uint32_t * adjList = kdlr->kdl.AdjList;
306
307 uint32_t nConsecNodes = 0;
308 uint32_t consecIndex = 0;
309
310 int nFluid = kdlr->kdl.nFluid;
311
312 uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
313
314 int nNodesPerThread = nFluid / nThreads;
315
316 for (int i = 0; i < nThreads; ++i) {
317 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
318 }
319 consecThreadIndices[nThreads] = -1;
320
321 int indexThread = 1;
322
323 // We execute following code two times.
324 // - The first time to get the count of how many entries we need for the
325 // consecNodes array.
326 // - The second time to fill the array.
327
328 // Loop over adjacency list of all nodes.
329 // Compare if adjacent nodes share the same access pattern.
330 for (int index = 1; index < nFluid; ++index) {
331
332 int different = 0;
333
334 // Loop over all directions except the center one.
335 for(int d = 0; d < N_D3Q19 - 1; ++d) {
336 Assert(d != D3Q19_C);
337
338 if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
339 // Different access pattern.
340 different = 1;
341 break;
342 }
343 }
344
345 if (consecThreadIndices[indexThread] == index) {
346 // We are at a thread boundary. Starting from this index the fluids
347 // belong to another thread. Force a break, if nodes are consecutive.
348 ++indexThread;
349 different = 1;
350 }
351
352 if (different) {
353 ++consecIndex;
354 }
355 }
356
357 if (nFluid > 0) {
358 nConsecNodes = consecIndex + 1;
359 }
360
361 uint32_t * consecNodes;
362 MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
363
364 consecIndex = 0;
365
366 if (nFluid > 0) {
367 consecNodes[consecIndex] = 1;
368 }
369
370 indexThread = 1;
371 consecThreadIndices[0] = 0;
372
373 // Loop over adjacency list of all nodes.
374 // Compare if adjacent nodes share the same access pattern.
375 for (int index = 1; index < nFluid; ++index) {
376
377 int different = 0;
378
379 // Loop over all directions except the center one.
380 for(int d = 0; d < N_D3Q19 - 1; ++d) {
381 Assert(d != D3Q19_C);
382
383 if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
384 // Different access pattern.
385 different = 1;
386 break;
387 }
388 }
389
390 if (consecThreadIndices[indexThread] == index) {
391 // We are at a thread boundary. Starting from this index the fluids
392 // belong to another thread. Force a break, if nodes are consecutive.
393 consecThreadIndices[indexThread] = consecIndex + 1;
394 ++indexThread;
395 different = 1;
396 }
397
398 if (different) {
399 ++consecIndex;
400 Assert(consecIndex < nConsecNodes);
401 consecNodes[consecIndex] = 1;
402 }
403 else {
404 Assert(consecIndex < nConsecNodes);
405 consecNodes[consecIndex] += 1;
406 }
407 }
408
409
410 kdlr->ConsecNodes = consecNodes;
411 kdlr->nConsecNodes = nConsecNodes;
412
413 kdlr->ConsecThreadIndices = consecThreadIndices;
414 kdlr->nConsecThreadIndices = nThreads;
415
416 // printf("# total fluid nodes: %d consecutive blocks: %d\n", nFluid, nConsecNodes);
417
418 return;
419}
420
421
422static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
423{
424 KernelData * kd;
425 KernelDataList * kdl;
426 KernelDataListRia * kdlr;
427 MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
428
429 kd = (KernelData *)kdlr;
430 kdl = KDL(kdlr);
431
432 *kernelData = kd;
433
434#ifdef DEBUG
435 kd->Pdfs[0] = NULL;
436 kd->Pdfs[1] = NULL;
437 kd->PdfsActive = NULL;
438 kd->DstPdfs = NULL;
439 kd->SrcPdfs = NULL;
440 kd->Dims[0] = -1;
441 kd->Dims[1] = -1;
442 kd->Dims[2] = -1;
443 kd->GlobalDims[0] = -1;
444 kd->GlobalDims[1] = -1;
445 kd->GlobalDims[2] = -1;
446 kd->Offsets[0] = -1;
447 kd->Offsets[1] = -1;
448 kd->Offsets[2] = -1;
449
450 kd->ObstIndices = NULL;
451 kd->nObstIndices = -1;
452 kd->BounceBackPdfsSrc = NULL;
453 kd->BounceBackPdfsDst = NULL;
454 kd->nBounceBackPdfs = -1;
455
456 kdl->AdjList = NULL;
457 kdl->Coords = NULL;
458 kdl->Grid = NULL;
459 kdl->nCells = -1;
460 kdl->nFluid = -1;
461
462 kdlr->ConsecNodes = NULL;
463 kdlr->nConsecNodes = 0;
464 kdlr->ConsecThreadIndices = NULL;
465 kdlr->nConsecThreadIndices = 0;
466#endif
467
e3f82424
MW
468 int blk[3] = { 0 };
469 PadInfo * padInfo = NULL;
470
471 ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo);
472
10988083
MW
473 // Ajust the dimensions according to padding, if used.
474 kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
475 kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
476 kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
477
478 int * lDims = ld->Dims;
479
480 int lX = lDims[0];
481 int lY = lDims[1];
482 int lZ = lDims[2];
483
484 int nTotalCells = lX * lY * lZ;
485 int nCells = ld->nFluid;
486 int nFluid = ld->nFluid;
487
e3f82424
MW
488 {
489 nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
490 PadInfoFree(padInfo); padInfo = NULL;
491 }
492
10988083 493 // We padd each stream of a PDF array for a complete cache line.
9e0051cb
MW
494 int nElementsPerCl = 64 / sizeof(PdfT);
495
496 nCells = nCells + (nElementsPerCl - nCells % nElementsPerCl);
10988083
MW
497
498 Assert(nCells % VSIZE == 0);
499
500 kdl->nCells = nCells;
501 kdl->nFluid = nFluid;
502
503 PdfT * pdfs[2];
504
10988083
MW
505 if (blk[0] == 0) blk[0] = lX;
506 if (blk[1] == 0) blk[1] = lY;
507 if (blk[2] == 0) blk[2] = lZ;
508
509 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
510 printf("# temporary array size: %d PDFs, %lu b\n", kdlr->nTmpArray, kdlr->nTmpArray * sizeof(PdfT) * 23);
511
512 double latMiB = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
513 double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
514 double latPadMib = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
515
516 printf("# lattice size: %e MiB total: %e MiB\n", latMiB, latMiB * 2);
517 printf("# fluid lattice size: %e MiB total: %e MiB\n", latFluidMib, latFluidMib * 2);
518 printf("# lattice padding: %e MiB total: %e MiB\n", latPadMib, latPadMib * 2);
519
520#define PAGE_4K 4096
521
522 printf("# aligning lattices to: %d b\n", PAGE_4K);
523
524 MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
525 MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
526
527 kd->Pdfs[0] = pdfs[0];
528 kd->Pdfs[1] = pdfs[1];
529
530 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
531 // Here we touch only the fluid nodes as this loop is OpenMP parallel and
532 // we want the same scheduling as in the kernel.
533 #ifdef _OPENMP
534 #pragma omp parallel for
535 #endif
536 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
537 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
538 pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0;
539 } }
540
541 // Initialize all PDFs to some standard value.
542 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
543 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
544 pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0;
545 } }
546
547 // ----------------------------------------------------------------------
548 // create grid which will hold the index numbers of the fluid nodes
549
550 uint32_t * grid;
551
552 if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
553 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
554 exit(1);
555 }
556 kdl->Grid = grid;
557
558 int latticeIndex;
559
560#ifdef DEBUG
561 for(int z = 0; z < lZ; ++z) {
562 for(int y = 0; y < lY; ++y) {
563 for(int x = 0; x < lX; ++x) {
564
565 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
566
567 grid[latticeIndex] = ~0;
568 }
569 }
570 }
571#endif
572
573 // ----------------------------------------------------------------------
574 // generate numbering over grid
575
576 uint32_t * coords;
577
578 if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
579 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
580 exit(1);
581 }
582
583 kdl->Coords = coords;
584
585 // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
586 // through the ld->Lattice array.
587 int counter = 0;
588
589 // Blocking is implemented via setup of the adjacency list. The kernel later will
590 // walk through the lattice blocked automatically.
10988083 591 for (int bX = 0; bX < lX; bX += blk[0]) {
e3f82424
MW
592 for (int bY = 0; bY < lY; bY += blk[1]) {
593 for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
10988083
MW
594
595 int eX = MIN(bX + blk[0], lX);
596 int eY = MIN(bY + blk[1], lY);
597 int eZ = MIN(bZ + blk[2], lZ);
598
599
10988083 600 for (int x = bX; x < eX; ++x) {
e3f82424
MW
601 for (int y = bY; y < eY; ++y) {
602 for (int z = bZ; z < eZ; ++z) {
10988083
MW
603
604 latticeIndex = L_INDEX_4(lDims, x, y, z);
605
606 if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
607 grid[latticeIndex] = counter;
608
609 coords[C_INDEX_X(counter)] = x;
610 coords[C_INDEX_Y(counter)] = y;
611 coords[C_INDEX_Z(counter)] = z;
612
613 ++counter;
614 }
615 } } }
616 } } }
617
618 Verify(counter == nFluid);
619
620 uint32_t * adjList;
621
622 double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
623
624 printf("# index size: %e MiB\n", indexMib);
625
626
627 // AdjList only requires 18 instead of 19 entries per node, as
628 // the center PDF needs no addressing.
629 if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
630 printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
631 exit(1);
632 }
633
634 kdl->AdjList = adjList;
635
636 int x, y, z;
637
638 uint32_t neighborIndex;
639 uint32_t dstIndex;
640
641 int nx, ny, nz, px, py, pz;
642
643 // Loop over all fluid nodes and compute the indices to the neighboring
644 // PDFs for configured data layout (AoS/SoA).
645 // Parallelized loop to ensure correct NUMA placement.
e3f82424
MW
646 #ifdef _OPENMP
647 #pragma omp parallel for
648 #endif
649 for (int index = 0; index < nFluid; ++index) {
650 for (int d = 0; d < N_D3Q19_IDX; ++d) {
651 adjList[index * N_D3Q19_IDX + d] = -1;
652 }
653 }
654
10988083
MW
655 // #ifdef _OPENMP --> add line continuation
656 // #pragma omp parallel for default(none)
657 // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
658 // stderr,
659 // lDims, grid, ld, lX, lY, lZ, adjList)
660 // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
661 // #endif
662 for (int index = 0; index < nFluid; ++index) {
663 x = coords[C_INDEX_X(index)];
664 y = coords[C_INDEX_Y(index)];
665 z = coords[C_INDEX_Z(index)];
666
667 Assert(x >= 0 && x < lX);
668 Assert(y >= 0 && y < lY);
669 Assert(z >= 0 && z < lZ);
670
671 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
672
673 // Loop over all directions except the center one.
674 for(int d = 0; d < N_D3Q19 - 1; ++d) {
675 Assert(d != D3Q19_C);
676
677#ifdef PROP_MODEL_PUSH
678 nx = x + D3Q19_X[d];
679 ny = y + D3Q19_Y[d];
680 nz = z + D3Q19_Z[d];
681
682#elif PROP_MODEL_PULL
683 nx = x - D3Q19_X[d];
684 ny = y - D3Q19_Y[d];
685 nz = z - D3Q19_Z[d];
686#else
687 #error No implementation for this PROP_MODEL_NAME.
688#endif
689 // If the neighbor is outside the latcie in X direction and we have a
690 // periodic boundary then we need to wrap around.
691 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
692 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
693 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
694 ){
695 // x periodic
696
697 if (nx < 0) {
698 px = lX - 1;
699 }
700 else if (nx >= lX) {
701 px = 0;
702 } else {
703 px = nx;
704 }
705 // y periodic
706 if (ny < 0) {
707 py = lY - 1;
708 }
709 else if (ny >= lY) {
710 py = 0;
711 } else {
712 py = ny;
713 }
714
715 // z periodic
716 if (nz < 0) {
717 pz = lZ - 1;
718 }
719 else if (nz >= lZ) {
720 pz = 0;
721 } else {
722 pz = nz;
723 }
724
725 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
726 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
727 }
728 else {
729 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
730
731 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
732
733 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
734 }
735 }
736 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
737 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
738 }
739 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
740 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
741 }
742 else {
743 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
744
745 Assert(neighborIndex != ~0);
746
747 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
748 }
749
750 Assert(dstIndex >= 0);
751 Assert(dstIndex < nCells * N_D3Q19);
752
753 adjList[index * N_D3Q19_IDX + d] = dstIndex;
754 }
755 }
756
757 int nThreads = 1;
758
759#ifdef _OPENMP
760 nThreads = omp_get_max_threads();
761#endif
762
763 SetupConsecNodes(ld, KDLR(kd), nThreads);
764
765
766 // Fill remaining KernelData structures
767 kd->GetNode = GetNode;
768 kd->SetNode = SetNode;
769
770 kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
771 kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
772
773 kd->Kernel = NULL; // FNAME(KernelPullSplitNt2S);
774
775 kd->DstPdfs = NULL;
776 kd->PdfsActive = kd->Pdfs[0];
777
778 return;
779}
780
781void FNAME(D3Q19ListPullSplitNt1SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
782{
783 FNAME(Init)(ld, kernelData, params);
784 (*kernelData)->Kernel = FNAME(KernelPullSplitNt1S);
785
786 double loopBalance = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
787 printf("# loop balance: %.2f B/FLUP\n", loopBalance);
788}
789
790void FNAME(D3Q19ListPullSplitNt2SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
791{
792 FNAME(Init)(ld, kernelData, params);
793 (*kernelData)->Kernel = FNAME(KernelPullSplitNt2S);
794
795 double loopBalance = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
796 printf("# loop balance: %.2f B/FLUP\n", loopBalance);
797}
798
799
800void FNAME(D3Q19ListPullSplitNtDeinit)(LatticeDesc * ld, KernelData ** kernelData)
801{
802 KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
803
804 MemFree((void **)&((*kdlr)->ConsecNodes));
805
806 if ((*kdlr)->ConsecThreadIndices != NULL) {
807 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
808 }
809
810 KernelDataList ** kdl = (KernelDataList **)kernelData;
811
812 MemFree((void **)&((*kdl)->AdjList));
813 MemFree((void **)&((*kdl)->Coords));
814 MemFree((void **)&((*kdl)->Grid));
815
816 MemFree((void **)&((*kernelData)->Pdfs[0]));
817 MemFree((void **)&((*kernelData)->Pdfs[1]));
818
819 MemFree((void **)kernelData);
820 return;
821}
822
This page took 0.134311 seconds and 5 git commands to generate.