version 0.1
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPvCommon.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListAaPvCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31
32#include <math.h>
33
34#ifdef _OPENMP
35 #include <omp.h>
36#endif
37
38// Forward definition.
39void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
40
41
42
43
44// -----------------------------------------------------------------------
45// Functions which are used as callback by the kernel to read or write
46// PDFs and nodes.
47
48static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
49{
50 Assert(kd != NULL);
51 Assert(kd->PdfsActive != NULL);
52 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
53 Assert(pdf != NULL);
54
55 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
56 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
57 Assert(dir >= 0); Assert(dir < N_D3Q19);
58
59 KernelDataList * kdl = (KernelDataList *)kd;
60
61 if (kdl->Iteration % 2 == 0) {
62 // Pdfs are stored inverse, local PDFs are located in remote nodes
63
64 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
65
66 if (dir != D3Q19_C) {
67 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
68
69 *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
70 }
71 else {
72 *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
73 }
74
75 }
76 else {
77 *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
78 }
79
80
81 return;
82}
83
84static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
85{
86 Assert(kd != NULL);
87 Assert(kd->PdfsActive != NULL);
88 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
89 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
90 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
91 Assert(dir >= 0); Assert(dir < N_D3Q19);
92
93#if 0
94 if (isnan(pdf)) {
95 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
96 DEBUG_BREAK_POINT();
97 exit(1);
98 }
99#endif
100
101 KernelDataList * kdl = (KernelDataList *)kd;
102
103 if (kdl->Iteration % 2 == 0) {
104 // Pdfs are stored inverse, local PDFs are located in remote nodes
105
106 uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
107
108 if (dir != D3Q19_C) {
109 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
110
111 kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
112 }
113 else {
114 kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
115 }
116
117 }
118 else {
119 kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
120 }
121
122 return;
123}
124
125
126static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
127{
128 Assert(kd != NULL);
129 Assert(kd->PdfsActive != NULL);
130 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
131 Assert(pdfs != NULL);
132 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
133 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
134
135 KernelDataList * kdl = (KernelDataList *)kd;
136
137 if(kdl->Iteration % 2 == 0){
138
139 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
140 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
141
142 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
143 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
144
145 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
146 D3Q19_LIST_WO_C
147 #undef X
148
149 } else {
150
151 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
152 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
153 D3Q19_LIST
154 #undef X
155 #undef I
156
157 }
158
159#if 0
160 for (int d = 0; d < 19; ++d) {
161 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
162 printf("%d %d %d %d nan! get node\n", x, y, z, d);
163 for (int d2 = 0; d2 < 19; ++d2) {
164 printf("%d: %e\n", d2, pdfs[d2]);
165 }
166 exit(1);
167 }
168 }
169#endif
170
171 return;
172}
173
174
175static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
176{
177 Assert(kd != NULL);
178 Assert(kd->PdfsActive != NULL);
179 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
180 Assert(pdfs != NULL);
181
182 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
183 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
184
185#if 0
186 for (int d = 0; d < 19; ++d) {
187 if(isnan(pdfs[d])) {
188 printf("%d %d %d %d nan! get node\n", x, y, z, d);
189 for (int d2 = 0; d2 < 19; ++d2) {
190 printf("%d: %e\n", d2, pdfs[d2]);
191 }
192 exit(1);
193 }
194 }
195#endif
196
197 KernelDataList * kdl = (KernelDataList *)kd;
198
199 if(kdl->Iteration % 2 == 0){
200
201 uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
202 uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
203
204 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
205 kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
206
207 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
208 D3Q19_LIST_WO_C
209 #undef X
210
211 } else {
212
213 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
214 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
215 D3Q19_LIST
216 #undef X
217 #undef I
218
219 }
220
221 return;
222}
223
224static void ParameterUsage()
225{
226 printf("Kernel parameters:\n");
227 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
228
229 return;
230}
231
232static void ParseParameters(Parameters * params, int * blk)
233{
234 Assert(blk != NULL);
235
236 blk[0] = 0; blk[1] = 0; blk[2] = 0;
237
238 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
239 #define NEXT_ARG_PRESENT() \
240 do { \
241 if (i + 1 >= params->nKernelArgs) { \
242 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
243 exit(1); \
244 } \
245 } while (0)
246
247
248 for (int i = 0; i < params->nKernelArgs; ++i) {
249 if (ARG_IS("-blk") || ARG_IS("--blk")) {
250 NEXT_ARG_PRESENT();
251
252 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
253
254 if (tmp <= 0) {
255 printf("ERROR: blocking parameter must be > 0.\n");
256 exit(1);
257 }
258
259 blk[0] = blk[1] = blk[2] = tmp;
260 }
261 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
262 NEXT_ARG_PRESENT();
263
264 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
265
266 if (tmp <= 0) {
267 printf("ERROR: blocking parameter must be > 0.\n");
268 exit(1);
269 }
270
271 blk[0] = tmp;
272 }
273 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
274 NEXT_ARG_PRESENT();
275
276 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
277
278 if (tmp <= 0) {
279 printf("ERROR: blocking parameter must be > 0.\n");
280 exit(1);
281 }
282
283 blk[1] = tmp;
284 }
285 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
286 NEXT_ARG_PRESENT();
287
288 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
289
290 if (tmp <= 0) {
291 printf("ERROR: blocking parameter must be > 0.\n");
292 exit(1);
293 }
294
295 blk[2] = tmp;
296 }
297 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
298 ParameterUsage();
299 exit(1);
300 }
301 else {
302 printf("ERROR: unknown kernel parameter.\n");
303 ParameterUsage();
304 exit(1);
305 }
306 }
307
308 #undef ARG_IS
309 #undef NEXT_ARG_PRESENT
310
311 return;
312}
313
314static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
315{
316 Assert(ld != NULL);
317 Assert(kdlr != NULL);
318 Assert(nThreads > 0);
319
320 uint32_t * adjList = kdlr->kdl.AdjList;
321
322 uint32_t nConsecNodes = 0;
323 uint32_t consecIndex = 0;
324
325 int nFluid = kdlr->kdl.nFluid;
326
327 uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
328 int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
329
330 int nNodesPerThread = nFluid / nThreads;
331
332 for (int i = 0; i < nThreads; ++i) {
333 consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
334 fluidNodeThreadIndices[i] = consecThreadIndices[i];
335 }
336 consecThreadIndices[nThreads] = -1;
337 fluidNodeThreadIndices[nThreads] = nFluid;
338
339 int indexThread = 1;
340
341 // We execute following code two times.
342 // - The first time to get the count of how many entries we need for the
343 // consecNodes array.
344 // - The second time to fill the array.
345
346 // Loop over adjacency list of all nodes.
347 // Compare if adjacent nodes share the same access pattern.
348 for (int index = 1; index < nFluid; ++index) {
349
350 int different = 0;
351
352 // Loop over all directions except the center one.
353 for(int d = 0; d < N_D3Q19 - 1; ++d) {
354 Assert(d != D3Q19_C);
355
356 if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
357 // Different access pattern.
358 different = 1;
359 break;
360 }
361 }
362
363 if (consecThreadIndices[indexThread] == index) {
364 // We are at a thread boundary. Starting from this index the fluids
365 // belong to another thread. Force a break, if nodes are consecutive.
366 ++indexThread;
367 different = 1;
368 }
369
370 if (different) {
371 ++consecIndex;
372 }
373 }
374
375 if (nFluid > 0) {
376 nConsecNodes = consecIndex + 1;
377 }
378
379 uint32_t * consecNodes;
380 MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
381
382 consecIndex = 0;
383
384 if (nFluid > 0) {
385 consecNodes[consecIndex] = 1;
386 }
387
388 indexThread = 1;
389 consecThreadIndices[0] = 0;
390
391 // Loop over adjacency list of all nodes.
392 // Compare if adjacent nodes share the same access pattern.
393 for (int index = 1; index < nFluid; ++index) {
394
395 int different = 0;
396
397 // Loop over all directions except the center one.
398 for(int d = 0; d < N_D3Q19 - 1; ++d) {
399 Assert(d != D3Q19_C);
400
401 if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
402 // Different access pattern.
403 different = 1;
404 break;
405 }
406 }
407
408 if (consecThreadIndices[indexThread] == index) {
409 // We are at a thread boundary. Starting from this index the fluids
410 // belong to another thread. Force a break, if nodes are consecutive.
411 consecThreadIndices[indexThread] = consecIndex + 1;
412 ++indexThread;
413 different = 1;
414 }
415
416 if (different) {
417 ++consecIndex;
418 Assert(consecIndex < nConsecNodes);
419 consecNodes[consecIndex] = 1;
420 }
421 else {
422 Assert(consecIndex < nConsecNodes);
423 consecNodes[consecIndex] += 1;
424 }
425 }
426
427
428 kdlr->ConsecNodes = consecNodes;
429 kdlr->nConsecNodes = nConsecNodes;
430
431 kdlr->ConsecThreadIndices = consecThreadIndices;
432 kdlr->nConsecThreadIndices = nThreads;
433
434 kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices;
435 kdlr->nFluidNodeThreadIndices = nThreads;
436
437 printf("# total fluid nodes: %d consecutive blocks: %d\n", nFluid, nConsecNodes);
438
439 uint32_t vwidth[] = {2, 4, 8, 16, 32};
440 uint32_t vectorizable[] = {0, 0, 0, 0, 0};
441
442 for (int i = 0; i < nConsecNodes; ++i) {
443 for (int k = 0; k < N_ELEMS(vwidth); ++k) {
444 vectorizable[k] += consecNodes[i] / vwidth[k];
445 }
446 }
447
448 printf("# vectorizable fraction of fluid node updates:\n");
449 for (int i = 0; i < N_ELEMS(vwidth); ++i) {
450
451 printf("# vector width: %2d %6.2f %% (%u/%u fluid nodes)\n",
452 vwidth[i], (double)vectorizable[i] * vwidth[i] / nFluid * 100.0,
453 vectorizable[i] * vwidth[i], nFluid);
454 }
455
456 return;
457}
458
459void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
460{
461 KernelData * kd;
462 KernelDataList * kdl;
463 KernelDataListRia * kdlr;
464 MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
465
466 kd = (KernelData *)kdlr;
467 kdl = KDL(kdlr);
468
469 *kernelData = kd;
470
471#ifdef DEBUG
472 kd->Pdfs[0] = NULL;
473 kd->Pdfs[1] = NULL;
474 kd->PdfsActive = NULL;
475 kd->DstPdfs = NULL;
476 kd->SrcPdfs = NULL;
477 kd->Dims[0] = -1;
478 kd->Dims[1] = -1;
479 kd->Dims[2] = -1;
480 kd->GlobalDims[0] = -1;
481 kd->GlobalDims[1] = -1;
482 kd->GlobalDims[2] = -1;
483 kd->Offsets[0] = -1;
484 kd->Offsets[1] = -1;
485 kd->Offsets[2] = -1;
486
487 kd->ObstIndices = NULL;
488 kd->nObstIndices = -1;
489 kd->BounceBackPdfsSrc = NULL;
490 kd->BounceBackPdfsDst = NULL;
491 kd->nBounceBackPdfs = -1;
492
493 kdl->AdjList = NULL;
494 kdl->Coords = NULL;
495 kdl->Grid = NULL;
496 kdl->nCells = -1;
497 kdl->nFluid = -1;
498
499 kdlr->ConsecNodes = NULL;
500 kdlr->nConsecNodes = 0;
501 kdlr->ConsecThreadIndices = NULL;
502 kdlr->nConsecThreadIndices = 0;
503#endif
504
505 // Ajust the dimensions according to padding, if used.
506 kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
507 kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
508 kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
509
510 int * lDims = ld->Dims;
511
512 int lX = lDims[0];
513 int lY = lDims[1];
514 int lZ = lDims[2];
515
516 int nTotalCells = lX * lY * lZ;
517 int nCells = ld->nFluid; // TODO: + padding
518 int nFluid = ld->nFluid;
519
520 kdl->nCells = nCells;
521 kdl->nFluid = nFluid;
522
523 PdfT * pdfs[2];
524
525 int blk[3] = { 0 };
526
527 ParseParameters(params, blk);
528
529 if (blk[0] == 0) blk[0] = lX;
530 if (blk[1] == 0) blk[1] = lY;
531 if (blk[2] == 0) blk[2] = lZ;
532
533 printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
534
535 double latMiB = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
536 double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
537 double latPadMib = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
538
539 printf("# lattice size: %e MiB\n", latMiB);
540 printf("# fluid lattice size: %e MiB\n", latFluidMib);
541 printf("# lattice padding: %e MiB\n", latPadMib);
542
543#define PAGE_4K 4096
544
545 printf("# aligning lattices to: %d b\n", PAGE_4K);
546
547 MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
548
549 kd->Pdfs[0] = pdfs[0];
550
551 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
552 // Here we touch only the fluid nodes as this loop is OpenMP parallel and
553 // we want the same scheduling as in the kernel.
554 #ifdef _OPENMP
555 #pragma omp parallel for
556 #endif
557 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
558 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
559 } }
560
561 // Initialize all PDFs to some standard value.
562 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
563 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
564 } }
565
566 // ----------------------------------------------------------------------
567 // create grid which will hold the index numbers of the fluid nodes
568
569 uint32_t * grid;
570
571 if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
572 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
573 exit(1);
574 }
575 kdl->Grid = grid;
576
577 int latticeIndex;
578
579#ifdef DEBUG
580 for(int z = 0; z < lZ; ++z) {
581 for(int y = 0; y < lY; ++y) {
582 for(int x = 0; x < lX; ++x) {
583
584 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
585
586 grid[latticeIndex] = ~0;
587 }
588 }
589 }
590#endif
591
592 // ----------------------------------------------------------------------
593 // generate numbering over grid
594
595 uint32_t * coords;
596
597 if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
598 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
599 exit(1);
600 }
601
602 kdl->Coords = coords;
603
604 // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
605 // through the ld->Lattice array.
606 int counter = 0;
607
608 // Blocking is implemented via setup of the adjacency list. The kernel later will
609 // walk through the lattice blocked automatically.
610 for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
611 for (int bY = 0; bY < lY; bY += blk[1]) {
612 for (int bX = 0; bX < lX; bX += blk[0]) {
613
614 int eX = MIN(bX + blk[0], lX);
615 int eY = MIN(bY + blk[1], lY);
616 int eZ = MIN(bZ + blk[2], lZ);
617
618
619 for (int z = bZ; z < eZ; ++z) {
620 for (int y = bY; y < eY; ++y) {
621 for (int x = bX; x < eX; ++x) {
622
623 latticeIndex = L_INDEX_4(lDims, x, y, z);
624
625 if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
626 grid[latticeIndex] = counter;
627
628 coords[C_INDEX_X(counter)] = x;
629 coords[C_INDEX_Y(counter)] = y;
630 coords[C_INDEX_Z(counter)] = z;
631
632 ++counter;
633 }
634 } } }
635 } } }
636
637 Verify(counter == nFluid);
638
639 uint32_t * adjList;
640
641 double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
642
643 printf("# index size: %e MiB\n", indexMib);
644
645 // AdjList only requires 18 instead of 19 entries per node, as
646 // the center PDF needs no addressing.
647 if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
648 printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
649 exit(1);
650 }
651
652 kdl->AdjList = adjList;
653
654 int x, y, z;
655
656 uint32_t neighborIndex;
657 uint32_t dstIndex;
658
659 int nx, ny, nz, px, py, pz;
660
661 // Loop over all fluid nodes and compute the indices to the neighboring
662 // PDFs for configured data layout (AoS/SoA).
663 // TODO: Parallelized loop to ensure correct NUMA placement.
664 // #ifdef _OPENMP --> add line continuation
665 // #pragma omp parallel for default(none)
666 // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
667 // stderr,
668 // lDims, grid, ld, lX, lY, lZ, adjList)
669 // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
670 // #endif
671 for (int index = 0; index < nFluid; ++index) {
672 x = coords[C_INDEX_X(index)];
673 y = coords[C_INDEX_Y(index)];
674 z = coords[C_INDEX_Z(index)];
675
676 Assert(x >= 0 && x < lX);
677 Assert(y >= 0 && y < lY);
678 Assert(z >= 0 && z < lZ);
679
680 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
681
682 // Loop over all directions except the center one.
683 for(int d = 0; d < N_D3Q19 - 1; ++d) {
684 Assert(d != D3Q19_C);
685
686#ifdef PROP_MODEL_PUSH
687 nx = x + D3Q19_X[d];
688 ny = y + D3Q19_Y[d];
689 nz = z + D3Q19_Z[d];
690
691#elif PROP_MODEL_PULL
692 nx = x - D3Q19_X[d];
693 ny = y - D3Q19_Y[d];
694 nz = z - D3Q19_Z[d];
695#else
696 #error No implementation for this PROP_MODEL_NAME.
697#endif
698 // If the neighbor is outside the latcie in X direction and we have a
699 // periodic boundary then we need to wrap around.
700 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
701 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
702 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
703 ){
704 // x periodic
705
706 if (nx < 0) {
707 px = lX - 1;
708 }
709 else if (nx >= lX) {
710 px = 0;
711 } else {
712 px = nx;
713 }
714 // y periodic
715 if (ny < 0) {
716 py = lY - 1;
717 }
718 else if (ny >= lY) {
719 py = 0;
720 } else {
721 py = ny;
722 }
723
724 // z periodic
725 if (nz < 0) {
726 pz = lZ - 1;
727 }
728 else if (nz >= lZ) {
729 pz = 0;
730 } else {
731 pz = nz;
732 }
733
734 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
735 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
736 }
737 else {
738 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
739
740 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
741
742 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
743 }
744 }
745 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
746 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
747 }
748 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
749 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
750 }
751 else {
752 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
753
754 Assert(neighborIndex != ~0);
755
756 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
757 }
758
759 Assert(dstIndex >= 0);
760 Assert(dstIndex < nCells * N_D3Q19);
761
762 adjList[index * N_D3Q19_IDX + d] = dstIndex;
763 }
764 }
765
766 int nThreads = 1;
767
768#ifdef _OPENMP
769 nThreads = omp_get_max_threads();
770#endif
771
772 SetupConsecNodes(ld, KDLR(kd), nThreads);
773
774 double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
775 double loopBalanceOdd = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0);
776 double loopBalance = (loopBalanceEven + loopBalanceOdd) / 2.0;
777
778 printf("# loop balance: %.2f B/FLUP even: %.2f B/FLUP odd %.2f B/FLUP\n",
779 loopBalance, loopBalanceEven, loopBalanceOdd);
780
781 // Fill remaining KernelData structures
782 kd->GetNode = GetNode;
783 kd->SetNode = SetNode;
784
785 kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
786 kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
787
788 kd->Kernel = FNAME(D3Q19ListAaPvKernel);
789
790 kd->DstPdfs = NULL;
791 kd->PdfsActive = kd->Pdfs[0];
792
793 return;
794}
795
796void FNAME(D3Q19ListAaPvDeinit)(LatticeDesc * ld, KernelData ** kernelData)
797{
798 KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
799
800 MemFree((void **)&((*kdlr)->ConsecNodes));
801
802 if ((*kdlr)->ConsecThreadIndices != NULL) {
803 MemFree((void **)&((*kdlr)->ConsecThreadIndices));
804 }
805
806 if ((*kdlr)->FluidNodeThreadIndices != NULL) {
807 MemFree((void **)&((*kdlr)->FluidNodeThreadIndices));
808 }
809
810 KernelDataList ** kdl = (KernelDataList **)kernelData;
811
812 MemFree((void **)&((*kdl)->AdjList));
813 MemFree((void **)&((*kdl)->Coords));
814 MemFree((void **)&((*kdl)->Grid));
815
816 MemFree((void **)&((*kernelData)->Pdfs[0]));
817
818 MemFree((void **)kernelData);
819 return;
820}
821
This page took 0.109059 seconds and 5 git commands to generate.