merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPvGatherHybridCommon.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //   Michael Hussnaetter, 2017-2018
12 //   University of Erlangen-Nuremberg, Germany
13 //   michael.hussnaetter -at- fau.de
14 //
15 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16 //
17 //  LbmBenchKernels is free software: you can redistribute it and/or modify
18 //  it under the terms of the GNU General Public License as published by
19 //  the Free Software Foundation, either version 3 of the License, or
20 //  (at your option) any later version.
21 //
22 //  LbmBenchKernels is distributed in the hope that it will be useful,
23 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
24 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 //  GNU General Public License for more details.
26 //
27 //  You should have received a copy of the GNU General Public License
28 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
29 //
30 // --------------------------------------------------------------------------
31 #include "BenchKernelD3Q19ListAaPvGatherHybridCommon.h"
32
33 #include "Memory.h"
34 #include "Vtk.h"
35 #include "Vector.h"
36
37 #include <math.h>
38
39 #ifdef _OPENMP
40 #include <omp.h>
41 #endif
42
43 #define PAGE_4K         4096
44
45 #if ALLOC_ADJ_LIST_IN_HBM == 1
46 #define ADJ_LIST_ALLOCATOR HbwAllocAligned
47 #define ADJ_LIST_FREE HbwFree
48 #else
49 #define ADJ_LIST_ALLOCATOR MemAllocAligned
50 #define ADJ_LIST_FREE MemFree
51 #endif
52
53 #if ALLOC_PDF_IN_HBM == 1
54 #define PDF_ALLOCATOR HbwAllocAligned
55 #define PDF_FREE HbwFree
56 #else
57 #define PDF_ALLOCATOR MemAllocAligned
58 #define PDF_FREE MemFree
59 #endif
60
61 // Forward definition.
62 void FNAME(D3Q19ListAaPvGatherHybridKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
63
64
65
66 // -----------------------------------------------------------------------
67 // Functions which are used as callback by the kernel to read or write
68 // PDFs and nodes.
69
70 static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
71 {
72         Assert(kd != NULL);
73         Assert(kd->PdfsActive != NULL);
74         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
75         Assert(pdf != NULL);
76
77         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
78         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
79         Assert(dir >= 0); Assert(dir < N_D3Q19);
80
81         KernelDataList * kdl = KDL(kd);
82         uint32_t * adjList = kdl->AdjList;
83
84         if (kdl->Iteration % 2 == 0) {
85                 // Pdfs are stored inverse, local PDFs are located in remote nodes
86
87                 // getting node index
88                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
89
90                 if (dir != D3Q19_C) {
91                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
92                         *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
93                         #undef ADJ_LIST
94                 }
95                 else {
96                         *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
97                 }
98
99         }
100         else {
101                 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
102         }
103
104         return;
105 }
106
107 static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
108 {
109         Assert(kd != NULL);
110         Assert(kd->PdfsActive != NULL);
111         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
112         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
113         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
114         Assert(dir >= 0); Assert(dir < N_D3Q19);
115
116         if (isnan(pdf)) {
117                 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
118                 DEBUG_BREAK_POINT();
119                 exit(1);
120         }
121
122         KernelDataList * kdl = KDL(kd);
123         uint32_t * adjList = kdl->AdjList;
124
125         if (kdl->Iteration % 2 == 0) {
126                 // Pdfs are stored inverse, local PDFs are located in remote nodes
127
128                 // getting node index
129                 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
130
131                 if (dir != D3Q19_C) {
132                         #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
133                         kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
134                         #undef ADJ_LIST
135                 } else {
136                         kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
137                 }
138
139         } else {
140                 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
141         }
142
143         return;
144 }
145
146
147 static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
148 {
149         Assert(kd != NULL);
150         Assert(kd->PdfsActive != NULL);
151         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
152         Assert(pdfs != NULL);
153         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
154         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
155
156         KernelDataList * kdl = KDL(kd);
157         uint32_t * adjList = kdl->AdjList;
158
159         if(kdl->Iteration % 2 == 0){
160
161                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
162
163                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
164                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
165                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
166                 D3Q19_LIST_WO_C
167                         #undef X
168                         #undef ADJ_LIST
169                         pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
170
171         } else {
172
173                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
174                 #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
175                 D3Q19_LIST
176                         #undef X
177                         #undef I
178
179         }
180
181         for (int d = 0; d < 19; ++d) {
182                 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
183                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
184                         for (int d2 = 0; d2 < 19; ++d2) {
185                                 printf("%d: %e\n", d2, pdfs[d2]);
186                         }
187                         exit(1);
188                 }
189         }
190
191         return;
192 }
193
194
195 static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
196 {
197         Assert(kd != NULL);
198         Assert(kd->PdfsActive != NULL);
199         Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
200         Assert(pdfs != NULL);
201
202         Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
203         Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
204
205         for (int d = 0; d < 19; ++d) {
206                 if(isnan(pdfs[d])) {
207                         printf("%d %d %d %d nan! get node\n", x, y, z, d);
208                         for (int d2 = 0; d2 < 19; ++d2) {
209                                 printf("%d: %e\n", d2, pdfs[d2]);
210                         }
211                         exit(1);
212                 }
213         }
214
215         KernelDataList * kdl = KDL(kd);
216         uint32_t * adjList = kdl->AdjList;
217
218         if(kdl->Iteration % 2 == 0){
219
220                 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
221
222                 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
223                 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
224
225                 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
226                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
227                 D3Q19_LIST_WO_C
228                         #undef X
229                         #undef ADJ_LIST
230
231         } else {
232
233                 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
234                 #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
235                 D3Q19_LIST
236                         #undef X
237                         #undef I
238
239         }
240
241         return;
242 }
243
244 static void ParameterUsage()
245 {
246         printf("Kernel parameters:\n");
247         printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
248
249         return;
250 }
251
252 static void ParseParameters(Parameters * params, int * blk)
253 {
254         Assert(blk != NULL);
255
256         blk[0] = 0; blk[1] = 0; blk[2] = 0;
257
258         #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
259         #define NEXT_ARG_PRESENT() \
260         do { \
261                 if (i + 1 >= params->nKernelArgs) { \
262                         printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
263                         exit(1); \
264                 } \
265         } while (0)
266
267
268         for (int i = 0; i < params->nKernelArgs; ++i) {
269                 if (ARG_IS("-blk") || ARG_IS("--blk")) {
270                         NEXT_ARG_PRESENT();
271
272                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
273
274                         if (tmp <= 0) {
275                                 printf("ERROR: blocking parameter must be > 0.\n");
276                                 exit(1);
277                         }
278
279                         blk[0] = blk[1] = blk[2] = tmp;
280                 }
281                 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
282                         NEXT_ARG_PRESENT();
283
284                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
285
286                         if (tmp <= 0) {
287                                 printf("ERROR: blocking parameter must be > 0.\n");
288                                 exit(1);
289                         }
290
291                         blk[0] = tmp;
292                 }
293                 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
294                         NEXT_ARG_PRESENT();
295
296                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
297
298                         if (tmp <= 0) {
299                                 printf("ERROR: blocking parameter must be > 0.\n");
300                                 exit(1);
301                         }
302
303                         blk[1] = tmp;
304                 }
305                 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
306                         NEXT_ARG_PRESENT();
307
308                         int tmp = strtol(params->KernelArgs[++i], NULL, 0);
309
310                         if (tmp <= 0) {
311                                 printf("ERROR: blocking parameter must be > 0.\n");
312                                 exit(1);
313                         }
314
315                         blk[2] = tmp;
316                 }
317                 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
318                         ParameterUsage();
319                         exit(1);
320                 }
321                 else {
322                         printf("ERROR: unknown kernel parameter.\n");
323                         ParameterUsage();
324                         exit(1);
325                 }
326         }
327
328         #undef ARG_IS
329         #undef NEXT_ARG_PRESENT
330
331         return;
332 }
333
334 static void SetuploopStartIndices(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
335 {
336         //#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
337         Assert(ld != NULL);
338         Assert(kdlr != NULL);
339         Assert(nThreads > 0);
340
341         //uint32_t * adjList = kdlr->kdl.AdjList;
342         uint32_t * adjList = kdlr->kdl.AdjList;
343
344         uint32_t nLoopStartIndices = 0;
345         uint32_t loopStartIndex = 2;
346
347         int nFluid = kdlr->kdl.nFluid;
348
349         int * oddKernelThreadStartIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
350
351         int nNodesPerThread = nFluid / nThreads;
352         //printf("nodesPerThread: %d\n", nNodesPerThread);
353
354         for (int i = 0; i < nThreads; ++i) {
355                 oddKernelThreadStartIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
356         }
357
358         oddKernelThreadStartIndices[nThreads] = nFluid;
359
360         /*
361            for (int i = 0; i <= nThreads; ++i) {
362            printf("oddKernelThreadStartIndices[%d] = %d\n", i, oddKernelThreadStartIndices[i]);
363            }
364            */
365
366         int threadIndex = 1;
367
368         // We execute following code two times.
369         // - The first time to get the count of how many entries we need for the
370         //   loopStartIndices array.
371         // - The second time to fill the array.
372
373         // Loop over adjacency list of all nodes.
374         // Compare if adjacent nodes within one cache line share the same access pattern.
375         // First vectorized access is assumed to be consecutive (-> may be loaded with regular load).
376
377         int lastCacheLineConsecutive = 1;
378
379         for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
380
381                 int currentCacheLineConsecutive = 1;
382
383                 // Loop over all directions except the center one.
384                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
385                         Assert(d != D3Q19_C);
386
387                         // check if cache line itself has consecutive memory access pattern
388                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex) {
389                                 int index = fluidBaseIndex + inChunkIndex;
390
391                                 Assert(index < nFluid);
392
393                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
394                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
395                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
396                                         //printf("no match for index: %d\n", d);
397                                         //printf("ADJ_LlST(%d,%d) = %d  !=  %d = ADJ_LlST(%d,%d) + 1\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-1,d), index-1, d);
398                                         // Different access pattern.
399                                         currentCacheLineConsecutive = 0;
400                                         break;
401                                 }
402                                 #undef ADJ_LIST
403
404                         }
405
406                         if(!currentCacheLineConsecutive){
407                                 break; //exit from nested loop
408                         }
409
410                 }
411
412                 int interCacheLineConsecutive = 1;
413
414                 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
415                         // check if cache line has consecutive memory access pattern to last entry of previous cache line
416                         int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
417                         if (lastIdxOfPreviousCacheLine > 0) {
418                                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
419                                         Assert(d != D3Q19_C);
420                                         #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
421                                         if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
422                                                 // Different access pattern.
423                                                 //printf("not interCacheConsecutive\n");
424                                                 interCacheLineConsecutive = 0;
425                                                 break;
426                                         }
427                                         #undef ADJ_LIST
428
429                                 }
430                         }
431                 }
432                 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
433                 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
434                                 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
435                         // Current cache line contains thread boundary.
436                         // These cache lines are treated by scalar peel and
437                         // reminder loops in kernel of every thread.
438                         // TODO maybe replace these loops with masked gather / scatter?!
439                         if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
440                                 ++loopStartIndex; // reserving gather/scatter index
441                         }
442                         ++loopStartIndex; // reserving space for start remainder loop of thread n
443
444                         if (threadIndex < nThreads){
445                                 ++loopStartIndex; // reserving space for starting peel loop of thread n+1
446
447                                 if (fluidBaseIndex - 1 == threadBoundaryIndex){
448                                         if(!currentCacheLineConsecutive){
449                                                 ++loopStartIndex;
450                                         }
451                                 }
452                                 else {
453                                         currentCacheLineConsecutive = 1;
454                                 }
455
456                                 //++loopStartIndex; // reserving space for ending peel loop / starting load/store of thread n+1
457                                 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
458                         }
459                         ++threadIndex;
460                 }
461                 else {
462                         // We are not at a thread boundary.
463                         if (currentCacheLineConsecutive) {
464                                 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
465                                         loopStartIndex+=2;
466                                 }
467                                 else if(!lastCacheLineConsecutive){
468                                         ++loopStartIndex;
469                                 }
470                         }
471                         else {
472                                 if(lastCacheLineConsecutive){
473                                         ++loopStartIndex;
474                                 }
475                         }
476                 }
477
478                 // treating special case when last thread has no remainder loop
479                 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
480                         //printf("--> special case 111. loopStartIndex: %d \n", loopStartIndex);
481                         if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
482                                 ++loopStartIndex; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
483                         }
484
485                         ++loopStartIndex; // gather/scatter end and scalar remainder start
486                         ++loopStartIndex; // scalar remainder end and scalar peel start
487
488                 }
489
490                 lastCacheLineConsecutive = currentCacheLineConsecutive;
491         }
492
493         if (nFluid > 0) {
494                 nLoopStartIndices = loopStartIndex;
495         }
496
497         int * loopStartIndices;
498         unsigned long loopStartIndicesByte = (nLoopStartIndices + 1) * sizeof(int);
499
500         printf("# Loop Start Index Array Allocation:\n");
501         printf("#   elements:      \t\t%d\n",   nLoopStartIndices + 1);
502         printf("#   size:          \t\t%e MiB\n", loopStartIndicesByte / 1024.0 / 1024.0);
503         printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
504
505         if (MemAllocAligned((void **)&loopStartIndices, loopStartIndicesByte, PAGE_4K)) {
506                 printf("ERROR: allocating loopStartIndices array with MemAllocAligned failed: %lu bytes.\n", loopStartIndicesByte);
507                 exit(1);
508         }
509         else {
510                 printf("#   allocator: \t\t\tMemAllocAligned()\n");
511         }
512
513         oddKernelThreadStartIndices[0] = 0;
514         loopStartIndices[0] = 0; //first scalar loop would start with 0
515         loopStartIndices[1] = 0; //no peel loop expected -> first load/store loop may start at index==0
516         loopStartIndices[2] = 0; //may not be set in case first access is gather/scatter -> therefore its set here
517
518         // resetting values to default
519         threadIndex = 1;
520         lastCacheLineConsecutive = 1;
521         loopStartIndex = 2;
522
523         // Loop over adjacency list of all nodes.
524         // Compare if adjacent nodes share the same access pattern.
525
526         int indexAccumulator = 0;
527
528         // for statistical reasons:
529         int gatherAccumulator = 0;
530         int loadAccumulator = 0;
531         int scalarLookups = 0;
532         int loadLookups = 0;
533
534
535         for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
536                 int currentCacheLineConsecutive = 1;
537                 //printf("fluidbaseIndex: %d\n", fluidBaseIndex);
538                 // Loop over all directions except the center one.
539                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
540                         Assert(d != D3Q19_C);
541
542                         // check if cache line itself has consecutive memory access pattern
543                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
544                                 int index = fluidBaseIndex + inChunkIndex;
545
546                                 Assert(index < nFluid);
547
548                                 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
549                                 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
550                                 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
551                                         // Different access pattern.
552                                         currentCacheLineConsecutive = 0;
553                                         break;
554                                 }
555                                 #undef ADJ_LIST
556                         }
557
558                         if(!currentCacheLineConsecutive){
559                                 break; //exit from nested loop
560                         }
561                 }
562
563                 int interCacheLineConsecutive = 1;
564
565                 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
566                         // check if cache line has consecutive memory access pattern to last entry of previous cache line
567                         int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
568                         if (lastIdxOfPreviousCacheLine > 0) {
569                                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
570                                         Assert(d != D3Q19_C);
571                                         #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
572                                         if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
573                                                 // Different access pattern.
574                                                 interCacheLineConsecutive = 0;
575                                                 break;
576                                         }
577                                         #undef ADJ_LIST
578
579                                 }
580                         }
581                 }
582
583                 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
584                 //if (fluidBaseIndex > 3500)
585                 //      printf("threadBoundaryIndex: %d  fluidBaseIndex-1: %d fluidBaseIndex + VSIZE - 1: %d\n", threadBoundaryIndex, fluidBaseIndex-1, fluidBaseIndex + VSIZE -1);
586
587                 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
588                                 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
589                         // Current cache line contains thread boundary.
590                         // These cache lines are treated by scalar peel and
591                         // reminder loops in kernel of every thread.
592                         // TODO maybe replace these loops with masked gather / scatter?!
593                         if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
594                                 //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; //same value as scalar remainder start => no more access to gather/scatter loop
595                                 loopStartIndices[loopStartIndex] = indexAccumulator; //same value as scalar remainder start => no more access to gather/scatter loop
596                                 ++loopStartIndex;
597                         }
598
599                         //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; // gather/scatter end and scalar remainder start
600                         loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
601                         ++loopStartIndex;
602
603                         // starting indices of thread n+1
604                         loopStartIndices[loopStartIndex] = threadBoundaryIndex; // scalar remainder of thread n end and scalar peel of thread n+1 start
605                         oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
606
607                         if (threadIndex < nThreads){
608                                 indexAccumulator = ((threadBoundaryIndex + VSIZE - 1) / VSIZE ) * VSIZE; // rounding towards next multiple of VSIZE
609                                 ++loopStartIndex;
610                                 loopStartIndices[loopStartIndex] = indexAccumulator; // scalar peel end and 1st load/store start
611
612                                 // treating special case when there is no peel / remainder loop
613                                 if (fluidBaseIndex - 1 == threadBoundaryIndex){
614                                         if(!currentCacheLineConsecutive){
615                                                 ++loopStartIndex;
616                                                 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end and 1st gather/scatter start
617                                                 gatherAccumulator += VSIZE;
618                                         }
619                                         else {
620                                                 loadLookups += VSIZE;
621                                         }
622                                         indexAccumulator += VSIZE;
623                                 }
624                                 else {
625                                         scalarLookups += VSIZE;
626                                         currentCacheLineConsecutive = 1;
627                                 }
628
629                                 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
630                                 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
631                         }
632                         ++threadIndex;
633
634                 }
635                 else {
636                         // We are not at a thread boundary.
637                         int print = 0;
638                         if (currentCacheLineConsecutive) {
639                                 loadAccumulator += VSIZE;
640
641                                 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
642                                         loadLookups += VSIZE;
643                                         if (print)
644                                                 printf("#1 loopStartIndex: %d\n", loopStartIndex);
645                                         // loopStartIndices[loopStartIndex] is not incremented since pointers need to be fetched again.
646                                         // loopStartIndices[loopStartIndex + 1] (-> start Load/Store and end Gather/Scatter)
647                                         // gets same value as loopStartIndices[loopStartindex] (-> start Gather/Scatter)
648                                         // this ensures that no gather/scatter iteration is executed
649                                         ++loopStartIndex;
650                                         loopStartIndices[loopStartIndex] = indexAccumulator;
651
652                                         // loopStartIndices[loopStartIndex + 2] (-> start Gather/Scatter and end Load/Store)
653                                         // gets set to have one Load/Store iteration
654                                         ++loopStartIndex;
655                                         indexAccumulator+=VSIZE;
656                                         loopStartIndices[loopStartIndex] = indexAccumulator;
657
658                                 }
659                                 else if(!lastCacheLineConsecutive){
660                                         loadLookups += VSIZE;
661                                         if (print)
662                                                 printf("#2 loopStartIndex: %d\n", loopStartIndex);
663                                         ++loopStartIndex;
664                                         indexAccumulator+=VSIZE;
665                                         loopStartIndices[loopStartIndex] = indexAccumulator;
666                                 }
667                                 else { // (lastCacheLineConsecutive && interCacheLineConsecutive)
668                                         if (print)
669                                                 printf("#3 loopStartIndex: %d\n", loopStartIndex);
670                                         indexAccumulator+=VSIZE;
671                                         loopStartIndices[loopStartIndex] = indexAccumulator;
672                                 }
673                         }
674                         else {
675                                 gatherAccumulator += VSIZE;
676                                 if(lastCacheLineConsecutive){
677                                         if (print)
678                                                 printf("#4 loopStartIndex: %d\n", loopStartIndex);
679                                         ++loopStartIndex;
680                                         indexAccumulator+=VSIZE;
681                                         loopStartIndices[loopStartIndex] = indexAccumulator;
682                                 }
683                                 else { // lastCacheLine without not consecutive memory access pattern
684                                         if (print)
685                                                 printf("#5 loopStartIndex: %d\n", loopStartIndex);
686                                         indexAccumulator+=VSIZE;
687                                         loopStartIndices[loopStartIndex] = indexAccumulator;
688                                 }
689                         }
690                 }
691
692                 // treating special case when last thread has no remainder loop
693                 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
694                         //printf("--> special case. indexAccumulator: %d\n", indexAccumulator);
695                         if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
696                                 ++loopStartIndex;
697                                 loopStartIndices[loopStartIndex] = indexAccumulator; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
698                         }
699
700                         ++loopStartIndex;
701                         loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
702                         ++loopStartIndex;
703                         loopStartIndices[loopStartIndex] = indexAccumulator; // scalar remainder end and scalar peel start
704
705                         oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
706                 }
707
708                 lastCacheLineConsecutive = currentCacheLineConsecutive;
709
710         }
711
712         if (nLoopStartIndices != loopStartIndex){
713                 printf("ERROR: nLoopStartIndices unequal loopStartIndex!\n");
714         }
715
716         /*
717         printf("loopStartIndices:\n");
718         for(int i = 0; i <= nLoopStartIndices; ++i){
719                 printf("%d ", loopStartIndices[i]);
720         }
721         printf("\n");
722         printf("oddKernelThreadStartIndices:\n");
723         for(int i = 0; i <= nThreads; ++i){
724                 printf("%d ", oddKernelThreadStartIndices[i]);
725         }
726         printf("\n");
727         */
728
729         kdlr->loopStartIndices = loopStartIndices;
730         kdlr->nLoopStartIndices = nLoopStartIndices;
731
732         kdlr->oddKernelThreadStartIndices  = oddKernelThreadStartIndices;
733         kdlr->nOddKernelThreadStartIndices = nThreads;
734
735         printf("#   vload/vstore nodes:  \t% 10d   \t(%3.4f %% of total fluid nodes)\n", loadAccumulator, ((double) loadAccumulator / (double) nFluid) * 100);
736         printf("#   gather/scatter nodes:\t% 10d   \t(%3.4f %% of total fluid nodes)\n", gatherAccumulator, ((double) gatherAccumulator / (double) nFluid) * 100.0);
737         printf("#   vload/vstore lookups:\t% 10d \n", loadLookups * (N_D3Q19 - 1));
738         printf("#   gather/scatter lookups:\t% 10d \n", gatherAccumulator * (N_D3Q19 - 1));
739         printf("#   scalar lookups:      \t% 10d \n", scalarLookups * (N_D3Q19 - 1));
740
741         double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
742         double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) /* actual PDFs */
743                 + (((double)(gatherAccumulator + loadLookups + scalarLookups)) / nFluid) * sizeof(int) * (N_D3Q19 - 1) /* AdjList */
744                 + (nLoopStartIndices / nFluid) * sizeof(int); // one lookup to loopStartIndices
745
746         double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
747
748         kdlr->kdl.kd.LoopBalance = loopBalance;
749
750         printf("# loop balance:\n");
751         printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
752         printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
753         printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
754
755         return;
756 }
757
758 void FNAME(D3Q19ListAaPvGatherHybridInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
759 {
760         KernelData * kd;
761         KernelDataList * kdl;
762         KernelDataListRia * kdlr;
763         MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
764
765         kd = (KernelData *)kdlr;
766         kdl = KDL(kdlr);
767
768         *kernelData = kd;
769
770 #ifdef DEBUG
771         kd->Pdfs[0] = NULL;
772         kd->Pdfs[1] = NULL;
773         kd->PdfsActive = NULL;
774         kd->DstPdfs = NULL;
775         kd->SrcPdfs = NULL;
776         kd->Dims[0] = -1;
777         kd->Dims[1] = -1;
778         kd->Dims[2] = -1;
779         kd->GlobalDims[0] = -1;
780         kd->GlobalDims[1] = -1;
781         kd->GlobalDims[2] = -1;
782         kd->Offsets[0] = -1;
783         kd->Offsets[1] = -1;
784         kd->Offsets[2] = -1;
785
786         kd->ObstIndices = NULL;
787         kd->nObstIndices = -1;
788         kd->BounceBackPdfsSrc = NULL;
789         kd->BounceBackPdfsDst = NULL;
790         kd->nBounceBackPdfs = -1;
791
792         kdl->AdjList = NULL;
793         kdl->Coords = NULL;
794         kdl->Grid = NULL;
795         kdl->nCells = -1;
796         kdl->nFluid = -1;
797
798         kdlr->loopStartIndices = NULL;
799         kdlr->nLoopStartIndices = 0;
800         kdlr->oddKernelThreadStartIndices = NULL;
801         kdlr->nOddKernelThreadStartIndices = 0;
802 #endif
803
804         kdl->Iteration = -1;
805
806         // Ajust the dimensions according to padding, if used.
807         kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
808         kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
809         kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
810
811         int * lDims = ld->Dims;
812
813         int lX = lDims[0];
814         int lY = lDims[1];
815         int lZ = lDims[2];
816
817         int nTotalCells = lX * lY * lZ;
818         int nCells = ld->nFluid; // TODO: + padding
819         int nFluid = ld->nFluid;
820
821         // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
822         // done during lattice setup.
823         kdl->nCells = nCells;
824         kdl->nFluid = nFluid;
825
826         PdfT * pdfs[2];
827
828         int blk[3] = { 0 };
829
830         ParseParameters(params, blk);
831
832         if (blk[0] == 0) blk[0] = lX;
833         if (blk[1] == 0) blk[1] = lY;
834         if (blk[2] == 0) blk[2] = lZ;
835
836         printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
837
838         unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
839         unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
840         unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
841
842         printf("# Lattice Array Allocation:\n");
843         printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
844         printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
845         printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
846
847
848         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
849
850         if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
851                 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
852                 exit(1);
853         }
854         else {
855                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
856         }
857
858         kd->Pdfs[0] = pdfs[0];
859
860         // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
861         // Here we touch only the fluid nodes as this loop is OpenMP parallel and
862         // we want the same scheduling as in the kernel.
863         #ifdef _OPENMP
864         #pragma omp parallel for
865         #endif
866         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
867                 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
868         } }
869
870         // Initialize all PDFs to some standard value.
871         for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
872                 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
873         } }
874
875         // ----------------------------------------------------------------------
876         // create grid which will hold the index numbers of the fluid nodes
877
878         uint32_t * grid;
879
880         if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
881                 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
882                 exit(1);
883         }
884         kdl->Grid = grid;
885
886         int latticeIndex;
887
888 #ifdef DEBUG
889         for(int z = 0; z < lZ; ++z) {
890                 for(int y = 0; y < lY; ++y) {
891                         for(int x = 0; x < lX; ++x) {
892
893                                 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
894
895                                 grid[latticeIndex] = ~0;
896                         }
897                 }
898         }
899 #endif
900
901         // ----------------------------------------------------------------------
902         // generate numbering over grid
903
904         uint32_t * coords;
905
906         if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
907                 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
908                 exit(1);
909         }
910
911         kdl->Coords = coords;
912
913         // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
914         // through the ld->Lattice array.
915         int counter = 0;
916
917         // Blocking is implemented via setup of the adjacency list. The kernel later will
918         // walk through the lattice blocked automatically.
919         for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
920                 for (int bY = 0; bY < lY; bY += blk[1]) {
921                         for (int bX = 0; bX < lX; bX += blk[0]) {
922
923                                 int eX = MIN(bX + blk[0], lX);
924                                 int eY = MIN(bY + blk[1], lY);
925                                 int eZ = MIN(bZ + blk[2], lZ);
926
927
928                                 for (int z = bZ; z < eZ; ++z) {
929                                         for (int y = bY; y < eY; ++y) {
930                                                 for (int x = bX; x < eX; ++x) {
931
932                                                         latticeIndex = L_INDEX_4(lDims, x, y, z);
933
934                                                         if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
935                                                                 grid[latticeIndex] = counter;
936
937                                                                 coords[C_INDEX_X(counter)] = x;
938                                                                 coords[C_INDEX_Y(counter)] = y;
939                                                                 coords[C_INDEX_Z(counter)] = z;
940
941                                                                 ++counter;
942                                                         }
943                                                 } } }
944                         } } }
945
946         Verify(counter == nFluid);
947
948         uint32_t * adjList;
949
950         // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
951         int nFluid_padded = ((nFluid + VSIZE - 1) / VSIZE) * VSIZE;
952         unsigned long adjListBytes = nFluid_padded * sizeof(int) * N_D3Q19_IDX;
953
954         printf("# Adjacency List Allocation:\n");
955         printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
956         printf("#   alignment:         \t\t%d b\n", PAGE_4K);
957
958         // AdjList only requires 18 instead of 19 entries per node, as
959         // the center PDF needs no addressing.
960         if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
961                 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
962                 exit(1);
963         }
964         else {
965                 printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
966         }
967
968         for (int i = 0; i < nFluid_padded; ++i){
969                 adjList[i] = -1;
970         }
971
972         kdl->AdjList = adjList;
973
974         int x, y, z;
975
976         uint32_t neighborIndex;
977         uint32_t dstIndex;
978
979         int nx, ny, nz, px, py, pz;
980
981         // Loop over all fluid nodes and compute the indices to the neighboring
982         // PDFs for configured data layout (AoS/SoA).
983         // Parallelized loop to ensure correct NUMA placement.
984         // #ifdef _OPENMP  --> add line continuation
985         //      #pragma omp parallel for default(none)
986         //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
987         //                              stderr,
988         //                              lDims, grid, ld, lX, lY, lZ, adjList)
989         //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
990         // #endif
991
992         for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
993
994
995                 // Loop over all directions except the center one.
996                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
997                         Assert(d != D3Q19_C);
998
999                         for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
1000                                 int index = fluidBaseIndex + inChunkIndex;
1001
1002                                 Assert(index < nFluid);
1003
1004                                 x = coords[C_INDEX_X(index)];
1005                                 y = coords[C_INDEX_Y(index)];
1006                                 z = coords[C_INDEX_Z(index)];
1007
1008                                 Assert(x >= 0 && x < lX);
1009                                 Assert(y >= 0 && y < lY);
1010                                 Assert(z >= 0 && z < lZ);
1011
1012                                 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
1013
1014 #ifdef PROP_MODEL_PUSH
1015                                 nx = x + D3Q19_X[d];
1016                                 ny = y + D3Q19_Y[d];
1017                                 nz = z + D3Q19_Z[d];
1018
1019 #elif PROP_MODEL_PULL
1020                                 nx = x - D3Q19_X[d];
1021                                 ny = y - D3Q19_Y[d];
1022                                 nz = z - D3Q19_Z[d];
1023 #else
1024                                 #error No implementation for this PROP_MODEL_NAME.
1025 #endif
1026                                 // If the neighbor is outside the latice in X direction and we have a
1027                                 // periodic boundary then we need to wrap around.
1028                                 if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
1029                                                 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
1030                                                 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
1031                                    ){
1032                                         // x periodic
1033
1034                                         if (nx < 0) {
1035                                                 px = lX - 1;
1036                                         }
1037                                         else if (nx >= lX) {
1038                                                 px = 0;
1039                                         } else {
1040                                                 px = nx;
1041                                         }
1042                                         // y periodic
1043                                         if (ny < 0) {
1044                                                 py = lY - 1;
1045                                         }
1046                                         else if (ny >= lY) {
1047                                                 py = 0;
1048                                         } else {
1049                                                 py = ny;
1050                                         }
1051
1052                                         // z periodic
1053                                         if (nz < 0) {
1054                                                 pz = lZ - 1;
1055                                         }
1056                                         else if (nz >= lZ) {
1057                                                 pz = 0;
1058                                         } else {
1059                                                 pz = nz;
1060                                         }
1061
1062                                         if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
1063                                                 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1064                                         }
1065                                         else {
1066                                                 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
1067
1068                                                 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
1069
1070                                                 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1071                                         }
1072                                 }
1073                                 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
1074                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1075                                 }
1076                                 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
1077                                         dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1078                                 }
1079                                 else {
1080                                         neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
1081
1082                                         Assert(neighborIndex != ~0);
1083
1084                                         dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1085                                 }
1086
1087                                 Assert(dstIndex >= 0);
1088                                 Assert(dstIndex < nCells * N_D3Q19);
1089
1090                                 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
1091                         }
1092                 }
1093         }
1094
1095 // Sets unused adjList entries to some extreme value which triggers and SIGSEG, whenever these values are accidently accessed.
1096         for(int index = nFluid; index < nFluid_padded; ++index){
1097                 for(int d = 0; d < N_D3Q19 - 1; ++d) {
1098                         adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = -10*1000*1000;
1099                 }
1100         }
1101
1102 /*
1103         printf("============\n");
1104         for (int i = 0; i < nFluid_padded * (N_D3Q19_IDX + 20);){
1105                 for (int j = 0; j < VSIZE; ++j){
1106                         printf("%d ",adjList[i]);
1107                         ++i;
1108                 }
1109                 printf("\n");
1110         }
1111            for(int dir = 0; dir < N_D3Q19; ++dir){
1112            printf("dir: %d\n",dir);
1113            for(int baseIndex = 0; baseIndex < nFluid + VSIZE; baseIndex+=VSIZE){
1114            for(int i = 0; i < VSIZE; ++i){
1115            int index = baseIndex + i;
1116
1117            printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]);
1118            }
1119            printf("\n");
1120            }
1121            printf("\n");
1122            }
1123         printf("============\n");
1124 */
1125
1126
1127         int nThreads = 1;
1128
1129 #ifdef _OPENMP
1130         nThreads = omp_get_max_threads();
1131 #endif
1132
1133         SetuploopStartIndices(ld, KDLR(kd), nThreads);
1134
1135         // Fill remaining KernelData structures
1136         kd->GetNode = GetNode;
1137         kd->SetNode = SetNode;
1138
1139         kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
1140         kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
1141
1142         kd->Kernel = FNAME(D3Q19ListAaPvGatherHybridKernel);
1143
1144         kd->DstPdfs = NULL;
1145         kd->PdfsActive = kd->Pdfs[0];
1146
1147         return;
1148 }
1149
1150 void FNAME(D3Q19ListAaPvGatherHybridDeinit)(LatticeDesc * ld, KernelData ** kernelData)
1151 {
1152         KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
1153
1154         MemFree((void **)&((*kdlr)->loopStartIndices));
1155
1156         if ((*kdlr)->oddKernelThreadStartIndices != NULL) {
1157                 MemFree((void **)&((*kdlr)->oddKernelThreadStartIndices));
1158         }
1159
1160         KernelDataList ** kdl = (KernelDataList **)kernelData;
1161
1162         ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
1163
1164         MemFree((void **)&((*kdl)->Coords));
1165         MemFree((void **)&((*kdl)->Grid));
1166
1167         PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
1168
1169         MemFree((void **)kernelData);
1170         return;
1171 }
1172 #undef PAGE_4K
1173 #undef ADJ_LIST
This page took 0.330513 seconds and 4 git commands to generate.