add citation information
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPvGatherHybridCommon.c
CommitLineData
8cafd9ea
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// Michael Hussnaetter, 2017-2018
12// University of Erlangen-Nuremberg, Germany
13// michael.hussnaetter -at- fau.de
14//
15// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16//
17// LbmBenchKernels is free software: you can redistribute it and/or modify
18// it under the terms of the GNU General Public License as published by
19// the Free Software Foundation, either version 3 of the License, or
20// (at your option) any later version.
21//
22// LbmBenchKernels is distributed in the hope that it will be useful,
23// but WITHOUT ANY WARRANTY; without even the implied warranty of
24// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25// GNU General Public License for more details.
26//
27// You should have received a copy of the GNU General Public License
28// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
29//
30// --------------------------------------------------------------------------
31#include "BenchKernelD3Q19ListAaPvGatherHybridCommon.h"
32
33#include "Memory.h"
34#include "Vtk.h"
35#include "Vector.h"
36
37#include <math.h>
38
39#ifdef _OPENMP
40#include <omp.h>
41#endif
42
43#define PAGE_4K 4096
44
45#if ALLOC_ADJ_LIST_IN_HBM == 1
46#define ADJ_LIST_ALLOCATOR HbwAllocAligned
47#define ADJ_LIST_FREE HbwFree
48#else
49#define ADJ_LIST_ALLOCATOR MemAllocAligned
50#define ADJ_LIST_FREE MemFree
51#endif
52
53#if ALLOC_PDF_IN_HBM == 1
54#define PDF_ALLOCATOR HbwAllocAligned
55#define PDF_FREE HbwFree
56#else
57#define PDF_ALLOCATOR MemAllocAligned
58#define PDF_FREE MemFree
59#endif
60
61// Forward definition.
62void FNAME(D3Q19ListAaPvGatherHybridKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
63
64
65
66// -----------------------------------------------------------------------
67// Functions which are used as callback by the kernel to read or write
68// PDFs and nodes.
69
70static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
71{
72 Assert(kd != NULL);
73 Assert(kd->PdfsActive != NULL);
74 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
75 Assert(pdf != NULL);
76
77 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
78 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
79 Assert(dir >= 0); Assert(dir < N_D3Q19);
80
81 KernelDataList * kdl = KDL(kd);
82 uint32_t * adjList = kdl->AdjList;
83
84 if (kdl->Iteration % 2 == 0) {
85 // Pdfs are stored inverse, local PDFs are located in remote nodes
86
87 // getting node index
88 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
89
90 if (dir != D3Q19_C) {
91 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
92 *pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
93 #undef ADJ_LIST
94 }
95 else {
96 *pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
97 }
98
99 }
100 else {
101 *pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
102 }
103
104 return;
105}
106
107static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
108{
109 Assert(kd != NULL);
110 Assert(kd->PdfsActive != NULL);
111 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
112 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
113 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
114 Assert(dir >= 0); Assert(dir < N_D3Q19);
115
116 if (isnan(pdf)) {
117 printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
118 DEBUG_BREAK_POINT();
119 exit(1);
120 }
121
122 KernelDataList * kdl = KDL(kd);
123 uint32_t * adjList = kdl->AdjList;
124
125 if (kdl->Iteration % 2 == 0) {
126 // Pdfs are stored inverse, local PDFs are located in remote nodes
127
128 // getting node index
129 uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
130
131 if (dir != D3Q19_C) {
132 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
133 kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
134 #undef ADJ_LIST
135 } else {
136 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
137 }
138
139 } else {
140 kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
141 }
142
143 return;
144}
145
146
147static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
148{
149 Assert(kd != NULL);
150 Assert(kd->PdfsActive != NULL);
151 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
152 Assert(pdfs != NULL);
153 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
154 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
155
156 KernelDataList * kdl = KDL(kd);
157 uint32_t * adjList = kdl->AdjList;
158
159 if(kdl->Iteration % 2 == 0){
160
161 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
162
163 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
164 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
165 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
166 D3Q19_LIST_WO_C
167 #undef X
168 #undef ADJ_LIST
169 pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
170
171 } else {
172
173 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
174 #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
175 D3Q19_LIST
176 #undef X
177 #undef I
178
179 }
180
181 for (int d = 0; d < 19; ++d) {
182 if(isnan(pdfs[d]) || isinf(pdfs[d])) {
183 printf("%d %d %d %d nan! get node\n", x, y, z, d);
184 for (int d2 = 0; d2 < 19; ++d2) {
185 printf("%d: %e\n", d2, pdfs[d2]);
186 }
187 exit(1);
188 }
189 }
190
191 return;
192}
193
194
195static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
196{
197 Assert(kd != NULL);
198 Assert(kd->PdfsActive != NULL);
199 Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
200 Assert(pdfs != NULL);
201
202 Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
203 Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
204
205 for (int d = 0; d < 19; ++d) {
206 if(isnan(pdfs[d])) {
207 printf("%d %d %d %d nan! get node\n", x, y, z, d);
208 for (int d2 = 0; d2 < 19; ++d2) {
209 printf("%d: %e\n", d2, pdfs[d2]);
210 }
211 exit(1);
212 }
213 }
214
215 KernelDataList * kdl = KDL(kd);
216 uint32_t * adjList = kdl->AdjList;
217
218 if(kdl->Iteration % 2 == 0){
219
220 uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
221
222 // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
223 kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
224
225 #define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
226 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
227 D3Q19_LIST_WO_C
228 #undef X
229 #undef ADJ_LIST
230
231 } else {
232
233 #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
234 #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
235 D3Q19_LIST
236 #undef X
237 #undef I
238
239 }
240
241 return;
242}
243
244static void ParameterUsage()
245{
246 printf("Kernel parameters:\n");
247 printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
248
249 return;
250}
251
252static void ParseParameters(Parameters * params, int * blk)
253{
254 Assert(blk != NULL);
255
256 blk[0] = 0; blk[1] = 0; blk[2] = 0;
257
258 #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
259 #define NEXT_ARG_PRESENT() \
260 do { \
261 if (i + 1 >= params->nKernelArgs) { \
262 printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
263 exit(1); \
264 } \
265 } while (0)
266
267
268 for (int i = 0; i < params->nKernelArgs; ++i) {
269 if (ARG_IS("-blk") || ARG_IS("--blk")) {
270 NEXT_ARG_PRESENT();
271
272 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
273
274 if (tmp <= 0) {
275 printf("ERROR: blocking parameter must be > 0.\n");
276 exit(1);
277 }
278
279 blk[0] = blk[1] = blk[2] = tmp;
280 }
281 else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
282 NEXT_ARG_PRESENT();
283
284 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
285
286 if (tmp <= 0) {
287 printf("ERROR: blocking parameter must be > 0.\n");
288 exit(1);
289 }
290
291 blk[0] = tmp;
292 }
293 else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
294 NEXT_ARG_PRESENT();
295
296 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
297
298 if (tmp <= 0) {
299 printf("ERROR: blocking parameter must be > 0.\n");
300 exit(1);
301 }
302
303 blk[1] = tmp;
304 }
305 else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
306 NEXT_ARG_PRESENT();
307
308 int tmp = strtol(params->KernelArgs[++i], NULL, 0);
309
310 if (tmp <= 0) {
311 printf("ERROR: blocking parameter must be > 0.\n");
312 exit(1);
313 }
314
315 blk[2] = tmp;
316 }
317 else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
318 ParameterUsage();
319 exit(1);
320 }
321 else {
322 printf("ERROR: unknown kernel parameter.\n");
323 ParameterUsage();
324 exit(1);
325 }
326 }
327
328 #undef ARG_IS
329 #undef NEXT_ARG_PRESENT
330
331 return;
332}
333
334static void SetuploopStartIndices(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
335{
336 //#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
337 Assert(ld != NULL);
338 Assert(kdlr != NULL);
339 Assert(nThreads > 0);
340
341 //uint32_t * adjList = kdlr->kdl.AdjList;
342 uint32_t * adjList = kdlr->kdl.AdjList;
343
344 uint32_t nLoopStartIndices = 0;
345 uint32_t loopStartIndex = 2;
346
347 int nFluid = kdlr->kdl.nFluid;
348
349 int * oddKernelThreadStartIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
350
351 int nNodesPerThread = nFluid / nThreads;
352 //printf("nodesPerThread: %d\n", nNodesPerThread);
353
354 for (int i = 0; i < nThreads; ++i) {
355 oddKernelThreadStartIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
356 }
357
358 oddKernelThreadStartIndices[nThreads] = nFluid;
359
360 /*
361 for (int i = 0; i <= nThreads; ++i) {
362 printf("oddKernelThreadStartIndices[%d] = %d\n", i, oddKernelThreadStartIndices[i]);
363 }
364 */
365
366 int threadIndex = 1;
367
368 // We execute following code two times.
369 // - The first time to get the count of how many entries we need for the
370 // loopStartIndices array.
371 // - The second time to fill the array.
372
373 // Loop over adjacency list of all nodes.
374 // Compare if adjacent nodes within one cache line share the same access pattern.
375 // First vectorized access is assumed to be consecutive (-> may be loaded with regular load).
376
377 int lastCacheLineConsecutive = 1;
378
379 for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
380
381 int currentCacheLineConsecutive = 1;
382
383 // Loop over all directions except the center one.
384 for(int d = 0; d < N_D3Q19 - 1; ++d) {
385 Assert(d != D3Q19_C);
386
387 // check if cache line itself has consecutive memory access pattern
388 for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex) {
389 int index = fluidBaseIndex + inChunkIndex;
390
391 Assert(index < nFluid);
392
393 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
394 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
395 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
396 //printf("no match for index: %d\n", d);
397 //printf("ADJ_LlST(%d,%d) = %d != %d = ADJ_LlST(%d,%d) + 1\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-1,d), index-1, d);
398 // Different access pattern.
399 currentCacheLineConsecutive = 0;
400 break;
401 }
402 #undef ADJ_LIST
403
404 }
405
406 if(!currentCacheLineConsecutive){
407 break; //exit from nested loop
408 }
409
410 }
411
412 int interCacheLineConsecutive = 1;
413
414 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
415 // check if cache line has consecutive memory access pattern to last entry of previous cache line
416 int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
417 if (lastIdxOfPreviousCacheLine > 0) {
418 for(int d = 0; d < N_D3Q19 - 1; ++d) {
419 Assert(d != D3Q19_C);
420 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
421 if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
422 // Different access pattern.
423 //printf("not interCacheConsecutive\n");
424 interCacheLineConsecutive = 0;
425 break;
426 }
427 #undef ADJ_LIST
428
429 }
430 }
431 }
432 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
433 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
434 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
435 // Current cache line contains thread boundary.
436 // These cache lines are treated by scalar peel and
437 // reminder loops in kernel of every thread.
438 // TODO maybe replace these loops with masked gather / scatter?!
439 if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
440 ++loopStartIndex; // reserving gather/scatter index
441 }
442 ++loopStartIndex; // reserving space for start remainder loop of thread n
443
444 if (threadIndex < nThreads){
445 ++loopStartIndex; // reserving space for starting peel loop of thread n+1
446
447 if (fluidBaseIndex - 1 == threadBoundaryIndex){
448 if(!currentCacheLineConsecutive){
449 ++loopStartIndex;
450 }
451 }
452 else {
453 currentCacheLineConsecutive = 1;
454 }
455
456 //++loopStartIndex; // reserving space for ending peel loop / starting load/store of thread n+1
457 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
458 }
459 ++threadIndex;
460 }
461 else {
462 // We are not at a thread boundary.
463 if (currentCacheLineConsecutive) {
464 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
465 loopStartIndex+=2;
466 }
467 else if(!lastCacheLineConsecutive){
468 ++loopStartIndex;
469 }
470 }
471 else {
472 if(lastCacheLineConsecutive){
473 ++loopStartIndex;
474 }
475 }
476 }
477
478 // treating special case when last thread has no remainder loop
479 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
480 //printf("--> special case 111. loopStartIndex: %d \n", loopStartIndex);
481 if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
482 ++loopStartIndex; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
483 }
484
485 ++loopStartIndex; // gather/scatter end and scalar remainder start
486 ++loopStartIndex; // scalar remainder end and scalar peel start
487
488 }
489
490 lastCacheLineConsecutive = currentCacheLineConsecutive;
491 }
492
493 if (nFluid > 0) {
494 nLoopStartIndices = loopStartIndex;
495 }
496
497 int * loopStartIndices;
498 unsigned long loopStartIndicesByte = (nLoopStartIndices + 1) * sizeof(int);
499
500 printf("# Loop Start Index Array Allocation:\n");
501 printf("# elements: \t\t%d\n", nLoopStartIndices + 1);
502 printf("# size: \t\t%e MiB\n", loopStartIndicesByte / 1024.0 / 1024.0);
503 printf("# alignment: \t\t%d b\n", PAGE_4K);
504
505 if (MemAllocAligned((void **)&loopStartIndices, loopStartIndicesByte, PAGE_4K)) {
506 printf("ERROR: allocating loopStartIndices array with MemAllocAligned failed: %lu bytes.\n", loopStartIndicesByte);
507 exit(1);
508 }
509 else {
510 printf("# allocator: \t\t\tMemAllocAligned()\n");
511 }
512
513 oddKernelThreadStartIndices[0] = 0;
514 loopStartIndices[0] = 0; //first scalar loop would start with 0
515 loopStartIndices[1] = 0; //no peel loop expected -> first load/store loop may start at index==0
516 loopStartIndices[2] = 0; //may not be set in case first access is gather/scatter -> therefore its set here
517
518 // resetting values to default
519 threadIndex = 1;
520 lastCacheLineConsecutive = 1;
521 loopStartIndex = 2;
522
523 // Loop over adjacency list of all nodes.
524 // Compare if adjacent nodes share the same access pattern.
525
526 int indexAccumulator = 0;
527
528 // for statistical reasons:
529 int gatherAccumulator = 0;
530 int loadAccumulator = 0;
531 int scalarLookups = 0;
532 int loadLookups = 0;
533
534
535 for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
536 int currentCacheLineConsecutive = 1;
537 //printf("fluidbaseIndex: %d\n", fluidBaseIndex);
538 // Loop over all directions except the center one.
539 for(int d = 0; d < N_D3Q19 - 1; ++d) {
540 Assert(d != D3Q19_C);
541
542 // check if cache line itself has consecutive memory access pattern
543 for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
544 int index = fluidBaseIndex + inChunkIndex;
545
546 Assert(index < nFluid);
547
548 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
549 //if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
550 if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
551 // Different access pattern.
552 currentCacheLineConsecutive = 0;
553 break;
554 }
555 #undef ADJ_LIST
556 }
557
558 if(!currentCacheLineConsecutive){
559 break; //exit from nested loop
560 }
561 }
562
563 int interCacheLineConsecutive = 1;
564
565 if(currentCacheLineConsecutive && lastCacheLineConsecutive){
566 // check if cache line has consecutive memory access pattern to last entry of previous cache line
567 int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
568 if (lastIdxOfPreviousCacheLine > 0) {
569 for(int d = 0; d < N_D3Q19 - 1; ++d) {
570 Assert(d != D3Q19_C);
571 #define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
572 if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
573 // Different access pattern.
574 interCacheLineConsecutive = 0;
575 break;
576 }
577 #undef ADJ_LIST
578
579 }
580 }
581 }
582
583 int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
584 //if (fluidBaseIndex > 3500)
585 // printf("threadBoundaryIndex: %d fluidBaseIndex-1: %d fluidBaseIndex + VSIZE - 1: %d\n", threadBoundaryIndex, fluidBaseIndex-1, fluidBaseIndex + VSIZE -1);
586
587 if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
588 threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
589 // Current cache line contains thread boundary.
590 // These cache lines are treated by scalar peel and
591 // reminder loops in kernel of every thread.
592 // TODO maybe replace these loops with masked gather / scatter?!
593 if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
594 //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; //same value as scalar remainder start => no more access to gather/scatter loop
595 loopStartIndices[loopStartIndex] = indexAccumulator; //same value as scalar remainder start => no more access to gather/scatter loop
596 ++loopStartIndex;
597 }
598
599 //loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; // gather/scatter end and scalar remainder start
600 loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
601 ++loopStartIndex;
602
603 // starting indices of thread n+1
604 loopStartIndices[loopStartIndex] = threadBoundaryIndex; // scalar remainder of thread n end and scalar peel of thread n+1 start
605 oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
606
607 if (threadIndex < nThreads){
608 indexAccumulator = ((threadBoundaryIndex + VSIZE - 1) / VSIZE ) * VSIZE; // rounding towards next multiple of VSIZE
609 ++loopStartIndex;
610 loopStartIndices[loopStartIndex] = indexAccumulator; // scalar peel end and 1st load/store start
611
612 // treating special case when there is no peel / remainder loop
613 if (fluidBaseIndex - 1 == threadBoundaryIndex){
614 if(!currentCacheLineConsecutive){
615 ++loopStartIndex;
616 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end and 1st gather/scatter start
617 gatherAccumulator += VSIZE;
618 }
619 else {
620 loadLookups += VSIZE;
621 }
622 indexAccumulator += VSIZE;
623 }
624 else {
625 scalarLookups += VSIZE;
626 currentCacheLineConsecutive = 1;
627 }
628
629 ++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
630 loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
631 }
632 ++threadIndex;
633
634 }
635 else {
636 // We are not at a thread boundary.
637 int print = 0;
638 if (currentCacheLineConsecutive) {
639 loadAccumulator += VSIZE;
640
641 if(lastCacheLineConsecutive && !interCacheLineConsecutive){
642 loadLookups += VSIZE;
643 if (print)
644 printf("#1 loopStartIndex: %d\n", loopStartIndex);
645 // loopStartIndices[loopStartIndex] is not incremented since pointers need to be fetched again.
646 // loopStartIndices[loopStartIndex + 1] (-> start Load/Store and end Gather/Scatter)
647 // gets same value as loopStartIndices[loopStartindex] (-> start Gather/Scatter)
648 // this ensures that no gather/scatter iteration is executed
649 ++loopStartIndex;
650 loopStartIndices[loopStartIndex] = indexAccumulator;
651
652 // loopStartIndices[loopStartIndex + 2] (-> start Gather/Scatter and end Load/Store)
653 // gets set to have one Load/Store iteration
654 ++loopStartIndex;
655 indexAccumulator+=VSIZE;
656 loopStartIndices[loopStartIndex] = indexAccumulator;
657
658 }
659 else if(!lastCacheLineConsecutive){
660 loadLookups += VSIZE;
661 if (print)
662 printf("#2 loopStartIndex: %d\n", loopStartIndex);
663 ++loopStartIndex;
664 indexAccumulator+=VSIZE;
665 loopStartIndices[loopStartIndex] = indexAccumulator;
666 }
667 else { // (lastCacheLineConsecutive && interCacheLineConsecutive)
668 if (print)
669 printf("#3 loopStartIndex: %d\n", loopStartIndex);
670 indexAccumulator+=VSIZE;
671 loopStartIndices[loopStartIndex] = indexAccumulator;
672 }
673 }
674 else {
675 gatherAccumulator += VSIZE;
676 if(lastCacheLineConsecutive){
677 if (print)
678 printf("#4 loopStartIndex: %d\n", loopStartIndex);
679 ++loopStartIndex;
680 indexAccumulator+=VSIZE;
681 loopStartIndices[loopStartIndex] = indexAccumulator;
682 }
683 else { // lastCacheLine without not consecutive memory access pattern
684 if (print)
685 printf("#5 loopStartIndex: %d\n", loopStartIndex);
686 indexAccumulator+=VSIZE;
687 loopStartIndices[loopStartIndex] = indexAccumulator;
688 }
689 }
690 }
691
692 // treating special case when last thread has no remainder loop
693 if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
694 //printf("--> special case. indexAccumulator: %d\n", indexAccumulator);
695 if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
696 ++loopStartIndex;
697 loopStartIndices[loopStartIndex] = indexAccumulator; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
698 }
699
700 ++loopStartIndex;
701 loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
702 ++loopStartIndex;
703 loopStartIndices[loopStartIndex] = indexAccumulator; // scalar remainder end and scalar peel start
704
705 oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
706 }
707
708 lastCacheLineConsecutive = currentCacheLineConsecutive;
709
710 }
711
712 if (nLoopStartIndices != loopStartIndex){
713 printf("ERROR: nLoopStartIndices unequal loopStartIndex!\n");
714 }
715
716 /*
717 printf("loopStartIndices:\n");
718 for(int i = 0; i <= nLoopStartIndices; ++i){
719 printf("%d ", loopStartIndices[i]);
720 }
721 printf("\n");
722 printf("oddKernelThreadStartIndices:\n");
723 for(int i = 0; i <= nThreads; ++i){
724 printf("%d ", oddKernelThreadStartIndices[i]);
725 }
726 printf("\n");
727 */
728
729 kdlr->loopStartIndices = loopStartIndices;
730 kdlr->nLoopStartIndices = nLoopStartIndices;
731
732 kdlr->oddKernelThreadStartIndices = oddKernelThreadStartIndices;
733 kdlr->nOddKernelThreadStartIndices = nThreads;
734
735 printf("# vload/vstore nodes: \t% 10d \t(%3.4f %% of total fluid nodes)\n", loadAccumulator, ((double) loadAccumulator / (double) nFluid) * 100);
736 printf("# gather/scatter nodes:\t% 10d \t(%3.4f %% of total fluid nodes)\n", gatherAccumulator, ((double) gatherAccumulator / (double) nFluid) * 100.0);
737 printf("# vload/vstore lookups:\t% 10d \n", loadLookups * (N_D3Q19 - 1));
738 printf("# gather/scatter lookups:\t% 10d \n", gatherAccumulator * (N_D3Q19 - 1));
739 printf("# scalar lookups: \t% 10d \n", scalarLookups * (N_D3Q19 - 1));
740
741 double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
742 double loopBalanceOdd = 2.0 * 19 * sizeof(PdfT) /* actual PDFs */
743 + (((double)(gatherAccumulator + loadLookups + scalarLookups)) / nFluid) * sizeof(int) * (N_D3Q19 - 1) /* AdjList */
744 + (nLoopStartIndices / nFluid) * sizeof(int); // one lookup to loopStartIndices
745
746 double loopBalance = (loopBalanceEven + loopBalanceOdd) / 2.0;
747
748 kdlr->kdl.kd.LoopBalance = loopBalance;
749
750 printf("# loop balance:\n");
751 printf("# even timestep: \t\t%.2f B/FLUP\n", loopBalanceEven);
752 printf("# odd timestep: \t\t%.2f B/FLUP\n", loopBalanceOdd);
753 printf("# average: \t\t%.2f B/FLUP\n", loopBalance);
754
755 return;
756}
757
758void FNAME(D3Q19ListAaPvGatherHybridInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
759{
760 KernelData * kd;
761 KernelDataList * kdl;
762 KernelDataListRia * kdlr;
763 MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
764
765 kd = (KernelData *)kdlr;
766 kdl = KDL(kdlr);
767
768 *kernelData = kd;
769
770#ifdef DEBUG
771 kd->Pdfs[0] = NULL;
772 kd->Pdfs[1] = NULL;
773 kd->PdfsActive = NULL;
774 kd->DstPdfs = NULL;
775 kd->SrcPdfs = NULL;
776 kd->Dims[0] = -1;
777 kd->Dims[1] = -1;
778 kd->Dims[2] = -1;
779 kd->GlobalDims[0] = -1;
780 kd->GlobalDims[1] = -1;
781 kd->GlobalDims[2] = -1;
782 kd->Offsets[0] = -1;
783 kd->Offsets[1] = -1;
784 kd->Offsets[2] = -1;
785
786 kd->ObstIndices = NULL;
787 kd->nObstIndices = -1;
788 kd->BounceBackPdfsSrc = NULL;
789 kd->BounceBackPdfsDst = NULL;
790 kd->nBounceBackPdfs = -1;
791
792 kdl->AdjList = NULL;
793 kdl->Coords = NULL;
794 kdl->Grid = NULL;
795 kdl->nCells = -1;
796 kdl->nFluid = -1;
797
798 kdlr->loopStartIndices = NULL;
799 kdlr->nLoopStartIndices = 0;
800 kdlr->oddKernelThreadStartIndices = NULL;
801 kdlr->nOddKernelThreadStartIndices = 0;
802#endif
803
804 kdl->Iteration = -1;
805
806 // Ajust the dimensions according to padding, if used.
807 kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
808 kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
809 kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
810
811 int * lDims = ld->Dims;
812
813 int lX = lDims[0];
814 int lY = lDims[1];
815 int lZ = lDims[2];
816
817 int nTotalCells = lX * lY * lZ;
818 int nCells = ld->nFluid; // TODO: + padding
819 int nFluid = ld->nFluid;
820
821 // TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
822 // done during lattice setup.
823 kdl->nCells = nCells;
824 kdl->nFluid = nFluid;
825
826 PdfT * pdfs[2];
827
828 int blk[3] = { 0 };
829
830 ParseParameters(params, blk);
831
832 if (blk[0] == 0) blk[0] = lX;
833 if (blk[1] == 0) blk[1] = lY;
834 if (blk[2] == 0) blk[2] = lZ;
835
836 printf("# blocking: \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
837
838 unsigned long latByte = nCells * sizeof(PdfT) * N_D3Q19;
839 unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
840 unsigned long latPadByte = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
841
842 printf("# Lattice Array Allocation:\n");
843 printf("# lattice size: \t\t%e MiB\n", latByte / 1024.0 / 1024.0);
844 printf("# fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
845 printf("# lattice padding: \t\t%e MiB\n", latPadByte / 1024.0 / 1024.0);
846
847
848 printf("# alignment: \t\t%d b\n", PAGE_4K);
849
850 if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
851 printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
852 exit(1);
853 }
854 else {
855 printf("# allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
856 }
857
858 kd->Pdfs[0] = pdfs[0];
859
860 // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
861 // Here we touch only the fluid nodes as this loop is OpenMP parallel and
862 // we want the same scheduling as in the kernel.
863 #ifdef _OPENMP
864 #pragma omp parallel for
865 #endif
866 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
867 pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
868 } }
869
870 // Initialize all PDFs to some standard value.
871 for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
872 pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
873 } }
874
875 // ----------------------------------------------------------------------
876 // create grid which will hold the index numbers of the fluid nodes
877
878 uint32_t * grid;
879
880 if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
881 printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
882 exit(1);
883 }
884 kdl->Grid = grid;
885
886 int latticeIndex;
887
888#ifdef DEBUG
889 for(int z = 0; z < lZ; ++z) {
890 for(int y = 0; y < lY; ++y) {
891 for(int x = 0; x < lX; ++x) {
892
893 latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
894
895 grid[latticeIndex] = ~0;
896 }
897 }
898 }
899#endif
900
901 // ----------------------------------------------------------------------
902 // generate numbering over grid
903
904 uint32_t * coords;
905
906 if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
907 printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
908 exit(1);
909 }
910
911 kdl->Coords = coords;
912
913 // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
914 // through the ld->Lattice array.
915 int counter = 0;
916
917 // Blocking is implemented via setup of the adjacency list. The kernel later will
918 // walk through the lattice blocked automatically.
919 for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
920 for (int bY = 0; bY < lY; bY += blk[1]) {
921 for (int bX = 0; bX < lX; bX += blk[0]) {
922
923 int eX = MIN(bX + blk[0], lX);
924 int eY = MIN(bY + blk[1], lY);
925 int eZ = MIN(bZ + blk[2], lZ);
926
927
928 for (int z = bZ; z < eZ; ++z) {
929 for (int y = bY; y < eY; ++y) {
930 for (int x = bX; x < eX; ++x) {
931
932 latticeIndex = L_INDEX_4(lDims, x, y, z);
933
934 if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
935 grid[latticeIndex] = counter;
936
937 coords[C_INDEX_X(counter)] = x;
938 coords[C_INDEX_Y(counter)] = y;
939 coords[C_INDEX_Z(counter)] = z;
940
941 ++counter;
942 }
943 } } }
944 } } }
945
946 Verify(counter == nFluid);
947
948 uint32_t * adjList;
949
950 // AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
951 int nFluid_padded = ((nFluid + VSIZE - 1) / VSIZE) * VSIZE;
952 unsigned long adjListBytes = nFluid_padded * sizeof(int) * N_D3Q19_IDX;
953
954 printf("# Adjacency List Allocation:\n");
955 printf("# size: \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
956 printf("# alignment: \t\t%d b\n", PAGE_4K);
957
958 // AdjList only requires 18 instead of 19 entries per node, as
959 // the center PDF needs no addressing.
960 if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
961 printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
962 exit(1);
963 }
964 else {
965 printf("# allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
966 }
967
968 for (int i = 0; i < nFluid_padded; ++i){
969 adjList[i] = -1;
970 }
971
972 kdl->AdjList = adjList;
973
974 int x, y, z;
975
976 uint32_t neighborIndex;
977 uint32_t dstIndex;
978
979 int nx, ny, nz, px, py, pz;
980
981 // Loop over all fluid nodes and compute the indices to the neighboring
982 // PDFs for configured data layout (AoS/SoA).
983 // Parallelized loop to ensure correct NUMA placement.
984 // #ifdef _OPENMP --> add line continuation
985 // #pragma omp parallel for default(none)
986 // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
987 // stderr,
988 // lDims, grid, ld, lX, lY, lZ, adjList)
989 // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
990 // #endif
991
992 for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
993
994
995 // Loop over all directions except the center one.
996 for(int d = 0; d < N_D3Q19 - 1; ++d) {
997 Assert(d != D3Q19_C);
998
999 for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
1000 int index = fluidBaseIndex + inChunkIndex;
1001
1002 Assert(index < nFluid);
1003
1004 x = coords[C_INDEX_X(index)];
1005 y = coords[C_INDEX_Y(index)];
1006 z = coords[C_INDEX_Z(index)];
1007
1008 Assert(x >= 0 && x < lX);
1009 Assert(y >= 0 && y < lY);
1010 Assert(z >= 0 && z < lZ);
1011
1012 Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
1013
1014#ifdef PROP_MODEL_PUSH
1015 nx = x + D3Q19_X[d];
1016 ny = y + D3Q19_Y[d];
1017 nz = z + D3Q19_Z[d];
1018
1019#elif PROP_MODEL_PULL
1020 nx = x - D3Q19_X[d];
1021 ny = y - D3Q19_Y[d];
1022 nz = z - D3Q19_Z[d];
1023#else
1024 #error No implementation for this PROP_MODEL_NAME.
1025#endif
1026 // If the neighbor is outside the latice in X direction and we have a
1027 // periodic boundary then we need to wrap around.
1028 if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
1029 ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
1030 ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
1031 ){
1032 // x periodic
1033
1034 if (nx < 0) {
1035 px = lX - 1;
1036 }
1037 else if (nx >= lX) {
1038 px = 0;
1039 } else {
1040 px = nx;
1041 }
1042 // y periodic
1043 if (ny < 0) {
1044 py = lY - 1;
1045 }
1046 else if (ny >= lY) {
1047 py = 0;
1048 } else {
1049 py = ny;
1050 }
1051
1052 // z periodic
1053 if (nz < 0) {
1054 pz = lZ - 1;
1055 }
1056 else if (nz >= lZ) {
1057 pz = 0;
1058 } else {
1059 pz = nz;
1060 }
1061
1062 if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
1063 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1064 }
1065 else {
1066 neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
1067
1068 AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
1069
1070 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1071 }
1072 }
1073 else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
1074 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1075 }
1076 else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
1077 dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
1078 }
1079 else {
1080 neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
1081
1082 Assert(neighborIndex != ~0);
1083
1084 dstIndex = P_INDEX_3(nCells, neighborIndex, d);
1085 }
1086
1087 Assert(dstIndex >= 0);
1088 Assert(dstIndex < nCells * N_D3Q19);
1089
1090 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
1091 }
1092 }
1093 }
1094
1095// Sets unused adjList entries to some extreme value which triggers and SIGSEG, whenever these values are accidently accessed.
1096 for(int index = nFluid; index < nFluid_padded; ++index){
1097 for(int d = 0; d < N_D3Q19 - 1; ++d) {
1098 adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = -10*1000*1000;
1099 }
1100 }
1101
1102/*
1103 printf("============\n");
1104 for (int i = 0; i < nFluid_padded * (N_D3Q19_IDX + 20);){
1105 for (int j = 0; j < VSIZE; ++j){
1106 printf("%d ",adjList[i]);
1107 ++i;
1108 }
1109 printf("\n");
1110 }
1111 for(int dir = 0; dir < N_D3Q19; ++dir){
1112 printf("dir: %d\n",dir);
1113 for(int baseIndex = 0; baseIndex < nFluid + VSIZE; baseIndex+=VSIZE){
1114 for(int i = 0; i < VSIZE; ++i){
1115 int index = baseIndex + i;
1116
1117 printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]);
1118 }
1119 printf("\n");
1120 }
1121 printf("\n");
1122 }
1123 printf("============\n");
1124*/
1125
1126
1127 int nThreads = 1;
1128
1129#ifdef _OPENMP
1130 nThreads = omp_get_max_threads();
1131#endif
1132
1133 SetuploopStartIndices(ld, KDLR(kd), nThreads);
1134
1135 // Fill remaining KernelData structures
1136 kd->GetNode = GetNode;
1137 kd->SetNode = SetNode;
1138
1139 kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
1140 kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
1141
1142 kd->Kernel = FNAME(D3Q19ListAaPvGatherHybridKernel);
1143
1144 kd->DstPdfs = NULL;
1145 kd->PdfsActive = kd->Pdfs[0];
1146
1147 return;
1148}
1149
1150void FNAME(D3Q19ListAaPvGatherHybridDeinit)(LatticeDesc * ld, KernelData ** kernelData)
1151{
1152 KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
1153
1154 MemFree((void **)&((*kdlr)->loopStartIndices));
1155
1156 if ((*kdlr)->oddKernelThreadStartIndices != NULL) {
1157 MemFree((void **)&((*kdlr)->oddKernelThreadStartIndices));
1158 }
1159
1160 KernelDataList ** kdl = (KernelDataList **)kernelData;
1161
1162 ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
1163
1164 MemFree((void **)&((*kdl)->Coords));
1165 MemFree((void **)&((*kdl)->Grid));
1166
1167 PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
1168
1169 MemFree((void **)kernelData);
1170 return;
1171}
1172#undef PAGE_4K
1173#undef ADJ_LIST
This page took 0.202121 seconds and 5 git commands to generate.