merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31#include "Vector.h"
e3f82424 32#include "LikwidIf.h"
10988083
MW
33
34#include <inttypes.h>
35#include <math.h>
36
37#ifdef _OPENMP
38 #include <omp.h>
39#endif
40
41#define TMP_UX 18
42#define TMP_UY 19
43#define TMP_UZ 20
44#define TMP_W1 21
45#define TMP_W2 22
46
47#define N_TMP 23
48
49#define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index)
50
51void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
52{
53
54 Assert(ld != NULL);
55 Assert(kernelData != NULL);
56 Assert(cd != NULL);
57
0fde6e45
MW
58 Assert(cd->Omega > F(0.0));
59 Assert(cd->Omega < F(2.0));
10988083
MW
60
61 KernelData * kd = (KernelData *)kernelData;
62 KernelDataList * kdl = KDL(kernelData);
63 KernelDataListRia * kdlr = KDLR(kernelData);
64
65 PdfT omega = cd->Omega;
66 const PdfT omegaEven = omega;
67
0fde6e45
MW
68 PdfT magicParam = F(1.0) / F(12.0);
69 const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083
MW
70
71
0fde6e45
MW
72 const PdfT w_0 = F(1.0) / F( 3.0);
73 const PdfT w_1 = F(1.0) / F(18.0);
74 const PdfT w_2 = F(1.0) / F(36.0);
10988083 75
0fde6e45
MW
76 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
77 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
10988083
MW
78
79 const VPDFT vw_1_x3 = VSET(w_1_x3);
80 const VPDFT vw_2_x3 = VSET(w_2_x3);
81
82 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
83 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
84
85 const VPDFT vomegaEven = VSET(omegaEven);
86 const VPDFT vomegaOdd = VSET(omegaOdd);
87
0fde6e45 88 const VPDFT voneHalf = VSET(F(0.5));
10988083
MW
89
90 // uint32_t nConsecNodes = kdlr->nConsecNodes;
91 // uint32_t * consecNodes = kdlr->ConsecNodes;
92 // uint32_t consecIndex = 0;
93 // uint32_t consecValue = 0;
94
95 PdfT * src = kd->Pdfs[0];
96 PdfT * dst = kd->Pdfs[1];
97 PdfT * tmp;
98
99 int maxIterations = cd->MaxIterations;
100
101 int nFluid = kdl->nFluid;
102 int nCells = kdl->nCells;
103
104 int nTmpArray = kdlr->nTmpArray;
105
106 Assert(nTmpArray % VSIZE == 0);
107
108 uint32_t * adjList = kdl->AdjList;
109
110 #ifdef VTK_OUTPUT
111 if (cd->VtkOutput) {
112 kd->PdfsActive = src;
113 VtkWrite(ld, kd, cd, -1);
114 }
115 #endif
116
117 #ifdef STATISTICS
118 kd->PdfsActive = src;
119 KernelStatistics(kd, ld, cd, 0);
120 #endif
121
8cafd9ea
MW
122 X_KERNEL_START(kernelData);
123
124 X_LIKWID_START("list-pull-split-nt-1s");
e3f82424 125
10988083
MW
126 #ifdef _OPENMP
127 #pragma omp parallel default(none) \
128 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
129 cd, maxIterations, ld, tmp, nTmpArray, \
130 stderr )
131 #endif
132 {
133 uint32_t adjListIndex;
134
135 PdfT ux, uy, uz, ui;
136 VPDFT vux, vuy, vuz, vui;
137
138 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
139 D3Q19_LIST
140 #undef X
141 VPDFT vpdf_a, vpdf_b;
142
143 PdfT evenPart, oddPart, dir_indep_trm, dens;
144 PdfT w_1_indep, w_2_indep;
145 VPDFT vevenPart, voddPart;
146 VPDFT vw_1_indep, vw_2_indep;
147
148 int indexMax;
149
150 PdfT * tmpArray;
151 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
152
153 int nThreads = 1;
154 int threadId = 0;
155
156#ifdef _OPENMP
157 nThreads = omp_get_max_threads();
158 threadId = omp_get_thread_num();
159#endif
160
161 int nCellsThread = nFluid / nThreads;
162 int blIndexStart = threadId * nCellsThread;
163
164 if (threadId < nFluid % nThreads) {
165 blIndexStart += threadId;
166 nCellsThread += 1;
167 }
168 else {
169 blIndexStart += nFluid % nThreads;
170 }
171
172 int blIndexStop = blIndexStart + nCellsThread;
173
174 // We have three loops:
175 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
176 // 2. Vectorized handling of nodes.
177 // 3. Remaining nodes, less than vector size.
178
179 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
180 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
181
182 int nCellsVectorized = nCellsThread - nCellsUnaligned;
183 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
184
185 int blIndexVec = blIndexStart + nCellsUnaligned;
186 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
187
188 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
189
190 for(int iter = 0; iter < maxIterations; ++iter) {
191
e3f82424 192
10988083
MW
193#if 1
194 #define INDEX_START blIndexStart
195 #define INDEX_STOP blIndexVec
196 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
197
198 #define INDEX_START blIndexVec
199 #define INDEX_STOP blIndexRemaining
200 #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
201
202 #define INDEX_START blIndexRemaining
203 #define INDEX_STOP blIndexStop
204 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
205#else
206 #define INDEX_START blIndexStart
207 #define INDEX_STOP blIndexStop
208 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
209#endif
e3f82424
MW
210
211
10988083
MW
212 #pragma omp barrier
213
214 #pragma omp single
215 {
216 #ifdef VERIFICATION
217 kd->PdfsActive = dst;
218 KernelAddBodyForce(kd, ld, cd);
219 #endif
220
221 #ifdef VTK_OUTPUT
222 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
223 kd->PdfsActive = dst;
224 VtkWrite(ld, kd, cd, iter);
225 }
226 #endif
227
228 #ifdef STATISTICS
229 kd->PdfsActive = dst;
230 KernelStatistics(kd, ld, cd, iter);
231 #endif
232
233 // swap grids
234 tmp = src;
235 src = dst;
236 dst = tmp;
237 }
238
239 #pragma omp barrier
240
241 } // for (int iter = 0; ...
242
243 MemFree((void **)&tmpArray);
244 }
245
e3f82424
MW
246
247 X_LIKWID_STOP("list-pull-split-nt-1s");
248
8cafd9ea
MW
249 X_KERNEL_END(kernelData);
250
10988083
MW
251#ifdef VTK_OUTPUT
252 if (cd->VtkOutput) {
253 kd->PdfsActive = src;
254 VtkWrite(ld, kd, cd, maxIterations);
255 }
256#endif
257
258#ifdef STATISTICS
259 kd->PdfsActive = src;
260 KernelStatistics(kd, ld, cd, maxIterations);
261#endif
262
263 return;
264}
265
266void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
267{
268
269 Assert(ld != NULL);
270 Assert(kernelData != NULL);
271 Assert(cd != NULL);
272
0fde6e45
MW
273 Assert(cd->Omega > F(0.0));
274 Assert(cd->Omega < F(2.0));
10988083
MW
275
276 KernelData * kd = (KernelData *)kernelData;
277 KernelDataList * kdl = KDL(kernelData);
278 KernelDataListRia * kdlr = KDLR(kernelData);
279
280 PdfT omega = cd->Omega;
281 const PdfT omegaEven = omega;
282
0fde6e45
MW
283 PdfT magicParam = F(1.0) / F(12.0);
284 const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 285
0fde6e45
MW
286 const PdfT w_0 = F(1.0) / F( 3.0);
287 const PdfT w_1 = F(1.0) / F(18.0);
288 const PdfT w_2 = F(1.0) / F(36.0);
10988083 289
0fde6e45
MW
290 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
291 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
10988083
MW
292
293 const VPDFT vw_1_x3 = VSET(w_1_x3);
294 const VPDFT vw_2_x3 = VSET(w_2_x3);
295
296 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
297 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
298
299 const VPDFT vomegaEven = VSET(omegaEven);
300 const VPDFT vomegaOdd = VSET(omegaOdd);
301
0fde6e45 302 const VPDFT voneHalf = VSET(F(0.5));
10988083
MW
303
304 // uint32_t nConsecNodes = kdlr->nConsecNodes;
305 // uint32_t * consecNodes = kdlr->ConsecNodes;
306 // uint32_t consecIndex = 0;
307 // uint32_t consecValue = 0;
308
309 PdfT * src = kd->Pdfs[0];
310 PdfT * dst = kd->Pdfs[1];
311 PdfT * tmp;
312
313 int maxIterations = cd->MaxIterations;
314
315 int nFluid = kdl->nFluid;
316 int nCells = kdl->nCells;
317
318 int nTmpArray = kdlr->nTmpArray;
319
320 Assert(nTmpArray % VSIZE == 0);
321
322 uint32_t * adjList = kdl->AdjList;
323
324 #ifdef VTK_OUTPUT
325 if (cd->VtkOutput) {
326 kd->PdfsActive = src;
327 VtkWrite(ld, kd, cd, -1);
328 }
329 #endif
330
331 #ifdef STATISTICS
332 kd->PdfsActive = src;
333 KernelStatistics(kd, ld, cd, 0);
334 #endif
335
e3f82424 336
8cafd9ea
MW
337 X_KERNEL_START(kernelData);
338
339 X_LIKWID_START("list-pull-split-nt-2s");
e3f82424
MW
340
341
10988083
MW
342 #ifdef _OPENMP
343 #pragma omp parallel default(none) \
344 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
345 cd, maxIterations, ld, tmp, nTmpArray, \
346 stderr )
347 #endif
348 {
349 uint32_t adjListIndex;
350
351 PdfT ux, uy, uz, ui;
352 VPDFT vux, vuy, vuz, vui;
353
354 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
355 D3Q19_LIST
356 #undef X
357 VPDFT vpdf_a, vpdf_b;
358
359 PdfT evenPart, oddPart, dir_indep_trm, dens;
360 PdfT w_1_indep, w_2_indep;
361 VPDFT vevenPart, voddPart;
362 VPDFT vw_1_indep, vw_2_indep;
363
364 int indexMax;
365
366 PdfT * tmpArray;
367 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
368
369 int nThreads = 1;
370 int threadId = 0;
371
372#ifdef _OPENMP
373 nThreads = omp_get_max_threads();
374 threadId = omp_get_thread_num();
375#endif
376
377 int nCellsThread = nFluid / nThreads;
378 int blIndexStart = threadId * nCellsThread;
379
380 if (threadId < nFluid % nThreads) {
381 blIndexStart += threadId;
382 nCellsThread += 1;
383 }
384 else {
385 blIndexStart += nFluid % nThreads;
386 }
387
388 int blIndexStop = blIndexStart + nCellsThread;
389
390 // We have three loops:
391 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
392 // 2. Vectorized handling of nodes.
393 // 3. Remaining nodes, less than vector size.
394
395 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
396 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
397
398 int nCellsVectorized = nCellsThread - nCellsUnaligned;
399 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
400
401 int blIndexVec = blIndexStart + nCellsUnaligned;
402 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
403
404 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
405
406 for(int iter = 0; iter < maxIterations; ++iter) {
407
408#if 1
409 #define INDEX_START blIndexStart
410 #define INDEX_STOP blIndexVec
411 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
412
413 #define INDEX_START blIndexVec
414 #define INDEX_STOP blIndexRemaining
415 #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
416
417 #define INDEX_START blIndexRemaining
418 #define INDEX_STOP blIndexStop
419 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
420#else
421 #define INDEX_START blIndexStart
422 #define INDEX_STOP blIndexStop
423 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
424#endif
425 #pragma omp barrier
426
e3f82424 427
10988083
MW
428 #pragma omp single
429 {
430 #ifdef VERIFICATION
431 kd->PdfsActive = dst;
432 KernelAddBodyForce(kd, ld, cd);
433 #endif
434
435 #ifdef VTK_OUTPUT
436 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
437 kd->PdfsActive = dst;
438 VtkWrite(ld, kd, cd, iter);
439 }
440 #endif
441
442 #ifdef STATISTICS
443 kd->PdfsActive = dst;
444 KernelStatistics(kd, ld, cd, iter);
445 #endif
446
447 // swap grids
448 tmp = src;
449 src = dst;
450 dst = tmp;
451 }
452
453 #pragma omp barrier
454
455 } // for (int iter = 0; ...
456
457 MemFree((void **)&tmpArray);
458 }
459
8cafd9ea
MW
460 X_LIKWID_STOP("list-pull-split-nt-2s");
461
462 X_KERNEL_END(kernelData);
e3f82424 463
10988083
MW
464#ifdef VTK_OUTPUT
465 if (cd->VtkOutput) {
466 kd->PdfsActive = src;
467 VtkWrite(ld, kd, cd, maxIterations);
468 }
469#endif
470
471#ifdef STATISTICS
472 kd->PdfsActive = src;
473 KernelStatistics(kd, ld, cd, maxIterations);
474#endif
475
476 return;
477}
478
This page took 0.102818 seconds and 5 git commands to generate.