add single precision, add aa-vec-sl-soa kernel, updated doc
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31#include "Vector.h"
e3f82424 32#include "LikwidIf.h"
10988083
MW
33
34#include <inttypes.h>
35#include <math.h>
36
37#ifdef _OPENMP
38 #include <omp.h>
39#endif
40
41#define TMP_UX 18
42#define TMP_UY 19
43#define TMP_UZ 20
44#define TMP_W1 21
45#define TMP_W2 22
46
47#define N_TMP 23
48
49#define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index)
50
51void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
52{
53
54 Assert(ld != NULL);
55 Assert(kernelData != NULL);
56 Assert(cd != NULL);
57
0fde6e45
MW
58 Assert(cd->Omega > F(0.0));
59 Assert(cd->Omega < F(2.0));
10988083
MW
60
61 KernelData * kd = (KernelData *)kernelData;
62 KernelDataList * kdl = KDL(kernelData);
63 KernelDataListRia * kdlr = KDLR(kernelData);
64
65 PdfT omega = cd->Omega;
66 const PdfT omegaEven = omega;
67
0fde6e45
MW
68 PdfT magicParam = F(1.0) / F(12.0);
69 const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083
MW
70
71
0fde6e45
MW
72 const PdfT w_0 = F(1.0) / F( 3.0);
73 const PdfT w_1 = F(1.0) / F(18.0);
74 const PdfT w_2 = F(1.0) / F(36.0);
10988083 75
0fde6e45
MW
76 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
77 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
10988083
MW
78
79 const VPDFT vw_1_x3 = VSET(w_1_x3);
80 const VPDFT vw_2_x3 = VSET(w_2_x3);
81
82 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
83 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
84
85 const VPDFT vomegaEven = VSET(omegaEven);
86 const VPDFT vomegaOdd = VSET(omegaOdd);
87
0fde6e45 88 const VPDFT voneHalf = VSET(F(0.5));
10988083
MW
89
90 // uint32_t nConsecNodes = kdlr->nConsecNodes;
91 // uint32_t * consecNodes = kdlr->ConsecNodes;
92 // uint32_t consecIndex = 0;
93 // uint32_t consecValue = 0;
94
95 PdfT * src = kd->Pdfs[0];
96 PdfT * dst = kd->Pdfs[1];
97 PdfT * tmp;
98
99 int maxIterations = cd->MaxIterations;
100
101 int nFluid = kdl->nFluid;
102 int nCells = kdl->nCells;
103
104 int nTmpArray = kdlr->nTmpArray;
105
106 Assert(nTmpArray % VSIZE == 0);
107
108 uint32_t * adjList = kdl->AdjList;
109
110 #ifdef VTK_OUTPUT
111 if (cd->VtkOutput) {
112 kd->PdfsActive = src;
113 VtkWrite(ld, kd, cd, -1);
114 }
115 #endif
116
117 #ifdef STATISTICS
118 kd->PdfsActive = src;
119 KernelStatistics(kd, ld, cd, 0);
120 #endif
121
e3f82424
MW
122
123 X_LIKWID_START("list-pull-split-nt-1s");
10988083
MW
124 #ifdef _OPENMP
125 #pragma omp parallel default(none) \
126 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
127 cd, maxIterations, ld, tmp, nTmpArray, \
128 stderr )
129 #endif
130 {
131 uint32_t adjListIndex;
132
133 PdfT ux, uy, uz, ui;
134 VPDFT vux, vuy, vuz, vui;
135
136 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
137 D3Q19_LIST
138 #undef X
139 VPDFT vpdf_a, vpdf_b;
140
141 PdfT evenPart, oddPart, dir_indep_trm, dens;
142 PdfT w_1_indep, w_2_indep;
143 VPDFT vevenPart, voddPart;
144 VPDFT vw_1_indep, vw_2_indep;
145
146 int indexMax;
147
148 PdfT * tmpArray;
149 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
150
151 int nThreads = 1;
152 int threadId = 0;
153
154#ifdef _OPENMP
155 nThreads = omp_get_max_threads();
156 threadId = omp_get_thread_num();
157#endif
158
159 int nCellsThread = nFluid / nThreads;
160 int blIndexStart = threadId * nCellsThread;
161
162 if (threadId < nFluid % nThreads) {
163 blIndexStart += threadId;
164 nCellsThread += 1;
165 }
166 else {
167 blIndexStart += nFluid % nThreads;
168 }
169
170 int blIndexStop = blIndexStart + nCellsThread;
171
172 // We have three loops:
173 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
174 // 2. Vectorized handling of nodes.
175 // 3. Remaining nodes, less than vector size.
176
177 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
178 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
179
180 int nCellsVectorized = nCellsThread - nCellsUnaligned;
181 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
182
183 int blIndexVec = blIndexStart + nCellsUnaligned;
184 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
185
186 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
187
188 for(int iter = 0; iter < maxIterations; ++iter) {
189
e3f82424 190
10988083
MW
191#if 1
192 #define INDEX_START blIndexStart
193 #define INDEX_STOP blIndexVec
194 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
195
196 #define INDEX_START blIndexVec
197 #define INDEX_STOP blIndexRemaining
198 #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
199
200 #define INDEX_START blIndexRemaining
201 #define INDEX_STOP blIndexStop
202 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
203#else
204 #define INDEX_START blIndexStart
205 #define INDEX_STOP blIndexStop
206 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
207#endif
e3f82424
MW
208
209
10988083
MW
210 #pragma omp barrier
211
212 #pragma omp single
213 {
214 #ifdef VERIFICATION
215 kd->PdfsActive = dst;
216 KernelAddBodyForce(kd, ld, cd);
217 #endif
218
219 #ifdef VTK_OUTPUT
220 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
221 kd->PdfsActive = dst;
222 VtkWrite(ld, kd, cd, iter);
223 }
224 #endif
225
226 #ifdef STATISTICS
227 kd->PdfsActive = dst;
228 KernelStatistics(kd, ld, cd, iter);
229 #endif
230
231 // swap grids
232 tmp = src;
233 src = dst;
234 dst = tmp;
235 }
236
237 #pragma omp barrier
238
239 } // for (int iter = 0; ...
240
241 MemFree((void **)&tmpArray);
242 }
243
e3f82424
MW
244
245 X_LIKWID_STOP("list-pull-split-nt-1s");
246
10988083
MW
247#ifdef VTK_OUTPUT
248 if (cd->VtkOutput) {
249 kd->PdfsActive = src;
250 VtkWrite(ld, kd, cd, maxIterations);
251 }
252#endif
253
254#ifdef STATISTICS
255 kd->PdfsActive = src;
256 KernelStatistics(kd, ld, cd, maxIterations);
257#endif
258
259 return;
260}
261
262void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
263{
264
265 Assert(ld != NULL);
266 Assert(kernelData != NULL);
267 Assert(cd != NULL);
268
0fde6e45
MW
269 Assert(cd->Omega > F(0.0));
270 Assert(cd->Omega < F(2.0));
10988083
MW
271
272 KernelData * kd = (KernelData *)kernelData;
273 KernelDataList * kdl = KDL(kernelData);
274 KernelDataListRia * kdlr = KDLR(kernelData);
275
276 PdfT omega = cd->Omega;
277 const PdfT omegaEven = omega;
278
0fde6e45
MW
279 PdfT magicParam = F(1.0) / F(12.0);
280 const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 281
0fde6e45
MW
282 const PdfT w_0 = F(1.0) / F( 3.0);
283 const PdfT w_1 = F(1.0) / F(18.0);
284 const PdfT w_2 = F(1.0) / F(36.0);
10988083 285
0fde6e45
MW
286 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
287 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
10988083
MW
288
289 const VPDFT vw_1_x3 = VSET(w_1_x3);
290 const VPDFT vw_2_x3 = VSET(w_2_x3);
291
292 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
293 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
294
295 const VPDFT vomegaEven = VSET(omegaEven);
296 const VPDFT vomegaOdd = VSET(omegaOdd);
297
0fde6e45 298 const VPDFT voneHalf = VSET(F(0.5));
10988083
MW
299
300 // uint32_t nConsecNodes = kdlr->nConsecNodes;
301 // uint32_t * consecNodes = kdlr->ConsecNodes;
302 // uint32_t consecIndex = 0;
303 // uint32_t consecValue = 0;
304
305 PdfT * src = kd->Pdfs[0];
306 PdfT * dst = kd->Pdfs[1];
307 PdfT * tmp;
308
309 int maxIterations = cd->MaxIterations;
310
311 int nFluid = kdl->nFluid;
312 int nCells = kdl->nCells;
313
314 int nTmpArray = kdlr->nTmpArray;
315
316 Assert(nTmpArray % VSIZE == 0);
317
318 uint32_t * adjList = kdl->AdjList;
319
320 #ifdef VTK_OUTPUT
321 if (cd->VtkOutput) {
322 kd->PdfsActive = src;
323 VtkWrite(ld, kd, cd, -1);
324 }
325 #endif
326
327 #ifdef STATISTICS
328 kd->PdfsActive = src;
329 KernelStatistics(kd, ld, cd, 0);
330 #endif
331
e3f82424
MW
332
333 X_LIKWID_START("list-pull-split-nt-2s");
334
335
10988083
MW
336 #ifdef _OPENMP
337 #pragma omp parallel default(none) \
338 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
339 cd, maxIterations, ld, tmp, nTmpArray, \
340 stderr )
341 #endif
342 {
343 uint32_t adjListIndex;
344
345 PdfT ux, uy, uz, ui;
346 VPDFT vux, vuy, vuz, vui;
347
348 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
349 D3Q19_LIST
350 #undef X
351 VPDFT vpdf_a, vpdf_b;
352
353 PdfT evenPart, oddPart, dir_indep_trm, dens;
354 PdfT w_1_indep, w_2_indep;
355 VPDFT vevenPart, voddPart;
356 VPDFT vw_1_indep, vw_2_indep;
357
358 int indexMax;
359
360 PdfT * tmpArray;
361 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
362
363 int nThreads = 1;
364 int threadId = 0;
365
366#ifdef _OPENMP
367 nThreads = omp_get_max_threads();
368 threadId = omp_get_thread_num();
369#endif
370
371 int nCellsThread = nFluid / nThreads;
372 int blIndexStart = threadId * nCellsThread;
373
374 if (threadId < nFluid % nThreads) {
375 blIndexStart += threadId;
376 nCellsThread += 1;
377 }
378 else {
379 blIndexStart += nFluid % nThreads;
380 }
381
382 int blIndexStop = blIndexStart + nCellsThread;
383
384 // We have three loops:
385 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
386 // 2. Vectorized handling of nodes.
387 // 3. Remaining nodes, less than vector size.
388
389 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
390 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
391
392 int nCellsVectorized = nCellsThread - nCellsUnaligned;
393 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
394
395 int blIndexVec = blIndexStart + nCellsUnaligned;
396 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
397
398 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
399
400 for(int iter = 0; iter < maxIterations; ++iter) {
401
402#if 1
403 #define INDEX_START blIndexStart
404 #define INDEX_STOP blIndexVec
405 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
406
407 #define INDEX_START blIndexVec
408 #define INDEX_STOP blIndexRemaining
409 #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
410
411 #define INDEX_START blIndexRemaining
412 #define INDEX_STOP blIndexStop
413 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
414#else
415 #define INDEX_START blIndexStart
416 #define INDEX_STOP blIndexStop
417 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
418#endif
419 #pragma omp barrier
420
e3f82424 421
10988083
MW
422 #pragma omp single
423 {
424 #ifdef VERIFICATION
425 kd->PdfsActive = dst;
426 KernelAddBodyForce(kd, ld, cd);
427 #endif
428
429 #ifdef VTK_OUTPUT
430 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
431 kd->PdfsActive = dst;
432 VtkWrite(ld, kd, cd, iter);
433 }
434 #endif
435
436 #ifdef STATISTICS
437 kd->PdfsActive = dst;
438 KernelStatistics(kd, ld, cd, iter);
439 #endif
440
441 // swap grids
442 tmp = src;
443 src = dst;
444 dst = tmp;
445 }
446
447 #pragma omp barrier
448
449 } // for (int iter = 0; ...
450
451 MemFree((void **)&tmpArray);
452 }
453
e3f82424
MW
454 X_LIKWID_STOP("list-pull-split-nt-2s");
455
10988083
MW
456#ifdef VTK_OUTPUT
457 if (cd->VtkOutput) {
458 kd->PdfsActive = src;
459 VtkWrite(ld, kd, cd, maxIterations);
460 }
461#endif
462
463#ifdef STATISTICS
464 kd->PdfsActive = src;
465 KernelStatistics(kd, ld, cd, maxIterations);
466#endif
467
468 return;
469}
470
This page took 0.083823 seconds and 5 git commands to generate.