version 0.1
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31#include "Vector.h"
32
33#include <inttypes.h>
34#include <math.h>
35
36#ifdef _OPENMP
37 #include <omp.h>
38#endif
39
40#define TMP_UX 18
41#define TMP_UY 19
42#define TMP_UZ 20
43#define TMP_W1 21
44#define TMP_W2 22
45
46#define N_TMP 23
47
48#define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index)
49
50void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
51{
52
53 Assert(ld != NULL);
54 Assert(kernelData != NULL);
55 Assert(cd != NULL);
56
57 Assert(cd->Omega > 0.0);
58 Assert(cd->Omega < 2.0);
59
60 KernelData * kd = (KernelData *)kernelData;
61 KernelDataList * kdl = KDL(kernelData);
62 KernelDataListRia * kdlr = KDLR(kernelData);
63
64 PdfT omega = cd->Omega;
65 const PdfT omegaEven = omega;
66
67 PdfT magicParam = 1.0 / 12.0;
68 const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
69
70
71 const PdfT w_0 = 1.0 / 3.0;
72 const PdfT w_1 = 1.0 / 18.0;
73 const PdfT w_2 = 1.0 / 36.0;
74
75 const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
76 const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
77
78 const VPDFT vw_1_x3 = VSET(w_1_x3);
79 const VPDFT vw_2_x3 = VSET(w_2_x3);
80
81 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
82 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
83
84 const VPDFT vomegaEven = VSET(omegaEven);
85 const VPDFT vomegaOdd = VSET(omegaOdd);
86
87 const VPDFT voneHalf = VSET(0.5);
88
89 // uint32_t nConsecNodes = kdlr->nConsecNodes;
90 // uint32_t * consecNodes = kdlr->ConsecNodes;
91 // uint32_t consecIndex = 0;
92 // uint32_t consecValue = 0;
93
94 PdfT * src = kd->Pdfs[0];
95 PdfT * dst = kd->Pdfs[1];
96 PdfT * tmp;
97
98 int maxIterations = cd->MaxIterations;
99
100 int nFluid = kdl->nFluid;
101 int nCells = kdl->nCells;
102
103 int nTmpArray = kdlr->nTmpArray;
104
105 Assert(nTmpArray % VSIZE == 0);
106
107 uint32_t * adjList = kdl->AdjList;
108
109 #ifdef VTK_OUTPUT
110 if (cd->VtkOutput) {
111 kd->PdfsActive = src;
112 VtkWrite(ld, kd, cd, -1);
113 }
114 #endif
115
116 #ifdef STATISTICS
117 kd->PdfsActive = src;
118 KernelStatistics(kd, ld, cd, 0);
119 #endif
120
121 #ifdef _OPENMP
122 #pragma omp parallel default(none) \
123 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
124 cd, maxIterations, ld, tmp, nTmpArray, \
125 stderr )
126 #endif
127 {
128 uint32_t adjListIndex;
129
130 PdfT ux, uy, uz, ui;
131 VPDFT vux, vuy, vuz, vui;
132
133 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
134 D3Q19_LIST
135 #undef X
136 VPDFT vpdf_a, vpdf_b;
137
138 PdfT evenPart, oddPart, dir_indep_trm, dens;
139 PdfT w_1_indep, w_2_indep;
140 VPDFT vevenPart, voddPart;
141 VPDFT vw_1_indep, vw_2_indep;
142
143 int indexMax;
144
145 PdfT * tmpArray;
146 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
147
148 int nThreads = 1;
149 int threadId = 0;
150
151#ifdef _OPENMP
152 nThreads = omp_get_max_threads();
153 threadId = omp_get_thread_num();
154#endif
155
156 int nCellsThread = nFluid / nThreads;
157 int blIndexStart = threadId * nCellsThread;
158
159 if (threadId < nFluid % nThreads) {
160 blIndexStart += threadId;
161 nCellsThread += 1;
162 }
163 else {
164 blIndexStart += nFluid % nThreads;
165 }
166
167 int blIndexStop = blIndexStart + nCellsThread;
168
169 // We have three loops:
170 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
171 // 2. Vectorized handling of nodes.
172 // 3. Remaining nodes, less than vector size.
173
174 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
175 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
176
177 int nCellsVectorized = nCellsThread - nCellsUnaligned;
178 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
179
180 int blIndexVec = blIndexStart + nCellsUnaligned;
181 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
182
183 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
184
185 for(int iter = 0; iter < maxIterations; ++iter) {
186
187#if 1
188 #define INDEX_START blIndexStart
189 #define INDEX_STOP blIndexVec
190 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
191
192 #define INDEX_START blIndexVec
193 #define INDEX_STOP blIndexRemaining
194 #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
195
196 #define INDEX_START blIndexRemaining
197 #define INDEX_STOP blIndexStop
198 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
199#else
200 #define INDEX_START blIndexStart
201 #define INDEX_STOP blIndexStop
202 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
203#endif
204 #pragma omp barrier
205
206 #pragma omp single
207 {
208 #ifdef VERIFICATION
209 kd->PdfsActive = dst;
210 KernelAddBodyForce(kd, ld, cd);
211 #endif
212
213 #ifdef VTK_OUTPUT
214 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
215 kd->PdfsActive = dst;
216 VtkWrite(ld, kd, cd, iter);
217 }
218 #endif
219
220 #ifdef STATISTICS
221 kd->PdfsActive = dst;
222 KernelStatistics(kd, ld, cd, iter);
223 #endif
224
225 // swap grids
226 tmp = src;
227 src = dst;
228 dst = tmp;
229 }
230
231 #pragma omp barrier
232
233 } // for (int iter = 0; ...
234
235 MemFree((void **)&tmpArray);
236 }
237
238#ifdef VTK_OUTPUT
239 if (cd->VtkOutput) {
240 kd->PdfsActive = src;
241 VtkWrite(ld, kd, cd, maxIterations);
242 }
243#endif
244
245#ifdef STATISTICS
246 kd->PdfsActive = src;
247 KernelStatistics(kd, ld, cd, maxIterations);
248#endif
249
250 return;
251}
252
253void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
254{
255
256 Assert(ld != NULL);
257 Assert(kernelData != NULL);
258 Assert(cd != NULL);
259
260 Assert(cd->Omega > 0.0);
261 Assert(cd->Omega < 2.0);
262
263 KernelData * kd = (KernelData *)kernelData;
264 KernelDataList * kdl = KDL(kernelData);
265 KernelDataListRia * kdlr = KDLR(kernelData);
266
267 PdfT omega = cd->Omega;
268 const PdfT omegaEven = omega;
269
270 PdfT magicParam = 1.0 / 12.0;
271 const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
272
273
274 const PdfT w_0 = 1.0 / 3.0;
275 const PdfT w_1 = 1.0 / 18.0;
276 const PdfT w_2 = 1.0 / 36.0;
277
278 const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
279 const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
280
281 const VPDFT vw_1_x3 = VSET(w_1_x3);
282 const VPDFT vw_2_x3 = VSET(w_2_x3);
283
284 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
285 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
286
287 const VPDFT vomegaEven = VSET(omegaEven);
288 const VPDFT vomegaOdd = VSET(omegaOdd);
289
290 const VPDFT voneHalf = VSET(0.5);
291
292 // uint32_t nConsecNodes = kdlr->nConsecNodes;
293 // uint32_t * consecNodes = kdlr->ConsecNodes;
294 // uint32_t consecIndex = 0;
295 // uint32_t consecValue = 0;
296
297 PdfT * src = kd->Pdfs[0];
298 PdfT * dst = kd->Pdfs[1];
299 PdfT * tmp;
300
301 int maxIterations = cd->MaxIterations;
302
303 int nFluid = kdl->nFluid;
304 int nCells = kdl->nCells;
305
306 int nTmpArray = kdlr->nTmpArray;
307
308 Assert(nTmpArray % VSIZE == 0);
309
310 uint32_t * adjList = kdl->AdjList;
311
312 #ifdef VTK_OUTPUT
313 if (cd->VtkOutput) {
314 kd->PdfsActive = src;
315 VtkWrite(ld, kd, cd, -1);
316 }
317 #endif
318
319 #ifdef STATISTICS
320 kd->PdfsActive = src;
321 KernelStatistics(kd, ld, cd, 0);
322 #endif
323
324 #ifdef _OPENMP
325 #pragma omp parallel default(none) \
326 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
327 cd, maxIterations, ld, tmp, nTmpArray, \
328 stderr )
329 #endif
330 {
331 uint32_t adjListIndex;
332
333 PdfT ux, uy, uz, ui;
334 VPDFT vux, vuy, vuz, vui;
335
336 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
337 D3Q19_LIST
338 #undef X
339 VPDFT vpdf_a, vpdf_b;
340
341 PdfT evenPart, oddPart, dir_indep_trm, dens;
342 PdfT w_1_indep, w_2_indep;
343 VPDFT vevenPart, voddPart;
344 VPDFT vw_1_indep, vw_2_indep;
345
346 int indexMax;
347
348 PdfT * tmpArray;
349 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
350
351 int nThreads = 1;
352 int threadId = 0;
353
354#ifdef _OPENMP
355 nThreads = omp_get_max_threads();
356 threadId = omp_get_thread_num();
357#endif
358
359 int nCellsThread = nFluid / nThreads;
360 int blIndexStart = threadId * nCellsThread;
361
362 if (threadId < nFluid % nThreads) {
363 blIndexStart += threadId;
364 nCellsThread += 1;
365 }
366 else {
367 blIndexStart += nFluid % nThreads;
368 }
369
370 int blIndexStop = blIndexStart + nCellsThread;
371
372 // We have three loops:
373 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
374 // 2. Vectorized handling of nodes.
375 // 3. Remaining nodes, less than vector size.
376
377 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
378 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
379
380 int nCellsVectorized = nCellsThread - nCellsUnaligned;
381 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
382
383 int blIndexVec = blIndexStart + nCellsUnaligned;
384 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
385
386 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
387
388 for(int iter = 0; iter < maxIterations; ++iter) {
389
390#if 1
391 #define INDEX_START blIndexStart
392 #define INDEX_STOP blIndexVec
393 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
394
395 #define INDEX_START blIndexVec
396 #define INDEX_STOP blIndexRemaining
397 #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
398
399 #define INDEX_START blIndexRemaining
400 #define INDEX_STOP blIndexStop
401 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
402#else
403 #define INDEX_START blIndexStart
404 #define INDEX_STOP blIndexStop
405 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
406#endif
407 #pragma omp barrier
408
409 #pragma omp single
410 {
411 #ifdef VERIFICATION
412 kd->PdfsActive = dst;
413 KernelAddBodyForce(kd, ld, cd);
414 #endif
415
416 #ifdef VTK_OUTPUT
417 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
418 kd->PdfsActive = dst;
419 VtkWrite(ld, kd, cd, iter);
420 }
421 #endif
422
423 #ifdef STATISTICS
424 kd->PdfsActive = dst;
425 KernelStatistics(kd, ld, cd, iter);
426 #endif
427
428 // swap grids
429 tmp = src;
430 src = dst;
431 dst = tmp;
432 }
433
434 #pragma omp barrier
435
436 } // for (int iter = 0; ...
437
438 MemFree((void **)&tmpArray);
439 }
440
441#ifdef VTK_OUTPUT
442 if (cd->VtkOutput) {
443 kd->PdfsActive = src;
444 VtkWrite(ld, kd, cd, maxIterations);
445 }
446#endif
447
448#ifdef STATISTICS
449 kd->PdfsActive = src;
450 KernelStatistics(kd, ld, cd, maxIterations);
451#endif
452
453 return;
454}
455
This page took 0.091478 seconds and 5 git commands to generate.