1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
49 #define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index)
51 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
55 Assert(kernelData != NULL);
58 Assert(cd->Omega > 0.0);
59 Assert(cd->Omega < 2.0);
61 KernelData * kd = (KernelData *)kernelData;
62 KernelDataList * kdl = KDL(kernelData);
63 KernelDataListRia * kdlr = KDLR(kernelData);
65 PdfT omega = cd->Omega;
66 const PdfT omegaEven = omega;
68 PdfT magicParam = 1.0 / 12.0;
69 const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
72 const PdfT w_0 = 1.0 / 3.0;
73 const PdfT w_1 = 1.0 / 18.0;
74 const PdfT w_2 = 1.0 / 36.0;
76 const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
77 const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
79 const VPDFT vw_1_x3 = VSET(w_1_x3);
80 const VPDFT vw_2_x3 = VSET(w_2_x3);
82 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
83 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
85 const VPDFT vomegaEven = VSET(omegaEven);
86 const VPDFT vomegaOdd = VSET(omegaOdd);
88 const VPDFT voneHalf = VSET(0.5);
90 // uint32_t nConsecNodes = kdlr->nConsecNodes;
91 // uint32_t * consecNodes = kdlr->ConsecNodes;
92 // uint32_t consecIndex = 0;
93 // uint32_t consecValue = 0;
95 PdfT * src = kd->Pdfs[0];
96 PdfT * dst = kd->Pdfs[1];
99 int maxIterations = cd->MaxIterations;
101 int nFluid = kdl->nFluid;
102 int nCells = kdl->nCells;
104 int nTmpArray = kdlr->nTmpArray;
106 Assert(nTmpArray % VSIZE == 0);
108 uint32_t * adjList = kdl->AdjList;
112 kd->PdfsActive = src;
113 VtkWrite(ld, kd, cd, -1);
118 kd->PdfsActive = src;
119 KernelStatistics(kd, ld, cd, 0);
123 X_LIKWID_START("list-pull-split-nt-1s");
125 #pragma omp parallel default(none) \
126 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
127 cd, maxIterations, ld, tmp, nTmpArray, \
131 uint32_t adjListIndex;
134 VPDFT vux, vuy, vuz, vui;
136 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
139 VPDFT vpdf_a, vpdf_b;
141 PdfT evenPart, oddPart, dir_indep_trm, dens;
142 PdfT w_1_indep, w_2_indep;
143 VPDFT vevenPart, voddPart;
144 VPDFT vw_1_indep, vw_2_indep;
149 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
155 nThreads = omp_get_max_threads();
156 threadId = omp_get_thread_num();
159 int nCellsThread = nFluid / nThreads;
160 int blIndexStart = threadId * nCellsThread;
162 if (threadId < nFluid % nThreads) {
163 blIndexStart += threadId;
167 blIndexStart += nFluid % nThreads;
170 int blIndexStop = blIndexStart + nCellsThread;
172 // We have three loops:
173 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
174 // 2. Vectorized handling of nodes.
175 // 3. Remaining nodes, less than vector size.
177 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
178 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
180 int nCellsVectorized = nCellsThread - nCellsUnaligned;
181 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
183 int blIndexVec = blIndexStart + nCellsUnaligned;
184 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
186 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
188 for(int iter = 0; iter < maxIterations; ++iter) {
192 #define INDEX_START blIndexStart
193 #define INDEX_STOP blIndexVec
194 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
196 #define INDEX_START blIndexVec
197 #define INDEX_STOP blIndexRemaining
198 #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
200 #define INDEX_START blIndexRemaining
201 #define INDEX_STOP blIndexStop
202 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
204 #define INDEX_START blIndexStart
205 #define INDEX_STOP blIndexStop
206 #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
215 kd->PdfsActive = dst;
216 KernelAddBodyForce(kd, ld, cd);
220 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
221 kd->PdfsActive = dst;
222 VtkWrite(ld, kd, cd, iter);
227 kd->PdfsActive = dst;
228 KernelStatistics(kd, ld, cd, iter);
239 } // for (int iter = 0; ...
241 MemFree((void **)&tmpArray);
245 X_LIKWID_STOP("list-pull-split-nt-1s");
249 kd->PdfsActive = src;
250 VtkWrite(ld, kd, cd, maxIterations);
255 kd->PdfsActive = src;
256 KernelStatistics(kd, ld, cd, maxIterations);
262 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
266 Assert(kernelData != NULL);
269 Assert(cd->Omega > 0.0);
270 Assert(cd->Omega < 2.0);
272 KernelData * kd = (KernelData *)kernelData;
273 KernelDataList * kdl = KDL(kernelData);
274 KernelDataListRia * kdlr = KDLR(kernelData);
276 PdfT omega = cd->Omega;
277 const PdfT omegaEven = omega;
279 PdfT magicParam = 1.0 / 12.0;
280 const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
283 const PdfT w_0 = 1.0 / 3.0;
284 const PdfT w_1 = 1.0 / 18.0;
285 const PdfT w_2 = 1.0 / 36.0;
287 const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
288 const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
290 const VPDFT vw_1_x3 = VSET(w_1_x3);
291 const VPDFT vw_2_x3 = VSET(w_2_x3);
293 const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
294 const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
296 const VPDFT vomegaEven = VSET(omegaEven);
297 const VPDFT vomegaOdd = VSET(omegaOdd);
299 const VPDFT voneHalf = VSET(0.5);
301 // uint32_t nConsecNodes = kdlr->nConsecNodes;
302 // uint32_t * consecNodes = kdlr->ConsecNodes;
303 // uint32_t consecIndex = 0;
304 // uint32_t consecValue = 0;
306 PdfT * src = kd->Pdfs[0];
307 PdfT * dst = kd->Pdfs[1];
310 int maxIterations = cd->MaxIterations;
312 int nFluid = kdl->nFluid;
313 int nCells = kdl->nCells;
315 int nTmpArray = kdlr->nTmpArray;
317 Assert(nTmpArray % VSIZE == 0);
319 uint32_t * adjList = kdl->AdjList;
323 kd->PdfsActive = src;
324 VtkWrite(ld, kd, cd, -1);
329 kd->PdfsActive = src;
330 KernelStatistics(kd, ld, cd, 0);
334 X_LIKWID_START("list-pull-split-nt-2s");
338 #pragma omp parallel default(none) \
339 shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
340 cd, maxIterations, ld, tmp, nTmpArray, \
344 uint32_t adjListIndex;
347 VPDFT vux, vuy, vuz, vui;
349 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
352 VPDFT vpdf_a, vpdf_b;
354 PdfT evenPart, oddPart, dir_indep_trm, dens;
355 PdfT w_1_indep, w_2_indep;
356 VPDFT vevenPart, voddPart;
357 VPDFT vw_1_indep, vw_2_indep;
362 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
368 nThreads = omp_get_max_threads();
369 threadId = omp_get_thread_num();
372 int nCellsThread = nFluid / nThreads;
373 int blIndexStart = threadId * nCellsThread;
375 if (threadId < nFluid % nThreads) {
376 blIndexStart += threadId;
380 blIndexStart += nFluid % nThreads;
383 int blIndexStop = blIndexStart + nCellsThread;
385 // We have three loops:
386 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
387 // 2. Vectorized handling of nodes.
388 // 3. Remaining nodes, less than vector size.
390 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
391 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
393 int nCellsVectorized = nCellsThread - nCellsUnaligned;
394 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
396 int blIndexVec = blIndexStart + nCellsUnaligned;
397 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
399 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
401 for(int iter = 0; iter < maxIterations; ++iter) {
404 #define INDEX_START blIndexStart
405 #define INDEX_STOP blIndexVec
406 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
408 #define INDEX_START blIndexVec
409 #define INDEX_STOP blIndexRemaining
410 #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
412 #define INDEX_START blIndexRemaining
413 #define INDEX_STOP blIndexStop
414 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
416 #define INDEX_START blIndexStart
417 #define INDEX_STOP blIndexStop
418 #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
426 kd->PdfsActive = dst;
427 KernelAddBodyForce(kd, ld, cd);
431 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
432 kd->PdfsActive = dst;
433 VtkWrite(ld, kd, cd, iter);
438 kd->PdfsActive = dst;
439 KernelStatistics(kd, ld, cd, iter);
450 } // for (int iter = 0; ...
452 MemFree((void **)&tmpArray);
455 X_LIKWID_STOP("list-pull-split-nt-2s");
459 kd->PdfsActive = src;
460 VtkWrite(ld, kd, cd, maxIterations);
465 kd->PdfsActive = src;
466 KernelStatistics(kd, ld, cd, maxIterations);