Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListPullSplitNtCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
e3f82424 | 32 | #include "LikwidIf.h" |
10988083 MW |
33 | |
34 | #include <inttypes.h> | |
35 | #include <math.h> | |
36 | ||
37 | #ifdef _OPENMP | |
38 | #include <omp.h> | |
39 | #endif | |
40 | ||
41 | #define TMP_UX 18 | |
42 | #define TMP_UY 19 | |
43 | #define TMP_UZ 20 | |
44 | #define TMP_W1 21 | |
45 | #define TMP_W2 22 | |
46 | ||
47 | #define N_TMP 23 | |
48 | ||
49 | #define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) | |
50 | ||
51 | void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
52 | { | |
53 | ||
54 | Assert(ld != NULL); | |
55 | Assert(kernelData != NULL); | |
56 | Assert(cd != NULL); | |
57 | ||
0fde6e45 MW |
58 | Assert(cd->Omega > F(0.0)); |
59 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
60 | |
61 | KernelData * kd = (KernelData *)kernelData; | |
62 | KernelDataList * kdl = KDL(kernelData); | |
63 | KernelDataListRia * kdlr = KDLR(kernelData); | |
64 | ||
65 | PdfT omega = cd->Omega; | |
66 | const PdfT omegaEven = omega; | |
67 | ||
0fde6e45 MW |
68 | PdfT magicParam = F(1.0) / F(12.0); |
69 | const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 MW |
70 | |
71 | ||
0fde6e45 MW |
72 | const PdfT w_0 = F(1.0) / F( 3.0); |
73 | const PdfT w_1 = F(1.0) / F(18.0); | |
74 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 75 | |
0fde6e45 MW |
76 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); |
77 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
10988083 MW |
78 | |
79 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
80 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
81 | ||
82 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
83 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
84 | ||
85 | const VPDFT vomegaEven = VSET(omegaEven); | |
86 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
87 | ||
0fde6e45 | 88 | const VPDFT voneHalf = VSET(F(0.5)); |
10988083 MW |
89 | |
90 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
91 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
92 | // uint32_t consecIndex = 0; | |
93 | // uint32_t consecValue = 0; | |
94 | ||
95 | PdfT * src = kd->Pdfs[0]; | |
96 | PdfT * dst = kd->Pdfs[1]; | |
97 | PdfT * tmp; | |
98 | ||
99 | int maxIterations = cd->MaxIterations; | |
100 | ||
101 | int nFluid = kdl->nFluid; | |
102 | int nCells = kdl->nCells; | |
103 | ||
104 | int nTmpArray = kdlr->nTmpArray; | |
105 | ||
106 | Assert(nTmpArray % VSIZE == 0); | |
107 | ||
108 | uint32_t * adjList = kdl->AdjList; | |
109 | ||
110 | #ifdef VTK_OUTPUT | |
111 | if (cd->VtkOutput) { | |
112 | kd->PdfsActive = src; | |
113 | VtkWrite(ld, kd, cd, -1); | |
114 | } | |
115 | #endif | |
116 | ||
117 | #ifdef STATISTICS | |
118 | kd->PdfsActive = src; | |
119 | KernelStatistics(kd, ld, cd, 0); | |
120 | #endif | |
121 | ||
8cafd9ea MW |
122 | X_KERNEL_START(kernelData); |
123 | ||
124 | X_LIKWID_START("list-pull-split-nt-1s"); | |
e3f82424 | 125 | |
10988083 MW |
126 | #ifdef _OPENMP |
127 | #pragma omp parallel default(none) \ | |
128 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
129 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
130 | stderr ) | |
131 | #endif | |
132 | { | |
133 | uint32_t adjListIndex; | |
134 | ||
135 | PdfT ux, uy, uz, ui; | |
136 | VPDFT vux, vuy, vuz, vui; | |
137 | ||
138 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
139 | D3Q19_LIST | |
140 | #undef X | |
141 | VPDFT vpdf_a, vpdf_b; | |
142 | ||
143 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
144 | PdfT w_1_indep, w_2_indep; | |
145 | VPDFT vevenPart, voddPart; | |
146 | VPDFT vw_1_indep, vw_2_indep; | |
147 | ||
148 | int indexMax; | |
149 | ||
150 | PdfT * tmpArray; | |
151 | MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); | |
152 | ||
153 | int nThreads = 1; | |
154 | int threadId = 0; | |
155 | ||
156 | #ifdef _OPENMP | |
157 | nThreads = omp_get_max_threads(); | |
158 | threadId = omp_get_thread_num(); | |
159 | #endif | |
160 | ||
161 | int nCellsThread = nFluid / nThreads; | |
162 | int blIndexStart = threadId * nCellsThread; | |
163 | ||
164 | if (threadId < nFluid % nThreads) { | |
165 | blIndexStart += threadId; | |
166 | nCellsThread += 1; | |
167 | } | |
168 | else { | |
169 | blIndexStart += nFluid % nThreads; | |
170 | } | |
171 | ||
172 | int blIndexStop = blIndexStart + nCellsThread; | |
173 | ||
174 | // We have three loops: | |
175 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
176 | // 2. Vectorized handling of nodes. | |
177 | // 3. Remaining nodes, less than vector size. | |
178 | ||
179 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
180 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
181 | ||
182 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
183 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
184 | ||
185 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
186 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
187 | ||
188 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
189 | ||
190 | for(int iter = 0; iter < maxIterations; ++iter) { | |
191 | ||
e3f82424 | 192 | |
10988083 MW |
193 | #if 1 |
194 | #define INDEX_START blIndexStart | |
195 | #define INDEX_STOP blIndexVec | |
196 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
197 | ||
198 | #define INDEX_START blIndexVec | |
199 | #define INDEX_STOP blIndexRemaining | |
200 | #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" | |
201 | ||
202 | #define INDEX_START blIndexRemaining | |
203 | #define INDEX_STOP blIndexStop | |
204 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
205 | #else | |
206 | #define INDEX_START blIndexStart | |
207 | #define INDEX_STOP blIndexStop | |
208 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
209 | #endif | |
e3f82424 MW |
210 | |
211 | ||
10988083 MW |
212 | #pragma omp barrier |
213 | ||
214 | #pragma omp single | |
215 | { | |
216 | #ifdef VERIFICATION | |
217 | kd->PdfsActive = dst; | |
218 | KernelAddBodyForce(kd, ld, cd); | |
219 | #endif | |
220 | ||
221 | #ifdef VTK_OUTPUT | |
222 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
223 | kd->PdfsActive = dst; | |
224 | VtkWrite(ld, kd, cd, iter); | |
225 | } | |
226 | #endif | |
227 | ||
228 | #ifdef STATISTICS | |
229 | kd->PdfsActive = dst; | |
230 | KernelStatistics(kd, ld, cd, iter); | |
231 | #endif | |
232 | ||
233 | // swap grids | |
234 | tmp = src; | |
235 | src = dst; | |
236 | dst = tmp; | |
237 | } | |
238 | ||
239 | #pragma omp barrier | |
240 | ||
241 | } // for (int iter = 0; ... | |
242 | ||
243 | MemFree((void **)&tmpArray); | |
244 | } | |
245 | ||
e3f82424 MW |
246 | |
247 | X_LIKWID_STOP("list-pull-split-nt-1s"); | |
248 | ||
8cafd9ea MW |
249 | X_KERNEL_END(kernelData); |
250 | ||
10988083 MW |
251 | #ifdef VTK_OUTPUT |
252 | if (cd->VtkOutput) { | |
253 | kd->PdfsActive = src; | |
254 | VtkWrite(ld, kd, cd, maxIterations); | |
255 | } | |
256 | #endif | |
257 | ||
258 | #ifdef STATISTICS | |
259 | kd->PdfsActive = src; | |
260 | KernelStatistics(kd, ld, cd, maxIterations); | |
261 | #endif | |
262 | ||
263 | return; | |
264 | } | |
265 | ||
266 | void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
267 | { | |
268 | ||
269 | Assert(ld != NULL); | |
270 | Assert(kernelData != NULL); | |
271 | Assert(cd != NULL); | |
272 | ||
0fde6e45 MW |
273 | Assert(cd->Omega > F(0.0)); |
274 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
275 | |
276 | KernelData * kd = (KernelData *)kernelData; | |
277 | KernelDataList * kdl = KDL(kernelData); | |
278 | KernelDataListRia * kdlr = KDLR(kernelData); | |
279 | ||
280 | PdfT omega = cd->Omega; | |
281 | const PdfT omegaEven = omega; | |
282 | ||
0fde6e45 MW |
283 | PdfT magicParam = F(1.0) / F(12.0); |
284 | const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 285 | |
0fde6e45 MW |
286 | const PdfT w_0 = F(1.0) / F( 3.0); |
287 | const PdfT w_1 = F(1.0) / F(18.0); | |
288 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 289 | |
0fde6e45 MW |
290 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); |
291 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
10988083 MW |
292 | |
293 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
294 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
295 | ||
296 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
297 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
298 | ||
299 | const VPDFT vomegaEven = VSET(omegaEven); | |
300 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
301 | ||
0fde6e45 | 302 | const VPDFT voneHalf = VSET(F(0.5)); |
10988083 MW |
303 | |
304 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
305 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
306 | // uint32_t consecIndex = 0; | |
307 | // uint32_t consecValue = 0; | |
308 | ||
309 | PdfT * src = kd->Pdfs[0]; | |
310 | PdfT * dst = kd->Pdfs[1]; | |
311 | PdfT * tmp; | |
312 | ||
313 | int maxIterations = cd->MaxIterations; | |
314 | ||
315 | int nFluid = kdl->nFluid; | |
316 | int nCells = kdl->nCells; | |
317 | ||
318 | int nTmpArray = kdlr->nTmpArray; | |
319 | ||
320 | Assert(nTmpArray % VSIZE == 0); | |
321 | ||
322 | uint32_t * adjList = kdl->AdjList; | |
323 | ||
324 | #ifdef VTK_OUTPUT | |
325 | if (cd->VtkOutput) { | |
326 | kd->PdfsActive = src; | |
327 | VtkWrite(ld, kd, cd, -1); | |
328 | } | |
329 | #endif | |
330 | ||
331 | #ifdef STATISTICS | |
332 | kd->PdfsActive = src; | |
333 | KernelStatistics(kd, ld, cd, 0); | |
334 | #endif | |
335 | ||
e3f82424 | 336 | |
8cafd9ea MW |
337 | X_KERNEL_START(kernelData); |
338 | ||
339 | X_LIKWID_START("list-pull-split-nt-2s"); | |
e3f82424 MW |
340 | |
341 | ||
10988083 MW |
342 | #ifdef _OPENMP |
343 | #pragma omp parallel default(none) \ | |
344 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
345 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
346 | stderr ) | |
347 | #endif | |
348 | { | |
349 | uint32_t adjListIndex; | |
350 | ||
351 | PdfT ux, uy, uz, ui; | |
352 | VPDFT vux, vuy, vuz, vui; | |
353 | ||
354 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
355 | D3Q19_LIST | |
356 | #undef X | |
357 | VPDFT vpdf_a, vpdf_b; | |
358 | ||
359 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
360 | PdfT w_1_indep, w_2_indep; | |
361 | VPDFT vevenPart, voddPart; | |
362 | VPDFT vw_1_indep, vw_2_indep; | |
363 | ||
364 | int indexMax; | |
365 | ||
366 | PdfT * tmpArray; | |
367 | MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); | |
368 | ||
369 | int nThreads = 1; | |
370 | int threadId = 0; | |
371 | ||
372 | #ifdef _OPENMP | |
373 | nThreads = omp_get_max_threads(); | |
374 | threadId = omp_get_thread_num(); | |
375 | #endif | |
376 | ||
377 | int nCellsThread = nFluid / nThreads; | |
378 | int blIndexStart = threadId * nCellsThread; | |
379 | ||
380 | if (threadId < nFluid % nThreads) { | |
381 | blIndexStart += threadId; | |
382 | nCellsThread += 1; | |
383 | } | |
384 | else { | |
385 | blIndexStart += nFluid % nThreads; | |
386 | } | |
387 | ||
388 | int blIndexStop = blIndexStart + nCellsThread; | |
389 | ||
390 | // We have three loops: | |
391 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
392 | // 2. Vectorized handling of nodes. | |
393 | // 3. Remaining nodes, less than vector size. | |
394 | ||
395 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
396 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
397 | ||
398 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
399 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
400 | ||
401 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
402 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
403 | ||
404 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
405 | ||
406 | for(int iter = 0; iter < maxIterations; ++iter) { | |
407 | ||
408 | #if 1 | |
409 | #define INDEX_START blIndexStart | |
410 | #define INDEX_STOP blIndexVec | |
411 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
412 | ||
413 | #define INDEX_START blIndexVec | |
414 | #define INDEX_STOP blIndexRemaining | |
415 | #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" | |
416 | ||
417 | #define INDEX_START blIndexRemaining | |
418 | #define INDEX_STOP blIndexStop | |
419 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
420 | #else | |
421 | #define INDEX_START blIndexStart | |
422 | #define INDEX_STOP blIndexStop | |
423 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
424 | #endif | |
425 | #pragma omp barrier | |
426 | ||
e3f82424 | 427 | |
10988083 MW |
428 | #pragma omp single |
429 | { | |
430 | #ifdef VERIFICATION | |
431 | kd->PdfsActive = dst; | |
432 | KernelAddBodyForce(kd, ld, cd); | |
433 | #endif | |
434 | ||
435 | #ifdef VTK_OUTPUT | |
436 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
437 | kd->PdfsActive = dst; | |
438 | VtkWrite(ld, kd, cd, iter); | |
439 | } | |
440 | #endif | |
441 | ||
442 | #ifdef STATISTICS | |
443 | kd->PdfsActive = dst; | |
444 | KernelStatistics(kd, ld, cd, iter); | |
445 | #endif | |
446 | ||
447 | // swap grids | |
448 | tmp = src; | |
449 | src = dst; | |
450 | dst = tmp; | |
451 | } | |
452 | ||
453 | #pragma omp barrier | |
454 | ||
455 | } // for (int iter = 0; ... | |
456 | ||
457 | MemFree((void **)&tmpArray); | |
458 | } | |
459 | ||
8cafd9ea MW |
460 | X_LIKWID_STOP("list-pull-split-nt-2s"); |
461 | ||
462 | X_KERNEL_END(kernelData); | |
e3f82424 | 463 | |
10988083 MW |
464 | #ifdef VTK_OUTPUT |
465 | if (cd->VtkOutput) { | |
466 | kd->PdfsActive = src; | |
467 | VtkWrite(ld, kd, cd, maxIterations); | |
468 | } | |
469 | #endif | |
470 | ||
471 | #ifdef STATISTICS | |
472 | kd->PdfsActive = src; | |
473 | KernelStatistics(kd, ld, cd, maxIterations); | |
474 | #endif | |
475 | ||
476 | return; | |
477 | } | |
478 |