Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListPullSplitNtCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
e3f82424 | 32 | #include "LikwidIf.h" |
10988083 MW |
33 | |
34 | #include <inttypes.h> | |
35 | #include <math.h> | |
36 | ||
37 | #ifdef _OPENMP | |
38 | #include <omp.h> | |
39 | #endif | |
40 | ||
41 | #define TMP_UX 18 | |
42 | #define TMP_UY 19 | |
43 | #define TMP_UZ 20 | |
44 | #define TMP_W1 21 | |
45 | #define TMP_W2 22 | |
46 | ||
47 | #define N_TMP 23 | |
48 | ||
49 | #define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) | |
50 | ||
51 | void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
52 | { | |
53 | ||
54 | Assert(ld != NULL); | |
55 | Assert(kernelData != NULL); | |
56 | Assert(cd != NULL); | |
57 | ||
0fde6e45 MW |
58 | Assert(cd->Omega > F(0.0)); |
59 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
60 | |
61 | KernelData * kd = (KernelData *)kernelData; | |
62 | KernelDataList * kdl = KDL(kernelData); | |
63 | KernelDataListRia * kdlr = KDLR(kernelData); | |
64 | ||
65 | PdfT omega = cd->Omega; | |
66 | const PdfT omegaEven = omega; | |
67 | ||
0fde6e45 MW |
68 | PdfT magicParam = F(1.0) / F(12.0); |
69 | const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 MW |
70 | |
71 | ||
0fde6e45 MW |
72 | const PdfT w_0 = F(1.0) / F( 3.0); |
73 | const PdfT w_1 = F(1.0) / F(18.0); | |
74 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 75 | |
0fde6e45 MW |
76 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); |
77 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
10988083 MW |
78 | |
79 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
80 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
81 | ||
82 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
83 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
84 | ||
85 | const VPDFT vomegaEven = VSET(omegaEven); | |
86 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
87 | ||
0fde6e45 | 88 | const VPDFT voneHalf = VSET(F(0.5)); |
10988083 MW |
89 | |
90 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
91 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
92 | // uint32_t consecIndex = 0; | |
93 | // uint32_t consecValue = 0; | |
94 | ||
95 | PdfT * src = kd->Pdfs[0]; | |
96 | PdfT * dst = kd->Pdfs[1]; | |
97 | PdfT * tmp; | |
98 | ||
99 | int maxIterations = cd->MaxIterations; | |
100 | ||
101 | int nFluid = kdl->nFluid; | |
102 | int nCells = kdl->nCells; | |
103 | ||
104 | int nTmpArray = kdlr->nTmpArray; | |
105 | ||
106 | Assert(nTmpArray % VSIZE == 0); | |
107 | ||
108 | uint32_t * adjList = kdl->AdjList; | |
109 | ||
110 | #ifdef VTK_OUTPUT | |
111 | if (cd->VtkOutput) { | |
112 | kd->PdfsActive = src; | |
113 | VtkWrite(ld, kd, cd, -1); | |
114 | } | |
115 | #endif | |
116 | ||
117 | #ifdef STATISTICS | |
118 | kd->PdfsActive = src; | |
119 | KernelStatistics(kd, ld, cd, 0); | |
120 | #endif | |
121 | ||
e3f82424 MW |
122 | |
123 | X_LIKWID_START("list-pull-split-nt-1s"); | |
10988083 MW |
124 | #ifdef _OPENMP |
125 | #pragma omp parallel default(none) \ | |
126 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
127 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
128 | stderr ) | |
129 | #endif | |
130 | { | |
131 | uint32_t adjListIndex; | |
132 | ||
133 | PdfT ux, uy, uz, ui; | |
134 | VPDFT vux, vuy, vuz, vui; | |
135 | ||
136 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
137 | D3Q19_LIST | |
138 | #undef X | |
139 | VPDFT vpdf_a, vpdf_b; | |
140 | ||
141 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
142 | PdfT w_1_indep, w_2_indep; | |
143 | VPDFT vevenPart, voddPart; | |
144 | VPDFT vw_1_indep, vw_2_indep; | |
145 | ||
146 | int indexMax; | |
147 | ||
148 | PdfT * tmpArray; | |
149 | MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); | |
150 | ||
151 | int nThreads = 1; | |
152 | int threadId = 0; | |
153 | ||
154 | #ifdef _OPENMP | |
155 | nThreads = omp_get_max_threads(); | |
156 | threadId = omp_get_thread_num(); | |
157 | #endif | |
158 | ||
159 | int nCellsThread = nFluid / nThreads; | |
160 | int blIndexStart = threadId * nCellsThread; | |
161 | ||
162 | if (threadId < nFluid % nThreads) { | |
163 | blIndexStart += threadId; | |
164 | nCellsThread += 1; | |
165 | } | |
166 | else { | |
167 | blIndexStart += nFluid % nThreads; | |
168 | } | |
169 | ||
170 | int blIndexStop = blIndexStart + nCellsThread; | |
171 | ||
172 | // We have three loops: | |
173 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
174 | // 2. Vectorized handling of nodes. | |
175 | // 3. Remaining nodes, less than vector size. | |
176 | ||
177 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
178 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
179 | ||
180 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
181 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
182 | ||
183 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
184 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
185 | ||
186 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
187 | ||
188 | for(int iter = 0; iter < maxIterations; ++iter) { | |
189 | ||
e3f82424 | 190 | |
10988083 MW |
191 | #if 1 |
192 | #define INDEX_START blIndexStart | |
193 | #define INDEX_STOP blIndexVec | |
194 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
195 | ||
196 | #define INDEX_START blIndexVec | |
197 | #define INDEX_STOP blIndexRemaining | |
198 | #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" | |
199 | ||
200 | #define INDEX_START blIndexRemaining | |
201 | #define INDEX_STOP blIndexStop | |
202 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
203 | #else | |
204 | #define INDEX_START blIndexStart | |
205 | #define INDEX_STOP blIndexStop | |
206 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
207 | #endif | |
e3f82424 MW |
208 | |
209 | ||
10988083 MW |
210 | #pragma omp barrier |
211 | ||
212 | #pragma omp single | |
213 | { | |
214 | #ifdef VERIFICATION | |
215 | kd->PdfsActive = dst; | |
216 | KernelAddBodyForce(kd, ld, cd); | |
217 | #endif | |
218 | ||
219 | #ifdef VTK_OUTPUT | |
220 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
221 | kd->PdfsActive = dst; | |
222 | VtkWrite(ld, kd, cd, iter); | |
223 | } | |
224 | #endif | |
225 | ||
226 | #ifdef STATISTICS | |
227 | kd->PdfsActive = dst; | |
228 | KernelStatistics(kd, ld, cd, iter); | |
229 | #endif | |
230 | ||
231 | // swap grids | |
232 | tmp = src; | |
233 | src = dst; | |
234 | dst = tmp; | |
235 | } | |
236 | ||
237 | #pragma omp barrier | |
238 | ||
239 | } // for (int iter = 0; ... | |
240 | ||
241 | MemFree((void **)&tmpArray); | |
242 | } | |
243 | ||
e3f82424 MW |
244 | |
245 | X_LIKWID_STOP("list-pull-split-nt-1s"); | |
246 | ||
10988083 MW |
247 | #ifdef VTK_OUTPUT |
248 | if (cd->VtkOutput) { | |
249 | kd->PdfsActive = src; | |
250 | VtkWrite(ld, kd, cd, maxIterations); | |
251 | } | |
252 | #endif | |
253 | ||
254 | #ifdef STATISTICS | |
255 | kd->PdfsActive = src; | |
256 | KernelStatistics(kd, ld, cd, maxIterations); | |
257 | #endif | |
258 | ||
259 | return; | |
260 | } | |
261 | ||
262 | void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
263 | { | |
264 | ||
265 | Assert(ld != NULL); | |
266 | Assert(kernelData != NULL); | |
267 | Assert(cd != NULL); | |
268 | ||
0fde6e45 MW |
269 | Assert(cd->Omega > F(0.0)); |
270 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
271 | |
272 | KernelData * kd = (KernelData *)kernelData; | |
273 | KernelDataList * kdl = KDL(kernelData); | |
274 | KernelDataListRia * kdlr = KDLR(kernelData); | |
275 | ||
276 | PdfT omega = cd->Omega; | |
277 | const PdfT omegaEven = omega; | |
278 | ||
0fde6e45 MW |
279 | PdfT magicParam = F(1.0) / F(12.0); |
280 | const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 281 | |
0fde6e45 MW |
282 | const PdfT w_0 = F(1.0) / F( 3.0); |
283 | const PdfT w_1 = F(1.0) / F(18.0); | |
284 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 285 | |
0fde6e45 MW |
286 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); |
287 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
10988083 MW |
288 | |
289 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
290 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
291 | ||
292 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
293 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
294 | ||
295 | const VPDFT vomegaEven = VSET(omegaEven); | |
296 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
297 | ||
0fde6e45 | 298 | const VPDFT voneHalf = VSET(F(0.5)); |
10988083 MW |
299 | |
300 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
301 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
302 | // uint32_t consecIndex = 0; | |
303 | // uint32_t consecValue = 0; | |
304 | ||
305 | PdfT * src = kd->Pdfs[0]; | |
306 | PdfT * dst = kd->Pdfs[1]; | |
307 | PdfT * tmp; | |
308 | ||
309 | int maxIterations = cd->MaxIterations; | |
310 | ||
311 | int nFluid = kdl->nFluid; | |
312 | int nCells = kdl->nCells; | |
313 | ||
314 | int nTmpArray = kdlr->nTmpArray; | |
315 | ||
316 | Assert(nTmpArray % VSIZE == 0); | |
317 | ||
318 | uint32_t * adjList = kdl->AdjList; | |
319 | ||
320 | #ifdef VTK_OUTPUT | |
321 | if (cd->VtkOutput) { | |
322 | kd->PdfsActive = src; | |
323 | VtkWrite(ld, kd, cd, -1); | |
324 | } | |
325 | #endif | |
326 | ||
327 | #ifdef STATISTICS | |
328 | kd->PdfsActive = src; | |
329 | KernelStatistics(kd, ld, cd, 0); | |
330 | #endif | |
331 | ||
e3f82424 MW |
332 | |
333 | X_LIKWID_START("list-pull-split-nt-2s"); | |
334 | ||
335 | ||
10988083 MW |
336 | #ifdef _OPENMP |
337 | #pragma omp parallel default(none) \ | |
338 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
339 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
340 | stderr ) | |
341 | #endif | |
342 | { | |
343 | uint32_t adjListIndex; | |
344 | ||
345 | PdfT ux, uy, uz, ui; | |
346 | VPDFT vux, vuy, vuz, vui; | |
347 | ||
348 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
349 | D3Q19_LIST | |
350 | #undef X | |
351 | VPDFT vpdf_a, vpdf_b; | |
352 | ||
353 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
354 | PdfT w_1_indep, w_2_indep; | |
355 | VPDFT vevenPart, voddPart; | |
356 | VPDFT vw_1_indep, vw_2_indep; | |
357 | ||
358 | int indexMax; | |
359 | ||
360 | PdfT * tmpArray; | |
361 | MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); | |
362 | ||
363 | int nThreads = 1; | |
364 | int threadId = 0; | |
365 | ||
366 | #ifdef _OPENMP | |
367 | nThreads = omp_get_max_threads(); | |
368 | threadId = omp_get_thread_num(); | |
369 | #endif | |
370 | ||
371 | int nCellsThread = nFluid / nThreads; | |
372 | int blIndexStart = threadId * nCellsThread; | |
373 | ||
374 | if (threadId < nFluid % nThreads) { | |
375 | blIndexStart += threadId; | |
376 | nCellsThread += 1; | |
377 | } | |
378 | else { | |
379 | blIndexStart += nFluid % nThreads; | |
380 | } | |
381 | ||
382 | int blIndexStop = blIndexStart + nCellsThread; | |
383 | ||
384 | // We have three loops: | |
385 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
386 | // 2. Vectorized handling of nodes. | |
387 | // 3. Remaining nodes, less than vector size. | |
388 | ||
389 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
390 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
391 | ||
392 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
393 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
394 | ||
395 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
396 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
397 | ||
398 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
399 | ||
400 | for(int iter = 0; iter < maxIterations; ++iter) { | |
401 | ||
402 | #if 1 | |
403 | #define INDEX_START blIndexStart | |
404 | #define INDEX_STOP blIndexVec | |
405 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
406 | ||
407 | #define INDEX_START blIndexVec | |
408 | #define INDEX_STOP blIndexRemaining | |
409 | #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" | |
410 | ||
411 | #define INDEX_START blIndexRemaining | |
412 | #define INDEX_STOP blIndexStop | |
413 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
414 | #else | |
415 | #define INDEX_START blIndexStart | |
416 | #define INDEX_STOP blIndexStop | |
417 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
418 | #endif | |
419 | #pragma omp barrier | |
420 | ||
e3f82424 | 421 | |
10988083 MW |
422 | #pragma omp single |
423 | { | |
424 | #ifdef VERIFICATION | |
425 | kd->PdfsActive = dst; | |
426 | KernelAddBodyForce(kd, ld, cd); | |
427 | #endif | |
428 | ||
429 | #ifdef VTK_OUTPUT | |
430 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
431 | kd->PdfsActive = dst; | |
432 | VtkWrite(ld, kd, cd, iter); | |
433 | } | |
434 | #endif | |
435 | ||
436 | #ifdef STATISTICS | |
437 | kd->PdfsActive = dst; | |
438 | KernelStatistics(kd, ld, cd, iter); | |
439 | #endif | |
440 | ||
441 | // swap grids | |
442 | tmp = src; | |
443 | src = dst; | |
444 | dst = tmp; | |
445 | } | |
446 | ||
447 | #pragma omp barrier | |
448 | ||
449 | } // for (int iter = 0; ... | |
450 | ||
451 | MemFree((void **)&tmpArray); | |
452 | } | |
453 | ||
e3f82424 MW |
454 | X_LIKWID_STOP("list-pull-split-nt-2s"); |
455 | ||
10988083 MW |
456 | #ifdef VTK_OUTPUT |
457 | if (cd->VtkOutput) { | |
458 | kd->PdfsActive = src; | |
459 | VtkWrite(ld, kd, cd, maxIterations); | |
460 | } | |
461 | #endif | |
462 | ||
463 | #ifdef STATISTICS | |
464 | kd->PdfsActive = src; | |
465 | KernelStatistics(kd, ld, cd, maxIterations); | |
466 | #endif | |
467 | ||
468 | return; | |
469 | } | |
470 |