Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListPullSplitNtCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
e3f82424 | 32 | #include "LikwidIf.h" |
10988083 MW |
33 | |
34 | #include <inttypes.h> | |
35 | #include <math.h> | |
36 | ||
37 | #ifdef _OPENMP | |
38 | #include <omp.h> | |
39 | #endif | |
40 | ||
41 | #define TMP_UX 18 | |
42 | #define TMP_UY 19 | |
43 | #define TMP_UZ 20 | |
44 | #define TMP_W1 21 | |
45 | #define TMP_W2 22 | |
46 | ||
47 | #define N_TMP 23 | |
48 | ||
49 | #define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) | |
50 | ||
51 | void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
52 | { | |
53 | ||
54 | Assert(ld != NULL); | |
55 | Assert(kernelData != NULL); | |
56 | Assert(cd != NULL); | |
57 | ||
58 | Assert(cd->Omega > 0.0); | |
59 | Assert(cd->Omega < 2.0); | |
60 | ||
61 | KernelData * kd = (KernelData *)kernelData; | |
62 | KernelDataList * kdl = KDL(kernelData); | |
63 | KernelDataListRia * kdlr = KDLR(kernelData); | |
64 | ||
65 | PdfT omega = cd->Omega; | |
66 | const PdfT omegaEven = omega; | |
67 | ||
68 | PdfT magicParam = 1.0 / 12.0; | |
69 | const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); | |
70 | ||
71 | ||
72 | const PdfT w_0 = 1.0 / 3.0; | |
73 | const PdfT w_1 = 1.0 / 18.0; | |
74 | const PdfT w_2 = 1.0 / 36.0; | |
75 | ||
76 | const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; | |
77 | const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; | |
78 | ||
79 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
80 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
81 | ||
82 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
83 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
84 | ||
85 | const VPDFT vomegaEven = VSET(omegaEven); | |
86 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
87 | ||
88 | const VPDFT voneHalf = VSET(0.5); | |
89 | ||
90 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
91 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
92 | // uint32_t consecIndex = 0; | |
93 | // uint32_t consecValue = 0; | |
94 | ||
95 | PdfT * src = kd->Pdfs[0]; | |
96 | PdfT * dst = kd->Pdfs[1]; | |
97 | PdfT * tmp; | |
98 | ||
99 | int maxIterations = cd->MaxIterations; | |
100 | ||
101 | int nFluid = kdl->nFluid; | |
102 | int nCells = kdl->nCells; | |
103 | ||
104 | int nTmpArray = kdlr->nTmpArray; | |
105 | ||
106 | Assert(nTmpArray % VSIZE == 0); | |
107 | ||
108 | uint32_t * adjList = kdl->AdjList; | |
109 | ||
110 | #ifdef VTK_OUTPUT | |
111 | if (cd->VtkOutput) { | |
112 | kd->PdfsActive = src; | |
113 | VtkWrite(ld, kd, cd, -1); | |
114 | } | |
115 | #endif | |
116 | ||
117 | #ifdef STATISTICS | |
118 | kd->PdfsActive = src; | |
119 | KernelStatistics(kd, ld, cd, 0); | |
120 | #endif | |
121 | ||
e3f82424 MW |
122 | |
123 | X_LIKWID_START("list-pull-split-nt-1s"); | |
10988083 MW |
124 | #ifdef _OPENMP |
125 | #pragma omp parallel default(none) \ | |
126 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
127 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
128 | stderr ) | |
129 | #endif | |
130 | { | |
131 | uint32_t adjListIndex; | |
132 | ||
133 | PdfT ux, uy, uz, ui; | |
134 | VPDFT vux, vuy, vuz, vui; | |
135 | ||
136 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
137 | D3Q19_LIST | |
138 | #undef X | |
139 | VPDFT vpdf_a, vpdf_b; | |
140 | ||
141 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
142 | PdfT w_1_indep, w_2_indep; | |
143 | VPDFT vevenPart, voddPart; | |
144 | VPDFT vw_1_indep, vw_2_indep; | |
145 | ||
146 | int indexMax; | |
147 | ||
148 | PdfT * tmpArray; | |
149 | MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); | |
150 | ||
151 | int nThreads = 1; | |
152 | int threadId = 0; | |
153 | ||
154 | #ifdef _OPENMP | |
155 | nThreads = omp_get_max_threads(); | |
156 | threadId = omp_get_thread_num(); | |
157 | #endif | |
158 | ||
159 | int nCellsThread = nFluid / nThreads; | |
160 | int blIndexStart = threadId * nCellsThread; | |
161 | ||
162 | if (threadId < nFluid % nThreads) { | |
163 | blIndexStart += threadId; | |
164 | nCellsThread += 1; | |
165 | } | |
166 | else { | |
167 | blIndexStart += nFluid % nThreads; | |
168 | } | |
169 | ||
170 | int blIndexStop = blIndexStart + nCellsThread; | |
171 | ||
172 | // We have three loops: | |
173 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
174 | // 2. Vectorized handling of nodes. | |
175 | // 3. Remaining nodes, less than vector size. | |
176 | ||
177 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
178 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
179 | ||
180 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
181 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
182 | ||
183 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
184 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
185 | ||
186 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
187 | ||
188 | for(int iter = 0; iter < maxIterations; ++iter) { | |
189 | ||
e3f82424 | 190 | |
10988083 MW |
191 | #if 1 |
192 | #define INDEX_START blIndexStart | |
193 | #define INDEX_STOP blIndexVec | |
194 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
195 | ||
196 | #define INDEX_START blIndexVec | |
197 | #define INDEX_STOP blIndexRemaining | |
198 | #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" | |
199 | ||
200 | #define INDEX_START blIndexRemaining | |
201 | #define INDEX_STOP blIndexStop | |
202 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
203 | #else | |
204 | #define INDEX_START blIndexStart | |
205 | #define INDEX_STOP blIndexStop | |
206 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
207 | #endif | |
e3f82424 MW |
208 | |
209 | ||
10988083 MW |
210 | #pragma omp barrier |
211 | ||
212 | #pragma omp single | |
213 | { | |
214 | #ifdef VERIFICATION | |
215 | kd->PdfsActive = dst; | |
216 | KernelAddBodyForce(kd, ld, cd); | |
217 | #endif | |
218 | ||
219 | #ifdef VTK_OUTPUT | |
220 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
221 | kd->PdfsActive = dst; | |
222 | VtkWrite(ld, kd, cd, iter); | |
223 | } | |
224 | #endif | |
225 | ||
226 | #ifdef STATISTICS | |
227 | kd->PdfsActive = dst; | |
228 | KernelStatistics(kd, ld, cd, iter); | |
229 | #endif | |
230 | ||
231 | // swap grids | |
232 | tmp = src; | |
233 | src = dst; | |
234 | dst = tmp; | |
235 | } | |
236 | ||
237 | #pragma omp barrier | |
238 | ||
239 | } // for (int iter = 0; ... | |
240 | ||
241 | MemFree((void **)&tmpArray); | |
242 | } | |
243 | ||
e3f82424 MW |
244 | |
245 | X_LIKWID_STOP("list-pull-split-nt-1s"); | |
246 | ||
10988083 MW |
247 | #ifdef VTK_OUTPUT |
248 | if (cd->VtkOutput) { | |
249 | kd->PdfsActive = src; | |
250 | VtkWrite(ld, kd, cd, maxIterations); | |
251 | } | |
252 | #endif | |
253 | ||
254 | #ifdef STATISTICS | |
255 | kd->PdfsActive = src; | |
256 | KernelStatistics(kd, ld, cd, maxIterations); | |
257 | #endif | |
258 | ||
259 | return; | |
260 | } | |
261 | ||
262 | void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
263 | { | |
264 | ||
265 | Assert(ld != NULL); | |
266 | Assert(kernelData != NULL); | |
267 | Assert(cd != NULL); | |
268 | ||
269 | Assert(cd->Omega > 0.0); | |
270 | Assert(cd->Omega < 2.0); | |
271 | ||
272 | KernelData * kd = (KernelData *)kernelData; | |
273 | KernelDataList * kdl = KDL(kernelData); | |
274 | KernelDataListRia * kdlr = KDLR(kernelData); | |
275 | ||
276 | PdfT omega = cd->Omega; | |
277 | const PdfT omegaEven = omega; | |
278 | ||
279 | PdfT magicParam = 1.0 / 12.0; | |
280 | const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); | |
281 | ||
282 | ||
283 | const PdfT w_0 = 1.0 / 3.0; | |
284 | const PdfT w_1 = 1.0 / 18.0; | |
285 | const PdfT w_2 = 1.0 / 36.0; | |
286 | ||
287 | const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; | |
288 | const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; | |
289 | ||
290 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
291 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
292 | ||
293 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
294 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
295 | ||
296 | const VPDFT vomegaEven = VSET(omegaEven); | |
297 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
298 | ||
299 | const VPDFT voneHalf = VSET(0.5); | |
300 | ||
301 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
302 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
303 | // uint32_t consecIndex = 0; | |
304 | // uint32_t consecValue = 0; | |
305 | ||
306 | PdfT * src = kd->Pdfs[0]; | |
307 | PdfT * dst = kd->Pdfs[1]; | |
308 | PdfT * tmp; | |
309 | ||
310 | int maxIterations = cd->MaxIterations; | |
311 | ||
312 | int nFluid = kdl->nFluid; | |
313 | int nCells = kdl->nCells; | |
314 | ||
315 | int nTmpArray = kdlr->nTmpArray; | |
316 | ||
317 | Assert(nTmpArray % VSIZE == 0); | |
318 | ||
319 | uint32_t * adjList = kdl->AdjList; | |
320 | ||
321 | #ifdef VTK_OUTPUT | |
322 | if (cd->VtkOutput) { | |
323 | kd->PdfsActive = src; | |
324 | VtkWrite(ld, kd, cd, -1); | |
325 | } | |
326 | #endif | |
327 | ||
328 | #ifdef STATISTICS | |
329 | kd->PdfsActive = src; | |
330 | KernelStatistics(kd, ld, cd, 0); | |
331 | #endif | |
332 | ||
e3f82424 MW |
333 | |
334 | X_LIKWID_START("list-pull-split-nt-2s"); | |
335 | ||
336 | ||
10988083 MW |
337 | #ifdef _OPENMP |
338 | #pragma omp parallel default(none) \ | |
339 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
340 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
341 | stderr ) | |
342 | #endif | |
343 | { | |
344 | uint32_t adjListIndex; | |
345 | ||
346 | PdfT ux, uy, uz, ui; | |
347 | VPDFT vux, vuy, vuz, vui; | |
348 | ||
349 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
350 | D3Q19_LIST | |
351 | #undef X | |
352 | VPDFT vpdf_a, vpdf_b; | |
353 | ||
354 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
355 | PdfT w_1_indep, w_2_indep; | |
356 | VPDFT vevenPart, voddPart; | |
357 | VPDFT vw_1_indep, vw_2_indep; | |
358 | ||
359 | int indexMax; | |
360 | ||
361 | PdfT * tmpArray; | |
362 | MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); | |
363 | ||
364 | int nThreads = 1; | |
365 | int threadId = 0; | |
366 | ||
367 | #ifdef _OPENMP | |
368 | nThreads = omp_get_max_threads(); | |
369 | threadId = omp_get_thread_num(); | |
370 | #endif | |
371 | ||
372 | int nCellsThread = nFluid / nThreads; | |
373 | int blIndexStart = threadId * nCellsThread; | |
374 | ||
375 | if (threadId < nFluid % nThreads) { | |
376 | blIndexStart += threadId; | |
377 | nCellsThread += 1; | |
378 | } | |
379 | else { | |
380 | blIndexStart += nFluid % nThreads; | |
381 | } | |
382 | ||
383 | int blIndexStop = blIndexStart + nCellsThread; | |
384 | ||
385 | // We have three loops: | |
386 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
387 | // 2. Vectorized handling of nodes. | |
388 | // 3. Remaining nodes, less than vector size. | |
389 | ||
390 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
391 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
392 | ||
393 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
394 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
395 | ||
396 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
397 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
398 | ||
399 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
400 | ||
401 | for(int iter = 0; iter < maxIterations; ++iter) { | |
402 | ||
403 | #if 1 | |
404 | #define INDEX_START blIndexStart | |
405 | #define INDEX_STOP blIndexVec | |
406 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
407 | ||
408 | #define INDEX_START blIndexVec | |
409 | #define INDEX_STOP blIndexRemaining | |
410 | #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" | |
411 | ||
412 | #define INDEX_START blIndexRemaining | |
413 | #define INDEX_STOP blIndexStop | |
414 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
415 | #else | |
416 | #define INDEX_START blIndexStart | |
417 | #define INDEX_STOP blIndexStop | |
418 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
419 | #endif | |
420 | #pragma omp barrier | |
421 | ||
e3f82424 | 422 | |
10988083 MW |
423 | #pragma omp single |
424 | { | |
425 | #ifdef VERIFICATION | |
426 | kd->PdfsActive = dst; | |
427 | KernelAddBodyForce(kd, ld, cd); | |
428 | #endif | |
429 | ||
430 | #ifdef VTK_OUTPUT | |
431 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
432 | kd->PdfsActive = dst; | |
433 | VtkWrite(ld, kd, cd, iter); | |
434 | } | |
435 | #endif | |
436 | ||
437 | #ifdef STATISTICS | |
438 | kd->PdfsActive = dst; | |
439 | KernelStatistics(kd, ld, cd, iter); | |
440 | #endif | |
441 | ||
442 | // swap grids | |
443 | tmp = src; | |
444 | src = dst; | |
445 | dst = tmp; | |
446 | } | |
447 | ||
448 | #pragma omp barrier | |
449 | ||
450 | } // for (int iter = 0; ... | |
451 | ||
452 | MemFree((void **)&tmpArray); | |
453 | } | |
454 | ||
e3f82424 MW |
455 | X_LIKWID_STOP("list-pull-split-nt-2s"); |
456 | ||
10988083 MW |
457 | #ifdef VTK_OUTPUT |
458 | if (cd->VtkOutput) { | |
459 | kd->PdfsActive = src; | |
460 | VtkWrite(ld, kd, cd, maxIterations); | |
461 | } | |
462 | #endif | |
463 | ||
464 | #ifdef STATISTICS | |
465 | kd->PdfsActive = src; | |
466 | KernelStatistics(kd, ld, cd, maxIterations); | |
467 | #endif | |
468 | ||
469 | return; | |
470 | } | |
471 |