Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListPullSplitNtCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
32 | ||
33 | #include <inttypes.h> | |
34 | #include <math.h> | |
35 | ||
36 | #ifdef _OPENMP | |
37 | #include <omp.h> | |
38 | #endif | |
39 | ||
40 | #define TMP_UX 18 | |
41 | #define TMP_UY 19 | |
42 | #define TMP_UZ 20 | |
43 | #define TMP_W1 21 | |
44 | #define TMP_W2 22 | |
45 | ||
46 | #define N_TMP 23 | |
47 | ||
48 | #define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) | |
49 | ||
50 | void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
51 | { | |
52 | ||
53 | Assert(ld != NULL); | |
54 | Assert(kernelData != NULL); | |
55 | Assert(cd != NULL); | |
56 | ||
57 | Assert(cd->Omega > 0.0); | |
58 | Assert(cd->Omega < 2.0); | |
59 | ||
60 | KernelData * kd = (KernelData *)kernelData; | |
61 | KernelDataList * kdl = KDL(kernelData); | |
62 | KernelDataListRia * kdlr = KDLR(kernelData); | |
63 | ||
64 | PdfT omega = cd->Omega; | |
65 | const PdfT omegaEven = omega; | |
66 | ||
67 | PdfT magicParam = 1.0 / 12.0; | |
68 | const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); | |
69 | ||
70 | ||
71 | const PdfT w_0 = 1.0 / 3.0; | |
72 | const PdfT w_1 = 1.0 / 18.0; | |
73 | const PdfT w_2 = 1.0 / 36.0; | |
74 | ||
75 | const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; | |
76 | const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; | |
77 | ||
78 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
79 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
80 | ||
81 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
82 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
83 | ||
84 | const VPDFT vomegaEven = VSET(omegaEven); | |
85 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
86 | ||
87 | const VPDFT voneHalf = VSET(0.5); | |
88 | ||
89 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
90 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
91 | // uint32_t consecIndex = 0; | |
92 | // uint32_t consecValue = 0; | |
93 | ||
94 | PdfT * src = kd->Pdfs[0]; | |
95 | PdfT * dst = kd->Pdfs[1]; | |
96 | PdfT * tmp; | |
97 | ||
98 | int maxIterations = cd->MaxIterations; | |
99 | ||
100 | int nFluid = kdl->nFluid; | |
101 | int nCells = kdl->nCells; | |
102 | ||
103 | int nTmpArray = kdlr->nTmpArray; | |
104 | ||
105 | Assert(nTmpArray % VSIZE == 0); | |
106 | ||
107 | uint32_t * adjList = kdl->AdjList; | |
108 | ||
109 | #ifdef VTK_OUTPUT | |
110 | if (cd->VtkOutput) { | |
111 | kd->PdfsActive = src; | |
112 | VtkWrite(ld, kd, cd, -1); | |
113 | } | |
114 | #endif | |
115 | ||
116 | #ifdef STATISTICS | |
117 | kd->PdfsActive = src; | |
118 | KernelStatistics(kd, ld, cd, 0); | |
119 | #endif | |
120 | ||
121 | #ifdef _OPENMP | |
122 | #pragma omp parallel default(none) \ | |
123 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
124 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
125 | stderr ) | |
126 | #endif | |
127 | { | |
128 | uint32_t adjListIndex; | |
129 | ||
130 | PdfT ux, uy, uz, ui; | |
131 | VPDFT vux, vuy, vuz, vui; | |
132 | ||
133 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
134 | D3Q19_LIST | |
135 | #undef X | |
136 | VPDFT vpdf_a, vpdf_b; | |
137 | ||
138 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
139 | PdfT w_1_indep, w_2_indep; | |
140 | VPDFT vevenPart, voddPart; | |
141 | VPDFT vw_1_indep, vw_2_indep; | |
142 | ||
143 | int indexMax; | |
144 | ||
145 | PdfT * tmpArray; | |
146 | MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); | |
147 | ||
148 | int nThreads = 1; | |
149 | int threadId = 0; | |
150 | ||
151 | #ifdef _OPENMP | |
152 | nThreads = omp_get_max_threads(); | |
153 | threadId = omp_get_thread_num(); | |
154 | #endif | |
155 | ||
156 | int nCellsThread = nFluid / nThreads; | |
157 | int blIndexStart = threadId * nCellsThread; | |
158 | ||
159 | if (threadId < nFluid % nThreads) { | |
160 | blIndexStart += threadId; | |
161 | nCellsThread += 1; | |
162 | } | |
163 | else { | |
164 | blIndexStart += nFluid % nThreads; | |
165 | } | |
166 | ||
167 | int blIndexStop = blIndexStart + nCellsThread; | |
168 | ||
169 | // We have three loops: | |
170 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
171 | // 2. Vectorized handling of nodes. | |
172 | // 3. Remaining nodes, less than vector size. | |
173 | ||
174 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
175 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
176 | ||
177 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
178 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
179 | ||
180 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
181 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
182 | ||
183 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
184 | ||
185 | for(int iter = 0; iter < maxIterations; ++iter) { | |
186 | ||
187 | #if 1 | |
188 | #define INDEX_START blIndexStart | |
189 | #define INDEX_STOP blIndexVec | |
190 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
191 | ||
192 | #define INDEX_START blIndexVec | |
193 | #define INDEX_STOP blIndexRemaining | |
194 | #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" | |
195 | ||
196 | #define INDEX_START blIndexRemaining | |
197 | #define INDEX_STOP blIndexStop | |
198 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
199 | #else | |
200 | #define INDEX_START blIndexStart | |
201 | #define INDEX_STOP blIndexStop | |
202 | #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" | |
203 | #endif | |
204 | #pragma omp barrier | |
205 | ||
206 | #pragma omp single | |
207 | { | |
208 | #ifdef VERIFICATION | |
209 | kd->PdfsActive = dst; | |
210 | KernelAddBodyForce(kd, ld, cd); | |
211 | #endif | |
212 | ||
213 | #ifdef VTK_OUTPUT | |
214 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
215 | kd->PdfsActive = dst; | |
216 | VtkWrite(ld, kd, cd, iter); | |
217 | } | |
218 | #endif | |
219 | ||
220 | #ifdef STATISTICS | |
221 | kd->PdfsActive = dst; | |
222 | KernelStatistics(kd, ld, cd, iter); | |
223 | #endif | |
224 | ||
225 | // swap grids | |
226 | tmp = src; | |
227 | src = dst; | |
228 | dst = tmp; | |
229 | } | |
230 | ||
231 | #pragma omp barrier | |
232 | ||
233 | } // for (int iter = 0; ... | |
234 | ||
235 | MemFree((void **)&tmpArray); | |
236 | } | |
237 | ||
238 | #ifdef VTK_OUTPUT | |
239 | if (cd->VtkOutput) { | |
240 | kd->PdfsActive = src; | |
241 | VtkWrite(ld, kd, cd, maxIterations); | |
242 | } | |
243 | #endif | |
244 | ||
245 | #ifdef STATISTICS | |
246 | kd->PdfsActive = src; | |
247 | KernelStatistics(kd, ld, cd, maxIterations); | |
248 | #endif | |
249 | ||
250 | return; | |
251 | } | |
252 | ||
253 | void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
254 | { | |
255 | ||
256 | Assert(ld != NULL); | |
257 | Assert(kernelData != NULL); | |
258 | Assert(cd != NULL); | |
259 | ||
260 | Assert(cd->Omega > 0.0); | |
261 | Assert(cd->Omega < 2.0); | |
262 | ||
263 | KernelData * kd = (KernelData *)kernelData; | |
264 | KernelDataList * kdl = KDL(kernelData); | |
265 | KernelDataListRia * kdlr = KDLR(kernelData); | |
266 | ||
267 | PdfT omega = cd->Omega; | |
268 | const PdfT omegaEven = omega; | |
269 | ||
270 | PdfT magicParam = 1.0 / 12.0; | |
271 | const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); | |
272 | ||
273 | ||
274 | const PdfT w_0 = 1.0 / 3.0; | |
275 | const PdfT w_1 = 1.0 / 18.0; | |
276 | const PdfT w_2 = 1.0 / 36.0; | |
277 | ||
278 | const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; | |
279 | const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; | |
280 | ||
281 | const VPDFT vw_1_x3 = VSET(w_1_x3); | |
282 | const VPDFT vw_2_x3 = VSET(w_2_x3); | |
283 | ||
284 | const VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
285 | const VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
286 | ||
287 | const VPDFT vomegaEven = VSET(omegaEven); | |
288 | const VPDFT vomegaOdd = VSET(omegaOdd); | |
289 | ||
290 | const VPDFT voneHalf = VSET(0.5); | |
291 | ||
292 | // uint32_t nConsecNodes = kdlr->nConsecNodes; | |
293 | // uint32_t * consecNodes = kdlr->ConsecNodes; | |
294 | // uint32_t consecIndex = 0; | |
295 | // uint32_t consecValue = 0; | |
296 | ||
297 | PdfT * src = kd->Pdfs[0]; | |
298 | PdfT * dst = kd->Pdfs[1]; | |
299 | PdfT * tmp; | |
300 | ||
301 | int maxIterations = cd->MaxIterations; | |
302 | ||
303 | int nFluid = kdl->nFluid; | |
304 | int nCells = kdl->nCells; | |
305 | ||
306 | int nTmpArray = kdlr->nTmpArray; | |
307 | ||
308 | Assert(nTmpArray % VSIZE == 0); | |
309 | ||
310 | uint32_t * adjList = kdl->AdjList; | |
311 | ||
312 | #ifdef VTK_OUTPUT | |
313 | if (cd->VtkOutput) { | |
314 | kd->PdfsActive = src; | |
315 | VtkWrite(ld, kd, cd, -1); | |
316 | } | |
317 | #endif | |
318 | ||
319 | #ifdef STATISTICS | |
320 | kd->PdfsActive = src; | |
321 | KernelStatistics(kd, ld, cd, 0); | |
322 | #endif | |
323 | ||
324 | #ifdef _OPENMP | |
325 | #pragma omp parallel default(none) \ | |
326 | shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ | |
327 | cd, maxIterations, ld, tmp, nTmpArray, \ | |
328 | stderr ) | |
329 | #endif | |
330 | { | |
331 | uint32_t adjListIndex; | |
332 | ||
333 | PdfT ux, uy, uz, ui; | |
334 | VPDFT vux, vuy, vuz, vui; | |
335 | ||
336 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); | |
337 | D3Q19_LIST | |
338 | #undef X | |
339 | VPDFT vpdf_a, vpdf_b; | |
340 | ||
341 | PdfT evenPart, oddPart, dir_indep_trm, dens; | |
342 | PdfT w_1_indep, w_2_indep; | |
343 | VPDFT vevenPart, voddPart; | |
344 | VPDFT vw_1_indep, vw_2_indep; | |
345 | ||
346 | int indexMax; | |
347 | ||
348 | PdfT * tmpArray; | |
349 | MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); | |
350 | ||
351 | int nThreads = 1; | |
352 | int threadId = 0; | |
353 | ||
354 | #ifdef _OPENMP | |
355 | nThreads = omp_get_max_threads(); | |
356 | threadId = omp_get_thread_num(); | |
357 | #endif | |
358 | ||
359 | int nCellsThread = nFluid / nThreads; | |
360 | int blIndexStart = threadId * nCellsThread; | |
361 | ||
362 | if (threadId < nFluid % nThreads) { | |
363 | blIndexStart += threadId; | |
364 | nCellsThread += 1; | |
365 | } | |
366 | else { | |
367 | blIndexStart += nFluid % nThreads; | |
368 | } | |
369 | ||
370 | int blIndexStop = blIndexStart + nCellsThread; | |
371 | ||
372 | // We have three loops: | |
373 | // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. | |
374 | // 2. Vectorized handling of nodes. | |
375 | // 3. Remaining nodes, less than vector size. | |
376 | ||
377 | unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); | |
378 | int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; | |
379 | ||
380 | int nCellsVectorized = nCellsThread - nCellsUnaligned; | |
381 | nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); | |
382 | ||
383 | int blIndexVec = blIndexStart + nCellsUnaligned; | |
384 | int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; | |
385 | ||
386 | // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); | |
387 | ||
388 | for(int iter = 0; iter < maxIterations; ++iter) { | |
389 | ||
390 | #if 1 | |
391 | #define INDEX_START blIndexStart | |
392 | #define INDEX_STOP blIndexVec | |
393 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
394 | ||
395 | #define INDEX_START blIndexVec | |
396 | #define INDEX_STOP blIndexRemaining | |
397 | #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" | |
398 | ||
399 | #define INDEX_START blIndexRemaining | |
400 | #define INDEX_STOP blIndexStop | |
401 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
402 | #else | |
403 | #define INDEX_START blIndexStart | |
404 | #define INDEX_STOP blIndexStop | |
405 | #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" | |
406 | #endif | |
407 | #pragma omp barrier | |
408 | ||
409 | #pragma omp single | |
410 | { | |
411 | #ifdef VERIFICATION | |
412 | kd->PdfsActive = dst; | |
413 | KernelAddBodyForce(kd, ld, cd); | |
414 | #endif | |
415 | ||
416 | #ifdef VTK_OUTPUT | |
417 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
418 | kd->PdfsActive = dst; | |
419 | VtkWrite(ld, kd, cd, iter); | |
420 | } | |
421 | #endif | |
422 | ||
423 | #ifdef STATISTICS | |
424 | kd->PdfsActive = dst; | |
425 | KernelStatistics(kd, ld, cd, iter); | |
426 | #endif | |
427 | ||
428 | // swap grids | |
429 | tmp = src; | |
430 | src = dst; | |
431 | dst = tmp; | |
432 | } | |
433 | ||
434 | #pragma omp barrier | |
435 | ||
436 | } // for (int iter = 0; ... | |
437 | ||
438 | MemFree((void **)&tmpArray); | |
439 | } | |
440 | ||
441 | #ifdef VTK_OUTPUT | |
442 | if (cd->VtkOutput) { | |
443 | kd->PdfsActive = src; | |
444 | VtkWrite(ld, kd, cd, maxIterations); | |
445 | } | |
446 | #endif | |
447 | ||
448 | #ifdef STATISTICS | |
449 | kd->PdfsActive = src; | |
450 | KernelStatistics(kd, ld, cd, maxIterations); | |
451 | #endif | |
452 | ||
453 | return; | |
454 | } | |
455 |