Commit | Line | Data |
---|---|---|
0fde6e45 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19AaVecCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "LikwidIf.h" | |
32 | #include "Vector.h" | |
33 | #include "Vector.h" | |
34 | ||
35 | #include <inttypes.h> | |
36 | #include <math.h> | |
37 | ||
38 | #ifdef _OPENMP | |
39 | #include <omp.h> | |
40 | #endif | |
41 | ||
42 | static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd); | |
43 | static void KernelOddVecSl(LatticeDesc * ld, KernelData * kd, CaseData * cd); | |
44 | ||
45 | #if 1 // {{{ | |
46 | void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix, int dir) | |
47 | { | |
48 | int * gDims = kd->GlobalDims; | |
49 | ||
50 | int nX = gDims[0]; | |
51 | int nY = gDims[1]; | |
52 | // int nZ = gDims[2]; | |
53 | ||
54 | PdfT pdfs[N_D3Q19]; | |
55 | ||
56 | int localZStart = zStart; | |
57 | int localZStop = zStop; | |
58 | ||
59 | if (localZStart == -1) localZStart = 0; | |
60 | if (localZStop == -1) localZStop = gDims[2] - 1; | |
61 | ||
62 | printf("D iter: %d dir: %d %s\n", iter, dir, D3Q19_NAMES[dir]); | |
63 | ||
64 | // for (int dir = 0; dir < 19; ++dir) { | |
65 | for (int z = localZStop; z >= localZStart; --z) { | |
66 | printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z); | |
67 | ||
68 | for(int y = 0; y < nY; ++y) { | |
69 | // for(int y = 2; y < nY - 2; ++y) { | |
70 | printf("D [%2d][%2d][%s] %2d ", iter, dir, prefix, y); | |
71 | ||
72 | for(int x = 0; x < nX; ++x) { | |
73 | ||
74 | if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { | |
75 | ||
76 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
77 | pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)]; | |
78 | #undef I | |
79 | } | |
80 | else { | |
81 | pdfs[dir] = -1.0; | |
82 | } | |
83 | ||
84 | printf("%.16e ", pdfs[dir]); | |
85 | // printf("%08.0f ", pdfs[dir]); | |
86 | } | |
87 | ||
88 | printf("\n"); | |
89 | } | |
90 | } | |
91 | // } | |
92 | } | |
93 | #endif // }}} | |
94 | ||
95 | void FNAME(D3Q19AaVecSlKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd) | |
96 | { | |
97 | Assert(ld != NULL); | |
98 | Assert(kd != NULL); | |
99 | Assert(cd != NULL); | |
100 | ||
101 | Assert(cd->Omega > 0.0); | |
102 | Assert(cd->Omega < 2.0); | |
103 | ||
104 | KernelDataAa * kda = KDA(kd); | |
105 | ||
106 | PdfT * src = kd->PdfsActive; | |
107 | ||
108 | int maxIterations = cd->MaxIterations; | |
109 | ||
110 | #ifdef VTK_OUTPUT | |
111 | if (cd->VtkOutput) { | |
112 | kd->PdfsActive = src; | |
113 | VtkWrite(ld, kd, cd, -1); | |
114 | } | |
115 | #endif | |
116 | ||
117 | #ifdef STATISTICS | |
118 | kd->PdfsActive = src; | |
119 | KernelStatistics(kd, ld, cd, 0); | |
120 | #endif | |
121 | ||
122 | Assert((maxIterations % 2) == 0); | |
123 | ||
8cafd9ea MW |
124 | X_KERNEL_START(kd); |
125 | ||
0fde6e45 MW |
126 | #ifdef _OPENMP |
127 | #pragma omp parallel default(none) shared(kda, kd, ld, cd, src, maxIterations) | |
128 | #endif | |
129 | { | |
130 | for (int iter = 0; iter < maxIterations; iter += 2) { | |
131 | ||
132 | // -------------------------------------------------------------------- | |
133 | // even time step | |
134 | // -------------------------------------------------------------------- | |
135 | ||
136 | X_LIKWID_START("aa-vec-even"); | |
137 | ||
138 | KernelEven(ld, kd, cd); | |
139 | #ifdef _OPENMP | |
140 | #pragma omp barrier | |
141 | #endif | |
142 | ||
143 | X_LIKWID_STOP("aa-vec-even"); | |
144 | ||
145 | // Fixup bounce back PDFs. | |
146 | #ifdef _OPENMP | |
147 | #pragma omp for | |
148 | #endif | |
149 | #ifdef INTEL_OPT_DIRECTIVES | |
150 | #pragma ivdep | |
151 | #endif | |
152 | for (int i = 0; i < kd->nBounceBackPdfs; ++i) { | |
153 | src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; | |
154 | } | |
155 | ||
156 | #ifdef _OPENMP | |
157 | #pragma omp single | |
158 | #endif | |
159 | { | |
160 | // save current iteration | |
161 | kda->Iteration = iter; | |
162 | ||
163 | #ifdef VERIFICATION | |
164 | kd->PdfsActive = src; | |
165 | KernelAddBodyForce(kd, ld, cd); | |
166 | #endif | |
167 | ||
168 | #ifdef VTK_OUTPUT | |
169 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
170 | kd->PdfsActive = src; | |
171 | VtkWrite(ld, kd, cd, iter); | |
172 | } | |
173 | #endif | |
174 | ||
175 | #ifdef STATISTICS | |
176 | kd->PdfsActive = src; | |
177 | KernelStatistics(kd, ld, cd, iter); | |
178 | #endif | |
179 | } | |
180 | #ifdef _OPENMP | |
181 | #pragma omp barrier | |
182 | #endif | |
183 | ||
184 | ||
185 | // -------------------------------------------------------------------- | |
186 | // odd time step | |
187 | // -------------------------------------------------------------------- | |
188 | ||
189 | X_LIKWID_START("aa-vec-odd"); | |
190 | ||
191 | ||
192 | KernelOddVecSl(ld, kd, cd); | |
193 | #ifdef _OPENMP | |
194 | #pragma omp barrier | |
195 | #endif | |
196 | ||
197 | // Stop counters before bounce back. Else computing loop balance will | |
198 | // be incorrect. | |
199 | ||
200 | X_LIKWID_STOP("aa-vec-odd"); | |
201 | ||
202 | // Fixup bounce back PDFs. | |
203 | #ifdef _OPENMP | |
204 | #pragma omp for | |
205 | #endif | |
206 | #ifdef INTEL_OPT_DIRECTIVES | |
207 | #pragma ivdep | |
208 | #endif | |
209 | for (int i = 0; i < kd->nBounceBackPdfs; ++i) { | |
210 | src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; | |
211 | } | |
212 | ||
213 | #ifdef _OPENMP | |
214 | #pragma omp single | |
215 | #endif | |
216 | { | |
217 | // save current iteration | |
218 | kda->Iteration = iter + 1; | |
219 | ||
220 | #ifdef VERIFICATION | |
221 | kd->PdfsActive = src; | |
222 | KernelAddBodyForce(kd, ld, cd); | |
223 | #endif | |
224 | ||
225 | #ifdef VTK_OUTPUT | |
226 | if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) { | |
227 | kd->PdfsActive = src; | |
228 | VtkWrite(ld, kd, cd, iter + 1); | |
229 | } | |
230 | #endif | |
231 | ||
232 | #ifdef STATISTICS | |
233 | kd->PdfsActive = src; | |
234 | KernelStatistics(kd, ld, cd, iter + 1); | |
235 | #endif | |
236 | } | |
237 | #ifdef _OPENMP | |
238 | #pragma omp barrier | |
239 | #endif | |
240 | } // for (int iter = 0; ... | |
241 | } // omp parallel | |
242 | ||
8cafd9ea MW |
243 | X_KERNEL_END(kd); |
244 | ||
0fde6e45 MW |
245 | #ifdef VTK_OUTPUT |
246 | ||
247 | if (cd->VtkOutput) { | |
248 | kd->PdfsActive = src; | |
249 | VtkWrite(ld, kd, cd, maxIterations); | |
250 | } | |
251 | ||
252 | #endif | |
253 | ||
254 | return; | |
255 | } | |
256 | ||
257 | static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ | |
258 | { | |
259 | Assert(ld != NULL); | |
260 | Assert(kd != NULL); | |
261 | Assert(cd != NULL); | |
262 | ||
263 | Assert(cd->Omega > F(0.0)); | |
264 | Assert(cd->Omega < F(2.0)); | |
265 | ||
266 | KernelDataAa * kda = KDA(kd); | |
267 | ||
268 | int nX = ld->Dims[0]; | |
269 | int nY = ld->Dims[1]; | |
270 | int nZ = ld->Dims[2]; | |
271 | ||
272 | int * gDims = kd->GlobalDims; | |
273 | ||
274 | int oX = kd->Offsets[0]; | |
275 | int oY = kd->Offsets[1]; | |
276 | int oZ = kd->Offsets[2]; | |
277 | ||
278 | int blk[3]; | |
279 | blk[0] = kda->Blk[0]; | |
280 | blk[1] = kda->Blk[1]; | |
281 | blk[2] = kda->Blk[2]; | |
282 | ||
283 | PdfT omega = cd->Omega; | |
284 | PdfT omegaEven = omega; | |
285 | ||
286 | PdfT magicParam = F(1.0) / F(12.0); | |
287 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
288 | ||
289 | const PdfT w_0 = F(1.0) / F( 3.0); | |
290 | const PdfT w_1 = F(1.0) / F(18.0); | |
291 | const PdfT w_2 = F(1.0) / F(36.0); | |
292 | ||
293 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); | |
294 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
295 | ||
296 | ||
297 | VPDFT VONE_HALF = VSET(F(0.5)); | |
298 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
299 | ||
300 | VPDFT vw_1_indep, vw_2_indep; | |
301 | VPDFT vw_0 = VSET(w_0); | |
302 | VPDFT vw_1 = VSET(w_1); | |
303 | VPDFT vw_2 = VSET(w_2); | |
304 | ||
305 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
306 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
307 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
308 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
309 | ||
310 | VPDFT vui, vux, vuy, vuz, vdens; | |
311 | ||
312 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
313 | ||
314 | VPDFT vomegaEven = VSET(omegaEven); | |
315 | VPDFT vomegaOdd = VSET(omegaOdd); | |
316 | ||
317 | VPDFT vpdf_a, vpdf_b; | |
318 | ||
319 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
320 | #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); PdfT * JOIN(ppdf_,name); | |
321 | D3Q19_LIST | |
322 | #undef X | |
323 | ||
324 | PdfT * src = kd->Pdfs[0]; | |
325 | ||
326 | int nThreads = 1; | |
327 | int threadId = 0; | |
328 | ||
329 | #ifdef _OPENMP | |
330 | nThreads = omp_get_max_threads(); | |
331 | threadId = omp_get_thread_num(); | |
332 | #endif | |
333 | ||
334 | const int nodesPlane = gDims[1] * gDims[2]; | |
335 | const int nodesCol = gDims[2]; | |
336 | ||
337 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
338 | ||
339 | // TODO: make inline function out of macros. | |
340 | ||
341 | #define IMPLODE(_x, _y, _z) (nodesPlane * (_x) + nodesCol * (_y) + (_z)) | |
342 | #define EXPLODE(index, _x, _y, _z) _x = index / (nodesPlane); _y = (index - nodesPlane * (_x)) / nodesCol; _z = index - nodesPlane * (_x) - nodesCol * (_y); | |
343 | ||
344 | int startX = oX; | |
345 | int startY = oY; | |
346 | int startZ = oZ; | |
347 | ||
348 | int indexStart = IMPLODE(startX, startY, startZ); | |
349 | int indexEnd = IMPLODE(startX + nX - 1, startY + nY - 1, startZ + nZ - 1); | |
350 | ||
351 | // How many cells as multiples of VSIZE do we have (rounded up)? | |
352 | int nVCells = (indexEnd - indexStart + 1 + VSIZE - 1) / VSIZE; | |
353 | ||
354 | int threadStart = nVCells / nThreads * threadId; | |
355 | int threadEnd = nVCells / nThreads * (threadId + 1); | |
356 | ||
357 | if (nVCells % nThreads > threadId) { | |
358 | threadStart += threadId; | |
359 | threadEnd += threadId + 1; | |
360 | } | |
361 | else { | |
362 | threadStart += nVCells % nThreads; | |
363 | threadEnd += nVCells % nThreads; | |
364 | } | |
365 | ||
366 | threadStart *= VSIZE; | |
367 | threadEnd *= VSIZE; | |
368 | ||
369 | // As threadStart/End is now in the granularity of cells we add the start offset. | |
370 | threadStart += indexStart; | |
371 | threadEnd += indexStart; | |
372 | ||
373 | EXPLODE(threadStart, startX, startY, startZ); | |
374 | ||
375 | #undef EXPLODE | |
376 | #undef IMPLODE | |
377 | ||
378 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &src[I(startX, startY, startZ, idx)]; | |
379 | D3Q19_LIST | |
380 | #undef X | |
381 | ||
382 | // printf("e thread %d idx start: %d end: %d thread start: %d end: %d\n", | |
383 | // threadId, indexStart, indexEnd, threadStart, threadEnd); | |
384 | ||
385 | ||
8cafd9ea | 386 | for (int i = threadStart; i < threadEnd; i += VSIZE) { // LOOP aa-vec-sl-even |
0fde6e45 MW |
387 | |
388 | // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... | |
389 | // #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]); | |
390 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name)); | |
391 | D3Q19_LIST | |
392 | #undef X | |
393 | ||
394 | ||
395 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
396 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
397 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
398 | ||
399 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
400 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)), | |
401 | VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
402 | ||
403 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
404 | ||
405 | VSTU(ppdf_C, VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
406 | ||
407 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
408 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
409 | ||
410 | #if defined(LOOP_1) || defined(LOOP_2) | |
411 | #error Loop macros are not allowed to be defined here. | |
412 | #endif | |
413 | ||
414 | #define LOOP_1(_dir1, _dir2, _vel) \ | |
415 | vui = _vel; \ | |
416 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
417 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
418 | \ | |
419 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
420 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
421 | \ | |
422 | VSTU(JOIN(ppdf_,_dir2), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
423 | VSTU(JOIN(ppdf_,_dir1), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
424 | ||
425 | #define LOOP_2(_dir1, _dir2, _expr) \ | |
426 | vui = _expr; \ | |
427 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
428 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
429 | \ | |
430 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
431 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
432 | \ | |
433 | VSTU(JOIN(ppdf_,_dir2), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
434 | VSTU(JOIN(ppdf_,_dir1), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
435 | ||
436 | LOOP_1(N, S, vuy); | |
437 | LOOP_1(E, W, vux); | |
438 | LOOP_1(T, B, vuz); | |
439 | ||
440 | LOOP_2(NW, SE, VSUB(vuy, vux)); | |
441 | LOOP_2(NE, SW, VADD(vuy, vux)); | |
442 | LOOP_2(TW, BE, VSUB(vuz, vux)); | |
443 | LOOP_2(TE, BW, VADD(vuz, vux)); | |
444 | LOOP_2(TS, BN, VSUB(vuz, vuy)); | |
445 | LOOP_2(TN, BS, VADD(vuz, vuy)); | |
446 | ||
447 | #undef LOOP_1 | |
448 | #undef LOOP_2 | |
449 | ||
450 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += VSIZE; | |
451 | D3Q19_LIST | |
452 | #undef X | |
453 | } | |
454 | ||
455 | #undef I | |
456 | ||
457 | return; | |
458 | } // }}} | |
459 | ||
460 | ||
461 | static void KernelOddVecSl(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ | |
462 | { | |
463 | Assert(ld != NULL); | |
464 | Assert(kd != NULL); | |
465 | Assert(cd != NULL); | |
466 | ||
467 | Assert(cd->Omega > 0.0); | |
468 | Assert(cd->Omega < F(2.0)); | |
469 | ||
470 | KernelDataAa * kda = KDA(kd); | |
471 | ||
472 | int nX = ld->Dims[0]; | |
473 | int nY = ld->Dims[1]; | |
474 | int nZ = ld->Dims[2]; | |
475 | ||
476 | int * gDims = kd->GlobalDims; | |
477 | ||
478 | int oX = kd->Offsets[0]; | |
479 | int oY = kd->Offsets[1]; | |
480 | int oZ = kd->Offsets[2]; | |
481 | ||
482 | int blk[3]; | |
483 | blk[0] = kda->Blk[0]; | |
484 | blk[1] = kda->Blk[1]; | |
485 | blk[2] = kda->Blk[2]; | |
486 | ||
487 | PdfT omega = cd->Omega; | |
488 | PdfT omegaEven = omega; | |
489 | ||
490 | PdfT magicParam = F(1.0) / F(12.0); | |
491 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
492 | ||
493 | const PdfT w_0 = F(1.0) / F( 3.0); | |
494 | const PdfT w_1 = F(1.0) / F(18.0); | |
495 | const PdfT w_2 = F(1.0) / F(36.0); | |
496 | ||
497 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); | |
498 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
499 | ||
500 | VPDFT VONE_HALF = VSET(F(0.5)); | |
501 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
502 | ||
503 | VPDFT vw_1_indep, vw_2_indep; | |
504 | VPDFT vw_0 = VSET(w_0); | |
505 | VPDFT vw_1 = VSET(w_1); | |
506 | VPDFT vw_2 = VSET(w_2); | |
507 | ||
508 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
509 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
510 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
511 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
512 | ||
513 | VPDFT vui, vux, vuy, vuz, vdens; | |
514 | ||
515 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
516 | ||
517 | VPDFT vomegaEven = VSET(omegaEven); | |
518 | VPDFT vomegaOdd = VSET(omegaOdd); | |
519 | ||
520 | VPDFT vpdf_a, vpdf_b; | |
521 | ||
522 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
523 | #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); PdfT * JOIN(ppdf_,idx); | |
524 | D3Q19_LIST | |
525 | #undef X | |
526 | ||
527 | PdfT * src = kd->Pdfs[0]; | |
528 | ||
529 | int nThreads = 1; | |
530 | int threadId = 0; | |
531 | ||
532 | #ifdef _OPENMP | |
533 | nThreads = omp_get_max_threads(); | |
534 | threadId = omp_get_thread_num(); | |
535 | #endif | |
536 | ||
537 | const int nodesPlane = gDims[1] * gDims[2]; | |
538 | const int nodesCol = gDims[2]; | |
539 | ||
540 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
541 | ||
542 | // TODO: make inline function out of macros. | |
543 | ||
544 | #define IMPLODE(_x, _y, _z) (nodesPlane * (_x) + nodesCol * (_y) + (_z)) | |
545 | #define EXPLODE(index, _x, _y, _z) _x = index / (nodesPlane); _y = (index - nodesPlane * (_x)) / nodesCol; _z = index - nodesPlane * (_x) - nodesCol * (_y); | |
546 | ||
547 | int startX = oX; | |
548 | int startY = oY; | |
549 | int startZ = oZ; | |
550 | ||
551 | int indexStart = IMPLODE(startX, startY, startZ); | |
552 | int indexEnd = IMPLODE(startX + nX - 1, startY + nY - 1, startZ + nZ - 1); | |
553 | ||
554 | // How many multiples of VSIZE cells (rounded up) do we have? | |
555 | int nVCells = (indexEnd - indexStart + 1 + VSIZE - 1) / VSIZE; | |
556 | ||
557 | int threadStart = nVCells / nThreads * threadId; | |
558 | int threadEnd = nVCells / nThreads * (threadId + 1); | |
559 | ||
560 | if (nVCells % nThreads > threadId) { | |
561 | threadStart += threadId; | |
562 | threadEnd += threadId + 1; | |
563 | } | |
564 | else { | |
565 | threadStart += nVCells % nThreads; | |
566 | threadEnd += nVCells % nThreads; | |
567 | } | |
568 | ||
569 | threadStart *= VSIZE; | |
570 | threadEnd *= VSIZE; | |
571 | ||
572 | // As threadStart/End is now in the granularity of cells we add the start offset. | |
573 | threadStart += indexStart; | |
574 | threadEnd += indexStart; | |
575 | ||
576 | EXPLODE(threadStart, startX, startY, startZ); | |
577 | ||
578 | #undef EXPLODE | |
579 | #undef IMPLODE | |
580 | ||
581 | // printf("o thread %d idx start: %d end: %d thread start: %d end: %d\n", | |
582 | // threadId, indexStart, indexEnd, threadStart, threadEnd); | |
583 | ||
584 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,idx) = &src[I(startX + _x, startY + _y, startZ + _z, idx)]; | |
585 | D3Q19_LIST | |
586 | #undef X | |
587 | ||
588 | #if DEBUG_EXTENDED | |
589 | ||
590 | #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_start_,idx), * JOIN(ppdf_end_,idx); | |
591 | D3Q19_LIST | |
592 | #undef X | |
593 | ||
594 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_start_,idx) = &src[I(startX + _x, startY + _y, startZ + _z, idx)]; | |
595 | D3Q19_LIST | |
596 | #undef X | |
597 | ||
598 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_end_,idx) = &src[I(startX + nX - 1 + _x, startY + nY - 1 + _y, startZ + nZ - 1 + _z, idx)]; | |
599 | D3Q19_LIST | |
600 | #undef X | |
601 | ||
602 | #if 0 | |
603 | #define X(name, idx, idxinv, _x, _y, _z) printf("%2s ppdf_%d = %p (%d %d %d) (%d %d %d)\n", STRINGIFY(name), idx, JOIN(ppdf_,idx), \ | |
604 | startX , startY , startZ , startX + _x, startY + _y, startZ + _z); | |
605 | D3Q19_LIST | |
606 | #undef X | |
607 | #endif | |
608 | ||
609 | #endif // DEBUG_EXTENDED | |
610 | ||
611 | ||
8cafd9ea | 612 | for (int i = threadStart; i < threadEnd; i += VSIZE) { // LOOP aa-vec-sl-odd |
0fde6e45 MW |
613 | |
614 | #if DEBUG_EXTENDED | |
615 | #define X(name, idx, idxinv, _x, _y, _z) Assert((unsigned long)(JOIN(ppdf_,idx)) >= (unsigned long)(JOIN(ppdf_start_,idx))); Assert((unsigned long)(JOIN(ppdf_,idx)) <= (unsigned long)(JOIN(ppdf_end_,idx))); | |
616 | D3Q19_LIST | |
617 | #undef X | |
618 | #endif | |
619 | ||
620 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,idxinv)); | |
621 | D3Q19_LIST | |
622 | #undef X | |
623 | ||
624 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
625 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
626 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
627 | ||
628 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
629 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
630 | ||
631 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
632 | ||
633 | // ppdf_18 is the pointer to the center pdfs. | |
634 | VSTU(ppdf_18, VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
635 | ||
636 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
637 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
638 | ||
639 | #if defined(LOOP_1) || defined(LOOP_2) | |
640 | #error Loop macros are not allowed to be defined here. | |
641 | #endif | |
642 | ||
643 | #define LOOP_1(_dir1, _dir2, _idx1, _idx2, _vel) \ | |
644 | vui = _vel; \ | |
645 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
646 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
647 | \ | |
648 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
649 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
650 | \ | |
651 | VSTU(JOIN(ppdf_,_idx1), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
652 | VSTU(JOIN(ppdf_,_idx2), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
653 | ||
654 | #define LOOP_2(_dir1, _dir2, _idx1, _idx2, _expr) \ | |
655 | vui = _expr; \ | |
656 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
657 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
658 | \ | |
659 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
660 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
661 | \ | |
662 | VSTU(JOIN(ppdf_,_idx1), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
663 | VSTU(JOIN(ppdf_,_idx2), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
664 | ||
665 | ||
666 | LOOP_1(N, S, D3Q19_N, D3Q19_S, vuy); | |
667 | LOOP_1(E, W, D3Q19_E, D3Q19_W, vux); | |
668 | LOOP_1(T, B, D3Q19_T, D3Q19_B, vuz); | |
669 | ||
670 | LOOP_2(NW, SE, D3Q19_NW, D3Q19_SE, VSUB(vuy, vux)); | |
671 | LOOP_2(NE, SW, D3Q19_NE, D3Q19_SW, VADD(vuy, vux)); | |
672 | LOOP_2(TW, BE, D3Q19_TW, D3Q19_BE, VSUB(vuz, vux)); | |
673 | LOOP_2(TE, BW, D3Q19_TE, D3Q19_BW, VADD(vuz, vux)); | |
674 | LOOP_2(TS, BN, D3Q19_TS, D3Q19_BN, VSUB(vuz, vuy)); | |
675 | LOOP_2(TN, BS, D3Q19_TN, D3Q19_BS, VADD(vuz, vuy)); | |
676 | ||
677 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,idx) += VSIZE; | |
678 | D3Q19_LIST | |
679 | #undef X | |
680 | } | |
681 | ||
682 | #undef I | |
683 | ||
684 | return; | |
685 | ||
686 | } // }}} |