Commit | Line | Data |
---|---|---|
0fde6e45 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19AaVecCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "LikwidIf.h" | |
32 | #include "Vector.h" | |
33 | #include "Vector.h" | |
34 | ||
35 | #include <inttypes.h> | |
36 | #include <math.h> | |
37 | ||
38 | #ifdef _OPENMP | |
39 | #include <omp.h> | |
40 | #endif | |
41 | ||
42 | static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd); | |
43 | static void KernelOddVecSl(LatticeDesc * ld, KernelData * kd, CaseData * cd); | |
44 | ||
45 | #if 1 // {{{ | |
46 | void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix, int dir) | |
47 | { | |
48 | int * gDims = kd->GlobalDims; | |
49 | ||
50 | int nX = gDims[0]; | |
51 | int nY = gDims[1]; | |
52 | // int nZ = gDims[2]; | |
53 | ||
54 | PdfT pdfs[N_D3Q19]; | |
55 | ||
56 | int localZStart = zStart; | |
57 | int localZStop = zStop; | |
58 | ||
59 | if (localZStart == -1) localZStart = 0; | |
60 | if (localZStop == -1) localZStop = gDims[2] - 1; | |
61 | ||
62 | printf("D iter: %d dir: %d %s\n", iter, dir, D3Q19_NAMES[dir]); | |
63 | ||
64 | // for (int dir = 0; dir < 19; ++dir) { | |
65 | for (int z = localZStop; z >= localZStart; --z) { | |
66 | printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z); | |
67 | ||
68 | for(int y = 0; y < nY; ++y) { | |
69 | // for(int y = 2; y < nY - 2; ++y) { | |
70 | printf("D [%2d][%2d][%s] %2d ", iter, dir, prefix, y); | |
71 | ||
72 | for(int x = 0; x < nX; ++x) { | |
73 | ||
74 | if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { | |
75 | ||
76 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
77 | pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)]; | |
78 | #undef I | |
79 | } | |
80 | else { | |
81 | pdfs[dir] = -1.0; | |
82 | } | |
83 | ||
84 | printf("%.16e ", pdfs[dir]); | |
85 | // printf("%08.0f ", pdfs[dir]); | |
86 | } | |
87 | ||
88 | printf("\n"); | |
89 | } | |
90 | } | |
91 | // } | |
92 | } | |
93 | #endif // }}} | |
94 | ||
95 | void FNAME(D3Q19AaVecSlKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd) | |
96 | { | |
97 | Assert(ld != NULL); | |
98 | Assert(kd != NULL); | |
99 | Assert(cd != NULL); | |
100 | ||
101 | Assert(cd->Omega > 0.0); | |
102 | Assert(cd->Omega < 2.0); | |
103 | ||
104 | KernelDataAa * kda = KDA(kd); | |
105 | ||
106 | PdfT * src = kd->PdfsActive; | |
107 | ||
108 | int maxIterations = cd->MaxIterations; | |
109 | ||
110 | #ifdef VTK_OUTPUT | |
111 | if (cd->VtkOutput) { | |
112 | kd->PdfsActive = src; | |
113 | VtkWrite(ld, kd, cd, -1); | |
114 | } | |
115 | #endif | |
116 | ||
117 | #ifdef STATISTICS | |
118 | kd->PdfsActive = src; | |
119 | KernelStatistics(kd, ld, cd, 0); | |
120 | #endif | |
121 | ||
122 | Assert((maxIterations % 2) == 0); | |
123 | ||
124 | #ifdef _OPENMP | |
125 | #pragma omp parallel default(none) shared(kda, kd, ld, cd, src, maxIterations) | |
126 | #endif | |
127 | { | |
128 | for (int iter = 0; iter < maxIterations; iter += 2) { | |
129 | ||
130 | // -------------------------------------------------------------------- | |
131 | // even time step | |
132 | // -------------------------------------------------------------------- | |
133 | ||
134 | X_LIKWID_START("aa-vec-even"); | |
135 | ||
136 | KernelEven(ld, kd, cd); | |
137 | #ifdef _OPENMP | |
138 | #pragma omp barrier | |
139 | #endif | |
140 | ||
141 | X_LIKWID_STOP("aa-vec-even"); | |
142 | ||
143 | // Fixup bounce back PDFs. | |
144 | #ifdef _OPENMP | |
145 | #pragma omp for | |
146 | #endif | |
147 | #ifdef INTEL_OPT_DIRECTIVES | |
148 | #pragma ivdep | |
149 | #endif | |
150 | for (int i = 0; i < kd->nBounceBackPdfs; ++i) { | |
151 | src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; | |
152 | } | |
153 | ||
154 | #ifdef _OPENMP | |
155 | #pragma omp single | |
156 | #endif | |
157 | { | |
158 | // save current iteration | |
159 | kda->Iteration = iter; | |
160 | ||
161 | #ifdef VERIFICATION | |
162 | kd->PdfsActive = src; | |
163 | KernelAddBodyForce(kd, ld, cd); | |
164 | #endif | |
165 | ||
166 | #ifdef VTK_OUTPUT | |
167 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
168 | kd->PdfsActive = src; | |
169 | VtkWrite(ld, kd, cd, iter); | |
170 | } | |
171 | #endif | |
172 | ||
173 | #ifdef STATISTICS | |
174 | kd->PdfsActive = src; | |
175 | KernelStatistics(kd, ld, cd, iter); | |
176 | #endif | |
177 | } | |
178 | #ifdef _OPENMP | |
179 | #pragma omp barrier | |
180 | #endif | |
181 | ||
182 | ||
183 | // -------------------------------------------------------------------- | |
184 | // odd time step | |
185 | // -------------------------------------------------------------------- | |
186 | ||
187 | X_LIKWID_START("aa-vec-odd"); | |
188 | ||
189 | ||
190 | KernelOddVecSl(ld, kd, cd); | |
191 | #ifdef _OPENMP | |
192 | #pragma omp barrier | |
193 | #endif | |
194 | ||
195 | // Stop counters before bounce back. Else computing loop balance will | |
196 | // be incorrect. | |
197 | ||
198 | X_LIKWID_STOP("aa-vec-odd"); | |
199 | ||
200 | // Fixup bounce back PDFs. | |
201 | #ifdef _OPENMP | |
202 | #pragma omp for | |
203 | #endif | |
204 | #ifdef INTEL_OPT_DIRECTIVES | |
205 | #pragma ivdep | |
206 | #endif | |
207 | for (int i = 0; i < kd->nBounceBackPdfs; ++i) { | |
208 | src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; | |
209 | } | |
210 | ||
211 | #ifdef _OPENMP | |
212 | #pragma omp single | |
213 | #endif | |
214 | { | |
215 | // save current iteration | |
216 | kda->Iteration = iter + 1; | |
217 | ||
218 | #ifdef VERIFICATION | |
219 | kd->PdfsActive = src; | |
220 | KernelAddBodyForce(kd, ld, cd); | |
221 | #endif | |
222 | ||
223 | #ifdef VTK_OUTPUT | |
224 | if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) { | |
225 | kd->PdfsActive = src; | |
226 | VtkWrite(ld, kd, cd, iter + 1); | |
227 | } | |
228 | #endif | |
229 | ||
230 | #ifdef STATISTICS | |
231 | kd->PdfsActive = src; | |
232 | KernelStatistics(kd, ld, cd, iter + 1); | |
233 | #endif | |
234 | } | |
235 | #ifdef _OPENMP | |
236 | #pragma omp barrier | |
237 | #endif | |
238 | } // for (int iter = 0; ... | |
239 | } // omp parallel | |
240 | ||
241 | #ifdef VTK_OUTPUT | |
242 | ||
243 | if (cd->VtkOutput) { | |
244 | kd->PdfsActive = src; | |
245 | VtkWrite(ld, kd, cd, maxIterations); | |
246 | } | |
247 | ||
248 | #endif | |
249 | ||
250 | return; | |
251 | } | |
252 | ||
253 | static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ | |
254 | { | |
255 | Assert(ld != NULL); | |
256 | Assert(kd != NULL); | |
257 | Assert(cd != NULL); | |
258 | ||
259 | Assert(cd->Omega > F(0.0)); | |
260 | Assert(cd->Omega < F(2.0)); | |
261 | ||
262 | KernelDataAa * kda = KDA(kd); | |
263 | ||
264 | int nX = ld->Dims[0]; | |
265 | int nY = ld->Dims[1]; | |
266 | int nZ = ld->Dims[2]; | |
267 | ||
268 | int * gDims = kd->GlobalDims; | |
269 | ||
270 | int oX = kd->Offsets[0]; | |
271 | int oY = kd->Offsets[1]; | |
272 | int oZ = kd->Offsets[2]; | |
273 | ||
274 | int blk[3]; | |
275 | blk[0] = kda->Blk[0]; | |
276 | blk[1] = kda->Blk[1]; | |
277 | blk[2] = kda->Blk[2]; | |
278 | ||
279 | PdfT omega = cd->Omega; | |
280 | PdfT omegaEven = omega; | |
281 | ||
282 | PdfT magicParam = F(1.0) / F(12.0); | |
283 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
284 | ||
285 | const PdfT w_0 = F(1.0) / F( 3.0); | |
286 | const PdfT w_1 = F(1.0) / F(18.0); | |
287 | const PdfT w_2 = F(1.0) / F(36.0); | |
288 | ||
289 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); | |
290 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
291 | ||
292 | ||
293 | VPDFT VONE_HALF = VSET(F(0.5)); | |
294 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
295 | ||
296 | VPDFT vw_1_indep, vw_2_indep; | |
297 | VPDFT vw_0 = VSET(w_0); | |
298 | VPDFT vw_1 = VSET(w_1); | |
299 | VPDFT vw_2 = VSET(w_2); | |
300 | ||
301 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
302 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
303 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
304 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
305 | ||
306 | VPDFT vui, vux, vuy, vuz, vdens; | |
307 | ||
308 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
309 | ||
310 | VPDFT vomegaEven = VSET(omegaEven); | |
311 | VPDFT vomegaOdd = VSET(omegaOdd); | |
312 | ||
313 | VPDFT vpdf_a, vpdf_b; | |
314 | ||
315 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
316 | #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); PdfT * JOIN(ppdf_,name); | |
317 | D3Q19_LIST | |
318 | #undef X | |
319 | ||
320 | PdfT * src = kd->Pdfs[0]; | |
321 | ||
322 | int nThreads = 1; | |
323 | int threadId = 0; | |
324 | ||
325 | #ifdef _OPENMP | |
326 | nThreads = omp_get_max_threads(); | |
327 | threadId = omp_get_thread_num(); | |
328 | #endif | |
329 | ||
330 | const int nodesPlane = gDims[1] * gDims[2]; | |
331 | const int nodesCol = gDims[2]; | |
332 | ||
333 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
334 | ||
335 | // TODO: make inline function out of macros. | |
336 | ||
337 | #define IMPLODE(_x, _y, _z) (nodesPlane * (_x) + nodesCol * (_y) + (_z)) | |
338 | #define EXPLODE(index, _x, _y, _z) _x = index / (nodesPlane); _y = (index - nodesPlane * (_x)) / nodesCol; _z = index - nodesPlane * (_x) - nodesCol * (_y); | |
339 | ||
340 | int startX = oX; | |
341 | int startY = oY; | |
342 | int startZ = oZ; | |
343 | ||
344 | int indexStart = IMPLODE(startX, startY, startZ); | |
345 | int indexEnd = IMPLODE(startX + nX - 1, startY + nY - 1, startZ + nZ - 1); | |
346 | ||
347 | // How many cells as multiples of VSIZE do we have (rounded up)? | |
348 | int nVCells = (indexEnd - indexStart + 1 + VSIZE - 1) / VSIZE; | |
349 | ||
350 | int threadStart = nVCells / nThreads * threadId; | |
351 | int threadEnd = nVCells / nThreads * (threadId + 1); | |
352 | ||
353 | if (nVCells % nThreads > threadId) { | |
354 | threadStart += threadId; | |
355 | threadEnd += threadId + 1; | |
356 | } | |
357 | else { | |
358 | threadStart += nVCells % nThreads; | |
359 | threadEnd += nVCells % nThreads; | |
360 | } | |
361 | ||
362 | threadStart *= VSIZE; | |
363 | threadEnd *= VSIZE; | |
364 | ||
365 | // As threadStart/End is now in the granularity of cells we add the start offset. | |
366 | threadStart += indexStart; | |
367 | threadEnd += indexStart; | |
368 | ||
369 | EXPLODE(threadStart, startX, startY, startZ); | |
370 | ||
371 | #undef EXPLODE | |
372 | #undef IMPLODE | |
373 | ||
374 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &src[I(startX, startY, startZ, idx)]; | |
375 | D3Q19_LIST | |
376 | #undef X | |
377 | ||
378 | // printf("e thread %d idx start: %d end: %d thread start: %d end: %d\n", | |
379 | // threadId, indexStart, indexEnd, threadStart, threadEnd); | |
380 | ||
381 | ||
382 | for (int i = threadStart; i < threadEnd; i += VSIZE) { | |
383 | ||
384 | // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... | |
385 | // #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]); | |
386 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name)); | |
387 | D3Q19_LIST | |
388 | #undef X | |
389 | ||
390 | ||
391 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
392 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
393 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
394 | ||
395 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
396 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)), | |
397 | VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
398 | ||
399 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
400 | ||
401 | VSTU(ppdf_C, VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
402 | ||
403 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
404 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
405 | ||
406 | #if defined(LOOP_1) || defined(LOOP_2) | |
407 | #error Loop macros are not allowed to be defined here. | |
408 | #endif | |
409 | ||
410 | #define LOOP_1(_dir1, _dir2, _vel) \ | |
411 | vui = _vel; \ | |
412 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
413 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
414 | \ | |
415 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
416 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
417 | \ | |
418 | VSTU(JOIN(ppdf_,_dir2), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
419 | VSTU(JOIN(ppdf_,_dir1), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
420 | ||
421 | #define LOOP_2(_dir1, _dir2, _expr) \ | |
422 | vui = _expr; \ | |
423 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
424 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
425 | \ | |
426 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
427 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
428 | \ | |
429 | VSTU(JOIN(ppdf_,_dir2), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
430 | VSTU(JOIN(ppdf_,_dir1), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
431 | ||
432 | LOOP_1(N, S, vuy); | |
433 | LOOP_1(E, W, vux); | |
434 | LOOP_1(T, B, vuz); | |
435 | ||
436 | LOOP_2(NW, SE, VSUB(vuy, vux)); | |
437 | LOOP_2(NE, SW, VADD(vuy, vux)); | |
438 | LOOP_2(TW, BE, VSUB(vuz, vux)); | |
439 | LOOP_2(TE, BW, VADD(vuz, vux)); | |
440 | LOOP_2(TS, BN, VSUB(vuz, vuy)); | |
441 | LOOP_2(TN, BS, VADD(vuz, vuy)); | |
442 | ||
443 | #undef LOOP_1 | |
444 | #undef LOOP_2 | |
445 | ||
446 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += VSIZE; | |
447 | D3Q19_LIST | |
448 | #undef X | |
449 | } | |
450 | ||
451 | #undef I | |
452 | ||
453 | return; | |
454 | } // }}} | |
455 | ||
456 | ||
457 | static void KernelOddVecSl(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ | |
458 | { | |
459 | Assert(ld != NULL); | |
460 | Assert(kd != NULL); | |
461 | Assert(cd != NULL); | |
462 | ||
463 | Assert(cd->Omega > 0.0); | |
464 | Assert(cd->Omega < F(2.0)); | |
465 | ||
466 | KernelDataAa * kda = KDA(kd); | |
467 | ||
468 | int nX = ld->Dims[0]; | |
469 | int nY = ld->Dims[1]; | |
470 | int nZ = ld->Dims[2]; | |
471 | ||
472 | int * gDims = kd->GlobalDims; | |
473 | ||
474 | int oX = kd->Offsets[0]; | |
475 | int oY = kd->Offsets[1]; | |
476 | int oZ = kd->Offsets[2]; | |
477 | ||
478 | int blk[3]; | |
479 | blk[0] = kda->Blk[0]; | |
480 | blk[1] = kda->Blk[1]; | |
481 | blk[2] = kda->Blk[2]; | |
482 | ||
483 | PdfT omega = cd->Omega; | |
484 | PdfT omegaEven = omega; | |
485 | ||
486 | PdfT magicParam = F(1.0) / F(12.0); | |
487 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
488 | ||
489 | const PdfT w_0 = F(1.0) / F( 3.0); | |
490 | const PdfT w_1 = F(1.0) / F(18.0); | |
491 | const PdfT w_2 = F(1.0) / F(36.0); | |
492 | ||
493 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); | |
494 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); | |
495 | ||
496 | VPDFT VONE_HALF = VSET(F(0.5)); | |
497 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
498 | ||
499 | VPDFT vw_1_indep, vw_2_indep; | |
500 | VPDFT vw_0 = VSET(w_0); | |
501 | VPDFT vw_1 = VSET(w_1); | |
502 | VPDFT vw_2 = VSET(w_2); | |
503 | ||
504 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
505 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
506 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
507 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
508 | ||
509 | VPDFT vui, vux, vuy, vuz, vdens; | |
510 | ||
511 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
512 | ||
513 | VPDFT vomegaEven = VSET(omegaEven); | |
514 | VPDFT vomegaOdd = VSET(omegaOdd); | |
515 | ||
516 | VPDFT vpdf_a, vpdf_b; | |
517 | ||
518 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
519 | #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); PdfT * JOIN(ppdf_,idx); | |
520 | D3Q19_LIST | |
521 | #undef X | |
522 | ||
523 | PdfT * src = kd->Pdfs[0]; | |
524 | ||
525 | int nThreads = 1; | |
526 | int threadId = 0; | |
527 | ||
528 | #ifdef _OPENMP | |
529 | nThreads = omp_get_max_threads(); | |
530 | threadId = omp_get_thread_num(); | |
531 | #endif | |
532 | ||
533 | const int nodesPlane = gDims[1] * gDims[2]; | |
534 | const int nodesCol = gDims[2]; | |
535 | ||
536 | #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) | |
537 | ||
538 | // TODO: make inline function out of macros. | |
539 | ||
540 | #define IMPLODE(_x, _y, _z) (nodesPlane * (_x) + nodesCol * (_y) + (_z)) | |
541 | #define EXPLODE(index, _x, _y, _z) _x = index / (nodesPlane); _y = (index - nodesPlane * (_x)) / nodesCol; _z = index - nodesPlane * (_x) - nodesCol * (_y); | |
542 | ||
543 | int startX = oX; | |
544 | int startY = oY; | |
545 | int startZ = oZ; | |
546 | ||
547 | int indexStart = IMPLODE(startX, startY, startZ); | |
548 | int indexEnd = IMPLODE(startX + nX - 1, startY + nY - 1, startZ + nZ - 1); | |
549 | ||
550 | // How many multiples of VSIZE cells (rounded up) do we have? | |
551 | int nVCells = (indexEnd - indexStart + 1 + VSIZE - 1) / VSIZE; | |
552 | ||
553 | int threadStart = nVCells / nThreads * threadId; | |
554 | int threadEnd = nVCells / nThreads * (threadId + 1); | |
555 | ||
556 | if (nVCells % nThreads > threadId) { | |
557 | threadStart += threadId; | |
558 | threadEnd += threadId + 1; | |
559 | } | |
560 | else { | |
561 | threadStart += nVCells % nThreads; | |
562 | threadEnd += nVCells % nThreads; | |
563 | } | |
564 | ||
565 | threadStart *= VSIZE; | |
566 | threadEnd *= VSIZE; | |
567 | ||
568 | // As threadStart/End is now in the granularity of cells we add the start offset. | |
569 | threadStart += indexStart; | |
570 | threadEnd += indexStart; | |
571 | ||
572 | EXPLODE(threadStart, startX, startY, startZ); | |
573 | ||
574 | #undef EXPLODE | |
575 | #undef IMPLODE | |
576 | ||
577 | // printf("o thread %d idx start: %d end: %d thread start: %d end: %d\n", | |
578 | // threadId, indexStart, indexEnd, threadStart, threadEnd); | |
579 | ||
580 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,idx) = &src[I(startX + _x, startY + _y, startZ + _z, idx)]; | |
581 | D3Q19_LIST | |
582 | #undef X | |
583 | ||
584 | #if DEBUG_EXTENDED | |
585 | ||
586 | #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_start_,idx), * JOIN(ppdf_end_,idx); | |
587 | D3Q19_LIST | |
588 | #undef X | |
589 | ||
590 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_start_,idx) = &src[I(startX + _x, startY + _y, startZ + _z, idx)]; | |
591 | D3Q19_LIST | |
592 | #undef X | |
593 | ||
594 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_end_,idx) = &src[I(startX + nX - 1 + _x, startY + nY - 1 + _y, startZ + nZ - 1 + _z, idx)]; | |
595 | D3Q19_LIST | |
596 | #undef X | |
597 | ||
598 | #if 0 | |
599 | #define X(name, idx, idxinv, _x, _y, _z) printf("%2s ppdf_%d = %p (%d %d %d) (%d %d %d)\n", STRINGIFY(name), idx, JOIN(ppdf_,idx), \ | |
600 | startX , startY , startZ , startX + _x, startY + _y, startZ + _z); | |
601 | D3Q19_LIST | |
602 | #undef X | |
603 | #endif | |
604 | ||
605 | #endif // DEBUG_EXTENDED | |
606 | ||
607 | ||
608 | for (int i = threadStart; i < threadEnd; i += VSIZE) { | |
609 | ||
610 | #if DEBUG_EXTENDED | |
611 | #define X(name, idx, idxinv, _x, _y, _z) Assert((unsigned long)(JOIN(ppdf_,idx)) >= (unsigned long)(JOIN(ppdf_start_,idx))); Assert((unsigned long)(JOIN(ppdf_,idx)) <= (unsigned long)(JOIN(ppdf_end_,idx))); | |
612 | D3Q19_LIST | |
613 | #undef X | |
614 | #endif | |
615 | ||
616 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,idxinv)); | |
617 | D3Q19_LIST | |
618 | #undef X | |
619 | ||
620 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
621 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
622 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
623 | ||
624 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
625 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
626 | ||
627 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
628 | ||
629 | // ppdf_18 is the pointer to the center pdfs. | |
630 | VSTU(ppdf_18, VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
631 | ||
632 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
633 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
634 | ||
635 | #if defined(LOOP_1) || defined(LOOP_2) | |
636 | #error Loop macros are not allowed to be defined here. | |
637 | #endif | |
638 | ||
639 | #define LOOP_1(_dir1, _dir2, _idx1, _idx2, _vel) \ | |
640 | vui = _vel; \ | |
641 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
642 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
643 | \ | |
644 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
645 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
646 | \ | |
647 | VSTU(JOIN(ppdf_,_idx1), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
648 | VSTU(JOIN(ppdf_,_idx2), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
649 | ||
650 | #define LOOP_2(_dir1, _dir2, _idx1, _idx2, _expr) \ | |
651 | vui = _expr; \ | |
652 | vpdf_a = JOIN(vpdf_,_dir1); \ | |
653 | vpdf_b = JOIN(vpdf_,_dir2); \ | |
654 | \ | |
655 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(VONE_HALF, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
656 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(VONE_HALF, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
657 | \ | |
658 | VSTU(JOIN(ppdf_,_idx1), VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
659 | VSTU(JOIN(ppdf_,_idx2), VADD(VSUB(vpdf_b, vevenPart), voddPart)); | |
660 | ||
661 | ||
662 | LOOP_1(N, S, D3Q19_N, D3Q19_S, vuy); | |
663 | LOOP_1(E, W, D3Q19_E, D3Q19_W, vux); | |
664 | LOOP_1(T, B, D3Q19_T, D3Q19_B, vuz); | |
665 | ||
666 | LOOP_2(NW, SE, D3Q19_NW, D3Q19_SE, VSUB(vuy, vux)); | |
667 | LOOP_2(NE, SW, D3Q19_NE, D3Q19_SW, VADD(vuy, vux)); | |
668 | LOOP_2(TW, BE, D3Q19_TW, D3Q19_BE, VSUB(vuz, vux)); | |
669 | LOOP_2(TE, BW, D3Q19_TE, D3Q19_BW, VADD(vuz, vux)); | |
670 | LOOP_2(TS, BN, D3Q19_TS, D3Q19_BN, VSUB(vuz, vuy)); | |
671 | LOOP_2(TN, BS, D3Q19_TN, D3Q19_BS, VADD(vuz, vuy)); | |
672 | ||
673 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,idx) += VSIZE; | |
674 | D3Q19_LIST | |
675 | #undef X | |
676 | } | |
677 | ||
678 | #undef I | |
679 | ||
680 | return; | |
681 | ||
682 | } // }}} |