add single precision, add aa-vec-sl-soa kernel, updated doc
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19Common.h"
28
29#include "Memory.h"
30#include "Vtk.h"
e3f82424 31#include "LikwidIf.h"
10988083
MW
32
33#include <inttypes.h>
34#include <math.h>
35
36#ifdef _OPENMP
37 #include <omp.h>
38#endif
39
40void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
41{
42 Assert(ld != NULL);
43 Assert(kernelData != NULL);
44 Assert(cd != NULL);
45
0fde6e45
MW
46 Assert(cd->Omega > F(0.0));
47 Assert(cd->Omega < F(2.0));
10988083
MW
48
49 KernelData * kd = (KernelData *)kernelData;
50
51
52 int nX = ld->Dims[0];
53 int nY = ld->Dims[1];
54 int nZ = ld->Dims[2];
55
56 int * gDims = kd->GlobalDims;
57
58 int oX = kd->Offsets[0];
59 int oY = kd->Offsets[1];
60 int oZ = kd->Offsets[2];
61
62 PdfT omega = cd->Omega;
63 PdfT omegaEven = omega;
0fde6e45
MW
64 // PdfT omegaOdd = 8.0*((F(2.0)-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter
65 PdfT magicParam = F(1.0) / F(12.0);
66 // 1/ 4: best stability;
67 // 1/12: removes third-order advection error (best advection);
68 // 1/ 6: removes fourth-order diffusion error (best diffusion);
69 // 3/16: exact location of bounce back for poiseuille flow
70 PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 71
0fde6e45
MW
72 PdfT evenPart = F(0.0);
73 PdfT oddPart = F(0.0);
74 PdfT dir_indep_trm = F(0.0);
10988083 75
0fde6e45
MW
76 PdfT w_0 = F(1.0) / F( 3.0);
77 PdfT w_1 = F(1.0) / F(18.0);
78 PdfT w_2 = F(1.0) / F(36.0);
10988083 79
0fde6e45
MW
80 PdfT w_1_x3 = w_1 * F(3.0); PdfT w_1_nine_half = w_1 * F(9.0)/F(2.0); PdfT w_1_indep = F(0.0);
81 PdfT w_2_x3 = w_2 * F(3.0); PdfT w_2_nine_half = w_2 * F(9.0)/F(2.0); PdfT w_2_indep = F(0.0);
10988083
MW
82
83 PdfT ux, uy, uz, ui;
84 PdfT dens;
85
86 // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
87 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
88 D3Q19_LIST
89 #undef X
90
91 PdfT * src = kd->Pdfs[0];
92 PdfT * dst = kd->Pdfs[1];
93 PdfT * tmp;
94
95 int maxIterations = cd->MaxIterations;
96
97 #ifdef VTK_OUTPUT
98 if (cd->VtkOutput) {
99 kd->PdfsActive = src;
100 VtkWrite(ld, kd, cd, 0);
101 }
102 #endif
103
104 for (int iter = 0; iter < maxIterations; ++iter) {
105
e3f82424
MW
106 X_LIKWID_START("os");
107
10988083 108 #ifdef _OPENMP
0fde6e45 109 #pragma omp parallel for collapse(2) default(none) \
10988083
MW
110 shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
111 w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
112 oX, oY, oZ, nX, nY, nZ) \
113 private(ux, uy, uz, ui, dens, dir_indep_trm, \
114 pdf_C, \
115 pdf_N, pdf_E, pdf_S, pdf_W, \
116 pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
117 pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
118 pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
119 evenPart, oddPart, w_1_indep, w_2_indep)
120 #endif
e3f82424 121 for (int x = oX; x < nX + oX; ++x) {
10988083 122 for (int y = oY; y < nY + oY; ++y) {
0fde6e45
MW
123 #ifdef INTEL_OPT_DIRECTIVES
124 #pragma ivdep
125 #pragma vector always
126 #pragma simd
127 #endif
e3f82424 128 for (int z = oZ; z < nZ + oZ; ++z) {
10988083
MW
129 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
130
131#ifdef PROP_MODEL_PUSH
132
133 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
134 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)];
135 //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
136 D3Q19_LIST
137 #undef X
138
139#elif PROP_MODEL_PULL
140
141 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
142 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
143 //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
144 D3Q19_LIST
145 #undef X
146
147#else
148 #error No implementation for PROP_MODEL_NAME.
149#endif
150
151 // #define LID_DRIVEN_CAVITY
152
153 #ifdef LID_DRIVEN_CAVITY
154
155 if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
0fde6e45
MW
156 ux = F(0.1 * 0.577);
157 uy = F(0.0);
158 uz = F(0.0);
10988083
MW
159
160 } else {
161 #endif
162 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
163 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
164 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
165 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
166 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
167 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
168 #ifdef LID_DRIVEN_CAVITY
169 }
170
171 #endif
172
173 dens = pdf_C +
174 pdf_N + pdf_E + pdf_S + pdf_W +
175 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
176 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
177 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
178
0fde6e45 179 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
10988083
MW
180
181#ifdef PROP_MODEL_PUSH
182
183 // direction: w_0
184 dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
185
186 // direction: w_1
187 w_1_indep = w_1*dir_indep_trm;
188
189 ui = uy;
0fde6e45
MW
190 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
191 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
10988083
MW
192 dst[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
193 dst[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
194
195 ui = ux;
0fde6e45
MW
196 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
197 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
10988083
MW
198 dst[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
199 dst[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
200
201 ui = uz;
0fde6e45
MW
202 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
203 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
10988083
MW
204 dst[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart;
205 dst[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart;
206
207 // direction: w_2
208 w_2_indep = w_2*dir_indep_trm;
209
210 ui = -ux + uy;
0fde6e45
MW
211 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
212 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
10988083
MW
213 dst[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
214 dst[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
215
216 ui = ux + uy;
0fde6e45
MW
217 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
218 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
10988083
MW
219 dst[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
220 dst[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
221
222 ui = -ux + uz;
0fde6e45
MW
223 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
224 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
10988083
MW
225 dst[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
226 dst[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
227
228 ui = ux + uz;
0fde6e45
MW
229 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
230 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
10988083
MW
231 dst[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
232 dst[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
233
234 ui = -uy + uz;
0fde6e45
MW
235 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
236 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
10988083
MW
237 dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
238 dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
239
240 ui = uy + uz;
0fde6e45
MW
241 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
242 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
10988083
MW
243 dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
244 dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
245
246#elif PROP_MODEL_PULL
247
248 // direction: w_0
249 dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
250
251 // direction: w_1
252 w_1_indep = w_1*dir_indep_trm;
253
254 ui = uy;
0fde6e45
MW
255 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
256 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
10988083
MW
257 dst[I(x, y, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
258 dst[I(x, y, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
259
260 ui = ux;
0fde6e45
MW
261 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
262 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
10988083
MW
263 dst[I(x, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
264 dst[I(x, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
265
266 ui = uz;
0fde6e45
MW
267 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
268 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
10988083
MW
269 dst[I(x, y, z, D3Q19_T)] = pdf_T - evenPart - oddPart;
270 dst[I(x, y, z, D3Q19_B)] = pdf_B - evenPart + oddPart;
271
272 // direction: w_2
273 w_2_indep = w_2*dir_indep_trm;
274
275 ui = -ux + uy;
0fde6e45
MW
276 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
277 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
10988083
MW
278 dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
279 dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
280
281 ui = ux + uy;
0fde6e45
MW
282 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
283 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
10988083
MW
284 dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
285 dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
286
287 ui = -ux + uz;
0fde6e45
MW
288 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
289 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
10988083
MW
290 dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
291 dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
292
293 ui = ux + uz;
0fde6e45
MW
294 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
295 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
10988083
MW
296 dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
297 dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
298
299 ui = -uy + uz;
0fde6e45
MW
300 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
301 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
10988083
MW
302 dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
303 dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
304
305 ui = uy + uz;
0fde6e45
MW
306 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
307 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
10988083
MW
308 dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
309 dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
310
311#else
312 #error No implementation for PROP_MODEL_NAME.
313#endif
314
315 #undef I
316 }
317 }
318 } // z, y, x (from inner to outer)
319
e3f82424
MW
320 // Stop counters before bounce back. Else computing loop balance will be incorrect.
321 X_LIKWID_STOP("os");
322
10988083
MW
323 // Fixup bounce back PDFs.
324 #ifdef _OPENMP
325 #pragma omp parallel for default(none) \
326 shared(kd, dst)
327 #endif
328 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
329 dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]];
330 }
331
332 #ifdef VERIFICATION
333 kd->PdfsActive = dst;
334 KernelAddBodyForce(kd, ld, cd);
335 #endif
336
337 #ifdef VTK_OUTPUT
338
339 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
340 kd->PdfsActive = dst;
341 VtkWrite(ld, kd, cd, iter);
342 }
343
344 #endif
345
346 #ifdef STATISTICS
347 kd->PdfsActive = dst;
348 KernelStatistics(kd, ld, cd, iter);
349 #endif
350
351 // swap grids
352 tmp = src;
353 src = dst;
354 dst = tmp;
355
356 } // for (int iter = 0; ...
357
358 #ifdef VTK_OUTPUT
359
360 if (cd->VtkOutput) {
361 kd->PdfsActive = src;
362 VtkWrite(ld, kd, cd, maxIterations);
363 }
364
365 #endif
366
367 return;
368}
369
370
371void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
372{
373 Assert(ld != NULL);
374 Assert(kernelData != NULL);
375 Assert(cd != NULL);
376
0fde6e45
MW
377 Assert(cd->Omega > F(0.0));
378 Assert(cd->Omega < F(2.0));
10988083
MW
379
380 KernelData * kd = (KernelData *)kernelData;
381
382
383 int nX = ld->Dims[0];
384 int nY = ld->Dims[1];
385 int nZ = ld->Dims[2];
386
387 int * gDims = kd->GlobalDims;
388
389 int oX = kd->Offsets[0];
390 int oY = kd->Offsets[1];
391 int oZ = kd->Offsets[2];
392
393 KernelDataEx * kdex = (KernelDataEx *)kd;
394
395 int blk[3];
396 blk[0] = kdex->Blk[0];
397 blk[1] = kdex->Blk[1];
398 blk[2] = kdex->Blk[2];
399
400 PdfT omega = cd->Omega;
401 PdfT omegaEven = omega;
0fde6e45
MW
402 // PdfT omegaOdd = 8.0*((F(2.0)-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter
403 PdfT magicParam = F(1.0)/F(12.0);
404 // 1/ 4: best stability;
405 // 1/12: removes third-order advection error (best advection);
406 // 1/ 6: removes fourth-order diffusion error (best diffusion);
407 // 3/16: exact location of bounce back for poiseuille flow
408 PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 409
0fde6e45
MW
410 PdfT evenPart = F(0.0);
411 PdfT oddPart = F(0.0);
412 PdfT dir_indep_trm = F(0.0);
10988083 413
0fde6e45
MW
414 PdfT w_0 = F(1.0) / F( 3.0);
415 PdfT w_1 = F(1.0) / F(18.0);
416 PdfT w_2 = F(1.0) / F(36.0);
10988083 417
0fde6e45
MW
418 PdfT w_1_x3 = w_1 * F(3.0); PdfT w_1_nine_half = w_1 * F(9.0)/F(2.0); PdfT w_1_indep = F(0.0);
419 PdfT w_2_x3 = w_2 * F(3.0); PdfT w_2_nine_half = w_2 * F(9.0)/F(2.0); PdfT w_2_indep = F(0.0);
10988083
MW
420
421 PdfT ux, uy, uz, ui;
422 PdfT dens;
423
424 // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
425 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
426 D3Q19_LIST
427 #undef X
428
429 PdfT * src = kd->Pdfs[0];
430 PdfT * dst = kd->Pdfs[1];
431 PdfT * tmp;
432
433 int maxIterations = cd->MaxIterations;
434
435 #ifdef VTK_OUTPUT
436 if (cd->VtkOutput) {
437 kd->PdfsActive = src;
438 VtkWrite(ld, kd, cd, 0);
439 }
440 #endif
441
442 int nThreads = 1;
443
444 #ifdef _OPENMP
445 nThreads = omp_get_max_threads();
446 #endif
447
448 for (int iter = 0; iter < maxIterations; ++iter) {
449
10988083 450 #ifdef _OPENMP
e3f82424 451 #pragma omp parallel default(none) \
10988083
MW
452 shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
453 w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
454 oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
455 private(ux, uy, uz, ui, dens, dir_indep_trm, \
456 pdf_C, \
457 pdf_N, pdf_E, pdf_S, pdf_W, \
458 pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
459 pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
460 pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
461 evenPart, oddPart, w_1_indep, w_2_indep)
462 #endif
e3f82424
MW
463 {
464 X_LIKWID_START("blk-os");
10988083 465
e3f82424 466 int threadId = omp_get_thread_num();
10988083 467
e3f82424
MW
468 int threadStartX = nX / nThreads * threadId;
469 int threadEndX = nX / nThreads * (threadId + 1);
10988083
MW
470
471 if (nX % nThreads > 0) {
e3f82424
MW
472 if (nX % nThreads > threadId) {
473 threadStartX += threadId;
474 threadEndX += threadId + 1;
10988083
MW
475 }
476 else {
477 threadStartX += nX % nThreads;
478 threadEndX += nX % nThreads;
479 }
480 }
481
e3f82424 482 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
10988083 483 for (int bY = oY; bY < nY + oY; bY += blk[1]) {
e3f82424 484 for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
10988083
MW
485
486 // Must do everything here, else it would break collapse.
487 int eZ = MIN(bZ + blk[2], nZ + oZ);
488 int eY = MIN(bY + blk[1], nY + oY);
489 int eX = MIN(bX + blk[0], threadEndX + oX);
490
e3f82424 491 for (int x = bX; x < eX; ++x) {
10988083 492 for (int y = bY; y < eY; ++y) {
0fde6e45
MW
493 #ifdef INTEL_OPT_DIRECTIVES
494 #pragma ivdep
495 #pragma vector always
496 #pragma simd
497 #endif
e3f82424 498 for (int z = bZ; z < eZ; ++z) {
10988083
MW
499
500 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
501
502#ifdef PROP_MODEL_PUSH
503
504 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
505 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)];
10988083
MW
506 D3Q19_LIST
507 #undef X
508
509#elif PROP_MODEL_PULL
510
511 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
512 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
10988083
MW
513 D3Q19_LIST
514 #undef X
515
516#else
517 #error No implementation for PROP_MODEL_NAME.
518#endif
519
520 // #define LID_DRIVEN_CAVITY
521
522 #ifdef LID_DRIVEN_CAVITY
523
524 if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
0fde6e45 525 ux = 0.1 * F(0.5)77;
10988083
MW
526 uy = 0.0;
527 uz = 0.0;
528
529 } else {
530 #endif
531 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
e3f82424 532 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
10988083 533 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
e3f82424 534 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
10988083 535 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
e3f82424 536 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
10988083
MW
537 #ifdef LID_DRIVEN_CAVITY
538 }
539
540 #endif
541
542 dens = pdf_C +
543 pdf_N + pdf_E + pdf_S + pdf_W +
544 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
545 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
546 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
547
0fde6e45 548 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
10988083
MW
549
550#ifdef PROP_MODEL_PUSH
551
552 // direction: w_0
553 dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
554
555 // direction: w_1
556 w_1_indep = w_1*dir_indep_trm;
557
558 ui = uy;
0fde6e45
MW
559 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
560 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
10988083
MW
561 dst[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
562 dst[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
563
564 ui = ux;
0fde6e45
MW
565 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
566 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
10988083
MW
567 dst[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
568 dst[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
569
570 ui = uz;
0fde6e45
MW
571 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
572 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
10988083
MW
573 dst[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart;
574 dst[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart;
575
576 // direction: w_2
577 w_2_indep = w_2*dir_indep_trm;
578
579 ui = -ux + uy;
0fde6e45
MW
580 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
581 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
10988083
MW
582 dst[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
583 dst[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
584
585 ui = ux + uy;
0fde6e45
MW
586 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
587 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
10988083
MW
588 dst[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
589 dst[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
590
591 ui = -ux + uz;
0fde6e45
MW
592 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
593 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
10988083
MW
594 dst[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
595 dst[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
596
597 ui = ux + uz;
0fde6e45
MW
598 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
599 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
10988083
MW
600 dst[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
601 dst[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
602
603 ui = -uy + uz;
0fde6e45
MW
604 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
605 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
10988083
MW
606 dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
607 dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
608
609 ui = uy + uz;
0fde6e45
MW
610 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
611 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
10988083
MW
612 dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
613 dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
614
615#elif PROP_MODEL_PULL
616
617 // direction: w_0
618 dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
619
620 // direction: w_1
621 w_1_indep = w_1*dir_indep_trm;
622
623 ui = uy;
0fde6e45
MW
624 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
625 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
10988083
MW
626 dst[I(x, y, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
627 dst[I(x, y, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
628
629 ui = ux;
0fde6e45
MW
630 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
631 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
10988083
MW
632 dst[I(x, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
633 dst[I(x, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
634
635 ui = uz;
0fde6e45
MW
636 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
637 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
10988083
MW
638 dst[I(x, y, z, D3Q19_T)] = pdf_T - evenPart - oddPart;
639 dst[I(x, y, z, D3Q19_B)] = pdf_B - evenPart + oddPart;
640
641 // direction: w_2
642 w_2_indep = w_2*dir_indep_trm;
643
644 ui = -ux + uy;
0fde6e45
MW
645 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
646 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
10988083
MW
647 dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
648 dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
649
650 ui = ux + uy;
0fde6e45
MW
651 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
652 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
10988083
MW
653 dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
654 dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
655
656 ui = -ux + uz;
0fde6e45
MW
657 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
658 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
10988083
MW
659 dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
660 dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
661
662 ui = ux + uz;
0fde6e45
MW
663 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
664 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
10988083
MW
665 dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
666 dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
667
668 ui = -uy + uz;
0fde6e45
MW
669 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
670 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
10988083
MW
671 dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
672 dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
673
674 ui = uy + uz;
0fde6e45
MW
675 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
676 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
10988083
MW
677 dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
678 dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
679
680#else
681 #error No implementation for PROP_MODEL_NAME.
682#endif
683
684 #undef I
685 }
686 }
687 } // z, y, x (from inner to outer)
688 }
689 }
690 } // z, y, x (from inner to outer)
691
e3f82424
MW
692 X_LIKWID_STOP("blk-os");
693 } // parallel region
694
695 // Stop counters before bounce back. Else computing loop balance will be incorrect.
10988083
MW
696
697 // Fixup bounce back PDFs.
698 #ifdef _OPENMP
699 #pragma omp parallel for default(none) \
700 shared(kd, dst)
701 #endif
702 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
703 dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]];
704 }
705
706 #ifdef VERIFICATION
707 kd->PdfsActive = dst;
708 KernelAddBodyForce(kd, ld, cd);
709 #endif
710
711 #ifdef VTK_OUTPUT
712
713 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
714 kd->PdfsActive = dst;
715 VtkWrite(ld, kd, cd, iter);
716 }
717
718 #endif
719
720 #ifdef STATISTICS
721 kd->PdfsActive = dst;
722 KernelStatistics(kd, ld, cd, iter);
723 #endif
724
725 // swap grids
726 tmp = src;
727 src = dst;
728 dst = tmp;
729
730 } // for (int iter = 0; ...
731
732 #ifdef VTK_OUTPUT
733
734 if (cd->VtkOutput) {
735 kd->PdfsActive = src;
736 VtkWrite(ld, kd, cd, maxIterations);
737 }
738
739 #endif
740
741 return;
742}
743
This page took 0.161543 seconds and 5 git commands to generate.