1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2017
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
13 // LbmBenchKernels is free software: you can redistribute it and/or modify
14 // it under the terms of the GNU General Public License as published by
15 // the Free Software Foundation, either version 3 of the License, or
16 // (at your option) any later version.
18 // LbmBenchKernels is distributed in the hope that it will be useful,
19 // but WITHOUT ANY WARRANTY; without even the implied warranty of
20 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 // GNU General Public License for more details.
23 // You should have received a copy of the GNU General Public License
24 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19AaCommon.h"
40 void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
43 Assert(kernelData != NULL);
46 Assert(cd->Omega > 0.0);
47 Assert(cd->Omega < 2.0);
49 KernelData * kd = (KernelData *)kernelData;
56 int * gDims = kd->GlobalDims;
58 int oX = kd->Offsets[0];
59 int oY = kd->Offsets[1];
60 int oZ = kd->Offsets[2];
62 KernelDataAa * kda = KDA(kd);
69 PdfT omega = cd->Omega;
70 PdfT omegaEven = omega;
71 PdfT magicParam = 1.0 / 12.0;
72 // 1/4: best stability;
73 // 1/12: removes third-order advection error (best advection);
74 // 1/6: removes fourth-order diffusion error (best diffusion);
75 // 3/16: exact location of bounce back for poiseuille flow
77 PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
81 PdfT dir_indep_trm = 0.0;
84 PdfT w_1 = 1.0 / 18.0;
85 PdfT w_2 = 1.0 / 36.0;
87 PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0;
88 PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0;
93 // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
94 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
98 PdfT * src = kd->Pdfs[0];
100 int maxIterations = cd->MaxIterations;
104 kd->PdfsActive = src;
105 VtkWrite(ld, kd, cd, -1);
110 kd->PdfsActive = src;
111 KernelStatistics(kd, ld, cd, 0);
118 nThreads = omp_get_max_threads();
121 for (int iter = 0; iter < maxIterations; iter += 2) {
123 // --------------------------------------------------------------------
126 X_LIKWID_START("aa-even");
131 #pragma omp parallel for default(none) \
132 shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
133 w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
134 oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \
135 private(ux, uy, uz, ui, dens, dir_indep_trm, \
137 pdf_N, pdf_E, pdf_S, pdf_W, \
138 pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
139 pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
140 pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
141 evenPart, oddPart, w_1_indep, w_2_indep)
144 for (int i = 0; i < nThreads; ++i) {
146 int threadStartX = nX / nThreads * i;
147 int threadEndX = nX / nThreads * (i + 1);
149 if (nX % nThreads > 0) {
150 if (nX % nThreads > i) {
155 threadStartX += nX % nThreads;
156 threadEndX += nX % nThreads;
160 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
161 for (int bY = oY; bY < nY + oY; bY += blk[1]) {
162 for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
164 int eX = MIN(bX + blk[0], threadEndX + oX);
165 int eY = MIN(bY + blk[1], nY + oY);
166 int eZ = MIN(bZ + blk[2], nZ + oZ);
168 // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
170 for (int x = bX; x < eX; ++x) {
171 for (int y = bY; y < eY; ++y) {
172 for (int z = bZ; z < eZ; ++z) {
175 if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) {
179 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
182 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
183 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)];
187 // #define LID_DRIVEN_CAVITY
189 #ifdef LID_DRIVEN_CAVITY
191 if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
198 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
199 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
200 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
201 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
202 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
203 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
204 #ifdef LID_DRIVEN_CAVITY
209 pdf_N + pdf_E + pdf_S + pdf_W +
210 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
211 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
212 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
214 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
217 src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
220 w_1_indep = w_1*dir_indep_trm;
223 evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
224 oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
225 src[I(x, y, z, D3Q19_S)] = pdf_N - evenPart - oddPart;
226 src[I(x, y, z, D3Q19_N)] = pdf_S - evenPart + oddPart;
229 evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
230 oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
231 src[I(x, y, z, D3Q19_W)] = pdf_E - evenPart - oddPart;
232 src[I(x, y, z, D3Q19_E)] = pdf_W - evenPart + oddPart;
235 evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
236 oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
237 src[I(x, y, z, D3Q19_B)] = pdf_T - evenPart - oddPart;
238 src[I(x, y, z, D3Q19_T)] = pdf_B - evenPart + oddPart;
241 w_2_indep = w_2*dir_indep_trm;
244 evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
245 oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
246 src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
247 src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
250 evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
251 oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
252 src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
253 src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
256 evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
257 oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
258 src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
259 src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
262 evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
263 oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
264 src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
265 src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
268 evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
269 oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
270 src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
271 src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
274 evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
275 oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
276 src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
277 src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
280 } } } // z, y, x (from inner to outer)
281 } } } // z, y, x (from inner to outer)
283 } // loop over threads
287 X_LIKWID_STOP("aa-even");
290 kd->PdfsActive = src;
291 KernelStatistics(kd, ld, cd, iter);
294 // Fixup bounce back PDFs.
296 #pragma omp parallel for default(none) \
299 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
300 src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
303 // save current iteration
304 kda->Iteration = iter;
307 kd->PdfsActive = src;
308 KernelAddBodyForce(kd, ld, cd);
312 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
313 kd->PdfsActive = src;
314 VtkWrite(ld, kd, cd, iter);
319 kd->PdfsActive = src;
320 KernelStatistics(kd, ld, cd, iter);
323 // --------------------------------------------------------------------
327 X_LIKWID_START("aa-odd");
332 #pragma omp parallel for default(none) \
333 shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
334 w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
335 oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
336 private(ux, uy, uz, ui, dens, dir_indep_trm, \
338 pdf_N, pdf_E, pdf_S, pdf_W, \
339 pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
340 pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
341 pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
342 evenPart, oddPart, w_1_indep, w_2_indep)
345 for (int i = 0; i < nThreads; ++i) {
347 int threadStartX = nX / nThreads * i;
348 int threadEndX = nX / nThreads * (i + 1);
350 if (nX % nThreads > 0) {
351 if (nX % nThreads > i) {
356 threadStartX += nX % nThreads;
357 threadEndX += nX % nThreads;
361 for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
362 for (int bY = oY; bY < nY + oY; bY += blk[1]) {
363 for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
365 // Must do everything here, else it would break collapse.
366 int eZ = MIN(bZ + blk[2], nZ + oZ);
367 int eY = MIN(bY + blk[1], nY + oY);
368 int eX = MIN(bX + blk[0], threadEndX + oX);
370 for (int x = bX; x < eX; ++x) {
371 for (int y = bY; y < eY; ++y) {
372 for (int z = bZ; z < eZ; ++z) {
374 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
376 // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
377 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)];
382 // #define LID_DRIVEN_CAVITY
384 #ifdef LID_DRIVEN_CAVITY
386 if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
393 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
394 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
395 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
396 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
397 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
398 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
399 #ifdef LID_DRIVEN_CAVITY
404 pdf_N + pdf_E + pdf_S + pdf_W +
405 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
406 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
407 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
409 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
412 src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
415 w_1_indep = w_1*dir_indep_trm;
418 evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
419 oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
420 src[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
421 src[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
424 evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
425 oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
426 src[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
427 src[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
430 evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
431 oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
432 src[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart;
433 src[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart;
436 w_2_indep = w_2*dir_indep_trm;
439 evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
440 oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
441 src[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
442 src[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
445 evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
446 oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
447 src[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
448 src[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
451 evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
452 oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
453 src[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
454 src[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
457 evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
458 oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
459 src[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
460 src[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
463 evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
464 oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
465 src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
466 src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
469 evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
470 oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
471 src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
472 src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
476 } } } // z, y, x (from inner to outer)
477 } } } // z, y, x (from inner to outer)
478 } // loop over threads
482 // Stop counters before bounce back. Else computing loop balance will be incorrect.
484 X_LIKWID_STOP("aa-odd");
486 // Fixup bounce back PDFs.
488 #pragma omp parallel for default(none) \
491 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
492 src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
495 // save current iteration
496 kda->Iteration = iter + 1;
499 kd->PdfsActive = src;
500 KernelAddBodyForce(kd, ld, cd);
504 if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) {
505 kd->PdfsActive = src;
506 VtkWrite(ld, kd, cd, iter + 1);
511 kd->PdfsActive = src;
512 KernelStatistics(kd, ld, cd, iter + 1);
516 } // for (int iter = 0; ...
521 kd->PdfsActive = src;
522 VtkWrite(ld, kd, cd, maxIterations);