add single precision, add aa-vec-sl-soa kernel, updated doc
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPv.c
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
4// Markus Wittmann, 2016-2017
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
11// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12//
13// LbmBenchKernels is free software: you can redistribute it and/or modify
14// it under the terms of the GNU General Public License as published by
15// the Free Software Foundation, either version 3 of the License, or
16// (at your option) any later version.
17//
18// LbmBenchKernels is distributed in the hope that it will be useful,
19// but WITHOUT ANY WARRANTY; without even the implied warranty of
20// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21// GNU General Public License for more details.
22//
23// You should have received a copy of the GNU General Public License
24// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
25//
26// --------------------------------------------------------------------------
27#include "BenchKernelD3Q19ListAaPvCommon.h"
28
29#include "Memory.h"
30#include "Vtk.h"
31#include "Vector.h"
32#include "LikwidIf.h"
33
34#include <inttypes.h>
35#include <math.h>
36
37#ifdef _OPENMP
38 #include <omp.h>
39#endif
40
41
42static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
43static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
44
45void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
46{
47
48 Assert(ld != NULL);
49 Assert(kernelData != NULL);
50 Assert(cd != NULL);
51
0fde6e45
MW
52 Assert(cd->Omega > F(0.0));
53 Assert(cd->Omega < F(2.0));;
10988083
MW
54
55#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
56 KernelData * kd = (KernelData *)kernelData;
57 KernelDataList * kdl = KDL(kernelData);
58#endif
59
60 int maxIterations = cd->MaxIterations;
61
62 int nThreads = 1;
63#ifdef _OPENMP
64 nThreads = omp_get_max_threads();
65#endif
66
67 #ifdef VTK_OUTPUT
68 if (cd->VtkOutput) {
69 kd->PdfsActive = kd->Pdfs[0];
70 VtkWrite(ld, kd, cd, -1);
71 }
72 #endif
73
74 #ifdef STATISTICS
75 kd->PdfsActive = kd->Pdfs[0];
76 KernelStatistics(kd, ld, cd, 0);
77 #endif
78
79 // TODO: outer openmp parallel
80
81 for(int iter = 0; iter < maxIterations; iter += 2) {
82
83 // ---------------------------------------------------
84 // even time step
85 // ---------------------------------------------------
86
87 X_LIKWID_START("list-aa-pv-even");
88
89 #ifdef _OPENMP
90 #pragma omp parallel default(none) shared(ld, kernelData, cd)
91 #endif
92 {
93 KernelEven(ld, kernelData, cd);
94 }
95
96 X_LIKWID_STOP("list-aa-pv-even");
97
98 #ifdef VERIFICATION
99 kdl->Iteration = iter;
100 kd->PdfsActive = kd->Pdfs[0];
101 KernelAddBodyForce(kd, ld, cd);
102 #endif
103
104 // ---------------------------------------------------
105 // odd time step
106 // ---------------------------------------------------
107
108 X_LIKWID_START("list-aa-pv-odd");
109
110 #ifdef _OPENMP
111 #pragma omp parallel default(none) shared(ld, kernelData, cd)
112 #endif
113 {
114 KernelOdd(ld, kernelData, cd);
115 }
116
117 X_LIKWID_STOP("list-aa-pv-odd");
118
119
120 #ifdef VERIFICATION
121 kdl->Iteration = iter + 1;
122 kd->PdfsActive = kd->Pdfs[0];
123 KernelAddBodyForce(kd, ld, cd);
124 #endif
125
126 #ifdef VTK_OUTPUT
127 if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
128 kdl->Iteration = iter + 1;
129 kd->PdfsActive = kd->Pdfs[0];
130 VtkWrite(ld, kd, cd, iter);
131 }
132 #endif
133
134 #ifdef STATISTICS
135 kdl->Iteration = iter + 1;
136 kd->PdfsActive = kd->Pdfs[0];
137 KernelStatistics(kd, ld, cd, iter);
138 #endif
139
140 } // for (int iter = 0; ...
141
142#ifdef VTK_OUTPUT
143 if (cd->VtkOutput) {
144 kd->PdfsActive = kd->Pdfs[0];
145 VtkWrite(ld, kd, cd, maxIterations);
146 }
147#endif
148
149#ifdef STATISTICS
150 kd->PdfsActive = kd->Pdfs[0];
151 KernelStatistics(kd, ld, cd, maxIterations);
152#endif
153
154 return;
155}
156
157static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
158{
159 Assert(ld != NULL);
160 Assert(kernelData != NULL);
161 Assert(cd != NULL);
162
0fde6e45
MW
163 Assert(cd->Omega > F(0.0));
164 Assert(cd->Omega < F(2.0));
10988083
MW
165
166 KernelData * kd = (KernelData *)kernelData;
167 KernelDataList * kdl = KDL(kernelData);
168 KernelDataListRia * kdlr = KDLR(kernelData);
169
170 PdfT omega = cd->Omega;
171 PdfT omegaEven = omega;
172
0fde6e45
MW
173 PdfT magicParam = F(1.0) / F(12.0);
174 PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 175
0fde6e45
MW
176 PdfT evenPart = F(0.0);
177 PdfT oddPart = F(0.0);
178 PdfT dir_indep_trm = F(0.0);
10988083 179
0fde6e45
MW
180 const PdfT w_0 = F(1.0) / F( 3.0);
181 const PdfT w_1 = F(1.0) / F(18.0);
182 const PdfT w_2 = F(1.0) / F(36.0);
10988083 183
0fde6e45
MW
184 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0);
185 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0);
10988083
MW
186
187 PdfT ui;
188
189 PdfT ux, uy, uz;
190 PdfT dens;
191
192
0fde6e45
MW
193 VPDFT VONE_HALF = VSET(F(0.5));
194 VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
10988083
MW
195
196 VPDFT vw_1_indep, vw_2_indep;
197 VPDFT vw_0 = VSET(w_0);
198 VPDFT vw_1 = VSET(w_1);
199 VPDFT vw_2 = VSET(w_2);
200
201 VPDFT vw_1_x3 = VSET(w_1_x3);
202 VPDFT vw_2_x3 = VSET(w_2_x3);
203 VPDFT vw_1_nine_half = VSET(w_1_nine_half);
204 VPDFT vw_2_nine_half = VSET(w_2_nine_half);
205
206 VPDFT vui, vux, vuy, vuz, vdens;
207
208 VPDFT vevenPart, voddPart, vdir_indep_trm;
209
210 VPDFT vomegaEven = VSET(omegaEven);
211 VPDFT vomegaOdd = VSET(omegaOdd);
212
213 // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
214 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
215 D3Q19_LIST
216 #undef X
217
218 PdfT * src = kd->Pdfs[0];
219
220 int nCells = kdl->nCells;
221
222 int threadId = 0;
223#ifdef _OPENMP
224 threadId = omp_get_thread_num();
225#endif
226
227 int * threadIndices = kdlr->FluidNodeThreadIndices;
228
229 int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
230 int nFluidVec = nFluidThread - (nFluidThread % VSIZE);
231
232 int indexStartVec = threadIndices[threadId];
233 int indexStopVec = threadIndices[threadId] + nFluidVec;
234 int indexStop = threadIndices[threadId] + nFluidThread;
235
236 #define I(index, dir) P_INDEX_3((nCells), (index), (dir))
237
238 for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
239
240
241 #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]);
242 D3Q19_LIST
243 #undef X
244
245
246 //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
247 // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
248 vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
249 //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
250 // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
251 vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
252 //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
253 // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
254 vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
255
256 //vdens = vpdf_C +
257 // vpdf_N + vpdf_E + vpdf_S + vpdf_W +
258 // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
259 // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
260 // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
261 vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
262
263 //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
264 vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
265
266 //src[I(index, D3Q19_C) ] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
267 VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
268
269 //vw_1_indep = vw_1 * vdir_indep_trm;
270 vw_1_indep = VMUL(vw_1,vdir_indep_trm);
271
272 //vui = vuy;
273 vui = vuy;
274 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
275 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
276 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
277 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
278 //src[I(index, D3Q19_S)] =[UA] vpdf_N - vevenPart - voddPart;
279 VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
280 //src[I(index, D3Q19_N)] =[UA] vpdf_S - vevenPart + voddPart;
281 VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
282
283 //vui = vux;
284 vui = vux;
285 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
286 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
287 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 );
288 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
289 //src[I(index, D3Q19_W)] =[UA] vpdf_E - vevenPart - voddPart;
290 VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
291 //src[I(index, D3Q19_E)] =[UA] vpdf_W - vevenPart + voddPart;
292 VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
293
294 //vui = vuz;
295 vui = vuz;
296 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
297 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
298 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
299 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
300 //src[I(index, D3Q19_B)] =[UA] vpdf_T - vevenPart - voddPart;
301 VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
302 //src[I(index, D3Q19_T)] =[UA] vpdf_B - vevenPart + voddPart;
303 VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
304
305 //vw_2_indep = vw_2 * vdir_indep_trm;
306 vw_2_indep = VMUL(vw_2,vdir_indep_trm);
307
308 //vui = vuy - vux;
309 vui = VSUB(vuy,vux);
310 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
311 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
312 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
313 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
314 //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart;
315 VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
316 //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart;
317 VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
318
319 //vui = vux + vuy;
320 vui = VADD(vux,vuy);
321 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
322 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
323 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
324 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
325 //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart;
326 VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
327 //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart;
328 VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
329
330 //vui = vuz - vux;
331 vui = VSUB(vuz,vux);
332 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
333 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
334 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
335 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
336 //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart;
337 VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
338 //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart;
339 VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
340
341 //vui = vux + vuz;
342 vui = VADD(vux,vuz);
343 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
344 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
345 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
346 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
347 //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart;
348 VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
349 //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart;
350 VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
351
352 //vui = vuz - vuy;
353 vui = VSUB(vuz,vuy);
354 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
355 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
356 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
357 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
358 //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart;
359 VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
360 //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart;
361 VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
362
363 //vui = vuy + vuz;
364 vui = VADD(vuy,vuz);
365 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
366 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
367 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
368 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
369 //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart;
370 VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
371 //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart;
372 VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
373
374 } // loop over fluid nodes
375
376 for (int index = indexStopVec; index < indexStop; ++index) {
377
378 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)];
379 D3Q19_LIST
380 #undef X
381
382
383 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
384 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
385 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
386 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
387 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
388 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
389
390 dens = pdf_C +
391 pdf_N + pdf_E + pdf_S + pdf_W +
392 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
393 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
394 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
395
0fde6e45 396 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
10988083
MW
397
398 // direction: w_0
399 src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
400
401 // direction: w_1
402 w_1_indep = w_1*dir_indep_trm;
403
404 ui = uy;
0fde6e45
MW
405 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
406 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
10988083
MW
407 src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart;
408 src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart;
409
410 ui = ux;
0fde6e45
MW
411 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
412 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
10988083
MW
413 src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart;
414 src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart;
415
416 ui = uz;
0fde6e45
MW
417 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
418 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
10988083
MW
419 src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart;
420 src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart;
421
422 // direction: w_2
423 w_2_indep = w_2*dir_indep_trm;
424
425 ui = -ux + uy;
0fde6e45
MW
426 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
427 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
10988083
MW
428 src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
429 src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
430
431 ui = ux + uy;
0fde6e45
MW
432 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
433 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
10988083
MW
434 src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
435 src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
436
437 ui = -ux + uz;
0fde6e45
MW
438 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
439 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
10988083
MW
440 src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
441 src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
442
443 ui = ux + uz;
0fde6e45
MW
444 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
445 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
10988083
MW
446 src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
447 src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
448
449 ui = -uy + uz;
0fde6e45
MW
450 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
451 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
10988083
MW
452 src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
453 src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
454
455 ui = uy + uz;
0fde6e45
MW
456 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
457 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
10988083
MW
458 src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
459 src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
460
461 } // loop over fluid nodes
462
463 #undef I
464
465 return;
466}
467
468static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
469{
470
471 Assert(ld != NULL);
472 Assert(kernelData != NULL);
473 Assert(cd != NULL);
474
0fde6e45
MW
475 Assert(cd->Omega > F(0.0));
476 Assert(cd->Omega < F(2.0));
10988083
MW
477
478 KernelData * kd = (KernelData *)kernelData;
479 KernelDataList * kdl = KDL(kernelData);
480 KernelDataListRia * kdlr = KDLR(kernelData);
481
482 PdfT omega = cd->Omega;
483 PdfT omegaEven = omega;
484
0fde6e45
MW
485 PdfT magicParam = F(1.0) / F(12.0);
486 PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
10988083 487
0fde6e45
MW
488 PdfT evenPart = F(0.0);
489 PdfT oddPart = F(0.0);
490 PdfT dir_indep_trm = F(0.0);
10988083 491
0fde6e45
MW
492 const PdfT w_0 = F(1.0) / F( 3.0);
493 const PdfT w_1 = F(1.0) / F(18.0);
494 const PdfT w_2 = F(1.0) / F(36.0);
10988083 495
0fde6e45
MW
496 const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0);
497 const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0);
10988083
MW
498
499 PdfT ui;
500
501 PdfT ux, uy, uz;
502 PdfT dens;
503
504
0fde6e45
MW
505 VPDFT VONE_HALF = VSET(F(0.5));
506 VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
10988083
MW
507
508 VPDFT vw_1_indep, vw_2_indep;
509 VPDFT vw_0 = VSET(w_0);
510 VPDFT vw_1 = VSET(w_1);
511 VPDFT vw_2 = VSET(w_2);
512
513 VPDFT vw_1_x3 = VSET(w_1_x3);
514 VPDFT vw_2_x3 = VSET(w_2_x3);
515 VPDFT vw_1_nine_half = VSET(w_1_nine_half);
516 VPDFT vw_2_nine_half = VSET(w_2_nine_half);
517
518 VPDFT vui, vux, vuy, vuz, vdens;
519
520 VPDFT vevenPart, voddPart, vdir_indep_trm;
521
522 VPDFT vomegaEven = VSET(omegaEven);
523 VPDFT vomegaOdd = VSET(omegaOdd);
524
525
526 // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
527 #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
528 D3Q19_LIST
529 #undef X
530
531 // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ...
532 #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_,name) = NULL;
533 D3Q19_LIST
534 #undef X
535
536 uint32_t nConsecNodes = kdlr->nConsecNodes;
537 uint32_t * consecNodes = kdlr->ConsecNodes;
538 uint32_t consecIndex = 0;
539 uint32_t consecValue = 0;
540
541#ifndef DEBUG
542 UNUSED(nConsecNodes);
543#endif
544
545 PdfT * src = kd->Pdfs[0];
546
547 int nCells = kdl->nCells;
548
549 uint32_t adjListIndex;
550 uint32_t * adjList = kdl->AdjList;
551
552 int threadId = 0;
553
554 #ifdef _OPENMP
555 threadId = omp_get_thread_num();
556 #endif
557
558 consecIndex = kdlr->ConsecThreadIndices[threadId];
559 consecValue = 0;
560
561 int * threadIndices = kdlr->FluidNodeThreadIndices;
562
563 int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
564
565 int indexStart = threadIndices[threadId];
566 int indexStop = threadIndices[threadId] + nFluidThread;
567
568 #define I(index, dir) P_INDEX_3((nCells), (index), (dir))
569
570 #define ADJ_LIST(dir) adjList[adjListIndex + (dir)]
571
572 int pointerOffset = 1;
573
574 for (int index = indexStart; index < indexStop; index += 1) {
575
576 if (consecValue > 0) {
577 --consecValue;
578 // Increment all pdf pointers by an offset. If the previous iteration was
579 // scalar, increment only by one. If the previous iteration was vectorized,
580 // increment by the vector width. These offsets are set in the corresponding
581 // if branches.
582 #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += pointerOffset;
583 D3Q19_LIST
584 #undef X
585 }
586 else {
587 Assert(consecIndex < nConsecNodes);
588
589 consecValue = consecNodes[consecIndex] - 1;
590 // Load new pointers to PDFs of local cell:
591
592 adjListIndex = index * N_D3Q19_IDX;
593
594 #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]);
595 D3Q19_LIST_WO_C
596 #undef X
597
598 ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
599 ++consecIndex;
600 }
601
602 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *JOIN(ppdf_,name);
603 D3Q19_LIST
604 #undef X
605
606 if (consecValue >= (VSIZE - 1)) {
607 // Vectorized part.
608
609 #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
610 D3Q19_LIST_WO_C
611 #undef X
612
613 vpdf_C = VLDU(ppdf_C);
614
615 //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
616 // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
617 vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
618 //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
619 // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
620 vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
621 //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
622 // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
623 vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
624
625 //vdens = vpdf_C +
626 // vpdf_N + vpdf_E + vpdf_S + vpdf_W +
627 // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
628 // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
629 // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
630 vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
631 VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
632
633 //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
634 vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
635
636 adjListIndex = index * N_D3Q19_IDX;
637
638 //src[I(index, D3Q19_C)] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
639 VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
640
641 //vw_1_indep = vw_1 * vdir_indep_trm;
642 vw_1_indep = VMUL(vw_1,vdir_indep_trm);
643
644 //vui = vuy;
645 vui = vuy;
646 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
647 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
648 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
649 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
650 //src[ADJ_LIST(D3Q19_N)] =[UA] vpdf_N - vevenPart - voddPart;
651 VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart));
652 //src[ADJ_LIST(D3Q19_S)] =[UA] vpdf_S - vevenPart + voddPart;
653 VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart));
654
655 //vui = vux;
656 vui = vux;
657 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
658 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
659 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3);
660 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
661 //src[ADJ_LIST(D3Q19_E)] =[UA] vpdf_E - vevenPart - voddPart;
662 VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart));
663 //src[ADJ_LIST(D3Q19_W)] =[UA] vpdf_W - vevenPart + voddPart;
664 VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart));
665
666 //vui = vuz;
667 vui = vuz;
668 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
669 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
670 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
671 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
672 //src[ADJ_LIST(D3Q19_T)] =[UA] vpdf_T - vevenPart - voddPart;
673 VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart));
674 //src[ADJ_LIST(D3Q19_B)] =[UA] vpdf_B - vevenPart + voddPart;
675 VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart));
676
677 //vw_2_indep = vw_2 * vdir_indep_trm;
678 vw_2_indep = VMUL(vw_2,vdir_indep_trm);
679
680 //vui = vuy - vux;
681 vui = VSUB(vuy,vux);
682 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
683 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
684 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
685 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
686 //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart;
687 VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
688 //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart;
689 VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart));
690
691 //vui = vux + vuy;
692 vui = VADD(vux,vuy);
693 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
694 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
695 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
696 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
697 //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart;
698 VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
699 //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart;
700 VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart));
701
702 //vui = vuz - vux;
703 vui = VSUB(vuz,vux);
704 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
705 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
706 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
707 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
708 //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart;
709 VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
710 //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart;
711 VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart));
712
713 //vui = vux + vuz;
714 vui = VADD(vux,vuz);
715 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
716 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
717 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
718 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
719 //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart;
720 VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
721 //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart;
722 VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart));
723
724 //vui = vuz - vuy;
725 vui = VSUB(vuz,vuy);
726 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
727 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
728 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
729 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
730 //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
e3f82424 731 VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
10988083 732 //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
e3f82424 733 VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
10988083
MW
734
735 //vui = vuy + vuz;
736 vui = VADD(vuy,vuz);
737 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
738 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
739 //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
740 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
741 //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart;
742 VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
743 //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart;
744 VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart));
745
746 consecValue -= (VSIZE - 1);
747 index += (VSIZE - 1);
748 pointerOffset = VSIZE;
749
750 }
751 else {
752 // Scalar part.
753
754 #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *(JOIN(ppdf_,name));
755 D3Q19_LIST_WO_C
756 #undef X
757
758 pdf_C = *ppdf_C;
759
760 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
761 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
762 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
763 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
764 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
765 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
766
767 dens = pdf_C +
768 pdf_N + pdf_E + pdf_S + pdf_W +
769 pdf_NE + pdf_SE + pdf_SW + pdf_NW +
770 pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
771 pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
772
0fde6e45 773 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
10988083
MW
774
775 adjListIndex = index * N_D3Q19_IDX;
776
777 // direction: w_0
778 src[I(index, D3Q19_C) ] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
779
780 // direction: w_1
781 w_1_indep = w_1 * dir_indep_trm;
782
783 ui = uy;
0fde6e45
MW
784 evenPart = omegaEven * (F(0.5) * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
785 oddPart = omegaOdd * (F(0.5) * (pdf_N - pdf_S) - ui * w_1_x3);
e3f82424
MW
786 *ppdf_S = pdf_N - evenPart - oddPart;
787 *ppdf_N = pdf_S - evenPart + oddPart;
10988083
MW
788
789 ui = ux;
0fde6e45
MW
790 evenPart = omegaEven * (F(0.5) * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
791 oddPart = omegaOdd * (F(0.5) * (pdf_E - pdf_W) - ui * w_1_x3);
e3f82424
MW
792 *ppdf_W = pdf_E - evenPart - oddPart;
793 *ppdf_E = pdf_W - evenPart + oddPart;
10988083
MW
794
795 ui = uz;
0fde6e45
MW
796 evenPart = omegaEven * (F(0.5) * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
797 oddPart = omegaOdd * (F(0.5) * (pdf_T - pdf_B) - ui * w_1_x3);
e3f82424
MW
798 *ppdf_B = pdf_T - evenPart - oddPart;
799 *ppdf_T = pdf_B - evenPart + oddPart;
10988083
MW
800
801 // direction: w_2
802 w_2_indep = w_2 * dir_indep_trm;
803
804 ui = -ux + uy;
0fde6e45
MW
805 evenPart = omegaEven * (F(0.5) * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
806 oddPart = omegaOdd * (F(0.5) * (pdf_NW - pdf_SE) - ui * w_2_x3);
e3f82424
MW
807 *ppdf_SE = pdf_NW - evenPart - oddPart;
808 *ppdf_NW = pdf_SE - evenPart + oddPart;
10988083
MW
809
810 ui = ux + uy;
0fde6e45
MW
811 evenPart = omegaEven * (F(0.5) * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
812 oddPart = omegaOdd * (F(0.5) * (pdf_NE - pdf_SW) - ui * w_2_x3);
e3f82424
MW
813 *ppdf_SW = pdf_NE - evenPart - oddPart;
814 *ppdf_NE = pdf_SW - evenPart + oddPart;
10988083
MW
815
816 ui = -ux + uz;
0fde6e45
MW
817 evenPart = omegaEven * (F(0.5) * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
818 oddPart = omegaOdd * (F(0.5) * (pdf_TW - pdf_BE) - ui * w_2_x3);
e3f82424
MW
819 *ppdf_BE = pdf_TW - evenPart - oddPart;
820 *ppdf_TW = pdf_BE - evenPart + oddPart;
10988083
MW
821
822 ui = ux + uz;
0fde6e45
MW
823 evenPart = omegaEven * (F(0.5) * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
824 oddPart = omegaOdd * (F(0.5) * (pdf_TE - pdf_BW) - ui * w_2_x3);
e3f82424
MW
825 *ppdf_BW = pdf_TE - evenPart - oddPart;
826 *ppdf_TE = pdf_BW - evenPart + oddPart;
10988083
MW
827
828 ui = -uy + uz;
0fde6e45
MW
829 evenPart = omegaEven * (F(0.5) * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
830 oddPart = omegaOdd * (F(0.5) * (pdf_TS - pdf_BN) - ui * w_2_x3);
e3f82424
MW
831 *ppdf_BN = pdf_TS - evenPart - oddPart;
832 *ppdf_TS = pdf_BN - evenPart + oddPart;
10988083
MW
833
834 ui = uy + uz;
0fde6e45
MW
835 evenPart = omegaEven * (F(0.5) * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
836 oddPart = omegaOdd * (F(0.5) * (pdf_TN - pdf_BS) - ui * w_2_x3);
e3f82424
MW
837 *ppdf_BS = pdf_TN - evenPart - oddPart;
838 *ppdf_TN = pdf_BS - evenPart + oddPart;
10988083
MW
839
840 pointerOffset = 1;
841 }
842
843 } // loop over fluid nodes
844
845 #undef ADJ_LIST
846 #undef I
847}
This page took 0.151577 seconds and 5 git commands to generate.