8ae0c2cd1cf735c6b36990c74050dc0795a20293
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListAaPv.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListAaPvCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "Vector.h"
32 #include "LikwidIf.h"
33
34 #include <inttypes.h>
35 #include <math.h>
36
37 #ifdef _OPENMP
38         #include <omp.h>
39 #endif
40
41
42 static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
43 static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
44
45 void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
46 {
47
48         Assert(ld != NULL);
49         Assert(kernelData != NULL);
50         Assert(cd != NULL);
51
52         Assert(cd->Omega > F(0.0));
53         Assert(cd->Omega < F(2.0));;
54
55 #if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
56         KernelData     * kd  = (KernelData *)kernelData;
57         KernelDataList * kdl = KDL(kernelData);
58 #endif
59
60         int maxIterations = cd->MaxIterations;
61
62         int nThreads = 1;
63 #ifdef _OPENMP
64         nThreads = omp_get_max_threads();
65 #endif
66
67         #ifdef VTK_OUTPUT
68                 if (cd->VtkOutput) {
69                         kd->PdfsActive = kd->Pdfs[0];
70                         VtkWrite(ld, kd, cd, -1);
71                 }
72         #endif
73
74         #ifdef STATISTICS
75                 kd->PdfsActive = kd->Pdfs[0];
76                 KernelStatistics(kd, ld, cd, 0);
77         #endif
78
79         // TODO: outer openmp parallel
80
81         for(int iter = 0; iter < maxIterations; iter += 2) {
82
83                 // ---------------------------------------------------
84                 // even time step
85                 // ---------------------------------------------------
86
87                 X_LIKWID_START("list-aa-pv-even");
88
89                 #ifdef _OPENMP
90                         #pragma omp parallel default(none) shared(ld, kernelData, cd)
91                 #endif
92                 {
93                         KernelEven(ld, kernelData, cd);
94                 }
95
96                 X_LIKWID_STOP("list-aa-pv-even");
97
98                 #ifdef VERIFICATION
99                         kdl->Iteration = iter;
100                         kd->PdfsActive = kd->Pdfs[0];
101                         KernelAddBodyForce(kd, ld, cd);
102                 #endif
103
104                 // ---------------------------------------------------
105                 // odd time step
106                 // ---------------------------------------------------
107
108                 X_LIKWID_START("list-aa-pv-odd");
109
110                 #ifdef _OPENMP
111                         #pragma omp parallel default(none) shared(ld, kernelData, cd)
112                 #endif
113                 {
114                         KernelOdd(ld, kernelData, cd);
115                 }
116
117                 X_LIKWID_STOP("list-aa-pv-odd");
118
119
120                 #ifdef VERIFICATION
121                         kdl->Iteration = iter + 1;
122                         kd->PdfsActive = kd->Pdfs[0];
123                         KernelAddBodyForce(kd, ld, cd);
124                 #endif
125
126                 #ifdef VTK_OUTPUT
127                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
128                                 kdl->Iteration = iter + 1;
129                                 kd->PdfsActive = kd->Pdfs[0];
130                                 VtkWrite(ld, kd, cd, iter);
131                         }
132                 #endif
133
134                 #ifdef STATISTICS
135                         kdl->Iteration = iter + 1;
136                         kd->PdfsActive = kd->Pdfs[0];
137                         KernelStatistics(kd, ld, cd, iter);
138                 #endif
139
140         } // for (int iter = 0; ...
141
142 #ifdef VTK_OUTPUT
143         if (cd->VtkOutput) {
144                 kd->PdfsActive = kd->Pdfs[0];
145                 VtkWrite(ld, kd, cd, maxIterations);
146         }
147 #endif
148
149 #ifdef STATISTICS
150         kd->PdfsActive = kd->Pdfs[0];
151         KernelStatistics(kd, ld, cd, maxIterations);
152 #endif
153
154         return;
155 }
156
157 static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
158 {
159         Assert(ld != NULL);
160         Assert(kernelData != NULL);
161         Assert(cd != NULL);
162
163         Assert(cd->Omega > F(0.0));
164         Assert(cd->Omega < F(2.0));
165
166         KernelData        * kd   = (KernelData *)kernelData;
167         KernelDataList    * kdl  = KDL(kernelData);
168         KernelDataListRia * kdlr = KDLR(kernelData);
169
170         PdfT omega = cd->Omega;
171         PdfT omegaEven = omega;
172
173         PdfT magicParam = F(1.0) / F(12.0);
174         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
175
176         PdfT evenPart = F(0.0);
177         PdfT oddPart = F(0.0);
178         PdfT dir_indep_trm = F(0.0);
179
180         const PdfT w_0 = F(1.0) / F( 3.0);
181         const PdfT w_1 = F(1.0) / F(18.0);
182         const PdfT w_2 = F(1.0) / F(36.0);
183
184         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);       PdfT w_1_indep = F(0.0);
185         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);       PdfT w_2_indep = F(0.0);
186
187         PdfT ui;
188
189         PdfT ux, uy, uz;
190         PdfT dens;
191
192
193         VPDFT VONE_HALF = VSET(F(0.5));
194         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
195
196         VPDFT vw_1_indep, vw_2_indep;
197         VPDFT vw_0 = VSET(w_0);
198         VPDFT vw_1 = VSET(w_1);
199         VPDFT vw_2 = VSET(w_2);
200
201         VPDFT vw_1_x3 = VSET(w_1_x3);
202         VPDFT vw_2_x3 = VSET(w_2_x3);
203         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
204         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
205
206         VPDFT vui, vux, vuy, vuz, vdens;
207
208         VPDFT vevenPart, voddPart, vdir_indep_trm;
209
210         VPDFT vomegaEven = VSET(omegaEven);
211         VPDFT vomegaOdd  = VSET(omegaOdd);
212
213         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
214         #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
215         D3Q19_LIST
216         #undef X
217
218         PdfT * src = kd->Pdfs[0];
219
220         int nCells = kdl->nCells;
221
222         int threadId = 0;
223 #ifdef _OPENMP
224         threadId =  omp_get_thread_num();
225 #endif
226
227         int * threadIndices = kdlr->FluidNodeThreadIndices;
228
229         int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
230         int nFluidVec    = nFluidThread - (nFluidThread % VSIZE);
231
232         int indexStartVec = threadIndices[threadId];
233         int indexStopVec  = threadIndices[threadId] + nFluidVec;
234         int indexStop     = threadIndices[threadId] + nFluidThread;
235
236         #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
237
238         for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
239
240
241                 #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]);
242                 D3Q19_LIST
243                 #undef X
244
245
246                 //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
247                 //           vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
248                 vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
249                 //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
250                 //           vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
251                 vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
252                 //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
253                 //           vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
254                 vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
255
256                 //vdens = vpdf_C +
257                 //          vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
258                 //          vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
259                 //          vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
260                 //          vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
261                 vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
262
263                 //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
264                 vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
265
266                 //src[I(index, D3Q19_C)             ]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
267                 VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
268
269                 //vw_1_indep = vw_1 * vdir_indep_trm;
270                 vw_1_indep = VMUL(vw_1,vdir_indep_trm);
271
272                 //vui = vuy;
273                 vui = vuy;
274                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
275                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
276                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
277                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
278                 //src[I(index, D3Q19_S)]  =[UA] vpdf_N - vevenPart - voddPart;
279                 VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
280                 //src[I(index, D3Q19_N)]  =[UA] vpdf_S - vevenPart + voddPart;
281                 VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
282
283                 //vui = vux;
284                 vui = vux;
285                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
286                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
287                 //voddPart =  vomegaOdd  * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 );
288                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
289                 //src[I(index, D3Q19_W)]  =[UA] vpdf_E - vevenPart - voddPart;
290                 VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
291                 //src[I(index, D3Q19_E)]  =[UA] vpdf_W - vevenPart + voddPart;
292                 VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
293
294                 //vui = vuz;
295                 vui = vuz;
296                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
297                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
298                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
299                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
300                 //src[I(index, D3Q19_B)]  =[UA] vpdf_T - vevenPart - voddPart;
301                 VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
302                 //src[I(index, D3Q19_T)]  =[UA] vpdf_B - vevenPart + voddPart;
303                 VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
304
305                 //vw_2_indep = vw_2 * vdir_indep_trm;
306                 vw_2_indep = VMUL(vw_2,vdir_indep_trm);
307
308                 //vui = vuy - vux;
309                 vui = VSUB(vuy,vux);
310                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
311                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
312                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
313                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
314                 //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart;
315                 VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
316                 //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart;
317                 VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
318
319                 //vui = vux + vuy;
320                 vui = VADD(vux,vuy);
321                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
322                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
323                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
324                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
325                 //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart;
326                 VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
327                 //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart;
328                 VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
329
330                 //vui = vuz - vux;
331                 vui = VSUB(vuz,vux);
332                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
333                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
334                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
335                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
336                 //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart;
337                 VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
338                 //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart;
339                 VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
340
341                 //vui = vux + vuz;
342                 vui = VADD(vux,vuz);
343                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
344                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
345                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
346                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
347                 //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart;
348                 VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
349                 //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart;
350                 VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
351
352                 //vui = vuz - vuy;
353                 vui = VSUB(vuz,vuy);
354                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
355                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
356                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
357                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
358                 //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart;
359                 VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
360                 //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart;
361                 VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
362
363                 //vui = vuy + vuz;
364                 vui = VADD(vuy,vuz);
365                 //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
366                 vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
367                 //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
368                 voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
369                 //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart;
370                 VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
371                 //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart;
372                 VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
373
374         } // loop over fluid nodes
375
376         for (int index = indexStopVec; index < indexStop; ++index) {
377
378                 #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(index, idx)];
379                 D3Q19_LIST
380                 #undef X
381
382
383                 ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
384                          pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
385                 uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
386                          pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
387                 uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
388                          pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
389
390                 dens = pdf_C +
391                            pdf_N  + pdf_E  + pdf_S  + pdf_W  +
392                            pdf_NE + pdf_SE + pdf_SW + pdf_NW +
393                            pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
394                            pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
395
396                 dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
397
398                 // direction: w_0
399                 src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
400
401                 // direction: w_1
402                 w_1_indep = w_1*dir_indep_trm;
403
404                 ui = uy;
405                 evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
406                 oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 );
407                 src[I(index, D3Q19_S)]  = pdf_N - evenPart - oddPart;
408                 src[I(index, D3Q19_N)]  = pdf_S - evenPart + oddPart;
409
410                 ui = ux;
411                 evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
412                 oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 );
413                 src[I(index, D3Q19_W)]  = pdf_E - evenPart - oddPart;
414                 src[I(index, D3Q19_E)]  = pdf_W - evenPart + oddPart;
415
416                 ui = uz;
417                 evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
418                 oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 );
419                 src[I(index, D3Q19_B)]  = pdf_T - evenPart - oddPart;
420                 src[I(index, D3Q19_T)]  = pdf_B - evenPart + oddPart;
421
422                 // direction: w_2
423                 w_2_indep = w_2*dir_indep_trm;
424
425                 ui = -ux + uy;
426                 evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
427                 oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 );
428                 src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
429                 src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
430
431                 ui = ux + uy;
432                 evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
433                 oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 );
434                 src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
435                 src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
436
437                 ui = -ux + uz;
438                 evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
439                 oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 );
440                 src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
441                 src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
442
443                 ui = ux + uz;
444                 evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
445                 oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 );
446                 src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
447                 src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
448
449                 ui = -uy + uz;
450                 evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
451                 oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 );
452                 src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
453                 src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
454
455                 ui = uy + uz;
456                 evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
457                 oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 );
458                 src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
459                 src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
460
461         } // loop over fluid nodes
462
463         #undef I
464
465         return;
466 }
467
468 static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
469 {
470
471         Assert(ld != NULL);
472         Assert(kernelData != NULL);
473         Assert(cd != NULL);
474
475         Assert(cd->Omega > F(0.0));
476         Assert(cd->Omega < F(2.0));
477
478         KernelData        * kd   = (KernelData *)kernelData;
479         KernelDataList    * kdl  = KDL(kernelData);
480         KernelDataListRia * kdlr = KDLR(kernelData);
481
482         PdfT omega = cd->Omega;
483         PdfT omegaEven = omega;
484
485         PdfT magicParam = F(1.0) / F(12.0);
486         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
487
488         PdfT evenPart = F(0.0);
489         PdfT oddPart = F(0.0);
490         PdfT dir_indep_trm = F(0.0);
491
492         const PdfT w_0 = F(1.0) / F( 3.0);
493         const PdfT w_1 = F(1.0) / F(18.0);
494         const PdfT w_2 = F(1.0) / F(36.0);
495
496         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);       PdfT w_1_indep = F(0.0);
497         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);       PdfT w_2_indep = F(0.0);
498
499         PdfT ui;
500
501         PdfT ux, uy, uz;
502         PdfT dens;
503
504
505         VPDFT VONE_HALF = VSET(F(0.5));
506         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
507
508         VPDFT vw_1_indep, vw_2_indep;
509         VPDFT vw_0 = VSET(w_0);
510         VPDFT vw_1 = VSET(w_1);
511         VPDFT vw_2 = VSET(w_2);
512
513         VPDFT vw_1_x3 = VSET(w_1_x3);
514         VPDFT vw_2_x3 = VSET(w_2_x3);
515         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
516         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
517
518         VPDFT vui, vux, vuy, vuz, vdens;
519
520         VPDFT vevenPart, voddPart, vdir_indep_trm;
521
522         VPDFT vomegaEven = VSET(omegaEven);
523         VPDFT vomegaOdd  = VSET(omegaOdd);
524
525
526         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
527         #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
528         D3Q19_LIST
529         #undef X
530
531         // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ...
532         #define X(name, idx, idxinv, x, y, z)   PdfT * JOIN(ppdf_,name) = NULL;
533         D3Q19_LIST
534         #undef X
535
536         uint32_t nConsecNodes = kdlr->nConsecNodes;
537         uint32_t * consecNodes = kdlr->ConsecNodes;
538         uint32_t consecIndex = 0;
539         uint32_t consecValue = 0;
540
541 #ifndef DEBUG
542         UNUSED(nConsecNodes);
543 #endif
544
545         PdfT * src = kd->Pdfs[0];
546
547         int nCells = kdl->nCells;
548
549         uint32_t adjListIndex;
550         uint32_t * adjList = kdl->AdjList;
551
552         int threadId = 0;
553
554         #ifdef _OPENMP
555                 threadId = omp_get_thread_num();
556         #endif
557
558         consecIndex = kdlr->ConsecThreadIndices[threadId];
559         consecValue = 0;
560
561         int * threadIndices = kdlr->FluidNodeThreadIndices;
562
563         int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
564
565         int indexStart = threadIndices[threadId];
566         int indexStop  = threadIndices[threadId] + nFluidThread;
567
568         #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
569
570         #define ADJ_LIST(dir) adjList[adjListIndex + (dir)]
571
572         int pointerOffset = 1;
573
574         for (int index = indexStart; index < indexStop; index += 1) {
575
576                 if (consecValue > 0) {
577                         --consecValue;
578                         // Increment all pdf pointers by an offset. If the previous iteration was
579                         // scalar, increment only by one. If the previous iteration was vectorized,
580                         // increment by the vector width. These offsets are set in the corresponding
581                         // if branches.
582                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) += pointerOffset;
583                         D3Q19_LIST
584                         #undef X
585                 }
586                 else {
587                         Assert(consecIndex < nConsecNodes);
588
589                         consecValue = consecNodes[consecIndex] - 1;
590                         // Load new pointers to PDFs of local cell:
591
592                         adjListIndex = index * N_D3Q19_IDX;
593
594                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]);
595                         D3Q19_LIST_WO_C
596                         #undef X
597
598                         ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
599                         ++consecIndex;
600                 }
601
602                 #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = *JOIN(ppdf_,name);
603                 D3Q19_LIST
604                 #undef X
605
606                 if (consecValue >= (VSIZE - 1)) {
607                         // Vectorized part.
608
609                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
610                         D3Q19_LIST_WO_C
611                         #undef X
612
613                         vpdf_C = VLDU(ppdf_C);
614
615                         //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
616                         //      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
617                         vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
618                         //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
619                         //      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
620                         vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
621                         //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
622                         //      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
623                         vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
624
625                         //vdens = vpdf_C +
626                         //        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
627                         //        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
628                         //        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
629                         //        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
630                         vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
631                                                  VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
632
633                         //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
634                         vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
635
636                         adjListIndex = index * N_D3Q19_IDX;
637
638                         //src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
639                         VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
640
641                         //vw_1_indep = vw_1 * vdir_indep_trm;
642                         vw_1_indep = VMUL(vw_1,vdir_indep_trm);
643
644                         //vui = vuy;
645                         vui = vuy;
646                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
647                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
648                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
649                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
650                         //src[ADJ_LIST(D3Q19_N)]  =[UA] vpdf_N - vevenPart - voddPart;
651                         VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart));
652                         //src[ADJ_LIST(D3Q19_S)]  =[UA] vpdf_S - vevenPart + voddPart;
653                         VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart));
654
655                         //vui = vux;
656                         vui = vux;
657                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
658                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
659                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3);
660                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
661                         //src[ADJ_LIST(D3Q19_E)]  =[UA] vpdf_E - vevenPart - voddPart;
662                         VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart));
663                         //src[ADJ_LIST(D3Q19_W)]  =[UA] vpdf_W - vevenPart + voddPart;
664                         VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart));
665
666                         //vui = vuz;
667                         vui = vuz;
668                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
669                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
670                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
671                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
672                         //src[ADJ_LIST(D3Q19_T)]  =[UA] vpdf_T - vevenPart - voddPart;
673                         VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart));
674                         //src[ADJ_LIST(D3Q19_B)]  =[UA] vpdf_B - vevenPart + voddPart;
675                         VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart));
676
677                         //vw_2_indep = vw_2 * vdir_indep_trm;
678                         vw_2_indep = VMUL(vw_2,vdir_indep_trm);
679
680                         //vui = vuy - vux;
681                         vui = VSUB(vuy,vux);
682                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
683                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
684                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
685                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
686                         //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart;
687                         VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
688                         //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart;
689                         VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart));
690
691                         //vui = vux + vuy;
692                         vui = VADD(vux,vuy);
693                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
694                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
695                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
696                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
697                         //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart;
698                         VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
699                         //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart;
700                         VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart));
701
702                         //vui = vuz - vux;
703                         vui = VSUB(vuz,vux);
704                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
705                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
706                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
707                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
708                         //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart;
709                         VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
710                         //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart;
711                         VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart));
712
713                         //vui = vux + vuz;
714                         vui = VADD(vux,vuz);
715                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
716                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
717                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
718                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
719                         //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart;
720                         VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
721                         //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart;
722                         VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart));
723
724                         //vui = vuz - vuy;
725                         vui = VSUB(vuz,vuy);
726                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
727                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
728                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
729                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
730                         //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
731                         VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
732                         //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
733                         VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
734
735                         //vui = vuy + vuz;
736                         vui = VADD(vuy,vuz);
737                         //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
738                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
739                         //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
740                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
741                         //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart;
742                         VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
743                         //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart;
744                         VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart));
745
746                         consecValue   -= (VSIZE - 1);
747                         index         += (VSIZE - 1);
748                         pointerOffset  = VSIZE;
749
750                 }
751                 else {
752                         // Scalar part.
753
754                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = *(JOIN(ppdf_,name));
755                         D3Q19_LIST_WO_C
756                         #undef X
757
758                         pdf_C = *ppdf_C;
759
760                         ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
761                                  pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
762                         uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
763                                  pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
764                         uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
765                                  pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
766
767                         dens = pdf_C +
768                                    pdf_N  + pdf_E  + pdf_S  + pdf_W  +
769                                    pdf_NE + pdf_SE + pdf_SW + pdf_NW +
770                                    pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
771                                    pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
772
773                         dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0);
774
775                         adjListIndex = index * N_D3Q19_IDX;
776
777                         // direction: w_0
778                         src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
779
780                         // direction: w_1
781                         w_1_indep = w_1 * dir_indep_trm;
782
783                         ui = uy;
784                         evenPart = omegaEven * (F(0.5) * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
785                         oddPart  = omegaOdd  * (F(0.5) * (pdf_N - pdf_S) - ui * w_1_x3);
786                         *ppdf_S  = pdf_N - evenPart - oddPart;
787                         *ppdf_N  = pdf_S - evenPart + oddPart;
788
789                         ui = ux;
790                         evenPart = omegaEven * (F(0.5) * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
791                         oddPart  = omegaOdd  * (F(0.5) * (pdf_E - pdf_W) - ui * w_1_x3);
792                         *ppdf_W  = pdf_E - evenPart - oddPart;
793                         *ppdf_E  = pdf_W - evenPart + oddPart;
794
795                         ui = uz;
796                         evenPart = omegaEven * (F(0.5) * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
797                         oddPart  = omegaOdd  * (F(0.5) * (pdf_T - pdf_B) - ui * w_1_x3);
798                         *ppdf_B  = pdf_T - evenPart - oddPart;
799                         *ppdf_T  = pdf_B - evenPart + oddPart;
800
801                         // direction: w_2
802                         w_2_indep = w_2 * dir_indep_trm;
803
804                         ui = -ux + uy;
805                         evenPart = omegaEven * (F(0.5) * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
806                         oddPart  = omegaOdd  * (F(0.5) * (pdf_NW - pdf_SE) - ui * w_2_x3);
807                         *ppdf_SE = pdf_NW - evenPart - oddPart;
808                         *ppdf_NW = pdf_SE - evenPart + oddPart;
809
810                         ui = ux + uy;
811                         evenPart = omegaEven * (F(0.5) * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
812                         oddPart  = omegaOdd  * (F(0.5) * (pdf_NE - pdf_SW) - ui * w_2_x3);
813                         *ppdf_SW = pdf_NE - evenPart - oddPart;
814                         *ppdf_NE = pdf_SW - evenPart + oddPart;
815
816                         ui = -ux + uz;
817                         evenPart = omegaEven * (F(0.5) * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
818                         oddPart  = omegaOdd  * (F(0.5) * (pdf_TW - pdf_BE) - ui * w_2_x3);
819                         *ppdf_BE = pdf_TW - evenPart - oddPart;
820                         *ppdf_TW = pdf_BE - evenPart + oddPart;
821
822                         ui = ux + uz;
823                         evenPart = omegaEven * (F(0.5) * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
824                         oddPart  = omegaOdd  * (F(0.5) * (pdf_TE - pdf_BW) - ui * w_2_x3);
825                         *ppdf_BW = pdf_TE - evenPart - oddPart;
826                         *ppdf_TE = pdf_BW - evenPart + oddPart;
827
828                         ui = -uy + uz;
829                         evenPart = omegaEven * (F(0.5) * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
830                         oddPart  = omegaOdd  * (F(0.5) * (pdf_TS - pdf_BN) - ui * w_2_x3);
831                         *ppdf_BN = pdf_TS - evenPart - oddPart;
832                         *ppdf_TS = pdf_BN - evenPart + oddPart;
833
834                         ui = uy + uz;
835                         evenPart = omegaEven * (F(0.5) * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
836                         oddPart  = omegaOdd  * (F(0.5) * (pdf_TN - pdf_BS) - ui * w_2_x3);
837                         *ppdf_BS = pdf_TN - evenPart - oddPart;
838                         *ppdf_TN = pdf_BS - evenPart + oddPart;
839
840                         pointerOffset = 1;
841                 }
842
843         } // loop over fluid nodes
844
845         #undef ADJ_LIST
846         #undef I
847 }
This page took 0.085689 seconds and 3 git commands to generate.