Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListAaPvCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
32 | #include "LikwidIf.h" | |
33 | ||
34 | #include <inttypes.h> | |
35 | #include <math.h> | |
36 | ||
37 | #ifdef _OPENMP | |
38 | #include <omp.h> | |
39 | #endif | |
40 | ||
41 | ||
42 | static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd); | |
43 | static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd); | |
44 | ||
45 | void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
46 | { | |
47 | ||
48 | Assert(ld != NULL); | |
49 | Assert(kernelData != NULL); | |
50 | Assert(cd != NULL); | |
51 | ||
0fde6e45 MW |
52 | Assert(cd->Omega > F(0.0)); |
53 | Assert(cd->Omega < F(2.0));; | |
10988083 MW |
54 | |
55 | #if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION) | |
56 | KernelData * kd = (KernelData *)kernelData; | |
57 | KernelDataList * kdl = KDL(kernelData); | |
58 | #endif | |
59 | ||
60 | int maxIterations = cd->MaxIterations; | |
61 | ||
62 | int nThreads = 1; | |
63 | #ifdef _OPENMP | |
64 | nThreads = omp_get_max_threads(); | |
65 | #endif | |
66 | ||
67 | #ifdef VTK_OUTPUT | |
68 | if (cd->VtkOutput) { | |
69 | kd->PdfsActive = kd->Pdfs[0]; | |
70 | VtkWrite(ld, kd, cd, -1); | |
71 | } | |
72 | #endif | |
73 | ||
74 | #ifdef STATISTICS | |
75 | kd->PdfsActive = kd->Pdfs[0]; | |
76 | KernelStatistics(kd, ld, cd, 0); | |
77 | #endif | |
78 | ||
8cafd9ea MW |
79 | X_KERNEL_START(kernelData); |
80 | ||
10988083 MW |
81 | // TODO: outer openmp parallel |
82 | ||
83 | for(int iter = 0; iter < maxIterations; iter += 2) { | |
84 | ||
85 | // --------------------------------------------------- | |
86 | // even time step | |
87 | // --------------------------------------------------- | |
88 | ||
89 | X_LIKWID_START("list-aa-pv-even"); | |
90 | ||
91 | #ifdef _OPENMP | |
92 | #pragma omp parallel default(none) shared(ld, kernelData, cd) | |
93 | #endif | |
94 | { | |
95 | KernelEven(ld, kernelData, cd); | |
96 | } | |
97 | ||
98 | X_LIKWID_STOP("list-aa-pv-even"); | |
99 | ||
100 | #ifdef VERIFICATION | |
101 | kdl->Iteration = iter; | |
102 | kd->PdfsActive = kd->Pdfs[0]; | |
103 | KernelAddBodyForce(kd, ld, cd); | |
104 | #endif | |
105 | ||
106 | // --------------------------------------------------- | |
107 | // odd time step | |
108 | // --------------------------------------------------- | |
109 | ||
110 | X_LIKWID_START("list-aa-pv-odd"); | |
111 | ||
112 | #ifdef _OPENMP | |
113 | #pragma omp parallel default(none) shared(ld, kernelData, cd) | |
114 | #endif | |
115 | { | |
116 | KernelOdd(ld, kernelData, cd); | |
117 | } | |
118 | ||
119 | X_LIKWID_STOP("list-aa-pv-odd"); | |
120 | ||
121 | ||
122 | #ifdef VERIFICATION | |
123 | kdl->Iteration = iter + 1; | |
124 | kd->PdfsActive = kd->Pdfs[0]; | |
125 | KernelAddBodyForce(kd, ld, cd); | |
126 | #endif | |
127 | ||
128 | #ifdef VTK_OUTPUT | |
129 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
130 | kdl->Iteration = iter + 1; | |
131 | kd->PdfsActive = kd->Pdfs[0]; | |
132 | VtkWrite(ld, kd, cd, iter); | |
133 | } | |
134 | #endif | |
135 | ||
136 | #ifdef STATISTICS | |
137 | kdl->Iteration = iter + 1; | |
138 | kd->PdfsActive = kd->Pdfs[0]; | |
139 | KernelStatistics(kd, ld, cd, iter); | |
140 | #endif | |
141 | ||
142 | } // for (int iter = 0; ... | |
143 | ||
8cafd9ea MW |
144 | X_KERNEL_END(kernelData); |
145 | ||
10988083 MW |
146 | #ifdef VTK_OUTPUT |
147 | if (cd->VtkOutput) { | |
148 | kd->PdfsActive = kd->Pdfs[0]; | |
149 | VtkWrite(ld, kd, cd, maxIterations); | |
150 | } | |
151 | #endif | |
152 | ||
153 | #ifdef STATISTICS | |
154 | kd->PdfsActive = kd->Pdfs[0]; | |
155 | KernelStatistics(kd, ld, cd, maxIterations); | |
156 | #endif | |
157 | ||
158 | return; | |
159 | } | |
160 | ||
161 | static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
162 | { | |
163 | Assert(ld != NULL); | |
164 | Assert(kernelData != NULL); | |
165 | Assert(cd != NULL); | |
166 | ||
0fde6e45 MW |
167 | Assert(cd->Omega > F(0.0)); |
168 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
169 | |
170 | KernelData * kd = (KernelData *)kernelData; | |
171 | KernelDataList * kdl = KDL(kernelData); | |
172 | KernelDataListRia * kdlr = KDLR(kernelData); | |
173 | ||
174 | PdfT omega = cd->Omega; | |
175 | PdfT omegaEven = omega; | |
176 | ||
0fde6e45 MW |
177 | PdfT magicParam = F(1.0) / F(12.0); |
178 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 179 | |
0fde6e45 MW |
180 | PdfT evenPart = F(0.0); |
181 | PdfT oddPart = F(0.0); | |
182 | PdfT dir_indep_trm = F(0.0); | |
10988083 | 183 | |
0fde6e45 MW |
184 | const PdfT w_0 = F(1.0) / F( 3.0); |
185 | const PdfT w_1 = F(1.0) / F(18.0); | |
186 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 187 | |
0fde6e45 MW |
188 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0); |
189 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0); | |
10988083 MW |
190 | |
191 | PdfT ui; | |
192 | ||
193 | PdfT ux, uy, uz; | |
194 | PdfT dens; | |
195 | ||
196 | ||
0fde6e45 MW |
197 | VPDFT VONE_HALF = VSET(F(0.5)); |
198 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
10988083 MW |
199 | |
200 | VPDFT vw_1_indep, vw_2_indep; | |
201 | VPDFT vw_0 = VSET(w_0); | |
202 | VPDFT vw_1 = VSET(w_1); | |
203 | VPDFT vw_2 = VSET(w_2); | |
204 | ||
205 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
206 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
207 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
208 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
209 | ||
210 | VPDFT vui, vux, vuy, vuz, vdens; | |
211 | ||
212 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
213 | ||
214 | VPDFT vomegaEven = VSET(omegaEven); | |
215 | VPDFT vomegaOdd = VSET(omegaOdd); | |
216 | ||
217 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
218 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); | |
219 | D3Q19_LIST | |
220 | #undef X | |
221 | ||
222 | PdfT * src = kd->Pdfs[0]; | |
223 | ||
224 | int nCells = kdl->nCells; | |
225 | ||
226 | int threadId = 0; | |
227 | #ifdef _OPENMP | |
228 | threadId = omp_get_thread_num(); | |
229 | #endif | |
230 | ||
231 | int * threadIndices = kdlr->FluidNodeThreadIndices; | |
232 | ||
233 | int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; | |
234 | int nFluidVec = nFluidThread - (nFluidThread % VSIZE); | |
235 | ||
236 | int indexStartVec = threadIndices[threadId]; | |
237 | int indexStopVec = threadIndices[threadId] + nFluidVec; | |
238 | int indexStop = threadIndices[threadId] + nFluidThread; | |
239 | ||
240 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
241 | ||
8cafd9ea | 242 | for (int index = indexStartVec; index < indexStopVec; index += VSIZE) { // LOOP list-aa-pv-even-vec |
10988083 MW |
243 | |
244 | ||
245 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]); | |
246 | D3Q19_LIST | |
247 | #undef X | |
248 | ||
249 | ||
250 | //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - | |
251 | // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; | |
252 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
253 | //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - | |
254 | // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; | |
255 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
256 | //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - | |
257 | // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; | |
258 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
259 | ||
260 | //vdens = vpdf_C + | |
261 | // vpdf_N + vpdf_E + vpdf_S + vpdf_W + | |
262 | // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + | |
263 | // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + | |
264 | // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; | |
265 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
266 | ||
267 | //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; | |
268 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
269 | ||
270 | //src[I(index, D3Q19_C) ] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); | |
271 | VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
272 | ||
273 | //vw_1_indep = vw_1 * vdir_indep_trm; | |
274 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
275 | ||
276 | //vui = vuy; | |
277 | vui = vuy; | |
278 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); | |
279 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
280 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); | |
281 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); | |
282 | //src[I(index, D3Q19_S)] =[UA] vpdf_N - vevenPart - voddPart; | |
283 | VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart)); | |
284 | //src[I(index, D3Q19_N)] =[UA] vpdf_S - vevenPart + voddPart; | |
285 | VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart)); | |
286 | ||
287 | //vui = vux; | |
288 | vui = vux; | |
289 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); | |
290 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
291 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 ); | |
292 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); | |
293 | //src[I(index, D3Q19_W)] =[UA] vpdf_E - vevenPart - voddPart; | |
294 | VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart)); | |
295 | //src[I(index, D3Q19_E)] =[UA] vpdf_W - vevenPart + voddPart; | |
296 | VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart)); | |
297 | ||
298 | //vui = vuz; | |
299 | vui = vuz; | |
300 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); | |
301 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
302 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); | |
303 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); | |
304 | //src[I(index, D3Q19_B)] =[UA] vpdf_T - vevenPart - voddPart; | |
305 | VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart)); | |
306 | //src[I(index, D3Q19_T)] =[UA] vpdf_B - vevenPart + voddPart; | |
307 | VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart)); | |
308 | ||
309 | //vw_2_indep = vw_2 * vdir_indep_trm; | |
310 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
311 | ||
312 | //vui = vuy - vux; | |
313 | vui = VSUB(vuy,vux); | |
314 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
315 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
316 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); | |
317 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); | |
318 | //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart; | |
319 | VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); | |
320 | //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart; | |
321 | VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart)); | |
322 | ||
323 | //vui = vux + vuy; | |
324 | vui = VADD(vux,vuy); | |
325 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
326 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
327 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); | |
328 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); | |
329 | //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart; | |
330 | VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); | |
331 | //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart; | |
332 | VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart)); | |
333 | ||
334 | //vui = vuz - vux; | |
335 | vui = VSUB(vuz,vux); | |
336 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
337 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
338 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); | |
339 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); | |
340 | //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart; | |
341 | VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); | |
342 | //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart; | |
343 | VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart)); | |
344 | ||
345 | //vui = vux + vuz; | |
346 | vui = VADD(vux,vuz); | |
347 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
348 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
349 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); | |
350 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); | |
351 | //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart; | |
352 | VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); | |
353 | //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart; | |
354 | VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart)); | |
355 | ||
356 | //vui = vuz - vuy; | |
357 | vui = VSUB(vuz,vuy); | |
358 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); | |
359 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
360 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); | |
361 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); | |
362 | //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart; | |
363 | VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); | |
364 | //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart; | |
365 | VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart)); | |
366 | ||
367 | //vui = vuy + vuz; | |
368 | vui = VADD(vuy,vuz); | |
369 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); | |
370 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
371 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); | |
372 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); | |
373 | //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart; | |
374 | VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); | |
375 | //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart; | |
376 | VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart)); | |
377 | ||
378 | } // loop over fluid nodes | |
379 | ||
8cafd9ea | 380 | for (int index = indexStopVec; index < indexStop; ++index) { // LOOP list-aa-pv-even-scalar |
10988083 MW |
381 | |
382 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; | |
383 | D3Q19_LIST | |
384 | #undef X | |
385 | ||
386 | ||
387 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
388 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
389 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
390 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
391 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
392 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
393 | ||
394 | dens = pdf_C + | |
395 | pdf_N + pdf_E + pdf_S + pdf_W + | |
396 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
397 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
398 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
399 | ||
0fde6e45 | 400 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0); |
10988083 MW |
401 | |
402 | // direction: w_0 | |
403 | src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); | |
404 | ||
405 | // direction: w_1 | |
406 | w_1_indep = w_1*dir_indep_trm; | |
407 | ||
408 | ui = uy; | |
0fde6e45 MW |
409 | evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); |
410 | oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 ); | |
10988083 MW |
411 | src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart; |
412 | src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart; | |
413 | ||
414 | ui = ux; | |
0fde6e45 MW |
415 | evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); |
416 | oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 ); | |
10988083 MW |
417 | src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart; |
418 | src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart; | |
419 | ||
420 | ui = uz; | |
0fde6e45 MW |
421 | evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); |
422 | oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 ); | |
10988083 MW |
423 | src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart; |
424 | src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart; | |
425 | ||
426 | // direction: w_2 | |
427 | w_2_indep = w_2*dir_indep_trm; | |
428 | ||
429 | ui = -ux + uy; | |
0fde6e45 MW |
430 | evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); |
431 | oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 ); | |
10988083 MW |
432 | src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart; |
433 | src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart; | |
434 | ||
435 | ui = ux + uy; | |
0fde6e45 MW |
436 | evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); |
437 | oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 ); | |
10988083 MW |
438 | src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart; |
439 | src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart; | |
440 | ||
441 | ui = -ux + uz; | |
0fde6e45 MW |
442 | evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); |
443 | oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 ); | |
10988083 MW |
444 | src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart; |
445 | src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart; | |
446 | ||
447 | ui = ux + uz; | |
0fde6e45 MW |
448 | evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); |
449 | oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 ); | |
10988083 MW |
450 | src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart; |
451 | src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart; | |
452 | ||
453 | ui = -uy + uz; | |
0fde6e45 MW |
454 | evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); |
455 | oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 ); | |
10988083 MW |
456 | src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart; |
457 | src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart; | |
458 | ||
459 | ui = uy + uz; | |
0fde6e45 MW |
460 | evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); |
461 | oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 ); | |
10988083 MW |
462 | src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart; |
463 | src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart; | |
464 | ||
465 | } // loop over fluid nodes | |
466 | ||
467 | #undef I | |
468 | ||
469 | return; | |
470 | } | |
471 | ||
472 | static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
473 | { | |
474 | ||
475 | Assert(ld != NULL); | |
476 | Assert(kernelData != NULL); | |
477 | Assert(cd != NULL); | |
478 | ||
0fde6e45 MW |
479 | Assert(cd->Omega > F(0.0)); |
480 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
481 | |
482 | KernelData * kd = (KernelData *)kernelData; | |
483 | KernelDataList * kdl = KDL(kernelData); | |
484 | KernelDataListRia * kdlr = KDLR(kernelData); | |
485 | ||
486 | PdfT omega = cd->Omega; | |
487 | PdfT omegaEven = omega; | |
488 | ||
0fde6e45 MW |
489 | PdfT magicParam = F(1.0) / F(12.0); |
490 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 491 | |
0fde6e45 MW |
492 | PdfT evenPart = F(0.0); |
493 | PdfT oddPart = F(0.0); | |
494 | PdfT dir_indep_trm = F(0.0); | |
10988083 | 495 | |
0fde6e45 MW |
496 | const PdfT w_0 = F(1.0) / F( 3.0); |
497 | const PdfT w_1 = F(1.0) / F(18.0); | |
498 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 499 | |
0fde6e45 MW |
500 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0); |
501 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0); | |
10988083 MW |
502 | |
503 | PdfT ui; | |
504 | ||
505 | PdfT ux, uy, uz; | |
506 | PdfT dens; | |
507 | ||
508 | ||
0fde6e45 MW |
509 | VPDFT VONE_HALF = VSET(F(0.5)); |
510 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
10988083 MW |
511 | |
512 | VPDFT vw_1_indep, vw_2_indep; | |
513 | VPDFT vw_0 = VSET(w_0); | |
514 | VPDFT vw_1 = VSET(w_1); | |
515 | VPDFT vw_2 = VSET(w_2); | |
516 | ||
517 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
518 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
519 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
520 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
521 | ||
522 | VPDFT vui, vux, vuy, vuz, vdens; | |
523 | ||
524 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
525 | ||
526 | VPDFT vomegaEven = VSET(omegaEven); | |
527 | VPDFT vomegaOdd = VSET(omegaOdd); | |
528 | ||
529 | ||
530 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
531 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); | |
532 | D3Q19_LIST | |
533 | #undef X | |
534 | ||
535 | // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ... | |
536 | #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_,name) = NULL; | |
537 | D3Q19_LIST | |
538 | #undef X | |
539 | ||
540 | uint32_t nConsecNodes = kdlr->nConsecNodes; | |
541 | uint32_t * consecNodes = kdlr->ConsecNodes; | |
542 | uint32_t consecIndex = 0; | |
543 | uint32_t consecValue = 0; | |
544 | ||
545 | #ifndef DEBUG | |
546 | UNUSED(nConsecNodes); | |
547 | #endif | |
548 | ||
549 | PdfT * src = kd->Pdfs[0]; | |
550 | ||
551 | int nCells = kdl->nCells; | |
552 | ||
553 | uint32_t adjListIndex; | |
554 | uint32_t * adjList = kdl->AdjList; | |
555 | ||
556 | int threadId = 0; | |
557 | ||
558 | #ifdef _OPENMP | |
559 | threadId = omp_get_thread_num(); | |
560 | #endif | |
561 | ||
562 | consecIndex = kdlr->ConsecThreadIndices[threadId]; | |
563 | consecValue = 0; | |
564 | ||
565 | int * threadIndices = kdlr->FluidNodeThreadIndices; | |
566 | ||
567 | int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; | |
568 | ||
569 | int indexStart = threadIndices[threadId]; | |
570 | int indexStop = threadIndices[threadId] + nFluidThread; | |
571 | ||
572 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
573 | ||
574 | #define ADJ_LIST(dir) adjList[adjListIndex + (dir)] | |
575 | ||
576 | int pointerOffset = 1; | |
577 | ||
578 | for (int index = indexStart; index < indexStop; index += 1) { | |
579 | ||
580 | if (consecValue > 0) { | |
581 | --consecValue; | |
582 | // Increment all pdf pointers by an offset. If the previous iteration was | |
583 | // scalar, increment only by one. If the previous iteration was vectorized, | |
584 | // increment by the vector width. These offsets are set in the corresponding | |
585 | // if branches. | |
586 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += pointerOffset; | |
587 | D3Q19_LIST | |
588 | #undef X | |
589 | } | |
590 | else { | |
591 | Assert(consecIndex < nConsecNodes); | |
592 | ||
593 | consecValue = consecNodes[consecIndex] - 1; | |
594 | // Load new pointers to PDFs of local cell: | |
595 | ||
596 | adjListIndex = index * N_D3Q19_IDX; | |
597 | ||
598 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]); | |
599 | D3Q19_LIST_WO_C | |
600 | #undef X | |
601 | ||
602 | ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]); | |
603 | ++consecIndex; | |
604 | } | |
605 | ||
606 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *JOIN(ppdf_,name); | |
607 | D3Q19_LIST | |
608 | #undef X | |
609 | ||
610 | if (consecValue >= (VSIZE - 1)) { | |
611 | // Vectorized part. | |
612 | ||
613 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name)); | |
614 | D3Q19_LIST_WO_C | |
615 | #undef X | |
616 | ||
617 | vpdf_C = VLDU(ppdf_C); | |
618 | ||
619 | //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - | |
620 | // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; | |
621 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
622 | //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - | |
623 | // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; | |
624 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
625 | //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - | |
626 | // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; | |
627 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
628 | ||
629 | //vdens = vpdf_C + | |
630 | // vpdf_N + vpdf_E + vpdf_S + vpdf_W + | |
631 | // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + | |
632 | // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + | |
633 | // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; | |
634 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
635 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
636 | ||
637 | //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; | |
638 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
639 | ||
640 | adjListIndex = index * N_D3Q19_IDX; | |
641 | ||
642 | //src[I(index, D3Q19_C)] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); | |
643 | VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
644 | ||
645 | //vw_1_indep = vw_1 * vdir_indep_trm; | |
646 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
647 | ||
648 | //vui = vuy; | |
649 | vui = vuy; | |
650 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); | |
651 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
652 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); | |
653 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); | |
654 | //src[ADJ_LIST(D3Q19_N)] =[UA] vpdf_N - vevenPart - voddPart; | |
655 | VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart)); | |
656 | //src[ADJ_LIST(D3Q19_S)] =[UA] vpdf_S - vevenPart + voddPart; | |
657 | VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart)); | |
658 | ||
659 | //vui = vux; | |
660 | vui = vux; | |
661 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); | |
662 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
663 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3); | |
664 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); | |
665 | //src[ADJ_LIST(D3Q19_E)] =[UA] vpdf_E - vevenPart - voddPart; | |
666 | VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart)); | |
667 | //src[ADJ_LIST(D3Q19_W)] =[UA] vpdf_W - vevenPart + voddPart; | |
668 | VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart)); | |
669 | ||
670 | //vui = vuz; | |
671 | vui = vuz; | |
672 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); | |
673 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
674 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); | |
675 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); | |
676 | //src[ADJ_LIST(D3Q19_T)] =[UA] vpdf_T - vevenPart - voddPart; | |
677 | VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart)); | |
678 | //src[ADJ_LIST(D3Q19_B)] =[UA] vpdf_B - vevenPart + voddPart; | |
679 | VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart)); | |
680 | ||
681 | //vw_2_indep = vw_2 * vdir_indep_trm; | |
682 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
683 | ||
684 | //vui = vuy - vux; | |
685 | vui = VSUB(vuy,vux); | |
686 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
687 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
688 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); | |
689 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); | |
690 | //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart; | |
691 | VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); | |
692 | //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart; | |
693 | VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart)); | |
694 | ||
695 | //vui = vux + vuy; | |
696 | vui = VADD(vux,vuy); | |
697 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
698 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
699 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); | |
700 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); | |
701 | //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart; | |
702 | VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); | |
703 | //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart; | |
704 | VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart)); | |
705 | ||
706 | //vui = vuz - vux; | |
707 | vui = VSUB(vuz,vux); | |
708 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
709 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
710 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); | |
711 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); | |
712 | //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart; | |
713 | VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); | |
714 | //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart; | |
715 | VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart)); | |
716 | ||
717 | //vui = vux + vuz; | |
718 | vui = VADD(vux,vuz); | |
719 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
720 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
721 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); | |
722 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); | |
723 | //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart; | |
724 | VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); | |
725 | //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart; | |
726 | VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart)); | |
727 | ||
728 | //vui = vuz - vuy; | |
729 | vui = VSUB(vuz,vuy); | |
730 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); | |
731 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
732 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); | |
733 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); | |
734 | //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart; | |
e3f82424 | 735 | VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); |
10988083 | 736 | //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart; |
e3f82424 | 737 | VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); |
10988083 MW |
738 | |
739 | //vui = vuy + vuz; | |
740 | vui = VADD(vuy,vuz); | |
741 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); | |
742 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
743 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); | |
744 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); | |
745 | //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart; | |
746 | VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); | |
747 | //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart; | |
748 | VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart)); | |
749 | ||
750 | consecValue -= (VSIZE - 1); | |
751 | index += (VSIZE - 1); | |
752 | pointerOffset = VSIZE; | |
753 | ||
754 | } | |
755 | else { | |
756 | // Scalar part. | |
757 | ||
758 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *(JOIN(ppdf_,name)); | |
759 | D3Q19_LIST_WO_C | |
760 | #undef X | |
761 | ||
762 | pdf_C = *ppdf_C; | |
763 | ||
764 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
765 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
766 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
767 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
768 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
769 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
770 | ||
771 | dens = pdf_C + | |
772 | pdf_N + pdf_E + pdf_S + pdf_W + | |
773 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
774 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
775 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
776 | ||
0fde6e45 | 777 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0); |
10988083 MW |
778 | |
779 | adjListIndex = index * N_D3Q19_IDX; | |
780 | ||
781 | // direction: w_0 | |
782 | src[I(index, D3Q19_C) ] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); | |
783 | ||
784 | // direction: w_1 | |
785 | w_1_indep = w_1 * dir_indep_trm; | |
786 | ||
787 | ui = uy; | |
0fde6e45 MW |
788 | evenPart = omegaEven * (F(0.5) * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep); |
789 | oddPart = omegaOdd * (F(0.5) * (pdf_N - pdf_S) - ui * w_1_x3); | |
e3f82424 MW |
790 | *ppdf_S = pdf_N - evenPart - oddPart; |
791 | *ppdf_N = pdf_S - evenPart + oddPart; | |
10988083 MW |
792 | |
793 | ui = ux; | |
0fde6e45 MW |
794 | evenPart = omegaEven * (F(0.5) * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep); |
795 | oddPart = omegaOdd * (F(0.5) * (pdf_E - pdf_W) - ui * w_1_x3); | |
e3f82424 MW |
796 | *ppdf_W = pdf_E - evenPart - oddPart; |
797 | *ppdf_E = pdf_W - evenPart + oddPart; | |
10988083 MW |
798 | |
799 | ui = uz; | |
0fde6e45 MW |
800 | evenPart = omegaEven * (F(0.5) * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep); |
801 | oddPart = omegaOdd * (F(0.5) * (pdf_T - pdf_B) - ui * w_1_x3); | |
e3f82424 MW |
802 | *ppdf_B = pdf_T - evenPart - oddPart; |
803 | *ppdf_T = pdf_B - evenPart + oddPart; | |
10988083 MW |
804 | |
805 | // direction: w_2 | |
806 | w_2_indep = w_2 * dir_indep_trm; | |
807 | ||
808 | ui = -ux + uy; | |
0fde6e45 MW |
809 | evenPart = omegaEven * (F(0.5) * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep); |
810 | oddPart = omegaOdd * (F(0.5) * (pdf_NW - pdf_SE) - ui * w_2_x3); | |
e3f82424 MW |
811 | *ppdf_SE = pdf_NW - evenPart - oddPart; |
812 | *ppdf_NW = pdf_SE - evenPart + oddPart; | |
10988083 MW |
813 | |
814 | ui = ux + uy; | |
0fde6e45 MW |
815 | evenPart = omegaEven * (F(0.5) * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep); |
816 | oddPart = omegaOdd * (F(0.5) * (pdf_NE - pdf_SW) - ui * w_2_x3); | |
e3f82424 MW |
817 | *ppdf_SW = pdf_NE - evenPart - oddPart; |
818 | *ppdf_NE = pdf_SW - evenPart + oddPart; | |
10988083 MW |
819 | |
820 | ui = -ux + uz; | |
0fde6e45 MW |
821 | evenPart = omegaEven * (F(0.5) * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep); |
822 | oddPart = omegaOdd * (F(0.5) * (pdf_TW - pdf_BE) - ui * w_2_x3); | |
e3f82424 MW |
823 | *ppdf_BE = pdf_TW - evenPart - oddPart; |
824 | *ppdf_TW = pdf_BE - evenPart + oddPart; | |
10988083 MW |
825 | |
826 | ui = ux + uz; | |
0fde6e45 MW |
827 | evenPart = omegaEven * (F(0.5) * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep); |
828 | oddPart = omegaOdd * (F(0.5) * (pdf_TE - pdf_BW) - ui * w_2_x3); | |
e3f82424 MW |
829 | *ppdf_BW = pdf_TE - evenPart - oddPart; |
830 | *ppdf_TE = pdf_BW - evenPart + oddPart; | |
10988083 MW |
831 | |
832 | ui = -uy + uz; | |
0fde6e45 MW |
833 | evenPart = omegaEven * (F(0.5) * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep); |
834 | oddPart = omegaOdd * (F(0.5) * (pdf_TS - pdf_BN) - ui * w_2_x3); | |
e3f82424 MW |
835 | *ppdf_BN = pdf_TS - evenPart - oddPart; |
836 | *ppdf_TS = pdf_BN - evenPart + oddPart; | |
10988083 MW |
837 | |
838 | ui = uy + uz; | |
0fde6e45 MW |
839 | evenPart = omegaEven * (F(0.5) * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep); |
840 | oddPart = omegaOdd * (F(0.5) * (pdf_TN - pdf_BS) - ui * w_2_x3); | |
e3f82424 MW |
841 | *ppdf_BS = pdf_TN - evenPart - oddPart; |
842 | *ppdf_TN = pdf_BS - evenPart + oddPart; | |
10988083 MW |
843 | |
844 | pointerOffset = 1; | |
845 | } | |
846 | ||
847 | } // loop over fluid nodes | |
848 | ||
849 | #undef ADJ_LIST | |
850 | #undef I | |
851 | } |