Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | #include "BenchKernelD3Q19ListAaPvCommon.h" | |
28 | ||
29 | #include "Memory.h" | |
30 | #include "Vtk.h" | |
31 | #include "Vector.h" | |
32 | #include "LikwidIf.h" | |
33 | ||
34 | #include <inttypes.h> | |
35 | #include <math.h> | |
36 | ||
37 | #ifdef _OPENMP | |
38 | #include <omp.h> | |
39 | #endif | |
40 | ||
41 | ||
42 | static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd); | |
43 | static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd); | |
44 | ||
45 | void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
46 | { | |
47 | ||
48 | Assert(ld != NULL); | |
49 | Assert(kernelData != NULL); | |
50 | Assert(cd != NULL); | |
51 | ||
0fde6e45 MW |
52 | Assert(cd->Omega > F(0.0)); |
53 | Assert(cd->Omega < F(2.0));; | |
10988083 MW |
54 | |
55 | #if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION) | |
56 | KernelData * kd = (KernelData *)kernelData; | |
57 | KernelDataList * kdl = KDL(kernelData); | |
58 | #endif | |
59 | ||
60 | int maxIterations = cd->MaxIterations; | |
61 | ||
62 | int nThreads = 1; | |
63 | #ifdef _OPENMP | |
64 | nThreads = omp_get_max_threads(); | |
65 | #endif | |
66 | ||
67 | #ifdef VTK_OUTPUT | |
68 | if (cd->VtkOutput) { | |
69 | kd->PdfsActive = kd->Pdfs[0]; | |
70 | VtkWrite(ld, kd, cd, -1); | |
71 | } | |
72 | #endif | |
73 | ||
74 | #ifdef STATISTICS | |
75 | kd->PdfsActive = kd->Pdfs[0]; | |
76 | KernelStatistics(kd, ld, cd, 0); | |
77 | #endif | |
78 | ||
79 | // TODO: outer openmp parallel | |
80 | ||
81 | for(int iter = 0; iter < maxIterations; iter += 2) { | |
82 | ||
83 | // --------------------------------------------------- | |
84 | // even time step | |
85 | // --------------------------------------------------- | |
86 | ||
87 | X_LIKWID_START("list-aa-pv-even"); | |
88 | ||
89 | #ifdef _OPENMP | |
90 | #pragma omp parallel default(none) shared(ld, kernelData, cd) | |
91 | #endif | |
92 | { | |
93 | KernelEven(ld, kernelData, cd); | |
94 | } | |
95 | ||
96 | X_LIKWID_STOP("list-aa-pv-even"); | |
97 | ||
98 | #ifdef VERIFICATION | |
99 | kdl->Iteration = iter; | |
100 | kd->PdfsActive = kd->Pdfs[0]; | |
101 | KernelAddBodyForce(kd, ld, cd); | |
102 | #endif | |
103 | ||
104 | // --------------------------------------------------- | |
105 | // odd time step | |
106 | // --------------------------------------------------- | |
107 | ||
108 | X_LIKWID_START("list-aa-pv-odd"); | |
109 | ||
110 | #ifdef _OPENMP | |
111 | #pragma omp parallel default(none) shared(ld, kernelData, cd) | |
112 | #endif | |
113 | { | |
114 | KernelOdd(ld, kernelData, cd); | |
115 | } | |
116 | ||
117 | X_LIKWID_STOP("list-aa-pv-odd"); | |
118 | ||
119 | ||
120 | #ifdef VERIFICATION | |
121 | kdl->Iteration = iter + 1; | |
122 | kd->PdfsActive = kd->Pdfs[0]; | |
123 | KernelAddBodyForce(kd, ld, cd); | |
124 | #endif | |
125 | ||
126 | #ifdef VTK_OUTPUT | |
127 | if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { | |
128 | kdl->Iteration = iter + 1; | |
129 | kd->PdfsActive = kd->Pdfs[0]; | |
130 | VtkWrite(ld, kd, cd, iter); | |
131 | } | |
132 | #endif | |
133 | ||
134 | #ifdef STATISTICS | |
135 | kdl->Iteration = iter + 1; | |
136 | kd->PdfsActive = kd->Pdfs[0]; | |
137 | KernelStatistics(kd, ld, cd, iter); | |
138 | #endif | |
139 | ||
140 | } // for (int iter = 0; ... | |
141 | ||
142 | #ifdef VTK_OUTPUT | |
143 | if (cd->VtkOutput) { | |
144 | kd->PdfsActive = kd->Pdfs[0]; | |
145 | VtkWrite(ld, kd, cd, maxIterations); | |
146 | } | |
147 | #endif | |
148 | ||
149 | #ifdef STATISTICS | |
150 | kd->PdfsActive = kd->Pdfs[0]; | |
151 | KernelStatistics(kd, ld, cd, maxIterations); | |
152 | #endif | |
153 | ||
154 | return; | |
155 | } | |
156 | ||
157 | static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
158 | { | |
159 | Assert(ld != NULL); | |
160 | Assert(kernelData != NULL); | |
161 | Assert(cd != NULL); | |
162 | ||
0fde6e45 MW |
163 | Assert(cd->Omega > F(0.0)); |
164 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
165 | |
166 | KernelData * kd = (KernelData *)kernelData; | |
167 | KernelDataList * kdl = KDL(kernelData); | |
168 | KernelDataListRia * kdlr = KDLR(kernelData); | |
169 | ||
170 | PdfT omega = cd->Omega; | |
171 | PdfT omegaEven = omega; | |
172 | ||
0fde6e45 MW |
173 | PdfT magicParam = F(1.0) / F(12.0); |
174 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 175 | |
0fde6e45 MW |
176 | PdfT evenPart = F(0.0); |
177 | PdfT oddPart = F(0.0); | |
178 | PdfT dir_indep_trm = F(0.0); | |
10988083 | 179 | |
0fde6e45 MW |
180 | const PdfT w_0 = F(1.0) / F( 3.0); |
181 | const PdfT w_1 = F(1.0) / F(18.0); | |
182 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 183 | |
0fde6e45 MW |
184 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0); |
185 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0); | |
10988083 MW |
186 | |
187 | PdfT ui; | |
188 | ||
189 | PdfT ux, uy, uz; | |
190 | PdfT dens; | |
191 | ||
192 | ||
0fde6e45 MW |
193 | VPDFT VONE_HALF = VSET(F(0.5)); |
194 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
10988083 MW |
195 | |
196 | VPDFT vw_1_indep, vw_2_indep; | |
197 | VPDFT vw_0 = VSET(w_0); | |
198 | VPDFT vw_1 = VSET(w_1); | |
199 | VPDFT vw_2 = VSET(w_2); | |
200 | ||
201 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
202 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
203 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
204 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
205 | ||
206 | VPDFT vui, vux, vuy, vuz, vdens; | |
207 | ||
208 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
209 | ||
210 | VPDFT vomegaEven = VSET(omegaEven); | |
211 | VPDFT vomegaOdd = VSET(omegaOdd); | |
212 | ||
213 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
214 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); | |
215 | D3Q19_LIST | |
216 | #undef X | |
217 | ||
218 | PdfT * src = kd->Pdfs[0]; | |
219 | ||
220 | int nCells = kdl->nCells; | |
221 | ||
222 | int threadId = 0; | |
223 | #ifdef _OPENMP | |
224 | threadId = omp_get_thread_num(); | |
225 | #endif | |
226 | ||
227 | int * threadIndices = kdlr->FluidNodeThreadIndices; | |
228 | ||
229 | int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; | |
230 | int nFluidVec = nFluidThread - (nFluidThread % VSIZE); | |
231 | ||
232 | int indexStartVec = threadIndices[threadId]; | |
233 | int indexStopVec = threadIndices[threadId] + nFluidVec; | |
234 | int indexStop = threadIndices[threadId] + nFluidThread; | |
235 | ||
236 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
237 | ||
238 | for (int index = indexStartVec; index < indexStopVec; index += VSIZE) { | |
239 | ||
240 | ||
241 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]); | |
242 | D3Q19_LIST | |
243 | #undef X | |
244 | ||
245 | ||
246 | //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - | |
247 | // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; | |
248 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
249 | //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - | |
250 | // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; | |
251 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
252 | //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - | |
253 | // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; | |
254 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
255 | ||
256 | //vdens = vpdf_C + | |
257 | // vpdf_N + vpdf_E + vpdf_S + vpdf_W + | |
258 | // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + | |
259 | // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + | |
260 | // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; | |
261 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
262 | ||
263 | //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; | |
264 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
265 | ||
266 | //src[I(index, D3Q19_C) ] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); | |
267 | VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
268 | ||
269 | //vw_1_indep = vw_1 * vdir_indep_trm; | |
270 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
271 | ||
272 | //vui = vuy; | |
273 | vui = vuy; | |
274 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); | |
275 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
276 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); | |
277 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); | |
278 | //src[I(index, D3Q19_S)] =[UA] vpdf_N - vevenPart - voddPart; | |
279 | VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart)); | |
280 | //src[I(index, D3Q19_N)] =[UA] vpdf_S - vevenPart + voddPart; | |
281 | VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart)); | |
282 | ||
283 | //vui = vux; | |
284 | vui = vux; | |
285 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); | |
286 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
287 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 ); | |
288 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); | |
289 | //src[I(index, D3Q19_W)] =[UA] vpdf_E - vevenPart - voddPart; | |
290 | VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart)); | |
291 | //src[I(index, D3Q19_E)] =[UA] vpdf_W - vevenPart + voddPart; | |
292 | VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart)); | |
293 | ||
294 | //vui = vuz; | |
295 | vui = vuz; | |
296 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); | |
297 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
298 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); | |
299 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); | |
300 | //src[I(index, D3Q19_B)] =[UA] vpdf_T - vevenPart - voddPart; | |
301 | VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart)); | |
302 | //src[I(index, D3Q19_T)] =[UA] vpdf_B - vevenPart + voddPart; | |
303 | VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart)); | |
304 | ||
305 | //vw_2_indep = vw_2 * vdir_indep_trm; | |
306 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
307 | ||
308 | //vui = vuy - vux; | |
309 | vui = VSUB(vuy,vux); | |
310 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
311 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
312 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); | |
313 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); | |
314 | //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart; | |
315 | VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); | |
316 | //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart; | |
317 | VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart)); | |
318 | ||
319 | //vui = vux + vuy; | |
320 | vui = VADD(vux,vuy); | |
321 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
322 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
323 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); | |
324 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); | |
325 | //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart; | |
326 | VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); | |
327 | //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart; | |
328 | VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart)); | |
329 | ||
330 | //vui = vuz - vux; | |
331 | vui = VSUB(vuz,vux); | |
332 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
333 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
334 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); | |
335 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); | |
336 | //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart; | |
337 | VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); | |
338 | //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart; | |
339 | VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart)); | |
340 | ||
341 | //vui = vux + vuz; | |
342 | vui = VADD(vux,vuz); | |
343 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
344 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
345 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); | |
346 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); | |
347 | //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart; | |
348 | VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); | |
349 | //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart; | |
350 | VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart)); | |
351 | ||
352 | //vui = vuz - vuy; | |
353 | vui = VSUB(vuz,vuy); | |
354 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); | |
355 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
356 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); | |
357 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); | |
358 | //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart; | |
359 | VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); | |
360 | //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart; | |
361 | VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart)); | |
362 | ||
363 | //vui = vuy + vuz; | |
364 | vui = VADD(vuy,vuz); | |
365 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); | |
366 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
367 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); | |
368 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); | |
369 | //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart; | |
370 | VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); | |
371 | //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart; | |
372 | VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart)); | |
373 | ||
374 | } // loop over fluid nodes | |
375 | ||
376 | for (int index = indexStopVec; index < indexStop; ++index) { | |
377 | ||
378 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; | |
379 | D3Q19_LIST | |
380 | #undef X | |
381 | ||
382 | ||
383 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
384 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
385 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
386 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
387 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
388 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
389 | ||
390 | dens = pdf_C + | |
391 | pdf_N + pdf_E + pdf_S + pdf_W + | |
392 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
393 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
394 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
395 | ||
0fde6e45 | 396 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0); |
10988083 MW |
397 | |
398 | // direction: w_0 | |
399 | src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); | |
400 | ||
401 | // direction: w_1 | |
402 | w_1_indep = w_1*dir_indep_trm; | |
403 | ||
404 | ui = uy; | |
0fde6e45 MW |
405 | evenPart = omegaEven*( F(0.5)*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); |
406 | oddPart = omegaOdd*(F(0.5)*(pdf_N - pdf_S) - ui*w_1_x3 ); | |
10988083 MW |
407 | src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart; |
408 | src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart; | |
409 | ||
410 | ui = ux; | |
0fde6e45 MW |
411 | evenPart = omegaEven*( F(0.5)*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); |
412 | oddPart = omegaOdd*(F(0.5)*(pdf_E - pdf_W) - ui*w_1_x3 ); | |
10988083 MW |
413 | src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart; |
414 | src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart; | |
415 | ||
416 | ui = uz; | |
0fde6e45 MW |
417 | evenPart = omegaEven*( F(0.5)*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); |
418 | oddPart = omegaOdd*(F(0.5)*(pdf_T - pdf_B) - ui*w_1_x3 ); | |
10988083 MW |
419 | src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart; |
420 | src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart; | |
421 | ||
422 | // direction: w_2 | |
423 | w_2_indep = w_2*dir_indep_trm; | |
424 | ||
425 | ui = -ux + uy; | |
0fde6e45 MW |
426 | evenPart = omegaEven*( F(0.5)*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); |
427 | oddPart = omegaOdd*(F(0.5)*(pdf_NW - pdf_SE) - ui*w_2_x3 ); | |
10988083 MW |
428 | src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart; |
429 | src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart; | |
430 | ||
431 | ui = ux + uy; | |
0fde6e45 MW |
432 | evenPart = omegaEven*( F(0.5)*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); |
433 | oddPart = omegaOdd*(F(0.5)*(pdf_NE - pdf_SW) - ui*w_2_x3 ); | |
10988083 MW |
434 | src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart; |
435 | src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart; | |
436 | ||
437 | ui = -ux + uz; | |
0fde6e45 MW |
438 | evenPart = omegaEven*( F(0.5)*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); |
439 | oddPart = omegaOdd*(F(0.5)*(pdf_TW - pdf_BE) - ui*w_2_x3 ); | |
10988083 MW |
440 | src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart; |
441 | src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart; | |
442 | ||
443 | ui = ux + uz; | |
0fde6e45 MW |
444 | evenPart = omegaEven*( F(0.5)*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); |
445 | oddPart = omegaOdd*(F(0.5)*(pdf_TE - pdf_BW) - ui*w_2_x3 ); | |
10988083 MW |
446 | src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart; |
447 | src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart; | |
448 | ||
449 | ui = -uy + uz; | |
0fde6e45 MW |
450 | evenPart = omegaEven*( F(0.5)*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); |
451 | oddPart = omegaOdd*(F(0.5)*(pdf_TS - pdf_BN) - ui*w_2_x3 ); | |
10988083 MW |
452 | src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart; |
453 | src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart; | |
454 | ||
455 | ui = uy + uz; | |
0fde6e45 MW |
456 | evenPart = omegaEven*( F(0.5)*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); |
457 | oddPart = omegaOdd*(F(0.5)*(pdf_TN - pdf_BS) - ui*w_2_x3 ); | |
10988083 MW |
458 | src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart; |
459 | src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart; | |
460 | ||
461 | } // loop over fluid nodes | |
462 | ||
463 | #undef I | |
464 | ||
465 | return; | |
466 | } | |
467 | ||
468 | static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) | |
469 | { | |
470 | ||
471 | Assert(ld != NULL); | |
472 | Assert(kernelData != NULL); | |
473 | Assert(cd != NULL); | |
474 | ||
0fde6e45 MW |
475 | Assert(cd->Omega > F(0.0)); |
476 | Assert(cd->Omega < F(2.0)); | |
10988083 MW |
477 | |
478 | KernelData * kd = (KernelData *)kernelData; | |
479 | KernelDataList * kdl = KDL(kernelData); | |
480 | KernelDataListRia * kdlr = KDLR(kernelData); | |
481 | ||
482 | PdfT omega = cd->Omega; | |
483 | PdfT omegaEven = omega; | |
484 | ||
0fde6e45 MW |
485 | PdfT magicParam = F(1.0) / F(12.0); |
486 | PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5))); | |
10988083 | 487 | |
0fde6e45 MW |
488 | PdfT evenPart = F(0.0); |
489 | PdfT oddPart = F(0.0); | |
490 | PdfT dir_indep_trm = F(0.0); | |
10988083 | 491 | |
0fde6e45 MW |
492 | const PdfT w_0 = F(1.0) / F( 3.0); |
493 | const PdfT w_1 = F(1.0) / F(18.0); | |
494 | const PdfT w_2 = F(1.0) / F(36.0); | |
10988083 | 495 | |
0fde6e45 MW |
496 | const PdfT w_1_x3 = w_1 * F(3.0); const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0); PdfT w_1_indep = F(0.0); |
497 | const PdfT w_2_x3 = w_2 * F(3.0); const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0); PdfT w_2_indep = F(0.0); | |
10988083 MW |
498 | |
499 | PdfT ui; | |
500 | ||
501 | PdfT ux, uy, uz; | |
502 | PdfT dens; | |
503 | ||
504 | ||
0fde6e45 MW |
505 | VPDFT VONE_HALF = VSET(F(0.5)); |
506 | VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0)); | |
10988083 MW |
507 | |
508 | VPDFT vw_1_indep, vw_2_indep; | |
509 | VPDFT vw_0 = VSET(w_0); | |
510 | VPDFT vw_1 = VSET(w_1); | |
511 | VPDFT vw_2 = VSET(w_2); | |
512 | ||
513 | VPDFT vw_1_x3 = VSET(w_1_x3); | |
514 | VPDFT vw_2_x3 = VSET(w_2_x3); | |
515 | VPDFT vw_1_nine_half = VSET(w_1_nine_half); | |
516 | VPDFT vw_2_nine_half = VSET(w_2_nine_half); | |
517 | ||
518 | VPDFT vui, vux, vuy, vuz, vdens; | |
519 | ||
520 | VPDFT vevenPart, voddPart, vdir_indep_trm; | |
521 | ||
522 | VPDFT vomegaEven = VSET(omegaEven); | |
523 | VPDFT vomegaOdd = VSET(omegaOdd); | |
524 | ||
525 | ||
526 | // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... | |
527 | #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); | |
528 | D3Q19_LIST | |
529 | #undef X | |
530 | ||
531 | // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ... | |
532 | #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_,name) = NULL; | |
533 | D3Q19_LIST | |
534 | #undef X | |
535 | ||
536 | uint32_t nConsecNodes = kdlr->nConsecNodes; | |
537 | uint32_t * consecNodes = kdlr->ConsecNodes; | |
538 | uint32_t consecIndex = 0; | |
539 | uint32_t consecValue = 0; | |
540 | ||
541 | #ifndef DEBUG | |
542 | UNUSED(nConsecNodes); | |
543 | #endif | |
544 | ||
545 | PdfT * src = kd->Pdfs[0]; | |
546 | ||
547 | int nCells = kdl->nCells; | |
548 | ||
549 | uint32_t adjListIndex; | |
550 | uint32_t * adjList = kdl->AdjList; | |
551 | ||
552 | int threadId = 0; | |
553 | ||
554 | #ifdef _OPENMP | |
555 | threadId = omp_get_thread_num(); | |
556 | #endif | |
557 | ||
558 | consecIndex = kdlr->ConsecThreadIndices[threadId]; | |
559 | consecValue = 0; | |
560 | ||
561 | int * threadIndices = kdlr->FluidNodeThreadIndices; | |
562 | ||
563 | int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; | |
564 | ||
565 | int indexStart = threadIndices[threadId]; | |
566 | int indexStop = threadIndices[threadId] + nFluidThread; | |
567 | ||
568 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
569 | ||
570 | #define ADJ_LIST(dir) adjList[adjListIndex + (dir)] | |
571 | ||
572 | int pointerOffset = 1; | |
573 | ||
574 | for (int index = indexStart; index < indexStop; index += 1) { | |
575 | ||
576 | if (consecValue > 0) { | |
577 | --consecValue; | |
578 | // Increment all pdf pointers by an offset. If the previous iteration was | |
579 | // scalar, increment only by one. If the previous iteration was vectorized, | |
580 | // increment by the vector width. These offsets are set in the corresponding | |
581 | // if branches. | |
582 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += pointerOffset; | |
583 | D3Q19_LIST | |
584 | #undef X | |
585 | } | |
586 | else { | |
587 | Assert(consecIndex < nConsecNodes); | |
588 | ||
589 | consecValue = consecNodes[consecIndex] - 1; | |
590 | // Load new pointers to PDFs of local cell: | |
591 | ||
592 | adjListIndex = index * N_D3Q19_IDX; | |
593 | ||
594 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]); | |
595 | D3Q19_LIST_WO_C | |
596 | #undef X | |
597 | ||
598 | ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]); | |
599 | ++consecIndex; | |
600 | } | |
601 | ||
602 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *JOIN(ppdf_,name); | |
603 | D3Q19_LIST | |
604 | #undef X | |
605 | ||
606 | if (consecValue >= (VSIZE - 1)) { | |
607 | // Vectorized part. | |
608 | ||
609 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name)); | |
610 | D3Q19_LIST_WO_C | |
611 | #undef X | |
612 | ||
613 | vpdf_C = VLDU(ppdf_C); | |
614 | ||
615 | //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - | |
616 | // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; | |
617 | vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); | |
618 | //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - | |
619 | // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; | |
620 | vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); | |
621 | //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - | |
622 | // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; | |
623 | vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); | |
624 | ||
625 | //vdens = vpdf_C + | |
626 | // vpdf_N + vpdf_E + vpdf_S + vpdf_W + | |
627 | // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + | |
628 | // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + | |
629 | // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; | |
630 | vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), | |
631 | VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); | |
632 | ||
633 | //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; | |
634 | vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); | |
635 | ||
636 | adjListIndex = index * N_D3Q19_IDX; | |
637 | ||
638 | //src[I(index, D3Q19_C)] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); | |
639 | VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); | |
640 | ||
641 | //vw_1_indep = vw_1 * vdir_indep_trm; | |
642 | vw_1_indep = VMUL(vw_1,vdir_indep_trm); | |
643 | ||
644 | //vui = vuy; | |
645 | vui = vuy; | |
646 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); | |
647 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
648 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); | |
649 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); | |
650 | //src[ADJ_LIST(D3Q19_N)] =[UA] vpdf_N - vevenPart - voddPart; | |
651 | VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart)); | |
652 | //src[ADJ_LIST(D3Q19_S)] =[UA] vpdf_S - vevenPart + voddPart; | |
653 | VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart)); | |
654 | ||
655 | //vui = vux; | |
656 | vui = vux; | |
657 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); | |
658 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
659 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3); | |
660 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); | |
661 | //src[ADJ_LIST(D3Q19_E)] =[UA] vpdf_E - vevenPart - voddPart; | |
662 | VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart)); | |
663 | //src[ADJ_LIST(D3Q19_W)] =[UA] vpdf_W - vevenPart + voddPart; | |
664 | VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart)); | |
665 | ||
666 | //vui = vuz; | |
667 | vui = vuz; | |
668 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); | |
669 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); | |
670 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); | |
671 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); | |
672 | //src[ADJ_LIST(D3Q19_T)] =[UA] vpdf_T - vevenPart - voddPart; | |
673 | VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart)); | |
674 | //src[ADJ_LIST(D3Q19_B)] =[UA] vpdf_B - vevenPart + voddPart; | |
675 | VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart)); | |
676 | ||
677 | //vw_2_indep = vw_2 * vdir_indep_trm; | |
678 | vw_2_indep = VMUL(vw_2,vdir_indep_trm); | |
679 | ||
680 | //vui = vuy - vux; | |
681 | vui = VSUB(vuy,vux); | |
682 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
683 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
684 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); | |
685 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); | |
686 | //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart; | |
687 | VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); | |
688 | //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart; | |
689 | VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart)); | |
690 | ||
691 | //vui = vux + vuy; | |
692 | vui = VADD(vux,vuy); | |
693 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
694 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
695 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); | |
696 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); | |
697 | //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart; | |
698 | VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); | |
699 | //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart; | |
700 | VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart)); | |
701 | ||
702 | //vui = vuz - vux; | |
703 | vui = VSUB(vuz,vux); | |
704 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); | |
705 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
706 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); | |
707 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); | |
708 | //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart; | |
709 | VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); | |
710 | //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart; | |
711 | VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart)); | |
712 | ||
713 | //vui = vux + vuz; | |
714 | vui = VADD(vux,vuz); | |
715 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); | |
716 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
717 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); | |
718 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); | |
719 | //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart; | |
720 | VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); | |
721 | //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart; | |
722 | VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart)); | |
723 | ||
724 | //vui = vuz - vuy; | |
725 | vui = VSUB(vuz,vuy); | |
726 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); | |
727 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
728 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); | |
729 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); | |
730 | //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart; | |
e3f82424 | 731 | VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); |
10988083 | 732 | //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart; |
e3f82424 | 733 | VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); |
10988083 MW |
734 | |
735 | //vui = vuy + vuz; | |
736 | vui = VADD(vuy,vuz); | |
737 | //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); | |
738 | vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); | |
739 | //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); | |
740 | voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); | |
741 | //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart; | |
742 | VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); | |
743 | //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart; | |
744 | VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart)); | |
745 | ||
746 | consecValue -= (VSIZE - 1); | |
747 | index += (VSIZE - 1); | |
748 | pointerOffset = VSIZE; | |
749 | ||
750 | } | |
751 | else { | |
752 | // Scalar part. | |
753 | ||
754 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *(JOIN(ppdf_,name)); | |
755 | D3Q19_LIST_WO_C | |
756 | #undef X | |
757 | ||
758 | pdf_C = *ppdf_C; | |
759 | ||
760 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
761 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
762 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
763 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
764 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
765 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
766 | ||
767 | dens = pdf_C + | |
768 | pdf_N + pdf_E + pdf_S + pdf_W + | |
769 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
770 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
771 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
772 | ||
0fde6e45 | 773 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0); |
10988083 MW |
774 | |
775 | adjListIndex = index * N_D3Q19_IDX; | |
776 | ||
777 | // direction: w_0 | |
778 | src[I(index, D3Q19_C) ] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); | |
779 | ||
780 | // direction: w_1 | |
781 | w_1_indep = w_1 * dir_indep_trm; | |
782 | ||
783 | ui = uy; | |
0fde6e45 MW |
784 | evenPart = omegaEven * (F(0.5) * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep); |
785 | oddPart = omegaOdd * (F(0.5) * (pdf_N - pdf_S) - ui * w_1_x3); | |
e3f82424 MW |
786 | *ppdf_S = pdf_N - evenPart - oddPart; |
787 | *ppdf_N = pdf_S - evenPart + oddPart; | |
10988083 MW |
788 | |
789 | ui = ux; | |
0fde6e45 MW |
790 | evenPart = omegaEven * (F(0.5) * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep); |
791 | oddPart = omegaOdd * (F(0.5) * (pdf_E - pdf_W) - ui * w_1_x3); | |
e3f82424 MW |
792 | *ppdf_W = pdf_E - evenPart - oddPart; |
793 | *ppdf_E = pdf_W - evenPart + oddPart; | |
10988083 MW |
794 | |
795 | ui = uz; | |
0fde6e45 MW |
796 | evenPart = omegaEven * (F(0.5) * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep); |
797 | oddPart = omegaOdd * (F(0.5) * (pdf_T - pdf_B) - ui * w_1_x3); | |
e3f82424 MW |
798 | *ppdf_B = pdf_T - evenPart - oddPart; |
799 | *ppdf_T = pdf_B - evenPart + oddPart; | |
10988083 MW |
800 | |
801 | // direction: w_2 | |
802 | w_2_indep = w_2 * dir_indep_trm; | |
803 | ||
804 | ui = -ux + uy; | |
0fde6e45 MW |
805 | evenPart = omegaEven * (F(0.5) * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep); |
806 | oddPart = omegaOdd * (F(0.5) * (pdf_NW - pdf_SE) - ui * w_2_x3); | |
e3f82424 MW |
807 | *ppdf_SE = pdf_NW - evenPart - oddPart; |
808 | *ppdf_NW = pdf_SE - evenPart + oddPart; | |
10988083 MW |
809 | |
810 | ui = ux + uy; | |
0fde6e45 MW |
811 | evenPart = omegaEven * (F(0.5) * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep); |
812 | oddPart = omegaOdd * (F(0.5) * (pdf_NE - pdf_SW) - ui * w_2_x3); | |
e3f82424 MW |
813 | *ppdf_SW = pdf_NE - evenPart - oddPart; |
814 | *ppdf_NE = pdf_SW - evenPart + oddPart; | |
10988083 MW |
815 | |
816 | ui = -ux + uz; | |
0fde6e45 MW |
817 | evenPart = omegaEven * (F(0.5) * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep); |
818 | oddPart = omegaOdd * (F(0.5) * (pdf_TW - pdf_BE) - ui * w_2_x3); | |
e3f82424 MW |
819 | *ppdf_BE = pdf_TW - evenPart - oddPart; |
820 | *ppdf_TW = pdf_BE - evenPart + oddPart; | |
10988083 MW |
821 | |
822 | ui = ux + uz; | |
0fde6e45 MW |
823 | evenPart = omegaEven * (F(0.5) * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep); |
824 | oddPart = omegaOdd * (F(0.5) * (pdf_TE - pdf_BW) - ui * w_2_x3); | |
e3f82424 MW |
825 | *ppdf_BW = pdf_TE - evenPart - oddPart; |
826 | *ppdf_TE = pdf_BW - evenPart + oddPart; | |
10988083 MW |
827 | |
828 | ui = -uy + uz; | |
0fde6e45 MW |
829 | evenPart = omegaEven * (F(0.5) * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep); |
830 | oddPart = omegaOdd * (F(0.5) * (pdf_TS - pdf_BN) - ui * w_2_x3); | |
e3f82424 MW |
831 | *ppdf_BN = pdf_TS - evenPart - oddPart; |
832 | *ppdf_TS = pdf_BN - evenPart + oddPart; | |
10988083 MW |
833 | |
834 | ui = uy + uz; | |
0fde6e45 MW |
835 | evenPart = omegaEven * (F(0.5) * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep); |
836 | oddPart = omegaOdd * (F(0.5) * (pdf_TN - pdf_BS) - ui * w_2_x3); | |
e3f82424 MW |
837 | *ppdf_BS = pdf_TN - evenPart - oddPart; |
838 | *ppdf_TN = pdf_BS - evenPart + oddPart; | |
10988083 MW |
839 | |
840 | pointerOffset = 1; | |
841 | } | |
842 | ||
843 | } // loop over fluid nodes | |
844 | ||
845 | #undef ADJ_LIST | |
846 | #undef I | |
847 | } |