f79e6cd5495495808ddb63723677e266acc68937
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19AaVec.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19AaVecCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "LikwidIf.h"
32 #include "Vector.h"
33 #include "Vector.h"
34
35 #include <inttypes.h>
36 #include <math.h>
37
38 #ifdef _OPENMP
39         #include <omp.h>
40 #endif
41
42 static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd);
43 static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd);
44
45 #if 0 // {{{
46 void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix)
47 {
48         return;
49
50         int * lDims = ld->Dims;
51         int * gDims = kd->GlobalDims;
52
53         int nX = gDims[0];
54         int nY = gDims[1];
55         int nZ = gDims[2];
56
57         PdfT pdfs[N_D3Q19];
58
59         int localZStart = zStart;
60         int localZStop  = zStop;
61
62         if (localZStart == -1) localZStart = 0;
63         if (localZStop  == -1) localZStop  = gDims[2] - 1;
64
65         printf("D iter: %d\n", iter);
66
67         for (int dir = 0; dir < 19; ++dir) {
68                 for (int z = localZStop; z >= localZStart; --z) {
69                         printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z);
70
71                         for(int y = 0; y < nY; ++y) {
72                                 printf("D [%2d][%2d][%s] %2d  ", iter, dir, prefix, y);
73
74                                 for(int x = 0; x < nX; ++x) {
75
76                                         if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
77
78                                                 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
79                                                 pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)];
80                                                 #undef I
81 //                                              kd->GetNode(kd, x, y, z, pdfs);
82                                         }
83                                         else {
84                                                 pdfs[dir] = -F(1.0);
85                                         }
86
87                                         printf("%.16e ", pdfs[dir]);
88                                 }
89
90                                 printf("\n");
91                         }
92                 }
93         }
94 }
95 #endif  // }}}
96
97 void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
98 {
99         Assert(ld != NULL);
100         Assert(kd != NULL);
101         Assert(cd != NULL);
102
103         Assert(cd->Omega > F(0.0));
104         Assert(cd->Omega < F(2.0));
105
106         KernelDataAa * kda = KDA(kd);
107
108         PdfT * src = kd->PdfsActive;
109
110         int maxIterations = cd->MaxIterations;
111
112         #ifdef VTK_OUTPUT
113                 if (cd->VtkOutput) {
114                         kd->PdfsActive = src;
115                         VtkWrite(ld, kd, cd, -1);
116                 }
117         #endif
118
119         #ifdef STATISTICS
120                 kd->PdfsActive = src;
121                 KernelStatistics(kd, ld, cd, 0);
122         #endif
123
124         Assert((maxIterations % 2) == 0);
125
126         for (int iter = 0; iter < maxIterations; iter += 2) {
127
128                 // --------------------------------------------------------------------
129                 // even time step
130                 // --------------------------------------------------------------------
131
132                 X_LIKWID_START("aa-vec-even");
133
134                 #pragma omp parallel
135                 {
136                         KernelEven(ld, kd, cd);
137                 }
138
139                 X_LIKWID_STOP("aa-vec-even");
140
141                 // Fixup bounce back PDFs.
142                 #ifdef _OPENMP
143                         #pragma omp parallel for default(none) \
144                                         shared(kd, src)
145                 #endif
146                 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
147                         src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
148                 }
149
150                 // save current iteration
151                 kda->Iteration = iter;
152
153                 #ifdef VERIFICATION
154                         kd->PdfsActive = src;
155                         KernelAddBodyForce(kd, ld, cd);
156                 #endif
157
158                 #ifdef VTK_OUTPUT
159                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
160                                 kd->PdfsActive = src;
161                                 VtkWrite(ld, kd, cd, iter);
162                         }
163                 #endif
164
165                 #ifdef STATISTICS
166                         kd->PdfsActive = src;
167                         KernelStatistics(kd, ld, cd, iter);
168                 #endif
169
170                 // --------------------------------------------------------------------
171                 // odd time step
172                 // --------------------------------------------------------------------
173
174                 X_LIKWID_START("aa-vec-odd");
175
176                 #pragma omp parallel
177                 {
178                         KernelOdd(ld, kd, cd);
179                 }
180
181                 // Stop counters before bounce back. Else computing loop balance will
182                 // be incorrect.
183
184                 X_LIKWID_STOP("aa-vec-odd");
185
186                 // Fixup bounce back PDFs.
187                 #ifdef _OPENMP
188                         #pragma omp parallel for default(none) \
189                                         shared(kd, src)
190                 #endif
191                 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
192                         src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
193                 }
194
195                 // save current iteration
196                 kda->Iteration = iter + 1;
197
198                 #ifdef VERIFICATION
199                         kd->PdfsActive = src;
200                         KernelAddBodyForce(kd, ld, cd);
201                 #endif
202
203                 #ifdef VTK_OUTPUT
204                         if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) {
205                                 kd->PdfsActive = src;
206                                 VtkWrite(ld, kd, cd, iter + 1);
207                         }
208                 #endif
209
210                 #ifdef STATISTICS
211                         kd->PdfsActive = src;
212                         KernelStatistics(kd, ld, cd, iter + 1);
213                 #endif // }}}
214
215
216         } // for (int iter = 0; ...
217
218         #ifdef VTK_OUTPUT
219
220         if (cd->VtkOutput) {
221                 kd->PdfsActive = src;
222                 VtkWrite(ld, kd, cd, maxIterations);
223         }
224
225         #endif
226
227         return;
228 }
229
230 static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
231 {
232         Assert(ld != NULL);
233         Assert(kd != NULL);
234         Assert(cd != NULL);
235
236         Assert(cd->Omega > F(0.0));
237         Assert(cd->Omega < F(2.0));
238
239         KernelDataAa * kda  = KDA(kd);
240
241         int nX = ld->Dims[0];
242         int nY = ld->Dims[1];
243         int nZ = ld->Dims[2];
244
245         int * gDims = kd->GlobalDims;
246
247         int oX = kd->Offsets[0];
248         int oY = kd->Offsets[1];
249         int oZ = kd->Offsets[2];
250
251         int blk[3];
252         blk[0] = kda->Blk[0];
253         blk[1] = kda->Blk[1];
254         blk[2] = kda->Blk[2];
255
256         PdfT omega = cd->Omega;
257         PdfT omegaEven = omega;
258
259         PdfT magicParam = F(1.0) / F(12.0);
260         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
261
262         const PdfT w_0 = F(1.0) /  F(3.0);
263         const PdfT w_1 = F(1.0) / F(18.0);
264         const PdfT w_2 = F(1.0) / F(36.0);
265
266         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
267         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
268
269
270         VPDFT VONE_HALF   = VSET(F(0.5));
271         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
272
273         VPDFT vw_1_indep, vw_2_indep;
274         VPDFT vw_0 = VSET(w_0);
275         VPDFT vw_1 = VSET(w_1);
276         VPDFT vw_2 = VSET(w_2);
277
278         VPDFT vw_1_x3 = VSET(w_1_x3);
279         VPDFT vw_2_x3 = VSET(w_2_x3);
280         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
281         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
282
283         VPDFT vui, vux, vuy, vuz, vdens;
284
285         VPDFT vevenPart, voddPart, vdir_indep_trm;
286
287         VPDFT vomegaEven = VSET(omegaEven);
288         VPDFT vomegaOdd  = VSET(omegaOdd);
289
290         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
291         #define X(name, idx, idxinv, x, y, z)   VPDFT JOIN(vpdf_,name);
292                 D3Q19_LIST
293         #undef X
294
295         PdfT * src = kd->Pdfs[0];
296
297         int nThreads = 1;
298         int threadId = 0;
299
300         #ifdef _OPENMP
301                 nThreads = omp_get_max_threads();
302                 threadId = omp_get_thread_num();
303         #endif
304
305         // TODO: Currently only a 1-D decomposition is applied. For achritectures
306         //       with a lot of cores we want at least 2-D.
307
308         int threadStartX = nX / nThreads * threadId;
309         int threadEndX   = nX / nThreads * (threadId + 1);
310
311         if (nX % nThreads > 0) {
312                 if (nX % nThreads > threadId) {
313                         threadStartX += threadId;
314                         threadEndX   += threadId + 1;
315                 }
316                 else {
317                         threadStartX += nX % nThreads;
318                         threadEndX   += nX % nThreads;
319                 }
320         }
321
322         AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
323
324         for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
325         for (int bY = oY; bY < nY + oY; bY += blk[1]) {
326         for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
327
328                 int eX = MIN(bX + blk[0], threadEndX + oX);
329                 int eY = MIN(bY + blk[1], nY + oY);
330                 int eZ = MIN(bZ + blk[2], nZ + oZ);
331
332                 for (int x = bX; x < eX; x += 1) {
333                 for (int y = bY; y < eY; y += 1) {
334                 for (int z = bZ; z < eZ; z += VSIZE) {
335
336                         #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
337
338                         // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
339                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
340                                 D3Q19_LIST
341                         #undef X
342
343
344                         vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
345                         vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
346                         vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
347
348                         vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
349                                         VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),
350                                         VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
351
352                         vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
353
354                         VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
355
356                         vw_1_indep = VMUL(vw_1,vdir_indep_trm);
357
358                         vui = vuy;
359                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
360                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
361                         VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
362                         VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
363
364                         vui = vux;
365                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
366                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
367                         VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
368                         VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
369
370                         vui = vuz;
371                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
372                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
373                         VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
374                         VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
375
376                         vw_2_indep = VMUL(vw_2,vdir_indep_trm);
377
378                         vui = VSUB(vuy,vux);
379                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
380                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
381                         VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
382                         VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
383
384                         vui = VADD(vux,vuy);
385                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
386                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
387                         VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
388                         VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
389
390                         vui = VSUB(vuz,vux);
391                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
392                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
393                         VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
394                         VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
395
396                         vui = VADD(vux,vuz);
397                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
398                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
399                         VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
400                         VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
401
402                         vui = VSUB(vuz,vuy);
403                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
404                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
405                         VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
406                         VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
407
408                         vui = VADD(vuy,vuz);
409                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
410                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
411                         VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
412                         VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
413
414                         #undef I
415                 } } } // x, y, z
416         } } } // blocked x, y, z
417
418
419
420         return;
421 } // }}}
422
423
424 static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd)  // {{{
425 {
426         Assert(ld != NULL);
427         Assert(kd != NULL);
428         Assert(cd != NULL);
429
430         Assert(cd->Omega > F(0.0));
431         Assert(cd->Omega < F(2.0));
432
433         KernelDataAa * kda  = KDA(kd);
434
435         int nX = ld->Dims[0];
436         int nY = ld->Dims[1];
437         int nZ = ld->Dims[2];
438
439         int * gDims = kd->GlobalDims;
440
441         int oX = kd->Offsets[0];
442         int oY = kd->Offsets[1];
443         int oZ = kd->Offsets[2];
444
445         int blk[3];
446         blk[0] = kda->Blk[0];
447         blk[1] = kda->Blk[1];
448         blk[2] = kda->Blk[2];
449
450         PdfT omega = cd->Omega;
451         PdfT omegaEven = omega;
452
453         PdfT magicParam = F(1.0) / F(12.0);
454         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
455
456         const PdfT w_0 = F(1.0) /  F(3.0);
457         const PdfT w_1 = F(1.0) / F(18.0);
458         const PdfT w_2 = F(1.0) / F(36.0);
459
460         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
461         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
462
463         VPDFT VONE_HALF   = VSET(F(0.5));
464         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
465
466         VPDFT vw_1_indep, vw_2_indep;
467         VPDFT vw_0 = VSET(w_0);
468         VPDFT vw_1 = VSET(w_1);
469         VPDFT vw_2 = VSET(w_2);
470
471         VPDFT vw_1_x3 = VSET(w_1_x3);
472         VPDFT vw_2_x3 = VSET(w_2_x3);
473         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
474         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
475
476         VPDFT vui, vux, vuy, vuz, vdens;
477
478         VPDFT vevenPart, voddPart, vdir_indep_trm;
479
480         VPDFT vomegaEven = VSET(omegaEven);
481         VPDFT vomegaOdd  = VSET(omegaOdd);
482
483         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
484         #define X(name, idx, idxinv, x, y, z)           VPDFT JOIN(vpdf_,name);
485                 D3Q19_LIST
486         #undef X
487
488         PdfT * src = kd->Pdfs[0];
489
490         int threadId = 0;
491         int nThreads = 1;
492
493         #ifdef _OPENMP
494                 threadId = omp_get_thread_num();
495                 nThreads = omp_get_max_threads();
496         #endif
497
498         // TODO: Currently only a 1-D decomposition is applied. For achritectures
499         //       with a lot of cores we want at least 2-D.
500         int threadStartX = nX / nThreads * threadId;
501         int threadEndX   = nX / nThreads * (threadId + 1);
502
503         if (nX % nThreads > 0) {
504                 if (nX % nThreads > threadId) {
505                         threadStartX += threadId;
506                         threadEndX   += threadId + 1;
507                 }
508                 else {
509                         threadStartX += nX % nThreads;
510                         threadEndX   += nX % nThreads;
511                 }
512         }
513
514         AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
515
516         for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
517         for (int bY = oY; bY < nY + oY; bY += blk[1]) {
518         for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
519
520                 int eX = MIN(bX + blk[0], threadEndX + oX);
521                 int eY = MIN(bY + blk[1], nY + oY);
522                 int eZ = MIN(bZ + blk[2], nZ + oZ);
523
524                 for (int x = bX; x < eX; ++x) {
525                 for (int y = bY; y < eY; ++y) {
526                 for (int z = bZ; z < eZ; z += VSIZE) {
527
528                         #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
529
530
531                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]);
532                         D3Q19_LIST
533                         #undef X
534
535                         vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
536                         vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
537                         vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
538
539                         vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
540                                                  VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
541
542                         vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
543
544                         VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
545
546                         vw_1_indep = VMUL(vw_1,vdir_indep_trm);
547
548                         vui = vuy;
549                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
550                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
551                         VSTU(&src[I(x, y + 1,  z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart));
552                         VSTU(&src[I(x, y - 1,  z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart));
553
554                         vui = vux;
555                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
556                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
557                         VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart));
558                         VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart));
559
560                         vui = vuz;
561                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
562                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
563                         VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart));
564                         VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart));
565
566                         vw_2_indep = VMUL(vw_2,vdir_indep_trm);
567
568                         vui = VSUB(vuy,vux);
569                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
570                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
571                         VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
572                         VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart));
573
574                         vui = VADD(vux,vuy);
575                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
576                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
577                         VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
578                         VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart));
579
580                         vui = VSUB(vuz,vux);
581                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
582                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
583                         VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
584                         VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart));
585
586                         vui = VADD(vux,vuz);
587                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
588                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
589                         VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
590                         VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart));
591
592                         vui = VSUB(vuz,vuy);
593                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
594                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
595                         VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
596                         VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart));
597
598                         vui = VADD(vuy,vuz);
599                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
600                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
601                         VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
602                         VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart));
603
604                         #undef I
605                 } } } // x, y, z
606         } } } // blocked x, y, z
607
608         return;
609
610 }  // }}}
This page took 0.10781 seconds and 3 git commands to generate.