merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19AaVec.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19AaVecCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "LikwidIf.h"
32 #include "Vector.h"
33 #include "Vector.h"
34
35 #include <inttypes.h>
36 #include <math.h>
37
38 #ifdef _OPENMP
39         #include <omp.h>
40 #endif
41
42 static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd);
43 static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd);
44
45 #if 0 // {{{
46 void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix)
47 {
48         return;
49
50         int * lDims = ld->Dims;
51         int * gDims = kd->GlobalDims;
52
53         int nX = gDims[0];
54         int nY = gDims[1];
55         int nZ = gDims[2];
56
57         PdfT pdfs[N_D3Q19];
58
59         int localZStart = zStart;
60         int localZStop  = zStop;
61
62         if (localZStart == -1) localZStart = 0;
63         if (localZStop  == -1) localZStop  = gDims[2] - 1;
64
65         printf("D iter: %d\n", iter);
66
67         for (int dir = 0; dir < 19; ++dir) {
68                 for (int z = localZStop; z >= localZStart; --z) {
69                         printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z);
70
71                         for(int y = 0; y < nY; ++y) {
72                                 printf("D [%2d][%2d][%s] %2d  ", iter, dir, prefix, y);
73
74                                 for(int x = 0; x < nX; ++x) {
75
76                                         if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
77
78                                                 #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
79                                                 pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)];
80                                                 #undef I
81 //                                              kd->GetNode(kd, x, y, z, pdfs);
82                                         }
83                                         else {
84                                                 pdfs[dir] = -F(1.0);
85                                         }
86
87                                         printf("%.16e ", pdfs[dir]);
88                                 }
89
90                                 printf("\n");
91                         }
92                 }
93         }
94 }
95 #endif  // }}}
96
97 void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
98 {
99         Assert(ld != NULL);
100         Assert(kd != NULL);
101         Assert(cd != NULL);
102
103         Assert(cd->Omega > F(0.0));
104         Assert(cd->Omega < F(2.0));
105
106         KernelDataAa * kda = KDA(kd);
107
108         PdfT * src = kd->PdfsActive;
109
110         int maxIterations = cd->MaxIterations;
111
112         #ifdef VTK_OUTPUT
113                 if (cd->VtkOutput) {
114                         kd->PdfsActive = src;
115                         VtkWrite(ld, kd, cd, -1);
116                 }
117         #endif
118
119         #ifdef STATISTICS
120                 kd->PdfsActive = src;
121                 KernelStatistics(kd, ld, cd, 0);
122         #endif
123
124         Assert((maxIterations % 2) == 0);
125
126         X_KERNEL_START(kd);
127
128         for (int iter = 0; iter < maxIterations; iter += 2) {
129
130                 // --------------------------------------------------------------------
131                 // even time step
132                 // --------------------------------------------------------------------
133
134                 X_LIKWID_START("aa-vec-even");
135
136                 #pragma omp parallel
137                 {
138                         KernelEven(ld, kd, cd);
139                 }
140
141                 X_LIKWID_STOP("aa-vec-even");
142
143                 // Fixup bounce back PDFs.
144                 #ifdef _OPENMP
145                         #pragma omp parallel for default(none) \
146                                         shared(kd, src)
147                 #endif
148                 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
149                         src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
150                 }
151
152                 // save current iteration
153                 kda->Iteration = iter;
154
155                 #ifdef VERIFICATION
156                         kd->PdfsActive = src;
157                         KernelAddBodyForce(kd, ld, cd);
158                 #endif
159
160                 #ifdef VTK_OUTPUT
161                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
162                                 kd->PdfsActive = src;
163                                 VtkWrite(ld, kd, cd, iter);
164                         }
165                 #endif
166
167                 #ifdef STATISTICS
168                         kd->PdfsActive = src;
169                         KernelStatistics(kd, ld, cd, iter);
170                 #endif
171
172                 // --------------------------------------------------------------------
173                 // odd time step
174                 // --------------------------------------------------------------------
175
176                 X_LIKWID_START("aa-vec-odd");
177
178                 #pragma omp parallel
179                 {
180                         KernelOdd(ld, kd, cd);
181                 }
182
183                 // Stop counters before bounce back. Else computing loop balance will
184                 // be incorrect.
185
186                 X_LIKWID_STOP("aa-vec-odd");
187
188                 // Fixup bounce back PDFs.
189                 #ifdef _OPENMP
190                         #pragma omp parallel for default(none) \
191                                         shared(kd, src)
192                 #endif
193                 for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
194                         src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
195                 }
196
197                 // save current iteration
198                 kda->Iteration = iter + 1;
199
200                 #ifdef VERIFICATION
201                         kd->PdfsActive = src;
202                         KernelAddBodyForce(kd, ld, cd);
203                 #endif
204
205                 #ifdef VTK_OUTPUT
206                         if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) {
207                                 kd->PdfsActive = src;
208                                 VtkWrite(ld, kd, cd, iter + 1);
209                         }
210                 #endif
211
212                 #ifdef STATISTICS
213                         kd->PdfsActive = src;
214                         KernelStatistics(kd, ld, cd, iter + 1);
215                 #endif // }}}
216
217
218         } // for (int iter = 0; ...
219
220         X_KERNEL_END(kd);
221
222         #ifdef VTK_OUTPUT
223
224         if (cd->VtkOutput) {
225                 kd->PdfsActive = src;
226                 VtkWrite(ld, kd, cd, maxIterations);
227         }
228
229         #endif
230
231         return;
232 }
233
234 static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
235 {
236         Assert(ld != NULL);
237         Assert(kd != NULL);
238         Assert(cd != NULL);
239
240         Assert(cd->Omega > F(0.0));
241         Assert(cd->Omega < F(2.0));
242
243         KernelDataAa * kda  = KDA(kd);
244
245         int nX = ld->Dims[0];
246         int nY = ld->Dims[1];
247         int nZ = ld->Dims[2];
248
249         int * gDims = kd->GlobalDims;
250
251         int oX = kd->Offsets[0];
252         int oY = kd->Offsets[1];
253         int oZ = kd->Offsets[2];
254
255         int blk[3];
256         blk[0] = kda->Blk[0];
257         blk[1] = kda->Blk[1];
258         blk[2] = kda->Blk[2];
259
260         PdfT omega = cd->Omega;
261         PdfT omegaEven = omega;
262
263         PdfT magicParam = F(1.0) / F(12.0);
264         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
265
266         const PdfT w_0 = F(1.0) /  F(3.0);
267         const PdfT w_1 = F(1.0) / F(18.0);
268         const PdfT w_2 = F(1.0) / F(36.0);
269
270         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
271         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
272
273
274         VPDFT VONE_HALF   = VSET(F(0.5));
275         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
276
277         VPDFT vw_1_indep, vw_2_indep;
278         VPDFT vw_0 = VSET(w_0);
279         VPDFT vw_1 = VSET(w_1);
280         VPDFT vw_2 = VSET(w_2);
281
282         VPDFT vw_1_x3 = VSET(w_1_x3);
283         VPDFT vw_2_x3 = VSET(w_2_x3);
284         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
285         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
286
287         VPDFT vui, vux, vuy, vuz, vdens;
288
289         VPDFT vevenPart, voddPart, vdir_indep_trm;
290
291         VPDFT vomegaEven = VSET(omegaEven);
292         VPDFT vomegaOdd  = VSET(omegaOdd);
293
294         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
295         #define X(name, idx, idxinv, x, y, z)   VPDFT JOIN(vpdf_,name);
296                 D3Q19_LIST
297         #undef X
298
299         PdfT * src = kd->Pdfs[0];
300
301         int nThreads = 1;
302         int threadId = 0;
303
304         #ifdef _OPENMP
305                 nThreads = omp_get_max_threads();
306                 threadId = omp_get_thread_num();
307         #endif
308
309         // TODO: Currently only a 1-D decomposition is applied. For achritectures
310         //       with a lot of cores we want at least 2-D.
311
312         int threadStartX = nX / nThreads * threadId;
313         int threadEndX   = nX / nThreads * (threadId + 1);
314
315         if (nX % nThreads > 0) {
316                 if (nX % nThreads > threadId) {
317                         threadStartX += threadId;
318                         threadEndX   += threadId + 1;
319                 }
320                 else {
321                         threadStartX += nX % nThreads;
322                         threadEndX   += nX % nThreads;
323                 }
324         }
325
326         AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
327
328         for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
329         for (int bY = oY; bY < nY + oY; bY += blk[1]) {
330         for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
331
332                 int eX = MIN(bX + blk[0], threadEndX + oX);
333                 int eY = MIN(bY + blk[1], nY + oY);
334                 int eZ = MIN(bZ + blk[2], nZ + oZ);
335
336                 for (int x = bX; x < eX; x += 1) {
337                 for (int y = bY; y < eY; y += 1) {
338                 for (int z = bZ; z < eZ; z += VSIZE) { // LOOP aa-vec-even
339
340                         #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
341
342                         // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
343                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
344                                 D3Q19_LIST
345                         #undef X
346
347
348                         vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
349                         vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
350                         vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
351
352                         vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
353                                         VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),
354                                         VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
355
356                         vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
357
358                         VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
359
360                         vw_1_indep = VMUL(vw_1,vdir_indep_trm);
361
362                         vui = vuy;
363                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
364                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
365                         VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
366                         VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
367
368                         vui = vux;
369                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
370                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
371                         VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
372                         VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
373
374                         vui = vuz;
375                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
376                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
377                         VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
378                         VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
379
380                         vw_2_indep = VMUL(vw_2,vdir_indep_trm);
381
382                         vui = VSUB(vuy,vux);
383                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
384                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
385                         VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
386                         VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
387
388                         vui = VADD(vux,vuy);
389                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
390                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
391                         VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
392                         VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
393
394                         vui = VSUB(vuz,vux);
395                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
396                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
397                         VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
398                         VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
399
400                         vui = VADD(vux,vuz);
401                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
402                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
403                         VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
404                         VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
405
406                         vui = VSUB(vuz,vuy);
407                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
408                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
409                         VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
410                         VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
411
412                         vui = VADD(vuy,vuz);
413                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
414                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
415                         VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
416                         VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
417
418                         #undef I
419                 } } } // x, y, z
420         } } } // blocked x, y, z
421
422
423
424         return;
425 } // }}}
426
427
428 static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd)  // {{{
429 {
430         Assert(ld != NULL);
431         Assert(kd != NULL);
432         Assert(cd != NULL);
433
434         Assert(cd->Omega > F(0.0));
435         Assert(cd->Omega < F(2.0));
436
437         KernelDataAa * kda  = KDA(kd);
438
439         int nX = ld->Dims[0];
440         int nY = ld->Dims[1];
441         int nZ = ld->Dims[2];
442
443         int * gDims = kd->GlobalDims;
444
445         int oX = kd->Offsets[0];
446         int oY = kd->Offsets[1];
447         int oZ = kd->Offsets[2];
448
449         int blk[3];
450         blk[0] = kda->Blk[0];
451         blk[1] = kda->Blk[1];
452         blk[2] = kda->Blk[2];
453
454         PdfT omega = cd->Omega;
455         PdfT omegaEven = omega;
456
457         PdfT magicParam = F(1.0) / F(12.0);
458         PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
459
460         const PdfT w_0 = F(1.0) /  F(3.0);
461         const PdfT w_1 = F(1.0) / F(18.0);
462         const PdfT w_2 = F(1.0) / F(36.0);
463
464         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
465         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
466
467         VPDFT VONE_HALF   = VSET(F(0.5));
468         VPDFT VTHREE_HALF = VSET(F(3.0) / F(2.0));
469
470         VPDFT vw_1_indep, vw_2_indep;
471         VPDFT vw_0 = VSET(w_0);
472         VPDFT vw_1 = VSET(w_1);
473         VPDFT vw_2 = VSET(w_2);
474
475         VPDFT vw_1_x3 = VSET(w_1_x3);
476         VPDFT vw_2_x3 = VSET(w_2_x3);
477         VPDFT vw_1_nine_half = VSET(w_1_nine_half);
478         VPDFT vw_2_nine_half = VSET(w_2_nine_half);
479
480         VPDFT vui, vux, vuy, vuz, vdens;
481
482         VPDFT vevenPart, voddPart, vdir_indep_trm;
483
484         VPDFT vomegaEven = VSET(omegaEven);
485         VPDFT vomegaOdd  = VSET(omegaOdd);
486
487         // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
488         #define X(name, idx, idxinv, x, y, z)           VPDFT JOIN(vpdf_,name);
489                 D3Q19_LIST
490         #undef X
491
492         PdfT * src = kd->Pdfs[0];
493
494         int threadId = 0;
495         int nThreads = 1;
496
497         #ifdef _OPENMP
498                 threadId = omp_get_thread_num();
499                 nThreads = omp_get_max_threads();
500         #endif
501
502         // TODO: Currently only a 1-D decomposition is applied. For achritectures
503         //       with a lot of cores we want at least 2-D.
504         int threadStartX = nX / nThreads * threadId;
505         int threadEndX   = nX / nThreads * (threadId + 1);
506
507         if (nX % nThreads > 0) {
508                 if (nX % nThreads > threadId) {
509                         threadStartX += threadId;
510                         threadEndX   += threadId + 1;
511                 }
512                 else {
513                         threadStartX += nX % nThreads;
514                         threadEndX   += nX % nThreads;
515                 }
516         }
517
518         AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
519
520         for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
521         for (int bY = oY; bY < nY + oY; bY += blk[1]) {
522         for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
523
524                 int eX = MIN(bX + blk[0], threadEndX + oX);
525                 int eY = MIN(bY + blk[1], nY + oY);
526                 int eZ = MIN(bZ + blk[2], nZ + oZ);
527
528                 for (int x = bX; x < eX; ++x) {
529                 for (int y = bY; y < eY; ++y) {
530                 for (int z = bZ; z < eZ; z += VSIZE) { // LOOP aa-vec-odd
531
532                         #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
533
534
535                         #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]);
536                         D3Q19_LIST
537                         #undef X
538
539                         vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
540                         vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
541                         vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
542
543                         vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
544                                                  VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
545
546                         vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
547
548                         VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
549
550                         vw_1_indep = VMUL(vw_1,vdir_indep_trm);
551
552                         vui = vuy;
553                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
554                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
555                         VSTU(&src[I(x, y + 1,  z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart));
556                         VSTU(&src[I(x, y - 1,  z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart));
557
558                         vui = vux;
559                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
560                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
561                         VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart));
562                         VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart));
563
564                         vui = vuz;
565                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
566                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
567                         VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart));
568                         VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart));
569
570                         vw_2_indep = VMUL(vw_2,vdir_indep_trm);
571
572                         vui = VSUB(vuy,vux);
573                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
574                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
575                         VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
576                         VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart));
577
578                         vui = VADD(vux,vuy);
579                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
580                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
581                         VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
582                         VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart));
583
584                         vui = VSUB(vuz,vux);
585                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
586                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
587                         VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
588                         VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart));
589
590                         vui = VADD(vux,vuz);
591                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
592                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
593                         VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
594                         VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart));
595
596                         vui = VSUB(vuz,vuy);
597                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
598                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
599                         VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
600                         VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart));
601
602                         vui = VADD(vuy,vuz);
603                         vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
604                         voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
605                         VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
606                         VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart));
607
608                         #undef I
609                 } } } // x, y, z
610         } } } // blocked x, y, z
611
612         return;
613
614 }  // }}}
This page took 0.095958 seconds and 4 git commands to generate.