merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "Vector.h"
32 #include "LikwidIf.h"
33
34 #include <inttypes.h>
35 #include <math.h>
36
37 #ifdef _OPENMP
38         #include <omp.h>
39 #endif
40
41 #define TMP_UX 18
42 #define TMP_UY 19
43 #define TMP_UZ 20
44 #define TMP_W1 21
45 #define TMP_W2 22
46
47 #define N_TMP 23
48
49 #define TMP_INDEX(tmp_index, tmp_dir)   nTmpArray * (tmp_dir) + (tmp_index)
50
51 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
52 {
53
54         Assert(ld != NULL);
55         Assert(kernelData != NULL);
56         Assert(cd != NULL);
57
58         Assert(cd->Omega > F(0.0));
59         Assert(cd->Omega < F(2.0));
60
61         KernelData        * kd   = (KernelData *)kernelData;
62         KernelDataList    * kdl  = KDL(kernelData);
63         KernelDataListRia * kdlr = KDLR(kernelData);
64
65         PdfT omega = cd->Omega;
66         const PdfT omegaEven = omega;
67
68         PdfT magicParam = F(1.0) / F(12.0);
69         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
70
71
72         const PdfT w_0 = F(1.0) / F( 3.0);
73         const PdfT w_1 = F(1.0) / F(18.0);
74         const PdfT w_2 = F(1.0) / F(36.0);
75
76         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
77         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
78
79         const VPDFT vw_1_x3 = VSET(w_1_x3);
80         const VPDFT vw_2_x3 = VSET(w_2_x3);
81
82         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
83         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
84
85         const VPDFT vomegaEven = VSET(omegaEven);
86         const VPDFT vomegaOdd  = VSET(omegaOdd);
87
88         const VPDFT voneHalf = VSET(F(0.5));
89
90         // uint32_t nConsecNodes = kdlr->nConsecNodes;
91         // uint32_t * consecNodes = kdlr->ConsecNodes;
92         // uint32_t consecIndex = 0;
93         // uint32_t consecValue = 0;
94
95         PdfT * src = kd->Pdfs[0];
96         PdfT * dst = kd->Pdfs[1];
97         PdfT * tmp;
98
99         int maxIterations  = cd->MaxIterations;
100
101         int nFluid         = kdl->nFluid;
102         int nCells         = kdl->nCells;
103
104         int nTmpArray      = kdlr->nTmpArray;
105
106         Assert(nTmpArray % VSIZE == 0);
107
108         uint32_t * adjList = kdl->AdjList;
109
110         #ifdef VTK_OUTPUT
111                 if (cd->VtkOutput) {
112                         kd->PdfsActive = src;
113                         VtkWrite(ld, kd, cd, -1);
114                 }
115         #endif
116
117         #ifdef STATISTICS
118                 kd->PdfsActive = src;
119                 KernelStatistics(kd, ld, cd, 0);
120         #endif
121
122         X_KERNEL_START(kernelData);
123
124         X_LIKWID_START("list-pull-split-nt-1s");
125
126         #ifdef _OPENMP
127                 #pragma omp parallel default(none) \
128                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
129                         cd, maxIterations, ld, tmp, nTmpArray, \
130                         stderr )
131         #endif
132         {
133                 uint32_t adjListIndex;
134
135                 PdfT ux, uy, uz, ui;
136                 VPDFT vux, vuy, vuz, vui;
137
138                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
139                 D3Q19_LIST
140                 #undef X
141                 VPDFT vpdf_a, vpdf_b;
142
143                 PdfT evenPart, oddPart, dir_indep_trm, dens;
144                 PdfT w_1_indep, w_2_indep;
145                 VPDFT vevenPart, voddPart;
146                 VPDFT vw_1_indep, vw_2_indep;
147
148                 int indexMax;
149
150                 PdfT * tmpArray;
151                 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
152
153                 int nThreads = 1;
154                 int threadId = 0;
155
156 #ifdef _OPENMP
157                 nThreads = omp_get_max_threads();
158                 threadId = omp_get_thread_num();
159 #endif
160
161                 int nCellsThread = nFluid / nThreads;
162                 int blIndexStart = threadId * nCellsThread;
163
164                 if (threadId < nFluid % nThreads) {
165                         blIndexStart += threadId;
166                         nCellsThread += 1;
167                 }
168                 else {
169                         blIndexStart += nFluid % nThreads;
170                 }
171
172                 int blIndexStop = blIndexStart + nCellsThread;
173
174                 // We have three loops:
175                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
176                 // 2. Vectorized handling of nodes.
177                 // 3. Remaining nodes, less than vector size.
178
179                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
180                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
181
182                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
183                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
184
185                 int blIndexVec       = blIndexStart + nCellsUnaligned;
186                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
187
188                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
189
190                 for(int iter = 0; iter < maxIterations; ++iter) {
191
192
193 #if 1
194                         #define INDEX_START     blIndexStart
195                         #define INDEX_STOP  blIndexVec
196                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
197
198                         #define INDEX_START blIndexVec
199                         #define INDEX_STOP  blIndexRemaining
200                         #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
201
202                         #define INDEX_START blIndexRemaining
203                         #define INDEX_STOP      blIndexStop
204                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
205 #else
206                         #define INDEX_START blIndexStart
207                         #define INDEX_STOP      blIndexStop
208                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
209 #endif
210
211
212                         #pragma omp barrier
213
214                         #pragma omp single
215                         {
216                                 #ifdef VERIFICATION
217                                         kd->PdfsActive = dst;
218                                         KernelAddBodyForce(kd, ld, cd);
219                                 #endif
220
221                                 #ifdef VTK_OUTPUT
222                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
223                                                 kd->PdfsActive = dst;
224                                                 VtkWrite(ld, kd, cd, iter);
225                                         }
226                                 #endif
227
228                                 #ifdef STATISTICS
229                                         kd->PdfsActive = dst;
230                                         KernelStatistics(kd, ld, cd, iter);
231                                 #endif
232
233                                 // swap grids
234                                 tmp = src;
235                                 src = dst;
236                                 dst = tmp;
237                         }
238
239                         #pragma omp barrier
240
241                 } // for (int iter = 0; ...
242
243                 MemFree((void **)&tmpArray);
244         }
245
246
247         X_LIKWID_STOP("list-pull-split-nt-1s");
248
249         X_KERNEL_END(kernelData);
250
251 #ifdef VTK_OUTPUT
252         if (cd->VtkOutput) {
253                 kd->PdfsActive = src;
254                 VtkWrite(ld, kd, cd, maxIterations);
255         }
256 #endif
257
258 #ifdef STATISTICS
259         kd->PdfsActive = src;
260         KernelStatistics(kd, ld, cd, maxIterations);
261 #endif
262
263         return;
264 }
265
266 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
267 {
268
269         Assert(ld != NULL);
270         Assert(kernelData != NULL);
271         Assert(cd != NULL);
272
273         Assert(cd->Omega > F(0.0));
274         Assert(cd->Omega < F(2.0));
275
276         KernelData        * kd   = (KernelData *)kernelData;
277         KernelDataList    * kdl  = KDL(kernelData);
278         KernelDataListRia * kdlr = KDLR(kernelData);
279
280         PdfT omega = cd->Omega;
281         const PdfT omegaEven = omega;
282
283         PdfT magicParam = F(1.0) / F(12.0);
284         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
285
286         const PdfT w_0 = F(1.0) / F( 3.0);
287         const PdfT w_1 = F(1.0) / F(18.0);
288         const PdfT w_2 = F(1.0) / F(36.0);
289
290         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
291         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
292
293         const VPDFT vw_1_x3 = VSET(w_1_x3);
294         const VPDFT vw_2_x3 = VSET(w_2_x3);
295
296         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
297         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
298
299         const VPDFT vomegaEven = VSET(omegaEven);
300         const VPDFT vomegaOdd  = VSET(omegaOdd);
301
302         const VPDFT voneHalf = VSET(F(0.5));
303
304         // uint32_t nConsecNodes = kdlr->nConsecNodes;
305         // uint32_t * consecNodes = kdlr->ConsecNodes;
306         // uint32_t consecIndex = 0;
307         // uint32_t consecValue = 0;
308
309         PdfT * src = kd->Pdfs[0];
310         PdfT * dst = kd->Pdfs[1];
311         PdfT * tmp;
312
313         int maxIterations  = cd->MaxIterations;
314
315         int nFluid         = kdl->nFluid;
316         int nCells         = kdl->nCells;
317
318         int nTmpArray      = kdlr->nTmpArray;
319
320         Assert(nTmpArray % VSIZE == 0);
321
322         uint32_t * adjList = kdl->AdjList;
323
324         #ifdef VTK_OUTPUT
325                 if (cd->VtkOutput) {
326                         kd->PdfsActive = src;
327                         VtkWrite(ld, kd, cd, -1);
328                 }
329         #endif
330
331         #ifdef STATISTICS
332                 kd->PdfsActive = src;
333                 KernelStatistics(kd, ld, cd, 0);
334         #endif
335
336
337         X_KERNEL_START(kernelData);
338
339         X_LIKWID_START("list-pull-split-nt-2s");
340
341
342         #ifdef _OPENMP
343                 #pragma omp parallel default(none) \
344                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
345                         cd, maxIterations, ld, tmp, nTmpArray, \
346                         stderr )
347         #endif
348         {
349                 uint32_t adjListIndex;
350
351                 PdfT ux, uy, uz, ui;
352                 VPDFT vux, vuy, vuz, vui;
353
354                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
355                 D3Q19_LIST
356                 #undef X
357                 VPDFT vpdf_a, vpdf_b;
358
359                 PdfT evenPart, oddPart, dir_indep_trm, dens;
360                 PdfT w_1_indep, w_2_indep;
361                 VPDFT vevenPart, voddPart;
362                 VPDFT vw_1_indep, vw_2_indep;
363
364                 int indexMax;
365
366                 PdfT * tmpArray;
367                 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
368
369                 int nThreads = 1;
370                 int threadId = 0;
371
372 #ifdef _OPENMP
373                 nThreads = omp_get_max_threads();
374                 threadId = omp_get_thread_num();
375 #endif
376
377                 int nCellsThread = nFluid / nThreads;
378                 int blIndexStart = threadId * nCellsThread;
379
380                 if (threadId < nFluid % nThreads) {
381                         blIndexStart += threadId;
382                         nCellsThread += 1;
383                 }
384                 else {
385                         blIndexStart += nFluid % nThreads;
386                 }
387
388                 int blIndexStop = blIndexStart + nCellsThread;
389
390                 // We have three loops:
391                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
392                 // 2. Vectorized handling of nodes.
393                 // 3. Remaining nodes, less than vector size.
394
395                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
396                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
397
398                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
399                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
400
401                 int blIndexVec       = blIndexStart + nCellsUnaligned;
402                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
403
404                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
405
406                 for(int iter = 0; iter < maxIterations; ++iter) {
407
408 #if 1
409                         #define INDEX_START     blIndexStart
410                         #define INDEX_STOP  blIndexVec
411                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
412
413                         #define INDEX_START blIndexVec
414                         #define INDEX_STOP  blIndexRemaining
415                         #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
416
417                         #define INDEX_START blIndexRemaining
418                         #define INDEX_STOP      blIndexStop
419                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
420 #else
421                         #define INDEX_START blIndexStart
422                         #define INDEX_STOP      blIndexStop
423                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
424 #endif
425                         #pragma omp barrier
426
427
428                         #pragma omp single
429                         {
430                                 #ifdef VERIFICATION
431                                         kd->PdfsActive = dst;
432                                         KernelAddBodyForce(kd, ld, cd);
433                                 #endif
434
435                                 #ifdef VTK_OUTPUT
436                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
437                                                 kd->PdfsActive = dst;
438                                                 VtkWrite(ld, kd, cd, iter);
439                                         }
440                                 #endif
441
442                                 #ifdef STATISTICS
443                                         kd->PdfsActive = dst;
444                                         KernelStatistics(kd, ld, cd, iter);
445                                 #endif
446
447                                 // swap grids
448                                 tmp = src;
449                                 src = dst;
450                                 dst = tmp;
451                         }
452
453                         #pragma omp barrier
454
455                 } // for (int iter = 0; ...
456
457                 MemFree((void **)&tmpArray);
458         }
459
460         X_LIKWID_STOP("list-pull-split-nt-2s");
461
462         X_KERNEL_END(kernelData);
463
464 #ifdef VTK_OUTPUT
465         if (cd->VtkOutput) {
466                 kd->PdfsActive = src;
467                 VtkWrite(ld, kd, cd, maxIterations);
468         }
469 #endif
470
471 #ifdef STATISTICS
472         kd->PdfsActive = src;
473         KernelStatistics(kd, ld, cd, maxIterations);
474 #endif
475
476         return;
477 }
478
This page took 0.07817 seconds and 4 git commands to generate.