f6174069936c2a403fefd5f6c45a4ed67fb17204
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "Vector.h"
32 #include "LikwidIf.h"
33
34 #include <inttypes.h>
35 #include <math.h>
36
37 #ifdef _OPENMP
38         #include <omp.h>
39 #endif
40
41 #define TMP_UX 18
42 #define TMP_UY 19
43 #define TMP_UZ 20
44 #define TMP_W1 21
45 #define TMP_W2 22
46
47 #define N_TMP 23
48
49 #define TMP_INDEX(tmp_index, tmp_dir)   nTmpArray * (tmp_dir) + (tmp_index)
50
51 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
52 {
53
54         Assert(ld != NULL);
55         Assert(kernelData != NULL);
56         Assert(cd != NULL);
57
58         Assert(cd->Omega > F(0.0));
59         Assert(cd->Omega < F(2.0));
60
61         KernelData        * kd   = (KernelData *)kernelData;
62         KernelDataList    * kdl  = KDL(kernelData);
63         KernelDataListRia * kdlr = KDLR(kernelData);
64
65         PdfT omega = cd->Omega;
66         const PdfT omegaEven = omega;
67
68         PdfT magicParam = F(1.0) / F(12.0);
69         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
70
71
72         const PdfT w_0 = F(1.0) / F( 3.0);
73         const PdfT w_1 = F(1.0) / F(18.0);
74         const PdfT w_2 = F(1.0) / F(36.0);
75
76         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
77         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
78
79         const VPDFT vw_1_x3 = VSET(w_1_x3);
80         const VPDFT vw_2_x3 = VSET(w_2_x3);
81
82         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
83         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
84
85         const VPDFT vomegaEven = VSET(omegaEven);
86         const VPDFT vomegaOdd  = VSET(omegaOdd);
87
88         const VPDFT voneHalf = VSET(F(0.5));
89
90         // uint32_t nConsecNodes = kdlr->nConsecNodes;
91         // uint32_t * consecNodes = kdlr->ConsecNodes;
92         // uint32_t consecIndex = 0;
93         // uint32_t consecValue = 0;
94
95         PdfT * src = kd->Pdfs[0];
96         PdfT * dst = kd->Pdfs[1];
97         PdfT * tmp;
98
99         int maxIterations  = cd->MaxIterations;
100
101         int nFluid         = kdl->nFluid;
102         int nCells         = kdl->nCells;
103
104         int nTmpArray      = kdlr->nTmpArray;
105
106         Assert(nTmpArray % VSIZE == 0);
107
108         uint32_t * adjList = kdl->AdjList;
109
110         #ifdef VTK_OUTPUT
111                 if (cd->VtkOutput) {
112                         kd->PdfsActive = src;
113                         VtkWrite(ld, kd, cd, -1);
114                 }
115         #endif
116
117         #ifdef STATISTICS
118                 kd->PdfsActive = src;
119                 KernelStatistics(kd, ld, cd, 0);
120         #endif
121
122
123                         X_LIKWID_START("list-pull-split-nt-1s");
124         #ifdef _OPENMP
125                 #pragma omp parallel default(none) \
126                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
127                         cd, maxIterations, ld, tmp, nTmpArray, \
128                         stderr )
129         #endif
130         {
131                 uint32_t adjListIndex;
132
133                 PdfT ux, uy, uz, ui;
134                 VPDFT vux, vuy, vuz, vui;
135
136                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
137                 D3Q19_LIST
138                 #undef X
139                 VPDFT vpdf_a, vpdf_b;
140
141                 PdfT evenPart, oddPart, dir_indep_trm, dens;
142                 PdfT w_1_indep, w_2_indep;
143                 VPDFT vevenPart, voddPart;
144                 VPDFT vw_1_indep, vw_2_indep;
145
146                 int indexMax;
147
148                 PdfT * tmpArray;
149                 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
150
151                 int nThreads = 1;
152                 int threadId = 0;
153
154 #ifdef _OPENMP
155                 nThreads = omp_get_max_threads();
156                 threadId = omp_get_thread_num();
157 #endif
158
159                 int nCellsThread = nFluid / nThreads;
160                 int blIndexStart = threadId * nCellsThread;
161
162                 if (threadId < nFluid % nThreads) {
163                         blIndexStart += threadId;
164                         nCellsThread += 1;
165                 }
166                 else {
167                         blIndexStart += nFluid % nThreads;
168                 }
169
170                 int blIndexStop = blIndexStart + nCellsThread;
171
172                 // We have three loops:
173                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
174                 // 2. Vectorized handling of nodes.
175                 // 3. Remaining nodes, less than vector size.
176
177                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
178                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
179
180                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
181                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
182
183                 int blIndexVec       = blIndexStart + nCellsUnaligned;
184                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
185
186                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
187
188                 for(int iter = 0; iter < maxIterations; ++iter) {
189
190
191 #if 1
192                         #define INDEX_START     blIndexStart
193                         #define INDEX_STOP  blIndexVec
194                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
195
196                         #define INDEX_START blIndexVec
197                         #define INDEX_STOP  blIndexRemaining
198                         #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
199
200                         #define INDEX_START blIndexRemaining
201                         #define INDEX_STOP      blIndexStop
202                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
203 #else
204                         #define INDEX_START blIndexStart
205                         #define INDEX_STOP      blIndexStop
206                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
207 #endif
208
209
210                         #pragma omp barrier
211
212                         #pragma omp single
213                         {
214                                 #ifdef VERIFICATION
215                                         kd->PdfsActive = dst;
216                                         KernelAddBodyForce(kd, ld, cd);
217                                 #endif
218
219                                 #ifdef VTK_OUTPUT
220                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
221                                                 kd->PdfsActive = dst;
222                                                 VtkWrite(ld, kd, cd, iter);
223                                         }
224                                 #endif
225
226                                 #ifdef STATISTICS
227                                         kd->PdfsActive = dst;
228                                         KernelStatistics(kd, ld, cd, iter);
229                                 #endif
230
231                                 // swap grids
232                                 tmp = src;
233                                 src = dst;
234                                 dst = tmp;
235                         }
236
237                         #pragma omp barrier
238
239                 } // for (int iter = 0; ...
240
241                 MemFree((void **)&tmpArray);
242         }
243
244
245         X_LIKWID_STOP("list-pull-split-nt-1s");
246
247 #ifdef VTK_OUTPUT
248         if (cd->VtkOutput) {
249                 kd->PdfsActive = src;
250                 VtkWrite(ld, kd, cd, maxIterations);
251         }
252 #endif
253
254 #ifdef STATISTICS
255         kd->PdfsActive = src;
256         KernelStatistics(kd, ld, cd, maxIterations);
257 #endif
258
259         return;
260 }
261
262 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
263 {
264
265         Assert(ld != NULL);
266         Assert(kernelData != NULL);
267         Assert(cd != NULL);
268
269         Assert(cd->Omega > F(0.0));
270         Assert(cd->Omega < F(2.0));
271
272         KernelData        * kd   = (KernelData *)kernelData;
273         KernelDataList    * kdl  = KDL(kernelData);
274         KernelDataListRia * kdlr = KDLR(kernelData);
275
276         PdfT omega = cd->Omega;
277         const PdfT omegaEven = omega;
278
279         PdfT magicParam = F(1.0) / F(12.0);
280         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
281
282         const PdfT w_0 = F(1.0) / F( 3.0);
283         const PdfT w_1 = F(1.0) / F(18.0);
284         const PdfT w_2 = F(1.0) / F(36.0);
285
286         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
287         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
288
289         const VPDFT vw_1_x3 = VSET(w_1_x3);
290         const VPDFT vw_2_x3 = VSET(w_2_x3);
291
292         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
293         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
294
295         const VPDFT vomegaEven = VSET(omegaEven);
296         const VPDFT vomegaOdd  = VSET(omegaOdd);
297
298         const VPDFT voneHalf = VSET(F(0.5));
299
300         // uint32_t nConsecNodes = kdlr->nConsecNodes;
301         // uint32_t * consecNodes = kdlr->ConsecNodes;
302         // uint32_t consecIndex = 0;
303         // uint32_t consecValue = 0;
304
305         PdfT * src = kd->Pdfs[0];
306         PdfT * dst = kd->Pdfs[1];
307         PdfT * tmp;
308
309         int maxIterations  = cd->MaxIterations;
310
311         int nFluid         = kdl->nFluid;
312         int nCells         = kdl->nCells;
313
314         int nTmpArray      = kdlr->nTmpArray;
315
316         Assert(nTmpArray % VSIZE == 0);
317
318         uint32_t * adjList = kdl->AdjList;
319
320         #ifdef VTK_OUTPUT
321                 if (cd->VtkOutput) {
322                         kd->PdfsActive = src;
323                         VtkWrite(ld, kd, cd, -1);
324                 }
325         #endif
326
327         #ifdef STATISTICS
328                 kd->PdfsActive = src;
329                 KernelStatistics(kd, ld, cd, 0);
330         #endif
331
332
333                         X_LIKWID_START("list-pull-split-nt-2s");
334
335
336         #ifdef _OPENMP
337                 #pragma omp parallel default(none) \
338                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
339                         cd, maxIterations, ld, tmp, nTmpArray, \
340                         stderr )
341         #endif
342         {
343                 uint32_t adjListIndex;
344
345                 PdfT ux, uy, uz, ui;
346                 VPDFT vux, vuy, vuz, vui;
347
348                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
349                 D3Q19_LIST
350                 #undef X
351                 VPDFT vpdf_a, vpdf_b;
352
353                 PdfT evenPart, oddPart, dir_indep_trm, dens;
354                 PdfT w_1_indep, w_2_indep;
355                 VPDFT vevenPart, voddPart;
356                 VPDFT vw_1_indep, vw_2_indep;
357
358                 int indexMax;
359
360                 PdfT * tmpArray;
361                 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
362
363                 int nThreads = 1;
364                 int threadId = 0;
365
366 #ifdef _OPENMP
367                 nThreads = omp_get_max_threads();
368                 threadId = omp_get_thread_num();
369 #endif
370
371                 int nCellsThread = nFluid / nThreads;
372                 int blIndexStart = threadId * nCellsThread;
373
374                 if (threadId < nFluid % nThreads) {
375                         blIndexStart += threadId;
376                         nCellsThread += 1;
377                 }
378                 else {
379                         blIndexStart += nFluid % nThreads;
380                 }
381
382                 int blIndexStop = blIndexStart + nCellsThread;
383
384                 // We have three loops:
385                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
386                 // 2. Vectorized handling of nodes.
387                 // 3. Remaining nodes, less than vector size.
388
389                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
390                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
391
392                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
393                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
394
395                 int blIndexVec       = blIndexStart + nCellsUnaligned;
396                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
397
398                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
399
400                 for(int iter = 0; iter < maxIterations; ++iter) {
401
402 #if 1
403                         #define INDEX_START     blIndexStart
404                         #define INDEX_STOP  blIndexVec
405                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
406
407                         #define INDEX_START blIndexVec
408                         #define INDEX_STOP  blIndexRemaining
409                         #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
410
411                         #define INDEX_START blIndexRemaining
412                         #define INDEX_STOP      blIndexStop
413                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
414 #else
415                         #define INDEX_START blIndexStart
416                         #define INDEX_STOP      blIndexStop
417                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
418 #endif
419                         #pragma omp barrier
420
421
422                         #pragma omp single
423                         {
424                                 #ifdef VERIFICATION
425                                         kd->PdfsActive = dst;
426                                         KernelAddBodyForce(kd, ld, cd);
427                                 #endif
428
429                                 #ifdef VTK_OUTPUT
430                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
431                                                 kd->PdfsActive = dst;
432                                                 VtkWrite(ld, kd, cd, iter);
433                                         }
434                                 #endif
435
436                                 #ifdef STATISTICS
437                                         kd->PdfsActive = dst;
438                                         KernelStatistics(kd, ld, cd, iter);
439                                 #endif
440
441                                 // swap grids
442                                 tmp = src;
443                                 src = dst;
444                                 dst = tmp;
445                         }
446
447                         #pragma omp barrier
448
449                 } // for (int iter = 0; ...
450
451                 MemFree((void **)&tmpArray);
452         }
453
454                         X_LIKWID_STOP("list-pull-split-nt-2s");
455
456 #ifdef VTK_OUTPUT
457         if (cd->VtkOutput) {
458                 kd->PdfsActive = src;
459                 VtkWrite(ld, kd, cd, maxIterations);
460         }
461 #endif
462
463 #ifdef STATISTICS
464         kd->PdfsActive = src;
465         KernelStatistics(kd, ld, cd, maxIterations);
466 #endif
467
468         return;
469 }
470
This page took 0.061918 seconds and 3 git commands to generate.