version 0.1
[LbmBenchmarkKernelsPublic.git] / src / BenchKernelD3Q19ListPullSplitNt.c
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2017
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
12 //
13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
14 //  it under the terms of the GNU General Public License as published by
15 //  the Free Software Foundation, either version 3 of the License, or
16 //  (at your option) any later version.
17 //
18 //  LbmBenchKernels is distributed in the hope that it will be useful,
19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 //  GNU General Public License for more details.
22 //
23 //  You should have received a copy of the GNU General Public License
24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
25 //
26 // --------------------------------------------------------------------------
27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
28
29 #include "Memory.h"
30 #include "Vtk.h"
31 #include "Vector.h"
32
33 #include <inttypes.h>
34 #include <math.h>
35
36 #ifdef _OPENMP
37         #include <omp.h>
38 #endif
39
40 #define TMP_UX 18
41 #define TMP_UY 19
42 #define TMP_UZ 20
43 #define TMP_W1 21
44 #define TMP_W2 22
45
46 #define N_TMP 23
47
48 #define TMP_INDEX(tmp_index, tmp_dir)   nTmpArray * (tmp_dir) + (tmp_index)
49
50 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
51 {
52
53         Assert(ld != NULL);
54         Assert(kernelData != NULL);
55         Assert(cd != NULL);
56
57         Assert(cd->Omega > 0.0);
58         Assert(cd->Omega < 2.0);
59
60         KernelData        * kd   = (KernelData *)kernelData;
61         KernelDataList    * kdl  = KDL(kernelData);
62         KernelDataListRia * kdlr = KDLR(kernelData);
63
64         PdfT omega = cd->Omega;
65         const PdfT omegaEven = omega;
66
67         PdfT magicParam = 1.0 / 12.0;
68         const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
69
70
71         const PdfT w_0 = 1.0 /  3.0;
72         const PdfT w_1 = 1.0 / 18.0;
73         const PdfT w_2 = 1.0 / 36.0;
74
75         const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
76         const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
77
78         const VPDFT vw_1_x3 = VSET(w_1_x3);
79         const VPDFT vw_2_x3 = VSET(w_2_x3);
80
81         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
82         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
83
84         const VPDFT vomegaEven = VSET(omegaEven);
85         const VPDFT vomegaOdd  = VSET(omegaOdd);
86
87         const VPDFT voneHalf = VSET(0.5);
88
89         // uint32_t nConsecNodes = kdlr->nConsecNodes;
90         // uint32_t * consecNodes = kdlr->ConsecNodes;
91         // uint32_t consecIndex = 0;
92         // uint32_t consecValue = 0;
93
94         PdfT * src = kd->Pdfs[0];
95         PdfT * dst = kd->Pdfs[1];
96         PdfT * tmp;
97
98         int maxIterations  = cd->MaxIterations;
99
100         int nFluid         = kdl->nFluid;
101         int nCells         = kdl->nCells;
102
103         int nTmpArray      = kdlr->nTmpArray;
104
105         Assert(nTmpArray % VSIZE == 0);
106
107         uint32_t * adjList = kdl->AdjList;
108
109         #ifdef VTK_OUTPUT
110                 if (cd->VtkOutput) {
111                         kd->PdfsActive = src;
112                         VtkWrite(ld, kd, cd, -1);
113                 }
114         #endif
115
116         #ifdef STATISTICS
117                 kd->PdfsActive = src;
118                 KernelStatistics(kd, ld, cd, 0);
119         #endif
120
121         #ifdef _OPENMP
122                 #pragma omp parallel default(none) \
123                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
124                         cd, maxIterations, ld, tmp, nTmpArray, \
125                         stderr )
126         #endif
127         {
128                 uint32_t adjListIndex;
129
130                 PdfT ux, uy, uz, ui;
131                 VPDFT vux, vuy, vuz, vui;
132
133                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
134                 D3Q19_LIST
135                 #undef X
136                 VPDFT vpdf_a, vpdf_b;
137
138                 PdfT evenPart, oddPart, dir_indep_trm, dens;
139                 PdfT w_1_indep, w_2_indep;
140                 VPDFT vevenPart, voddPart;
141                 VPDFT vw_1_indep, vw_2_indep;
142
143                 int indexMax;
144
145                 PdfT * tmpArray;
146                 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
147
148                 int nThreads = 1;
149                 int threadId = 0;
150
151 #ifdef _OPENMP
152                 nThreads = omp_get_max_threads();
153                 threadId = omp_get_thread_num();
154 #endif
155
156                 int nCellsThread = nFluid / nThreads;
157                 int blIndexStart = threadId * nCellsThread;
158
159                 if (threadId < nFluid % nThreads) {
160                         blIndexStart += threadId;
161                         nCellsThread += 1;
162                 }
163                 else {
164                         blIndexStart += nFluid % nThreads;
165                 }
166
167                 int blIndexStop = blIndexStart + nCellsThread;
168
169                 // We have three loops:
170                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
171                 // 2. Vectorized handling of nodes.
172                 // 3. Remaining nodes, less than vector size.
173
174                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
175                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
176
177                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
178                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
179
180                 int blIndexVec       = blIndexStart + nCellsUnaligned;
181                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
182
183                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
184
185                 for(int iter = 0; iter < maxIterations; ++iter) {
186
187 #if 1
188                         #define INDEX_START     blIndexStart
189                         #define INDEX_STOP  blIndexVec
190                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
191
192                         #define INDEX_START blIndexVec
193                         #define INDEX_STOP  blIndexRemaining
194                         #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
195
196                         #define INDEX_START blIndexRemaining
197                         #define INDEX_STOP      blIndexStop
198                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
199 #else
200                         #define INDEX_START blIndexStart
201                         #define INDEX_STOP      blIndexStop
202                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
203 #endif
204                         #pragma omp barrier
205
206                         #pragma omp single
207                         {
208                                 #ifdef VERIFICATION
209                                         kd->PdfsActive = dst;
210                                         KernelAddBodyForce(kd, ld, cd);
211                                 #endif
212
213                                 #ifdef VTK_OUTPUT
214                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
215                                                 kd->PdfsActive = dst;
216                                                 VtkWrite(ld, kd, cd, iter);
217                                         }
218                                 #endif
219
220                                 #ifdef STATISTICS
221                                         kd->PdfsActive = dst;
222                                         KernelStatistics(kd, ld, cd, iter);
223                                 #endif
224
225                                 // swap grids
226                                 tmp = src;
227                                 src = dst;
228                                 dst = tmp;
229                         }
230
231                         #pragma omp barrier
232
233                 } // for (int iter = 0; ...
234
235                 MemFree((void **)&tmpArray);
236         }
237
238 #ifdef VTK_OUTPUT
239         if (cd->VtkOutput) {
240                 kd->PdfsActive = src;
241                 VtkWrite(ld, kd, cd, maxIterations);
242         }
243 #endif
244
245 #ifdef STATISTICS
246         kd->PdfsActive = src;
247         KernelStatistics(kd, ld, cd, maxIterations);
248 #endif
249
250         return;
251 }
252
253 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
254 {
255
256         Assert(ld != NULL);
257         Assert(kernelData != NULL);
258         Assert(cd != NULL);
259
260         Assert(cd->Omega > 0.0);
261         Assert(cd->Omega < 2.0);
262
263         KernelData        * kd   = (KernelData *)kernelData;
264         KernelDataList    * kdl  = KDL(kernelData);
265         KernelDataListRia * kdlr = KDLR(kernelData);
266
267         PdfT omega = cd->Omega;
268         const PdfT omegaEven = omega;
269
270         PdfT magicParam = 1.0 / 12.0;
271         const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
272
273
274         const PdfT w_0 = 1.0 /  3.0;
275         const PdfT w_1 = 1.0 / 18.0;
276         const PdfT w_2 = 1.0 / 36.0;
277
278         const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
279         const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
280
281         const VPDFT vw_1_x3 = VSET(w_1_x3);
282         const VPDFT vw_2_x3 = VSET(w_2_x3);
283
284         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
285         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
286
287         const VPDFT vomegaEven = VSET(omegaEven);
288         const VPDFT vomegaOdd  = VSET(omegaOdd);
289
290         const VPDFT voneHalf = VSET(0.5);
291
292         // uint32_t nConsecNodes = kdlr->nConsecNodes;
293         // uint32_t * consecNodes = kdlr->ConsecNodes;
294         // uint32_t consecIndex = 0;
295         // uint32_t consecValue = 0;
296
297         PdfT * src = kd->Pdfs[0];
298         PdfT * dst = kd->Pdfs[1];
299         PdfT * tmp;
300
301         int maxIterations  = cd->MaxIterations;
302
303         int nFluid         = kdl->nFluid;
304         int nCells         = kdl->nCells;
305
306         int nTmpArray      = kdlr->nTmpArray;
307
308         Assert(nTmpArray % VSIZE == 0);
309
310         uint32_t * adjList = kdl->AdjList;
311
312         #ifdef VTK_OUTPUT
313                 if (cd->VtkOutput) {
314                         kd->PdfsActive = src;
315                         VtkWrite(ld, kd, cd, -1);
316                 }
317         #endif
318
319         #ifdef STATISTICS
320                 kd->PdfsActive = src;
321                 KernelStatistics(kd, ld, cd, 0);
322         #endif
323
324         #ifdef _OPENMP
325                 #pragma omp parallel default(none) \
326                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
327                         cd, maxIterations, ld, tmp, nTmpArray, \
328                         stderr )
329         #endif
330         {
331                 uint32_t adjListIndex;
332
333                 PdfT ux, uy, uz, ui;
334                 VPDFT vux, vuy, vuz, vui;
335
336                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
337                 D3Q19_LIST
338                 #undef X
339                 VPDFT vpdf_a, vpdf_b;
340
341                 PdfT evenPart, oddPart, dir_indep_trm, dens;
342                 PdfT w_1_indep, w_2_indep;
343                 VPDFT vevenPart, voddPart;
344                 VPDFT vw_1_indep, vw_2_indep;
345
346                 int indexMax;
347
348                 PdfT * tmpArray;
349                 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
350
351                 int nThreads = 1;
352                 int threadId = 0;
353
354 #ifdef _OPENMP
355                 nThreads = omp_get_max_threads();
356                 threadId = omp_get_thread_num();
357 #endif
358
359                 int nCellsThread = nFluid / nThreads;
360                 int blIndexStart = threadId * nCellsThread;
361
362                 if (threadId < nFluid % nThreads) {
363                         blIndexStart += threadId;
364                         nCellsThread += 1;
365                 }
366                 else {
367                         blIndexStart += nFluid % nThreads;
368                 }
369
370                 int blIndexStop = blIndexStart + nCellsThread;
371
372                 // We have three loops:
373                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
374                 // 2. Vectorized handling of nodes.
375                 // 3. Remaining nodes, less than vector size.
376
377                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
378                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
379
380                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
381                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
382
383                 int blIndexVec       = blIndexStart + nCellsUnaligned;
384                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
385
386                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
387
388                 for(int iter = 0; iter < maxIterations; ++iter) {
389
390 #if 1
391                         #define INDEX_START     blIndexStart
392                         #define INDEX_STOP  blIndexVec
393                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
394
395                         #define INDEX_START blIndexVec
396                         #define INDEX_STOP  blIndexRemaining
397                         #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
398
399                         #define INDEX_START blIndexRemaining
400                         #define INDEX_STOP      blIndexStop
401                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
402 #else
403                         #define INDEX_START blIndexStart
404                         #define INDEX_STOP      blIndexStop
405                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
406 #endif
407                         #pragma omp barrier
408
409                         #pragma omp single
410                         {
411                                 #ifdef VERIFICATION
412                                         kd->PdfsActive = dst;
413                                         KernelAddBodyForce(kd, ld, cd);
414                                 #endif
415
416                                 #ifdef VTK_OUTPUT
417                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
418                                                 kd->PdfsActive = dst;
419                                                 VtkWrite(ld, kd, cd, iter);
420                                         }
421                                 #endif
422
423                                 #ifdef STATISTICS
424                                         kd->PdfsActive = dst;
425                                         KernelStatistics(kd, ld, cd, iter);
426                                 #endif
427
428                                 // swap grids
429                                 tmp = src;
430                                 src = dst;
431                                 dst = tmp;
432                         }
433
434                         #pragma omp barrier
435
436                 } // for (int iter = 0; ...
437
438                 MemFree((void **)&tmpArray);
439         }
440
441 #ifdef VTK_OUTPUT
442         if (cd->VtkOutput) {
443                 kd->PdfsActive = src;
444                 VtkWrite(ld, kd, cd, maxIterations);
445         }
446 #endif
447
448 #ifdef STATISTICS
449         kd->PdfsActive = src;
450         KernelStatistics(kd, ld, cd, maxIterations);
451 #endif
452
453         return;
454 }
455
This page took 0.065256 seconds and 4 git commands to generate.