src/BenchKernelD3Q19ListPullSplitNt.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  12 //
  13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  14 //  it under the terms of the GNU General Public License as published by
  15 //  the Free Software Foundation, either version 3 of the License, or
  16 //  (at your option) any later version.
  17 //
  18 //  LbmBenchKernels is distributed in the hope that it will be useful,
  19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 //  GNU General Public License for more details.
  22 //
  23 //  You should have received a copy of the GNU General Public License
  24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  25 //
  26 // --------------------------------------------------------------------------
  27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
  28
  29 #include "Memory.h"
  30 #include "Vtk.h"
  31 #include "Vector.h"
  32 #include "LikwidIf.h"
  33
  34 #include <inttypes.h>
  35 #include <math.h>
  36
  37 #ifdef _OPENMP
  38         #include <omp.h>
  39 #endif
  40
  41 #define TMP_UX 18
  42 #define TMP_UY 19
  43 #define TMP_UZ 20
  44 #define TMP_W1 21
  45 #define TMP_W2 22
  46
  47 #define N_TMP 23
  48
  49 #define TMP_INDEX(tmp_index, tmp_dir)   nTmpArray * (tmp_dir) + (tmp_index)
  50
  51 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
  52 {
  53
  54         Assert(ld != NULL);
  55         Assert(kernelData != NULL);
  56         Assert(cd != NULL);
  57
  58         Assert(cd->Omega > F(0.0));
  59         Assert(cd->Omega < F(2.0));
  60
  61         KernelData        * kd   = (KernelData *)kernelData;
  62         KernelDataList    * kdl  = KDL(kernelData);
  63         KernelDataListRia * kdlr = KDLR(kernelData);
  64
  65         PdfT omega = cd->Omega;
  66         const PdfT omegaEven = omega;
  67
  68         PdfT magicParam = F(1.0) / F(12.0);
  69         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
  70
  71
  72         const PdfT w_0 = F(1.0) / F( 3.0);
  73         const PdfT w_1 = F(1.0) / F(18.0);
  74         const PdfT w_2 = F(1.0) / F(36.0);
  75
  76         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
  77         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
  78
  79         const VPDFT vw_1_x3 = VSET(w_1_x3);
  80         const VPDFT vw_2_x3 = VSET(w_2_x3);
  81
  82         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
  83         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
  84
  85         const VPDFT vomegaEven = VSET(omegaEven);
  86         const VPDFT vomegaOdd  = VSET(omegaOdd);
  87
  88         const VPDFT voneHalf = VSET(F(0.5));
  89
  90         // uint32_t nConsecNodes = kdlr->nConsecNodes;
  91         // uint32_t * consecNodes = kdlr->ConsecNodes;
  92         // uint32_t consecIndex = 0;
  93         // uint32_t consecValue = 0;
  94
  95         PdfT * src = kd->Pdfs[0];
  96         PdfT * dst = kd->Pdfs[1];
  97         PdfT * tmp;
  98
  99         int maxIterations  = cd->MaxIterations;
 100
 101         int nFluid         = kdl->nFluid;
 102         int nCells         = kdl->nCells;
 103
 104         int nTmpArray      = kdlr->nTmpArray;
 105
 106         Assert(nTmpArray % VSIZE == 0);
 107
 108         uint32_t * adjList = kdl->AdjList;
 109
 110         #ifdef VTK_OUTPUT
 111                 if (cd->VtkOutput) {
 112                         kd->PdfsActive = src;
 113                         VtkWrite(ld, kd, cd, -1);
 114                 }
 115         #endif
 116
 117         #ifdef STATISTICS
 118                 kd->PdfsActive = src;
 119                 KernelStatistics(kd, ld, cd, 0);
 120         #endif
 121
 122         X_KERNEL_START(kernelData);
 123
 124         X_LIKWID_START("list-pull-split-nt-1s");
 125
 126         #ifdef _OPENMP
 127                 #pragma omp parallel default(none) \
 128                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
 129                         cd, maxIterations, ld, tmp, nTmpArray, \
 130                         stderr )
 131         #endif
 132         {
 133                 uint32_t adjListIndex;
 134
 135                 PdfT ux, uy, uz, ui;
 136                 VPDFT vux, vuy, vuz, vui;
 137
 138                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
 139                 D3Q19_LIST
 140                 #undef X
 141                 VPDFT vpdf_a, vpdf_b;
 142
 143                 PdfT evenPart, oddPart, dir_indep_trm, dens;
 144                 PdfT w_1_indep, w_2_indep;
 145                 VPDFT vevenPart, voddPart;
 146                 VPDFT vw_1_indep, vw_2_indep;
 147
 148                 int indexMax;
 149
 150                 PdfT * tmpArray;
 151                 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
 152
 153                 int nThreads = 1;
 154                 int threadId = 0;
 155
 156 #ifdef _OPENMP
 157                 nThreads = omp_get_max_threads();
 158                 threadId = omp_get_thread_num();
 159 #endif
 160
 161                 int nCellsThread = nFluid / nThreads;
 162                 int blIndexStart = threadId * nCellsThread;
 163
 164                 if (threadId < nFluid % nThreads) {
 165                         blIndexStart += threadId;
 166                         nCellsThread += 1;
 167                 }
 168                 else {
 169                         blIndexStart += nFluid % nThreads;
 170                 }
 171
 172                 int blIndexStop = blIndexStart + nCellsThread;
 173
 174                 // We have three loops:
 175                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
 176                 // 2. Vectorized handling of nodes.
 177                 // 3. Remaining nodes, less than vector size.
 178
 179                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
 180                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
 181
 182                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
 183                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
 184
 185                 int blIndexVec       = blIndexStart + nCellsUnaligned;
 186                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
 187
 188                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
 189
 190                 for(int iter = 0; iter < maxIterations; ++iter) {
 191
 192
 193 #if 1
 194                         #define INDEX_START     blIndexStart
 195                         #define INDEX_STOP  blIndexVec
 196                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 197
 198                         #define INDEX_START blIndexVec
 199                         #define INDEX_STOP  blIndexRemaining
 200                         #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
 201
 202                         #define INDEX_START blIndexRemaining
 203                         #define INDEX_STOP      blIndexStop
 204                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 205 #else
 206                         #define INDEX_START blIndexStart
 207                         #define INDEX_STOP      blIndexStop
 208                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 209 #endif
 210
 211
 212                         #pragma omp barrier
 213
 214                         #pragma omp single
 215                         {
 216                                 #ifdef VERIFICATION
 217                                         kd->PdfsActive = dst;
 218                                         KernelAddBodyForce(kd, ld, cd);
 219                                 #endif
 220
 221                                 #ifdef VTK_OUTPUT
 222                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
 223                                                 kd->PdfsActive = dst;
 224                                                 VtkWrite(ld, kd, cd, iter);
 225                                         }
 226                                 #endif
 227
 228                                 #ifdef STATISTICS
 229                                         kd->PdfsActive = dst;
 230                                         KernelStatistics(kd, ld, cd, iter);
 231                                 #endif
 232
 233                                 // swap grids
 234                                 tmp = src;
 235                                 src = dst;
 236                                 dst = tmp;
 237                         }
 238
 239                         #pragma omp barrier
 240
 241                 } // for (int iter = 0; ...
 242
 243                 MemFree((void **)&tmpArray);
 244         }
 245
 246
 247         X_LIKWID_STOP("list-pull-split-nt-1s");
 248
 249         X_KERNEL_END(kernelData);
 250
 251 #ifdef VTK_OUTPUT
 252         if (cd->VtkOutput) {
 253                 kd->PdfsActive = src;
 254                 VtkWrite(ld, kd, cd, maxIterations);
 255         }
 256 #endif
 257
 258 #ifdef STATISTICS
 259         kd->PdfsActive = src;
 260         KernelStatistics(kd, ld, cd, maxIterations);
 261 #endif
 262
 263         return;
 264 }
 265
 266 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 267 {
 268
 269         Assert(ld != NULL);
 270         Assert(kernelData != NULL);
 271         Assert(cd != NULL);
 272
 273         Assert(cd->Omega > F(0.0));
 274         Assert(cd->Omega < F(2.0));
 275
 276         KernelData        * kd   = (KernelData *)kernelData;
 277         KernelDataList    * kdl  = KDL(kernelData);
 278         KernelDataListRia * kdlr = KDLR(kernelData);
 279
 280         PdfT omega = cd->Omega;
 281         const PdfT omegaEven = omega;
 282
 283         PdfT magicParam = F(1.0) / F(12.0);
 284         const PdfT omegaOdd = F(1.0) / (F(0.5) + magicParam / (F(1.0) / omega - F(0.5)));
 285
 286         const PdfT w_0 = F(1.0) / F( 3.0);
 287         const PdfT w_1 = F(1.0) / F(18.0);
 288         const PdfT w_2 = F(1.0) / F(36.0);
 289
 290         const PdfT w_1_x3 = w_1 * F(3.0);       const PdfT w_1_nine_half = w_1 * F(9.0) / F(2.0);
 291         const PdfT w_2_x3 = w_2 * F(3.0);       const PdfT w_2_nine_half = w_2 * F(9.0) / F(2.0);
 292
 293         const VPDFT vw_1_x3 = VSET(w_1_x3);
 294         const VPDFT vw_2_x3 = VSET(w_2_x3);
 295
 296         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
 297         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
 298
 299         const VPDFT vomegaEven = VSET(omegaEven);
 300         const VPDFT vomegaOdd  = VSET(omegaOdd);
 301
 302         const VPDFT voneHalf = VSET(F(0.5));
 303
 304         // uint32_t nConsecNodes = kdlr->nConsecNodes;
 305         // uint32_t * consecNodes = kdlr->ConsecNodes;
 306         // uint32_t consecIndex = 0;
 307         // uint32_t consecValue = 0;
 308
 309         PdfT * src = kd->Pdfs[0];
 310         PdfT * dst = kd->Pdfs[1];
 311         PdfT * tmp;
 312
 313         int maxIterations  = cd->MaxIterations;
 314
 315         int nFluid         = kdl->nFluid;
 316         int nCells         = kdl->nCells;
 317
 318         int nTmpArray      = kdlr->nTmpArray;
 319
 320         Assert(nTmpArray % VSIZE == 0);
 321
 322         uint32_t * adjList = kdl->AdjList;
 323
 324         #ifdef VTK_OUTPUT
 325                 if (cd->VtkOutput) {
 326                         kd->PdfsActive = src;
 327                         VtkWrite(ld, kd, cd, -1);
 328                 }
 329         #endif
 330
 331         #ifdef STATISTICS
 332                 kd->PdfsActive = src;
 333                 KernelStatistics(kd, ld, cd, 0);
 334         #endif
 335
 336
 337         X_KERNEL_START(kernelData);
 338
 339         X_LIKWID_START("list-pull-split-nt-2s");
 340
 341
 342         #ifdef _OPENMP
 343                 #pragma omp parallel default(none) \
 344                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
 345                         cd, maxIterations, ld, tmp, nTmpArray, \
 346                         stderr )
 347         #endif
 348         {
 349                 uint32_t adjListIndex;
 350
 351                 PdfT ux, uy, uz, ui;
 352                 VPDFT vux, vuy, vuz, vui;
 353
 354                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
 355                 D3Q19_LIST
 356                 #undef X
 357                 VPDFT vpdf_a, vpdf_b;
 358
 359                 PdfT evenPart, oddPart, dir_indep_trm, dens;
 360                 PdfT w_1_indep, w_2_indep;
 361                 VPDFT vevenPart, voddPart;
 362                 VPDFT vw_1_indep, vw_2_indep;
 363
 364                 int indexMax;
 365
 366                 PdfT * tmpArray;
 367                 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
 368
 369                 int nThreads = 1;
 370                 int threadId = 0;
 371
 372 #ifdef _OPENMP
 373                 nThreads = omp_get_max_threads();
 374                 threadId = omp_get_thread_num();
 375 #endif
 376
 377                 int nCellsThread = nFluid / nThreads;
 378                 int blIndexStart = threadId * nCellsThread;
 379
 380                 if (threadId < nFluid % nThreads) {
 381                         blIndexStart += threadId;
 382                         nCellsThread += 1;
 383                 }
 384                 else {
 385                         blIndexStart += nFluid % nThreads;
 386                 }
 387
 388                 int blIndexStop = blIndexStart + nCellsThread;
 389
 390                 // We have three loops:
 391                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
 392                 // 2. Vectorized handling of nodes.
 393                 // 3. Remaining nodes, less than vector size.
 394
 395                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
 396                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
 397
 398                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
 399                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
 400
 401                 int blIndexVec       = blIndexStart + nCellsUnaligned;
 402                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
 403
 404                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
 405
 406                 for(int iter = 0; iter < maxIterations; ++iter) {
 407
 408 #if 1
 409                         #define INDEX_START     blIndexStart
 410                         #define INDEX_STOP  blIndexVec
 411                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 412
 413                         #define INDEX_START blIndexVec
 414                         #define INDEX_STOP  blIndexRemaining
 415                         #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
 416
 417                         #define INDEX_START blIndexRemaining
 418                         #define INDEX_STOP      blIndexStop
 419                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 420 #else
 421                         #define INDEX_START blIndexStart
 422                         #define INDEX_STOP      blIndexStop
 423                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 424 #endif
 425                         #pragma omp barrier
 426
 427
 428                         #pragma omp single
 429                         {
 430                                 #ifdef VERIFICATION
 431                                         kd->PdfsActive = dst;
 432                                         KernelAddBodyForce(kd, ld, cd);
 433                                 #endif
 434
 435                                 #ifdef VTK_OUTPUT
 436                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
 437                                                 kd->PdfsActive = dst;
 438                                                 VtkWrite(ld, kd, cd, iter);
 439                                         }
 440                                 #endif
 441
 442                                 #ifdef STATISTICS
 443                                         kd->PdfsActive = dst;
 444                                         KernelStatistics(kd, ld, cd, iter);
 445                                 #endif
 446
 447                                 // swap grids
 448                                 tmp = src;
 449                                 src = dst;
 450                                 dst = tmp;
 451                         }
 452
 453                         #pragma omp barrier
 454
 455                 } // for (int iter = 0; ...
 456
 457                 MemFree((void **)&tmpArray);
 458         }
 459
 460         X_LIKWID_STOP("list-pull-split-nt-2s");
 461
 462         X_KERNEL_END(kernelData);
 463
 464 #ifdef VTK_OUTPUT
 465         if (cd->VtkOutput) {
 466                 kd->PdfsActive = src;
 467                 VtkWrite(ld, kd, cd, maxIterations);
 468         }
 469 #endif
 470
 471 #ifdef STATISTICS
 472         kd->PdfsActive = src;
 473         KernelStatistics(kd, ld, cd, maxIterations);
 474 #endif
 475
 476         return;
 477 }
 478