src/BenchKernelD3Q19ListPullSplitNt.c

   1 // --------------------------------------------------------------------------
   2 //
   3 // Copyright
   4 //   Markus Wittmann, 2016-2017
   5 //   RRZE, University of Erlangen-Nuremberg, Germany
   6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
   7 //
   8 //   Viktor Haag, 2016
   9 //   LSS, University of Erlangen-Nuremberg, Germany
  10 //
  11 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
  12 //
  13 //  LbmBenchKernels is free software: you can redistribute it and/or modify
  14 //  it under the terms of the GNU General Public License as published by
  15 //  the Free Software Foundation, either version 3 of the License, or
  16 //  (at your option) any later version.
  17 //
  18 //  LbmBenchKernels is distributed in the hope that it will be useful,
  19 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 //  GNU General Public License for more details.
  22 //
  23 //  You should have received a copy of the GNU General Public License
  24 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
  25 //
  26 // --------------------------------------------------------------------------
  27 #include "BenchKernelD3Q19ListPullSplitNtCommon.h"
  28
  29 #include "Memory.h"
  30 #include "Vtk.h"
  31 #include "Vector.h"
  32
  33 #include <inttypes.h>
  34 #include <math.h>
  35
  36 #ifdef _OPENMP
  37         #include <omp.h>
  38 #endif
  39
  40 #define TMP_UX 18
  41 #define TMP_UY 19
  42 #define TMP_UZ 20
  43 #define TMP_W1 21
  44 #define TMP_W2 22
  45
  46 #define N_TMP 23
  47
  48 #define TMP_INDEX(tmp_index, tmp_dir)   nTmpArray * (tmp_dir) + (tmp_index)
  49
  50 void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
  51 {
  52
  53         Assert(ld != NULL);
  54         Assert(kernelData != NULL);
  55         Assert(cd != NULL);
  56
  57         Assert(cd->Omega > 0.0);
  58         Assert(cd->Omega < 2.0);
  59
  60         KernelData        * kd   = (KernelData *)kernelData;
  61         KernelDataList    * kdl  = KDL(kernelData);
  62         KernelDataListRia * kdlr = KDLR(kernelData);
  63
  64         PdfT omega = cd->Omega;
  65         const PdfT omegaEven = omega;
  66
  67         PdfT magicParam = 1.0 / 12.0;
  68         const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
  69
  70
  71         const PdfT w_0 = 1.0 /  3.0;
  72         const PdfT w_1 = 1.0 / 18.0;
  73         const PdfT w_2 = 1.0 / 36.0;
  74
  75         const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
  76         const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
  77
  78         const VPDFT vw_1_x3 = VSET(w_1_x3);
  79         const VPDFT vw_2_x3 = VSET(w_2_x3);
  80
  81         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
  82         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
  83
  84         const VPDFT vomegaEven = VSET(omegaEven);
  85         const VPDFT vomegaOdd  = VSET(omegaOdd);
  86
  87         const VPDFT voneHalf = VSET(0.5);
  88
  89         // uint32_t nConsecNodes = kdlr->nConsecNodes;
  90         // uint32_t * consecNodes = kdlr->ConsecNodes;
  91         // uint32_t consecIndex = 0;
  92         // uint32_t consecValue = 0;
  93
  94         PdfT * src = kd->Pdfs[0];
  95         PdfT * dst = kd->Pdfs[1];
  96         PdfT * tmp;
  97
  98         int maxIterations  = cd->MaxIterations;
  99
 100         int nFluid         = kdl->nFluid;
 101         int nCells         = kdl->nCells;
 102
 103         int nTmpArray      = kdlr->nTmpArray;
 104
 105         Assert(nTmpArray % VSIZE == 0);
 106
 107         uint32_t * adjList = kdl->AdjList;
 108
 109         #ifdef VTK_OUTPUT
 110                 if (cd->VtkOutput) {
 111                         kd->PdfsActive = src;
 112                         VtkWrite(ld, kd, cd, -1);
 113                 }
 114         #endif
 115
 116         #ifdef STATISTICS
 117                 kd->PdfsActive = src;
 118                 KernelStatistics(kd, ld, cd, 0);
 119         #endif
 120
 121         #ifdef _OPENMP
 122                 #pragma omp parallel default(none) \
 123                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
 124                         cd, maxIterations, ld, tmp, nTmpArray, \
 125                         stderr )
 126         #endif
 127         {
 128                 uint32_t adjListIndex;
 129
 130                 PdfT ux, uy, uz, ui;
 131                 VPDFT vux, vuy, vuz, vui;
 132
 133                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
 134                 D3Q19_LIST
 135                 #undef X
 136                 VPDFT vpdf_a, vpdf_b;
 137
 138                 PdfT evenPart, oddPart, dir_indep_trm, dens;
 139                 PdfT w_1_indep, w_2_indep;
 140                 VPDFT vevenPart, voddPart;
 141                 VPDFT vw_1_indep, vw_2_indep;
 142
 143                 int indexMax;
 144
 145                 PdfT * tmpArray;
 146                 MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
 147
 148                 int nThreads = 1;
 149                 int threadId = 0;
 150
 151 #ifdef _OPENMP
 152                 nThreads = omp_get_max_threads();
 153                 threadId = omp_get_thread_num();
 154 #endif
 155
 156                 int nCellsThread = nFluid / nThreads;
 157                 int blIndexStart = threadId * nCellsThread;
 158
 159                 if (threadId < nFluid % nThreads) {
 160                         blIndexStart += threadId;
 161                         nCellsThread += 1;
 162                 }
 163                 else {
 164                         blIndexStart += nFluid % nThreads;
 165                 }
 166
 167                 int blIndexStop = blIndexStart + nCellsThread;
 168
 169                 // We have three loops:
 170                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
 171                 // 2. Vectorized handling of nodes.
 172                 // 3. Remaining nodes, less than vector size.
 173
 174                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
 175                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
 176
 177                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
 178                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
 179
 180                 int blIndexVec       = blIndexStart + nCellsUnaligned;
 181                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
 182
 183                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
 184
 185                 for(int iter = 0; iter < maxIterations; ++iter) {
 186
 187 #if 1
 188                         #define INDEX_START     blIndexStart
 189                         #define INDEX_STOP  blIndexVec
 190                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 191
 192                         #define INDEX_START blIndexVec
 193                         #define INDEX_STOP  blIndexRemaining
 194                         #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
 195
 196                         #define INDEX_START blIndexRemaining
 197                         #define INDEX_STOP      blIndexStop
 198                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 199 #else
 200                         #define INDEX_START blIndexStart
 201                         #define INDEX_STOP      blIndexStop
 202                         #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 203 #endif
 204                         #pragma omp barrier
 205
 206                         #pragma omp single
 207                         {
 208                                 #ifdef VERIFICATION
 209                                         kd->PdfsActive = dst;
 210                                         KernelAddBodyForce(kd, ld, cd);
 211                                 #endif
 212
 213                                 #ifdef VTK_OUTPUT
 214                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
 215                                                 kd->PdfsActive = dst;
 216                                                 VtkWrite(ld, kd, cd, iter);
 217                                         }
 218                                 #endif
 219
 220                                 #ifdef STATISTICS
 221                                         kd->PdfsActive = dst;
 222                                         KernelStatistics(kd, ld, cd, iter);
 223                                 #endif
 224
 225                                 // swap grids
 226                                 tmp = src;
 227                                 src = dst;
 228                                 dst = tmp;
 229                         }
 230
 231                         #pragma omp barrier
 232
 233                 } // for (int iter = 0; ...
 234
 235                 MemFree((void **)&tmpArray);
 236         }
 237
 238 #ifdef VTK_OUTPUT
 239         if (cd->VtkOutput) {
 240                 kd->PdfsActive = src;
 241                 VtkWrite(ld, kd, cd, maxIterations);
 242         }
 243 #endif
 244
 245 #ifdef STATISTICS
 246         kd->PdfsActive = src;
 247         KernelStatistics(kd, ld, cd, maxIterations);
 248 #endif
 249
 250         return;
 251 }
 252
 253 void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 254 {
 255
 256         Assert(ld != NULL);
 257         Assert(kernelData != NULL);
 258         Assert(cd != NULL);
 259
 260         Assert(cd->Omega > 0.0);
 261         Assert(cd->Omega < 2.0);
 262
 263         KernelData        * kd   = (KernelData *)kernelData;
 264         KernelDataList    * kdl  = KDL(kernelData);
 265         KernelDataListRia * kdlr = KDLR(kernelData);
 266
 267         PdfT omega = cd->Omega;
 268         const PdfT omegaEven = omega;
 269
 270         PdfT magicParam = 1.0 / 12.0;
 271         const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
 272
 273
 274         const PdfT w_0 = 1.0 /  3.0;
 275         const PdfT w_1 = 1.0 / 18.0;
 276         const PdfT w_2 = 1.0 / 36.0;
 277
 278         const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
 279         const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
 280
 281         const VPDFT vw_1_x3 = VSET(w_1_x3);
 282         const VPDFT vw_2_x3 = VSET(w_2_x3);
 283
 284         const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
 285         const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
 286
 287         const VPDFT vomegaEven = VSET(omegaEven);
 288         const VPDFT vomegaOdd  = VSET(omegaOdd);
 289
 290         const VPDFT voneHalf = VSET(0.5);
 291
 292         // uint32_t nConsecNodes = kdlr->nConsecNodes;
 293         // uint32_t * consecNodes = kdlr->ConsecNodes;
 294         // uint32_t consecIndex = 0;
 295         // uint32_t consecValue = 0;
 296
 297         PdfT * src = kd->Pdfs[0];
 298         PdfT * dst = kd->Pdfs[1];
 299         PdfT * tmp;
 300
 301         int maxIterations  = cd->MaxIterations;
 302
 303         int nFluid         = kdl->nFluid;
 304         int nCells         = kdl->nCells;
 305
 306         int nTmpArray      = kdlr->nTmpArray;
 307
 308         Assert(nTmpArray % VSIZE == 0);
 309
 310         uint32_t * adjList = kdl->AdjList;
 311
 312         #ifdef VTK_OUTPUT
 313                 if (cd->VtkOutput) {
 314                         kd->PdfsActive = src;
 315                         VtkWrite(ld, kd, cd, -1);
 316                 }
 317         #endif
 318
 319         #ifdef STATISTICS
 320                 kd->PdfsActive = src;
 321                 KernelStatistics(kd, ld, cd, 0);
 322         #endif
 323
 324         #ifdef _OPENMP
 325                 #pragma omp parallel default(none) \
 326                         shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
 327                         cd, maxIterations, ld, tmp, nTmpArray, \
 328                         stderr )
 329         #endif
 330         {
 331                 uint32_t adjListIndex;
 332
 333                 PdfT ux, uy, uz, ui;
 334                 VPDFT vux, vuy, vuz, vui;
 335
 336                 #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
 337                 D3Q19_LIST
 338                 #undef X
 339                 VPDFT vpdf_a, vpdf_b;
 340
 341                 PdfT evenPart, oddPart, dir_indep_trm, dens;
 342                 PdfT w_1_indep, w_2_indep;
 343                 VPDFT vevenPart, voddPart;
 344                 VPDFT vw_1_indep, vw_2_indep;
 345
 346                 int indexMax;
 347
 348                 PdfT * tmpArray;
 349                 MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
 350
 351                 int nThreads = 1;
 352                 int threadId = 0;
 353
 354 #ifdef _OPENMP
 355                 nThreads = omp_get_max_threads();
 356                 threadId = omp_get_thread_num();
 357 #endif
 358
 359                 int nCellsThread = nFluid / nThreads;
 360                 int blIndexStart = threadId * nCellsThread;
 361
 362                 if (threadId < nFluid % nThreads) {
 363                         blIndexStart += threadId;
 364                         nCellsThread += 1;
 365                 }
 366                 else {
 367                         blIndexStart += nFluid % nThreads;
 368                 }
 369
 370                 int blIndexStop = blIndexStart + nCellsThread;
 371
 372                 // We have three loops:
 373                 // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
 374                 // 2. Vectorized handling of nodes.
 375                 // 3. Remaining nodes, less than vector size.
 376
 377                 unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
 378                 int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
 379
 380                 int nCellsVectorized = nCellsThread - nCellsUnaligned;
 381                 nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
 382
 383                 int blIndexVec       = blIndexStart + nCellsUnaligned;
 384                 int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
 385
 386                 // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
 387
 388                 for(int iter = 0; iter < maxIterations; ++iter) {
 389
 390 #if 1
 391                         #define INDEX_START     blIndexStart
 392                         #define INDEX_STOP  blIndexVec
 393                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 394
 395                         #define INDEX_START blIndexVec
 396                         #define INDEX_STOP  blIndexRemaining
 397                         #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
 398
 399                         #define INDEX_START blIndexRemaining
 400                         #define INDEX_STOP      blIndexStop
 401                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 402 #else
 403                         #define INDEX_START blIndexStart
 404                         #define INDEX_STOP      blIndexStop
 405                         #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
 406 #endif
 407                         #pragma omp barrier
 408
 409                         #pragma omp single
 410                         {
 411                                 #ifdef VERIFICATION
 412                                         kd->PdfsActive = dst;
 413                                         KernelAddBodyForce(kd, ld, cd);
 414                                 #endif
 415
 416                                 #ifdef VTK_OUTPUT
 417                                         if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
 418                                                 kd->PdfsActive = dst;
 419                                                 VtkWrite(ld, kd, cd, iter);
 420                                         }
 421                                 #endif
 422
 423                                 #ifdef STATISTICS
 424                                         kd->PdfsActive = dst;
 425                                         KernelStatistics(kd, ld, cd, iter);
 426                                 #endif
 427
 428                                 // swap grids
 429                                 tmp = src;
 430                                 src = dst;
 431                                 dst = tmp;
 432                         }
 433
 434                         #pragma omp barrier
 435
 436                 } // for (int iter = 0; ...
 437
 438                 MemFree((void **)&tmpArray);
 439         }
 440
 441 #ifdef VTK_OUTPUT
 442         if (cd->VtkOutput) {
 443                 kd->PdfsActive = src;
 444                 VtkWrite(ld, kd, cd, maxIterations);
 445         }
 446 #endif
 447
 448 #ifdef STATISTICS
 449         kd->PdfsActive = src;
 450         KernelStatistics(kd, ld, cd, maxIterations);
 451 #endif
 452
 453         return;
 454 }
 455