X-Git-Url: http://git.rrze.uni-erlangen.de/gitweb/?p=LbmBenchmarkKernelsPublic.git;a=blobdiff_plain;f=src%2FBenchKernelD3Q19ListPullSplitNt.c;fp=src%2FBenchKernelD3Q19ListPullSplitNt.c;h=dfab54afbdc574c3936d146e28e9925e479a1088;hp=0000000000000000000000000000000000000000;hb=109880839321408644c94a34eb31208460b9f46d;hpb=42cf91486fb5c1ad178b3d21935a1be563e5fa39 diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c new file mode 100644 index 0000000..dfab54a --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt.c @@ -0,0 +1,455 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListPullSplitNtCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +#define TMP_UX 18 +#define TMP_UY 19 +#define TMP_UZ 20 +#define TMP_W1 21 +#define TMP_W2 22 + +#define N_TMP 23 + +#define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) + +void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + const PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + const VPDFT vw_1_x3 = VSET(w_1_x3); + const VPDFT vw_2_x3 = VSET(w_2_x3); + + const VPDFT vw_1_nine_half = VSET(w_1_nine_half); + const VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + const VPDFT vomegaEven = VSET(omegaEven); + const VPDFT vomegaOdd = VSET(omegaOdd); + + const VPDFT voneHalf = VSET(0.5); + + // uint32_t nConsecNodes = kdlr->nConsecNodes; + // uint32_t * consecNodes = kdlr->ConsecNodes; + // uint32_t consecIndex = 0; + // uint32_t consecValue = 0; + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + int nTmpArray = kdlr->nTmpArray; + + Assert(nTmpArray % VSIZE == 0); + + uint32_t * adjList = kdl->AdjList; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + #ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ + cd, maxIterations, ld, tmp, nTmpArray, \ + stderr ) + #endif + { + uint32_t adjListIndex; + + PdfT ux, uy, uz, ui; + VPDFT vux, vuy, vuz, vui; + + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + VPDFT vpdf_a, vpdf_b; + + PdfT evenPart, oddPart, dir_indep_trm, dens; + PdfT w_1_indep, w_2_indep; + VPDFT vevenPart, voddPart; + VPDFT vw_1_indep, vw_2_indep; + + int indexMax; + + PdfT * tmpArray; + MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); + + int nThreads = 1; + int threadId = 0; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); +#endif + + int nCellsThread = nFluid / nThreads; + int blIndexStart = threadId * nCellsThread; + + if (threadId < nFluid % nThreads) { + blIndexStart += threadId; + nCellsThread += 1; + } + else { + blIndexStart += nFluid % nThreads; + } + + int blIndexStop = blIndexStart + nCellsThread; + + // We have three loops: + // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. + // 2. Vectorized handling of nodes. + // 3. Remaining nodes, less than vector size. + + unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); + int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; + + int nCellsVectorized = nCellsThread - nCellsUnaligned; + nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); + + int blIndexVec = blIndexStart + nCellsUnaligned; + int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; + + // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); + + for(int iter = 0; iter < maxIterations; ++iter) { + +#if 1 + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexVec + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" + + #define INDEX_START blIndexVec + #define INDEX_STOP blIndexRemaining + #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" + + #define INDEX_START blIndexRemaining + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" +#else + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" +#endif + #pragma omp barrier + + #pragma omp single + { + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + } + + #pragma omp barrier + + } // for (int iter = 0; ... + + MemFree((void **)&tmpArray); + } + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} + +void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + const PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + const VPDFT vw_1_x3 = VSET(w_1_x3); + const VPDFT vw_2_x3 = VSET(w_2_x3); + + const VPDFT vw_1_nine_half = VSET(w_1_nine_half); + const VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + const VPDFT vomegaEven = VSET(omegaEven); + const VPDFT vomegaOdd = VSET(omegaOdd); + + const VPDFT voneHalf = VSET(0.5); + + // uint32_t nConsecNodes = kdlr->nConsecNodes; + // uint32_t * consecNodes = kdlr->ConsecNodes; + // uint32_t consecIndex = 0; + // uint32_t consecValue = 0; + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + int nTmpArray = kdlr->nTmpArray; + + Assert(nTmpArray % VSIZE == 0); + + uint32_t * adjList = kdl->AdjList; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + #ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ + cd, maxIterations, ld, tmp, nTmpArray, \ + stderr ) + #endif + { + uint32_t adjListIndex; + + PdfT ux, uy, uz, ui; + VPDFT vux, vuy, vuz, vui; + + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + VPDFT vpdf_a, vpdf_b; + + PdfT evenPart, oddPart, dir_indep_trm, dens; + PdfT w_1_indep, w_2_indep; + VPDFT vevenPart, voddPart; + VPDFT vw_1_indep, vw_2_indep; + + int indexMax; + + PdfT * tmpArray; + MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); + + int nThreads = 1; + int threadId = 0; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); +#endif + + int nCellsThread = nFluid / nThreads; + int blIndexStart = threadId * nCellsThread; + + if (threadId < nFluid % nThreads) { + blIndexStart += threadId; + nCellsThread += 1; + } + else { + blIndexStart += nFluid % nThreads; + } + + int blIndexStop = blIndexStart + nCellsThread; + + // We have three loops: + // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. + // 2. Vectorized handling of nodes. + // 3. Remaining nodes, less than vector size. + + unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); + int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; + + int nCellsVectorized = nCellsThread - nCellsUnaligned; + nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); + + int blIndexVec = blIndexStart + nCellsUnaligned; + int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; + + // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); + + for(int iter = 0; iter < maxIterations; ++iter) { + +#if 1 + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexVec + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" + + #define INDEX_START blIndexVec + #define INDEX_STOP blIndexRemaining + #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" + + #define INDEX_START blIndexRemaining + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" +#else + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" +#endif + #pragma omp barrier + + #pragma omp single + { + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + } + + #pragma omp barrier + + } // for (int iter = 0; ... + + MemFree((void **)&tmpArray); + } + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} +