From e3f82424829ebb623343ce0092238f83b4a1b8c2 Mon Sep 17 00:00:00 2001 From: Markus Wittmann <markus.wittmann@fau.de> Date: Thu, 2 Nov 2017 15:54:11 +0100 Subject: [PATCH] bulk commit - add AA pattern full array kernels - add padding for list kernels - transposed loops --- doc/Makefile | 2 +- doc/html/main.html | 411 ++++++++++-- doc/main.css | 46 ++ doc/main.rst | 194 +++++- src/Base.h | 14 +- src/BenchKernelD3Q19.c | 66 +- src/BenchKernelD3Q19Aa.c | 529 ++++++++++++++++ src/BenchKernelD3Q19Aa.h | 41 ++ src/BenchKernelD3Q19AaCommon.c | 651 +++++++++++++++++++ src/BenchKernelD3Q19AaCommon.h | 89 +++ src/BenchKernelD3Q19AaVec.c | 610 ++++++++++++++++++ src/BenchKernelD3Q19AaVec.h | 38 ++ src/BenchKernelD3Q19AaVecCommon.c | 656 ++++++++++++++++++++ src/BenchKernelD3Q19AaVecCommon.h | 93 +++ src/BenchKernelD3Q19Common.c | 167 +++-- src/BenchKernelD3Q19Common.h | 4 +- src/BenchKernelD3Q19List.c | 4 + src/BenchKernelD3Q19ListAaCommon.c | 68 +- src/BenchKernelD3Q19ListAaPv.c | 40 +- src/BenchKernelD3Q19ListAaPvCommon.c | 65 +- src/BenchKernelD3Q19ListAaRiaCommon.c | 64 +- src/BenchKernelD3Q19ListCommon.c | 68 +- src/BenchKernelD3Q19ListPullSplitNt.c | 16 + src/BenchKernelD3Q19ListPullSplitNtCommon.c | 59 +- src/Geometry.c | 49 +- src/Geometry.h | 3 +- src/Kernel.h | 2 +- src/KernelFunctions.h | 18 + src/Lattice.c | 59 ++ src/Lattice.h | 1 + src/Main.c | 6 +- src/Makefile | 45 +- src/Memory.c | 26 + src/Padding.c | 300 +++++++++ src/Padding.h | 19 + src/Pinning.c | 67 +- src/Pinning.h | 9 +- src/test.sh | 2 +- 38 files changed, 4228 insertions(+), 373 deletions(-) create mode 100644 doc/main.css create mode 100644 src/BenchKernelD3Q19Aa.c create mode 100644 src/BenchKernelD3Q19Aa.h create mode 100644 src/BenchKernelD3Q19AaCommon.c create mode 100644 src/BenchKernelD3Q19AaCommon.h create mode 100644 src/BenchKernelD3Q19AaVec.c create mode 100644 src/BenchKernelD3Q19AaVec.h create mode 100644 src/BenchKernelD3Q19AaVecCommon.c create mode 100644 src/BenchKernelD3Q19AaVecCommon.h create mode 100644 src/Lattice.c create mode 100644 src/Padding.c create mode 100644 src/Padding.h diff --git a/doc/Makefile b/doc/Makefile index 690e4eb..6417a6e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -34,5 +34,5 @@ main: main.rst #main.css [ -d html ] || mkdir -p html # rst2html --stylesheet=html4css1.css,main.css $< html/$@.html - rst2html --stylesheet=html4css1.css $< html/$@.html + rst2html --stylesheet=html4css1.css,main.css $< html/$@.html diff --git a/doc/html/main.html b/doc/html/main.html index 36579cc..511f6b2 100644 --- a/doc/html/main.html +++ b/doc/html/main.html @@ -335,6 +335,56 @@ h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { ul.auto-toc { list-style-type: none } +</style> +<style type="text/css"> + + +h1, h2, h3, h4, h5, h6 { + font-family: sans-serif; + font-size: 100%; + background-color: #dcdcdc; +} + +h1.title { + background-color: gray; + color: white +} + +table.footnote { + padding-left: 0.5ex; +} + +table.citation { + padding-left: 0.5ex +} + +td.label { + width: 10%; +} + +table, table.docutils, td, th { + border: 0; +} + +table.citation, table.footnote { + width: 100%; +} + +th { + background-color: lavender ; +} + +tr:nth-child(even) { + xxbackground-color: aliceblue; + background-color: white; +} +tr:nth-child(odd) { + xxbackground-color: lavender; + background-color: whitesmoke; +} + + + </style> </head> <body> @@ -375,15 +425,23 @@ ul.auto-toc { <li><a class="reference internal" href="#benchmarking" id="id4">1.2 Benchmarking</a></li> <li><a class="reference internal" href="#release-and-verification" id="id5">1.3 Release and Verification</a></li> <li><a class="reference internal" href="#compilers" id="id6">1.4 Compilers</a></li> -<li><a class="reference internal" href="#options-summary" id="id7">1.5 Options Summary</a></li> +<li><a class="reference internal" href="#cleaning" id="id7">1.5 Cleaning</a></li> +<li><a class="reference internal" href="#options-summary" id="id8">1.6 Options Summary</a></li> </ul> </li> -<li><a class="reference internal" href="#invocation" id="id8">2 Invocation</a><ul class="auto-toc"> -<li><a class="reference internal" href="#command-line-parameters" id="id9">2.1 Command Line Parameters</a></li> +<li><a class="reference internal" href="#invocation" id="id9">2 Invocation</a><ul class="auto-toc"> +<li><a class="reference internal" href="#command-line-parameters" id="id10">2.1 Command Line Parameters</a></li> +<li><a class="reference internal" href="#kernels" id="id11">2.2 Kernels</a></li> </ul> </li> -<li><a class="reference internal" href="#id1" id="id10">3 Benchmarking</a></li> -<li><a class="reference internal" href="#acknowledgements" id="id11">4 Acknowledgements</a></li> +<li><a class="reference internal" href="#id1" id="id12">3 Benchmarking</a><ul class="auto-toc"> +<li><a class="reference internal" href="#padding" id="id13">3.1 Padding</a></li> +</ul> +</li> +<li><a class="reference internal" href="#geometries" id="id14">4 Geometries</a></li> +<li><a class="reference internal" href="#results" id="id15">5 Results</a></li> +<li><a class="reference internal" href="#licence" id="id16">6 Licence</a></li> +<li><a class="reference internal" href="#acknowledgements" id="id17">7 Acknowledgements</a></li> </ul> </div> <div class="section" id="compilation"> @@ -399,15 +457,15 @@ depending on compiler and build configuration.</p> <div class="section" id="debug-and-verification"> <h2><a class="toc-backref" href="#id3">1.1 Debug and Verification</a></h2> <pre class="literal-block"> -make +make BUILD=debug BENCHMARK=off </pre> -<p>Running <tt class="docutils literal">make</tt> without any arguments builds the debug version (BUILD=debug) of +<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of the benchmark kernels, where no optimizations are performed, line numbers and debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined. The resulting binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named <tt class="docutils literal"><span class="pre">lbmbenchk-linux-<compiler>-debug</span></tt>.</p> -<p>Without any further specification the binary includes verification -(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS</tt>), and VTK output +<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification +(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output (<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p> <p>Please note that the generated binary will therefore exhibit a poor performance.</p> @@ -416,9 +474,10 @@ exhibit a poor performance.</p> <h2><a class="toc-backref" href="#id4">1.2 Benchmarking</a></h2> <p>To generate a binary for benchmarking run make with</p> <pre class="literal-block"> -make BENCHMARK=on BUILD=release +make </pre> -<p>Here BUILD=release turns optimizations on and BENCHMARK=on disables +<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where +BUILD=release turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables verfification, statistics, and VTK output.</p> </div> <div class="section" id="release-and-verification"> @@ -426,7 +485,7 @@ verfification, statistics, and VTK output.</p> <p>Verification with the debug builds can be extremely slow. Hence verification capabilities can be build with release builds:</p> <pre class="literal-block"> -make BUILD=release +make BENCHMARK=off </pre> </div> <div class="section" id="compilers"> @@ -435,8 +494,23 @@ make BUILD=release both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p> </div> +<div class="section" id="cleaning"> +<h2><a class="toc-backref" href="#id7">1.5 Cleaning</a></h2> +<p>For each configuration and build (debug/release) a subdirectory under the +<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are +stored. +With</p> +<pre class="literal-block"> +make CONFIG=... BUILD=... clean +</pre> +<p>a specific combination is select and cleaned, whereas with</p> +<pre class="literal-block"> +make clean-all +</pre> +<p>all object and dependency files are deleted.</p> +</div> <div class="section" id="options-summary"> -<h2><a class="toc-backref" href="#id7">1.5 Options Summary</a></h2> +<h2><a class="toc-backref" href="#id8">1.6 Options Summary</a></h2> <p>Options that can be specified when building the framework with make:</p> <table border="1" class="docutils"> <colgroup> @@ -451,19 +525,14 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p <td>default</td> <td>description</td> </tr> -<tr><td>TARCH</td> -<td>--</td> -<td>--</td> -<td>Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.</td> -</tr> <tr><td>BENCHMARK</td> <td>on, off</td> -<td>off</td> -<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.</td> +<td>on</td> +<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td> </tr> <tr><td>BUILD</td> <td>debug, release</td> -<td>debug</td> +<td>release</td> <td>No optimization, debug symbols, DEBUG defined.</td> </tr> <tr><td>CONFIG</td> @@ -486,6 +555,11 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p <td>off</td> <td>View statistics, like density etc, during simulation.</td> </tr> +<tr><td>TARCH</td> +<td>--</td> +<td>--</td> +<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td> +</tr> <tr><td>VERIFICATION</td> <td>on, off</td> <td>off</td> @@ -501,16 +575,18 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p </div> </div> <div class="section" id="invocation"> -<h1><a class="toc-backref" href="#id8">2 Invocation</a></h1> +<h1><a class="toc-backref" href="#id9">2 Invocation</a></h1> <p>Running the binary will print among the GPL licence header a line like the following:</p> -<blockquote> -LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification</blockquote> +<pre class="literal-block"> +LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification +</pre> <p>if verfication was enabled during compilation or</p> -<blockquote> -LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark</blockquote> +<pre class="literal-block"> +LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark +</pre> <p>if verfication was disabled during compilation.</p> <div class="section" id="command-line-parameters"> -<h2><a class="toc-backref" href="#id9">2.1 Command Line Parameters</a></h2> +<h2><a class="toc-backref" href="#id10">2.1 Command Line Parameters</a></h2> <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p> <pre class="literal-block"> Usage: @@ -540,7 +616,7 @@ $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32 <p>Kernel specific parameters can be opatained via selecting the specific kernel and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p> <pre class="literal-block"> -$ bin/lbmbenchk-linux-intel-release -kernel -- -h +$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h ... Kernel parameters: [-blk <n>] [-blk-[xyz] <n>] @@ -574,14 +650,225 @@ Available kernels to benchmark: blk-pull-aos </pre> </div> +<div class="section" id="kernels"> +<h2><a class="toc-backref" href="#id11">2.2 Kernels</a></h2> +<p>The following list shortly describes available kernels:</p> +<ul class="simple"> +<li>push-soa/push-aos/pull-soa/pull-aos: +Unoptimized kernels (but stream/collide are already fused) using two grids as +source and destination. Implement push/pull semantics as well structure of +arrays (soa) or array of structures (aos) layout.</li> +<li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: +The same as the unoptimized kernels without the blk prefix, except that they support +spatial blocking, i.e. loop blocking of the three loops used to iterate over +the lattice. Here manual work sharing for OpenMP is used.</li> +<li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: +The same as the unoptimized kernels without the list prefix, but for indirect addressing. +Here only a 1D vector of is used to store the fluid nodes, omitting the +obstacles. An adjacency list is used to recover the neighborhood associations.</li> +<li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: +Optimized variant of list-pull-soa. Chunks of the lattice are processed as +once. Postcollision values are written back via nontemporal stores in 18 (1s) +or 9 (2s) loops.</li> +<li>list-aa-aos/list-aa-soa: +Unoptimized implementation of the AA pattern for the 1D vector with adjacency +list. Supported are array of structures (aos) and structure of arrays (soa) +data layout is supported.</li> +<li>list-aa-ria-soa: +Implementation of AA pattern with intrinsics for the 1D vector with adjacency +list. Furthermore it contains a vectorized even time step and run length +coding to reduce the loop balance of the odd time step.</li> +<li>list-aa-pv-soa: +All optimizations of list-aa-ria-soa. Additional with partial vectorization +of the odd time step.</li> +</ul> +<p>Note that all array of structures (aos) kernels might require blocking +(depending on the domain size) to reach the performance of their structure of +arrays (soa) counter parts.</p> +<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means +direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D +vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported. +The loop balance B_l is computed for D3Q19 model with double precision floating +point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). +As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective +loop balance depends on the geometry. The effective loop balance is printed +during each run.</p> +<table border="1" class="docutils"> +<colgroup> +<col width="29%" /> +<col width="14%" /> +<col width="14%" /> +<col width="6%" /> +<col width="10%" /> +<col width="10%" /> +<col width="16%" /> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">kernel name</th> +<th class="head">prop. step</th> +<th class="head">data layout</th> +<th class="head">addr.</th> +<th class="head">parallel</th> +<th class="head">blocking</th> +<th class="head">B_l [B/FLUP]</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>push-soa</td> +<td>OS</td> +<td>SoA</td> +<td>D</td> +<td>x</td> +<td>--</td> +<td>456</td> +</tr> +<tr><td>push-aos</td> +<td>OS</td> +<td>AoS</td> +<td>D</td> +<td>x</td> +<td>--</td> +<td>456</td> +</tr> +<tr><td>pull-soa</td> +<td>OS</td> +<td>SoA</td> +<td>D</td> +<td>x</td> +<td>--</td> +<td>456</td> +</tr> +<tr><td>pull-aos</td> +<td>OS</td> +<td>AoS</td> +<td>D</td> +<td>x</td> +<td>--</td> +<td>456</td> +</tr> +<tr><td>blk-push-soa</td> +<td>OS</td> +<td>SoA</td> +<td>D</td> +<td>x</td> +<td>x</td> +<td>456</td> +</tr> +<tr><td>blk-push-aos</td> +<td>OS</td> +<td>AoS</td> +<td>D</td> +<td>x</td> +<td>x</td> +<td>456</td> +</tr> +<tr><td>blk-pull-soa</td> +<td>OS</td> +<td>SoA</td> +<td>D</td> +<td>x</td> +<td>x</td> +<td>456</td> +</tr> +<tr><td>blk-pull-aos</td> +<td>OS</td> +<td>AoS</td> +<td>D</td> +<td>x</td> +<td>x</td> +<td>456</td> +</tr> +<tr><td>list-push-soa</td> +<td>OS</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>528</td> +</tr> +<tr><td>list-push-aos</td> +<td>OS</td> +<td>AoS</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>528</td> +</tr> +<tr><td>list-pull-soa</td> +<td>OS</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>528</td> +</tr> +<tr><td>list-pull-aos</td> +<td>OS</td> +<td>AoS</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>528</td> +</tr> +<tr><td>list-pull-split-nt-1s</td> +<td>OS</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>376</td> +</tr> +<tr><td>list-pull-split-nt-2s</td> +<td>OS</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>376</td> +</tr> +<tr><td>list-aa-soa</td> +<td>AA</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>340</td> +</tr> +<tr><td>list-aa-aos</td> +<td>AA</td> +<td>AoS</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>340</td> +</tr> +<tr><td>list-aa-ria-soa</td> +<td>AA</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>304-342</td> +</tr> +<tr><td>list-aa-pv-soa</td> +<td>AA</td> +<td>SoA</td> +<td>I</td> +<td>x</td> +<td>x</td> +<td>304-342</td> +</tr> +</tbody> +</table> +</div> </div> <div class="section" id="id1"> -<h1><a class="toc-backref" href="#id10">3 Benchmarking</a></h1> +<h1><a class="toc-backref" href="#id12">3 Benchmarking</a></h1> <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be created make sure the binary was compiled with:</p> <ul class="simple"> -<li><tt class="docutils literal">BENCHMARK=on</tt> and</li> -<li><tt class="docutils literal">BUILD=release</tt> and</li> +<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li> +<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li> <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li> <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li> </ul> @@ -594,7 +881,13 @@ $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9) <ul class="simple"> <li>transparent huge pages: when allocating memory small 4 KiB pages might be replaced with larger ones. This is in general a good thing, but if this is -really the case, depends on the system settings.</li> +really the case, depends on the system settings (check e.g. the status of +<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>). +Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to +a 4 KiB page, which should be the case for the lattices. +This should result in huge pages except THP is disabled on the machine. +(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently +hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li> <li>CPU/core frequency: For reproducible results the frequency of all cores should be fixed.</li> <li>NUMA placement policy: The benchmark assumes a first touch policy, which @@ -604,19 +897,63 @@ used is already full memory might be allocated in a remote domain. Accesses to remote domains typically have a higher latency and lower bandwidth.</li> <li>System load: interference with other application, espcially on desktop systems should be avoided.</li> -<li>Padding: most kernels do not care about padding against cache or TLB -thrashing. Even if the number of (fluid) nodes suggest everything is fine, -through parallelization still problems might occur.</li> +<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically +adjusted so that no cache or TLB thrashing should occur. The parameters are +optimized for current Intel based systems. For more details look into the +padding section.</li> <li>CPU dispatcher function: the compiler might add different versions of a function for different ISA extensions. Make sure the code you might think is executed is actually the code which is executed.</li> </ul> +<div class="section" id="padding"> +<h2><a class="toc-backref" href="#id13">3.1 Padding</a></h2> +<p>With correct padding cache and TLB thrashing can be avoided. Therefore the +number of (fluid) nodes used in the data layout is artificially increased.</p> +<p>Currently automatic padding is active for kernels which support it. It can be +controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>) +<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding), +or a manual padding.</p> +<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 +entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the +parameters of current Intel based processors.</p> +<p>Manual padding is done via a padding string and has the format +<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes. +SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the +19 pages with one lattice (36 with two lattices) we are concurrently accessing +over as much sets in the TLB as possible. +This is controlled by the distance between the accessed pages, which is the +number of (fluid) nodes in between them and can be adjusted by adding further +(fluid) nodes. +We want the distance d (in bytes) between two accessed pages to be e.g. +<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>. +This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong> +would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>. +Measurements show that with only a quarter of half of a page size as offset +higher performance is achieved, which is done by automatic padding. +On top of this padding more paddings can be added. They are just added to the +padding string and are separated by commas.</p> +<p>A zero modulus in the padding string has a special meaning. Here the +corresponding offset is just added to the number of nodes. A padding string +like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p> +</div> +</div> +<div class="section" id="geometries"> +<h1><a class="toc-backref" href="#id14">4 Geometries</a></h1> +<p>TODO: supported geometries: channel, pipe, blocks</p> +</div> +<div class="section" id="results"> +<h1><a class="toc-backref" href="#id15">5 Results</a></h1> +<p>TODO</p> +</div> +<div class="section" id="licence"> +<h1><a class="toc-backref" href="#id16">6 Licence</a></h1> +<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p> </div> <div class="section" id="acknowledgements"> -<h1><a class="toc-backref" href="#id11">4 Acknowledgements</a></h1> +<h1><a class="toc-backref" href="#id17">7 Acknowledgements</a></h1> <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p> <p>This work was funded by KONWHIR project OMI4PAPS.</p> -<p>Document was generated at 2017-10-26 09:43.</p> +<p>Document was generated at 2017-11-02 15:33.</p> </div> </div> </body> diff --git a/doc/main.css b/doc/main.css new file mode 100644 index 0000000..27d3f26 --- /dev/null +++ b/doc/main.css @@ -0,0 +1,46 @@ + +h1, h2, h3, h4, h5, h6 { + font-family: sans-serif; + font-size: 100%; + background-color: #dcdcdc; +} + +h1.title { + background-color: gray; + color: white +} + +table.footnote { + padding-left: 0.5ex; +} + +table.citation { + padding-left: 0.5ex +} + +td.label { + width: 10%; +} + +table, table.docutils, td, th { + border: 0; +} + +table.citation, table.footnote { + width: 100%; +} + +th { + background-color: lavender ; +} + +tr:nth-child(even) { + xxbackground-color: aliceblue; + background-color: white; +} +tr:nth-child(odd) { + xxbackground-color: lavender; + background-color: whitesmoke; +} + + diff --git a/doc/main.rst b/doc/main.rst index 9ad56ba..528b339 100644 --- a/doc/main.rst +++ b/doc/main.rst @@ -54,16 +54,16 @@ Debug and Verification :: - make + make BUILD=debug BENCHMARK=off -Running ``make`` without any arguments builds the debug version (BUILD=debug) of +Running ``make`` with ``BUILD=debug`` builds the debug version of the benchmark kernels, where no optimizations are performed, line numbers and debug symbols are included as well as ``DEBUG`` will be defined. The resulting binary will be found in the ``bin`` subdirectory and named ``lbmbenchk-linux-<compiler>-debug``. -Without any further specification the binary includes verification -(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output +Specifying ``BENCHMARK=off`` turns on verification +(``VERIFICATION=on``), statistics (``STATISTICS=on``), and VTK output (``VTK_OUTPUT=on``) enabled. Please note that the generated binary will therefore @@ -74,9 +74,10 @@ Benchmarking To generate a binary for benchmarking run make with :: - make BENCHMARK=on BUILD=release + make -Here BUILD=release turns optimizations on and BENCHMARK=on disables +As default ``BENCHMARK=on`` and ``BUILD=release`` is set, where +BUILD=release turns optimizations on and ``BENCHMARK=on`` disables verfification, statistics, and VTK output. Release and Verification @@ -85,7 +86,7 @@ Release and Verification Verification with the debug builds can be extremely slow. Hence verification capabilities can be build with release builds: :: - make BUILD=release + make BENCHMARK=off Compilers --------- @@ -94,6 +95,24 @@ Currently only the GCC and Intel compiler under Linux are supported. Between both configuration can be chosen via ``CONFIG=linux-gcc`` or ``CONFIG=linux-intel``. + +Cleaning +-------- + +For each configuration and build (debug/release) a subdirectory under the +``src/obj`` directory is created where the dependency and object files are +stored. +With :: + + make CONFIG=... BUILD=... clean + +a specific combination is select and cleaned, whereas with :: + + make clean-all + +all object and dependency files are deleted. + + Options Summary --------------- @@ -102,13 +121,13 @@ Options that can be specified when building the framework with make: ============= ======================= ============ ========================================================== name values default description ------------- ----------------------- ------------ ---------------------------------------------------------- -TARCH -- -- Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler. -BENCHMARK on, off off If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. -BUILD debug, release debug No optimization, debug symbols, DEBUG defined. +BENCHMARK on, off on If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options. +BUILD debug, release release No optimization, debug symbols, DEBUG defined. CONFIG linux-gcc, linux-intel linux-intel Select GCC or Intel compiler. ISA avx, sse avx Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for. OPENMP on, off on OpenMP, i.\,e.\. threading support. STATISTICS on, off off View statistics, like density etc, during simulation. +TARCH -- -- Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler. VERIFICATION on, off off Turn verification on/off. VTK_OUTPUT on, off off Enable/Disable VTK file output. ============= ======================= ============ ========================================================== @@ -116,11 +135,11 @@ VTK_OUTPUT on, off off Enable/Disable VTK file outpu Invocation ========== -Running the binary will print among the GPL licence header a line like the following: +Running the binary will print among the GPL licence header a line like the following: :: LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification -if verfication was enabled during compilation or +if verfication was enabled during compilation or :: LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark @@ -159,7 +178,7 @@ iterations, etc, which can afterward be override, e.g.: :: Kernel specific parameters can be opatained via selecting the specific kernel and passing ``-h`` as parameter: :: - $ bin/lbmbenchk-linux-intel-release -kernel -- -h + $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h ... Kernel parameters: [-blk <n>] [-blk-[xyz] <n>] @@ -193,6 +212,82 @@ A list of all available kernels can be obtained via ``-list``: :: blk-pull-soa blk-pull-aos +Kernels +------- + +The following list shortly describes available kernels: + +- push-soa/push-aos/pull-soa/pull-aos: + Unoptimized kernels (but stream/collide are already fused) using two grids as + source and destination. Implement push/pull semantics as well structure of + arrays (soa) or array of structures (aos) layout. + +- blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: + The same as the unoptimized kernels without the blk prefix, except that they support + spatial blocking, i.e. loop blocking of the three loops used to iterate over + the lattice. Here manual work sharing for OpenMP is used. + +- list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: + The same as the unoptimized kernels without the list prefix, but for indirect addressing. + Here only a 1D vector of is used to store the fluid nodes, omitting the + obstacles. An adjacency list is used to recover the neighborhood associations. + +- list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: + Optimized variant of list-pull-soa. Chunks of the lattice are processed as + once. Postcollision values are written back via nontemporal stores in 18 (1s) + or 9 (2s) loops. + +- list-aa-aos/list-aa-soa: + Unoptimized implementation of the AA pattern for the 1D vector with adjacency + list. Supported are array of structures (aos) and structure of arrays (soa) + data layout is supported. + +- list-aa-ria-soa: + Implementation of AA pattern with intrinsics for the 1D vector with adjacency + list. Furthermore it contains a vectorized even time step and run length + coding to reduce the loop balance of the odd time step. + +- list-aa-pv-soa: + All optimizations of list-aa-ria-soa. Additional with partial vectorization + of the odd time step. + + +Note that all array of structures (aos) kernels might require blocking +(depending on the domain size) to reach the performance of their structure of +arrays (soa) counter parts. + +The following table summarizes the properties of the kernels. Here **D** means +direct addressing, i.e. full array, **I** means indirect addressing, i.e. 1D +vector with adjacency list, **x** means supported, whereas **--** means unsupported. +The loop balance B_l is computed for D3Q19 model with double precision floating +point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). +As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective +loop balance depends on the geometry. The effective loop balance is printed +during each run. + + +====================== =========== =========== ===== ======== ======== ============ +kernel name prop. step data layout addr. parallel blocking B_l [B/FLUP] +====================== =========== =========== ===== ======== ======== ============ +push-soa OS SoA D x -- 456 +push-aos OS AoS D x -- 456 +pull-soa OS SoA D x -- 456 +pull-aos OS AoS D x -- 456 +blk-push-soa OS SoA D x x 456 +blk-push-aos OS AoS D x x 456 +blk-pull-soa OS SoA D x x 456 +blk-pull-aos OS AoS D x x 456 +list-push-soa OS SoA I x x 528 +list-push-aos OS AoS I x x 528 +list-pull-soa OS SoA I x x 528 +list-pull-aos OS AoS I x x 528 +list-pull-split-nt-1s OS SoA I x x 376 +list-pull-split-nt-2s OS SoA I x x 376 +list-aa-soa AA SoA I x x 340 +list-aa-aos AA AoS I x x 340 +list-aa-ria-soa AA SoA I x x 304-342 +list-aa-pv-soa AA SoA I x x 304-342 +====================== =========== =========== ===== ======== ======== ============ Benchmarking ============ @@ -200,8 +295,8 @@ Benchmarking Correct benchmarking is a nontrivial task. Whenever benchmark results should be created make sure the binary was compiled with: -- ``BENCHMARK=on`` and -- ``BUILD=release`` and +- ``BENCHMARK=on`` (default if not overriden) and +- ``BUILD=release`` (default if not overriden) and - the correct ISA for macros is used, selected via ``ISA`` and - use ``TARCH`` to specify the architecture the compiler generates code for. @@ -214,7 +309,13 @@ Things the binary does nor check or controll: - transparent huge pages: when allocating memory small 4 KiB pages might be replaced with larger ones. This is in general a good thing, but if this is - really the case, depends on the system settings. + really the case, depends on the system settings (check e.g. the status of + ``/sys/kernel/mm/transparent_hugepage/enabled``). + Currently ``madvise(MADV_HUGEPAGE)`` is used for allocations which are aligned to + a 4 KiB page, which should be the case for the lattices. + This should result in huge pages except THP is disabled on the machine. + (NOTE: madvise() is used if ``HAVE_HUGE_PAGES`` is defined, which is currently + hard coded defined in ``Memory.c``). - CPU/core frequency: For reproducible results the frequency of all cores should be fixed. @@ -228,14 +329,69 @@ Things the binary does nor check or controll: - System load: interference with other application, espcially on desktop systems should be avoided. -- Padding: most kernels do not care about padding against cache or TLB - thrashing. Even if the number of (fluid) nodes suggest everything is fine, - through parallelization still problems might occur. +- Padding: For SoA based kernels the number of (fluid) nodes is automatically + adjusted so that no cache or TLB thrashing should occur. The parameters are + optimized for current Intel based systems. For more details look into the + padding section. - CPU dispatcher function: the compiler might add different versions of a function for different ISA extensions. Make sure the code you might think is executed is actually the code which is executed. +Padding +------- + +With correct padding cache and TLB thrashing can be avoided. Therefore the +number of (fluid) nodes used in the data layout is artificially increased. + +Currently automatic padding is active for kernels which support it. It can be +controlled via the kernel parameter (i.e. parameter after the ``--``) +``-pad``. Supported values are ``auto`` (default), ``no`` (to disable padding), +or a manual padding. + +Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 +entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the +parameters of current Intel based processors. + +Manual padding is done via a padding string and has the format +``mod_1+offset_1(,mod_n+offset_n)``, which specifies numbers of bytes. +SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the +19 pages with one lattice (36 with two lattices) we are concurrently accessing +over as much sets in the TLB as possible. +This is controlled by the distance between the accessed pages, which is the +number of (fluid) nodes in between them and can be adjusted by adding further +(fluid) nodes. +We want the distance d (in bytes) between two accessed pages to be e.g. +**d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE**. +This would distribute the pages evenly over the sets. Hereby **PAGE_SIZE * TLB_SETS** +would be our ``mod_1`` and **PAGE_SIZE** (after the =) our ``offset_1``. +Measurements show that with only a quarter of half of a page size as offset +higher performance is achieved, which is done by automatic padding. +On top of this padding more paddings can be added. They are just added to the +padding string and are separated by commas. + +A zero modulus in the padding string has a special meaning. Here the +corresponding offset is just added to the number of nodes. A padding string +like ``-pad 0+16`` would at a static padding of two nodes (one node = 8 b). + + +Geometries +========== + +TODO: supported geometries: channel, pipe, blocks + + +Results +======= + +TODO + + +Licence +======= + +The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3. + Acknowledgements ================ diff --git a/src/Base.h b/src/Base.h index 27f9116..ae61082 100644 --- a/src/Base.h +++ b/src/Base.h @@ -44,8 +44,6 @@ static inline double Time() } -#define TOOL_NAME "lbmbenchk" - #define STRINGIFYX(x) #x #define STRINGIFY(x) STRINGIFYX(x) @@ -117,17 +115,17 @@ static inline double Time() } while (0) #define Print(formatString, ...) \ - fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__) + fprintf(stdout, SHC_MAGENTA formatString SHC_NC, ##__VA_ARGS__) #define Warning(formatString, ...) \ - fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__) + fprintf(stdout, SHC_BROWN "WARNING: " SHC_NC formatString, ##__VA_ARGS__) #define Error(formatString, ...) \ - fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__) - + fprintf(stderr, SHC_RED "ERROR: " formatString SHC_NC , ##__VA_ARGS__) +/* #define DebugPrint(formatString, ...) \ - fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__) - + fprintf(stderr, "DEBUG: " formatString, ##__VA_ARGS__) +*/ #ifndef NO_SHELL_COLORS // or "\e" diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c index 8203d2a..bd6d43c 100644 --- a/src/BenchKernelD3Q19.c +++ b/src/BenchKernelD3Q19.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "LikwidIf.h" #include <inttypes.h> #include <math.h> @@ -98,6 +99,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd for (int iter = 0; iter < maxIterations; ++iter) { + X_LIKWID_START("os"); + #ifdef _OPENMP #pragma omp parallel for collapse(3) default(none) \ shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ @@ -112,9 +115,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd evenPart, oddPart, w_1_indep, w_2_indep) #endif - for (int z = oZ; z < nZ + oZ; ++z) { + for (int x = oX; x < nX + oX; ++x) { for (int y = oY; y < nY + oY; ++y) { - for (int x = oX; x < nX + oX; ++x) { + for (int z = oZ; z < nZ + oZ; ++z) { #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) #ifdef PROP_MODEL_PUSH @@ -306,6 +309,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd } } // z, y, x (from inner to outer) + // Stop counters before bounce back. Else computing loop balance will be incorrect. + X_LIKWID_STOP("os"); + // Fixup bounce back PDFs. #ifdef _OPENMP #pragma omp parallel for default(none) \ @@ -429,21 +435,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * for (int iter = 0; iter < maxIterations; ++iter) { - // #ifdef _OPENMP --> add line continuation - // #pragma omp parallel for collapse(3) default(none) - // shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, - // w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, - // oX, oY, oZ, nX, nY, nZ, blk) - // private(ux, uy, uz, ui, dens, dir_indep_trm, - // pdf_C, - // pdf_N, pdf_E, pdf_S, pdf_W, - // pdf_NE, pdf_SE, pdf_SW, pdf_NW, - // pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, - // pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, - // evenPart, oddPart, w_1_indep, w_2_indep) - // #endif #ifdef _OPENMP - #pragma omp parallel for default(none) \ + #pragma omp parallel default(none) \ shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ oX, oY, oZ, nX, nY, nZ, blk, nThreads) \ @@ -455,16 +448,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ evenPart, oddPart, w_1_indep, w_2_indep) #endif + { + X_LIKWID_START("blk-os"); - for (int i = 0; i < nThreads; ++i) { + int threadId = omp_get_thread_num(); - int threadStartX = nX / nThreads * i; - int threadEndX = nX / nThreads * (i + 1); + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); if (nX % nThreads > 0) { - if (nX % nThreads > i) { - threadStartX += i; - threadEndX += i + 1; + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; } else { threadStartX += nX % nThreads; @@ -472,24 +467,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } } - // for (int z = oZ; z < nZ + oZ; ++z) { - // for (int y = oY; y < nY + oY; ++y) { - // for (int x = oX; x < nX + oX; ++x) { - for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { for (int bY = oY; bY < nY + oY; bY += blk[1]) { - for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { - // for (int bX = oX; bX < nX + oX; bX += blk[0]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { // Must do everything here, else it would break collapse. int eZ = MIN(bZ + blk[2], nZ + oZ); int eY = MIN(bY + blk[1], nY + oY); int eX = MIN(bX + blk[0], threadEndX + oX); -// printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); - - for (int z = bZ; z < eZ; ++z) { + for (int x = bX; x < eX; ++x) { for (int y = bY; y < eY; ++y) { - for (int x = bX; x < eX; ++x) { + for (int z = bZ; z < eZ; ++z) { #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) @@ -497,7 +486,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; - //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} D3Q19_LIST #undef X @@ -505,7 +493,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)]; - //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} D3Q19_LIST #undef X @@ -525,11 +512,11 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } else { #endif ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - - pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - - pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - - pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; #ifdef LID_DRIVEN_CAVITY } @@ -685,7 +672,10 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } } // z, y, x (from inner to outer) - } // loop over threads + X_LIKWID_STOP("blk-os"); + } // parallel region + + // Stop counters before bounce back. Else computing loop balance will be incorrect. // Fixup bounce back PDFs. #ifdef _OPENMP diff --git a/src/BenchKernelD3Q19Aa.c b/src/BenchKernelD3Q19Aa.c new file mode 100644 index 0000000..87a9e33 --- /dev/null +++ b/src/BenchKernelD3Q19Aa.c @@ -0,0 +1,529 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" + +#include <inttypes.h> +#include <math.h> + +#ifdef _OPENMP + #include <omp.h> +#endif + +void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + KernelDataAa * kda = KDA(kd); + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + PdfT magicParam = 1.0 / 12.0; + // 1/4: best stability; + // 1/12: removes third-order advection error (best advection); + // 1/6: removes fourth-order diffusion error (best diffusion); + // 3/16: exact location of bounce back for poiseuille flow + + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ux, uy, uz, ui; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + + int nThreads = 1; + + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif + + for (int iter = 0; iter < maxIterations; iter += 2) { + + // -------------------------------------------------------------------- + // even time step + + X_LIKWID_START("aa-even"); + + // {{{ + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + + if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) { + continue; + } + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; + D3Q19_LIST + #undef X + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { + #endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + #ifdef LID_DRIVEN_CAVITY + } + #endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_S)] = pdf_N - evenPart - oddPart; + src[I(x, y, z, D3Q19_N)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_W)] = pdf_E - evenPart - oddPart; + src[I(x, y, z, D3Q19_E)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_B)] = pdf_T - evenPart - oddPart; + src[I(x, y, z, D3Q19_T)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart; + src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart; + src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart; + src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart; + src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart; + src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart; + src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart; + + #undef I + } } } // z, y, x (from inner to outer) + } } } // z, y, x (from inner to outer) + + } // loop over threads + + // }}} + + X_LIKWID_STOP("aa-even"); + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; + } + + // save current iteration + kda->Iteration = iter; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // -------------------------------------------------------------------- + // odd time step + + + X_LIKWID_START("aa-odd"); + + // {{{ + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ, blk, nThreads) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + // Must do everything here, else it would break collapse. + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)]; + D3Q19_LIST + #undef X + + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + src[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + src[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart; + src[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + src[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + src[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + src[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + src[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + + + #undef I + } } } // z, y, x (from inner to outer) + } } } // z, y, x (from inner to outer) + } // loop over threads + + // }}} + + // Stop counters before bounce back. Else computing loop balance will be incorrect. + + X_LIKWID_STOP("aa-odd"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; + } + + // save current iteration + kda->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter + 1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter + 1); + #endif // }}} + + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + diff --git a/src/BenchKernelD3Q19Aa.h b/src/BenchKernelD3Q19Aa.h new file mode 100644 index 0000000..d0605d1 --- /dev/null +++ b/src/BenchKernelD3Q19Aa.h @@ -0,0 +1,41 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA__ +#define __BENCH_KERNEL_D3Q19_AA__ + +#include "Kernel.h" + + +void D3Q19AaInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19AaInit_AaAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19AaDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19AaDeinit_AaAoS(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19__ diff --git a/src/BenchKernelD3Q19AaCommon.c b/src/BenchKernelD3Q19AaCommon.c new file mode 100644 index 0000000..fbdcbc1 --- /dev/null +++ b/src/BenchKernelD3Q19AaCommon.c @@ -0,0 +1,651 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include <inttypes.h> +#include <math.h> + +#ifdef _OPENMP + #include <omp.h> +#endif + +// Forward definition. +void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + +void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + +static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])]; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)]; + #undef I + } + + + return; +} + +static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf; + #undef I + } + + return; +} + + +static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 // DETECT NANs + + for (int d = 0; d < 19; ++d) { + if (isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + + exit(1); + } + } + +#endif + + return; +} + + +static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + return; +} + + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + + +void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelDataAa * kda = NULL; + MemAlloc((void **)&kda, sizeof(KernelDataAa)); + + kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0; + kda->Iteration = -1; + + KernelData * kd = &kda->kd; + *kernelData = kd; + + kd->nObstIndices = ld->nObst; + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = ld->Dims[0]; + kd->Dims[1] = ld->Dims[1]; + kd->Dims[2] = ld->Dims[2]; + + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + gDims[0] = lDims[0] + 2; + gDims[1] = lDims[1] + 2; + gDims[2] = lDims[2] + 2; + + kd->Offsets[0] = 1; + kd->Offsets[1] = 1; + kd->Offsets[2] = 1; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int gX = gDims[0]; + int gY = gDims[1]; + int gZ = gDims[2]; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3] = { 0 }; + + int nCells = gX * gY * gZ; + + PdfT * pdfs[2] = { NULL, NULL }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = gX; + if (blk[1] == 0) blk[1] = gY; + if (blk[2] == 0) blk[2] = gZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + + kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2]; + + + printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n", + nCells, + sizeof(PdfT) * nCells * N_D3Q19, + sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + +#define PAGE_4K 4096 + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = NULL; + + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // This depends on the chosen data layout. + // The structure of the loop should resemble the same "execution layout" + // as in the kernel! + + int nThreads; +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + +#ifdef _OPENMP + #pragma omp parallel for \ + shared(gDims, pdfs, \ + oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld) +#endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = lX / nThreads * i; + int threadEndX = lX / nThreads * (i + 1); + + if (lX % nThreads > 0) { + if (lX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += lX % nThreads; + threadEndX += lX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < lY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], lY + oY); + int eZ = MIN(bZ + blk[2], lZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0; + } + + } } } // x, y, z + } } } // bx, by, bz + } + + + // Initialize all PDFs to some standard value. + for (int x = oX; x < lX + oX; ++x) { + for (int y = oY; y < lY + oY; ++y) { + for (int z = oZ; z < lZ + oZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + } + } } } // x, y, z + + + // Count how many *PDFs* need bounce back treatment. + + uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + + // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2. + // Works when declaring nBounceBackPdfs as int64_t or using volatile. + volatile int nBounceBackPdfs = 0; + // int64_t nBounceBackPdfs = 0; + int nx, ny, nz, px, py, pz; + + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; + + // Check if neighbor is inside the lattice. + // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + // continue; + // } + if ((nx < 0 || nx >= lX) && ld->PeriodicX) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((ny < 0 || ny >= lY) && ld->PeriodicY) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + } + } + } + } + } + + printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2)); + + MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100); + MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100); + + kd->nBounceBackPdfs = nBounceBackPdfs; + nBounceBackPdfs = 0; + + int srcIndex; + int dstIndex; + + // TODO: currently this is not NUMA-aware + // - maybe use the same blocking as for lattice initialization? + // - do place the bounce back index vector parallel? + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // For periodicity: + + // We assume we have finished odd time step (accessing neighbor PDFs) and are + // before executing the even time step (accessing local PDFs only). + + // Assuming we are at the most east position of the lattice. Through the odd + // time step propagation has put a PDF in the east slot of the ghost cell east + // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node. + + // In case of transition from even to odd time step , src and dest must be + // exchanged. + + + // x periodic + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + // See description of bounce back handling below. + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + } + else { + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // Put it on the other side back into the domain. + dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + } + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + // Depending on the time step we are in we have to exchange src and dst index. + + // We build the list for the case, when we have finished odd time step + // (accessing neighbor PDFs) and before we start with the even time step + // (accessing local PDFs only). + + // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell. + // Then we have to move the east PDF from the obstacle to our west position, + // i.e. the inverse of east. + + // In case of transition from even to odd time step src and dest just + // have to be exchanged. + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + } + } + } + } + } + } + + + // Fill remaining KernelData structures + kd->GetNode = FNAME(GetNode); + kd->SetNode = FNAME(SetNode); + + kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); + + kd->Kernel = FNAME(D3Q19AaKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + MemFree((void **) & ((*kernelData)->Pdfs[0])); + // MemFree((void **) & ((*kernelData)->Pdfs[1])); + + MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc)); + MemFree((void **) & ((*kernelData)->BounceBackPdfsDst)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19AaCommon.h b/src/BenchKernelD3Q19AaCommon.h new file mode 100644 index 0000000..151e5a0 --- /dev/null +++ b/src/BenchKernelD3Q19AaCommon.h @@ -0,0 +1,89 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_AA_COMMON_H__ + +#include "Kernel.h" + +typedef struct KernelDataAa_ +{ + KernelData kd; + int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking. + int Iteration; // Current iteration number. +} KernelDataAa; + +// Macro for casting KernelData * to KernelDataAa *. +#define KDA(_x_) ((KernelDataAa *)(_x_)) + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must be defined +#endif + +#ifndef PROP_MODEL_NAME + #error PROP_MODEL_NAME must be defined +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) +{ + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + Assert(d >= 0); +#ifdef D3Q19 + Assert(d < N_D3Q19); +#else +#error Not implemented for this discretization. +#endif + +#ifdef DATA_LAYOUT_SOA + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; +#elif DATA_LAYOUT_AOS + return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; +#else +#error P_INDEX_5 function no implemented for chosen data layout. +#endif +} + +#endif // __BENCH_KERNEL_D3Q19_AA_COMMON_H__ + diff --git a/src/BenchKernelD3Q19AaVec.c b/src/BenchKernelD3Q19AaVec.c new file mode 100644 index 0000000..2642c1c --- /dev/null +++ b/src/BenchKernelD3Q19AaVec.c @@ -0,0 +1,610 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaVecCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" +#include "Vector.h" +#include "Vector.h" + +#include <inttypes.h> +#include <math.h> + +#ifdef _OPENMP + #include <omp.h> +#endif + +static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd); +static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd); + +#if 0 // {{{ +void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix) +{ + return; + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + int nX = gDims[0]; + int nY = gDims[1]; + int nZ = gDims[2]; + + PdfT pdfs[N_D3Q19]; + + int localZStart = zStart; + int localZStop = zStop; + + if (localZStart == -1) localZStart = 0; + if (localZStop == -1) localZStop = gDims[2] - 1; + + printf("D iter: %d\n", iter); + + for (int dir = 0; dir < 19; ++dir) { + for (int z = localZStop; z >= localZStart; --z) { + printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z); + + for(int y = 0; y < nY; ++y) { + printf("D [%2d][%2d][%s] %2d ", iter, dir, prefix, y); + + for(int x = 0; x < nX; ++x) { + + if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)]; + #undef I +// kd->GetNode(kd, x, y, z, pdfs); + } + else { + pdfs[dir] = -1.0; + } + + printf("%.16e ", pdfs[dir]); + } + + printf("\n"); + } + } + } +} +#endif // }}} + +void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + PdfT * src = kd->PdfsActive; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + Assert((maxIterations % 2) == 0); + + for (int iter = 0; iter < maxIterations; iter += 2) { + + // -------------------------------------------------------------------- + // even time step + // -------------------------------------------------------------------- + + X_LIKWID_START("aa-vec-even"); + + #pragma omp parallel + { + KernelEven(ld, kd, cd); + } + + X_LIKWID_STOP("aa-vec-even"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; + } + + // save current iteration + kda->Iteration = iter; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // -------------------------------------------------------------------- + // odd time step + // -------------------------------------------------------------------- + + X_LIKWID_START("aa-vec-odd"); + + #pragma omp parallel + { + KernelOdd(ld, kd, cd); + } + + // Stop counters before bounce back. Else computing loop balance will + // be incorrect. + + X_LIKWID_STOP("aa-vec-odd"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; + } + + // save current iteration + kda->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter + 1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter + 1); + #endif // }}} + + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + +static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int nThreads = 1; + int threadId = 0; + + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); + #endif + + // TODO: Currently only a 1-D decomposition is applied. For achritectures + // with a lot of cores we want at least 2-D. + + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE); + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + for (int x = bX; x < eX; x += 1) { + for (int y = bY; y < eY; y += 1) { + for (int z = bZ; z < eZ; z += VSIZE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]); + D3Q19_LIST + #undef X + + + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), + VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)), + VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + vui = vuy; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + vui = vux; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + vui = vuz; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + vui = VSUB(vuy,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + vui = VADD(vux,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + vui = VSUB(vuz,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + vui = VADD(vux,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + vui = VSUB(vuz,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + vui = VADD(vuy,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + #undef I + } } } // x, y, z + } } } // blocked x, y, z + + + + return; +} // }}} + + +static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int threadId = 0; + int nThreads = 1; + + #ifdef _OPENMP + threadId = omp_get_thread_num(); + nThreads = omp_get_max_threads(); + #endif + + // TODO: Currently only a 1-D decomposition is applied. For achritectures + // with a lot of cores we want at least 2-D. + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE); + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; z += VSIZE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]); + D3Q19_LIST + #undef X + + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), + VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + vui = vuy; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y + 1, z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + VSTU(&src[I(x, y - 1, z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + vui = vux; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + vui = vuz; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + vui = VSUB(vuy,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + vui = VADD(vux,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + vui = VSUB(vuz,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + vui = VADD(vux,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + vui = VSUB(vuz,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + vui = VADD(vuy,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + #undef I + } } } // x, y, z + } } } // blocked x, y, z + + return; + +} // }}} diff --git a/src/BenchKernelD3Q19AaVec.h b/src/BenchKernelD3Q19AaVec.h new file mode 100644 index 0000000..a125b04 --- /dev/null +++ b/src/BenchKernelD3Q19AaVec.h @@ -0,0 +1,38 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_VEC__ +#define __BENCH_KERNEL_D3Q19_AA_VEC__ + +#include "Kernel.h" + + +void D3Q19AaVecInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19AaVecDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_AA_VEC__ diff --git a/src/BenchKernelD3Q19AaVecCommon.c b/src/BenchKernelD3Q19AaVecCommon.c new file mode 100644 index 0000000..57a9fae --- /dev/null +++ b/src/BenchKernelD3Q19AaVecCommon.c @@ -0,0 +1,656 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaVecCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" + +#include <inttypes.h> +#include <math.h> + +#ifdef _OPENMP + #include <omp.h> +#endif + +// Forward definition. +void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + +static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])]; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)]; + #undef I + } + + + return; +} + +static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf; + #undef I + } + + return; +} + + +static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 // DETECT NANs + + for (int d = 0; d < 19; ++d) { + if (isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + + exit(1); + } + } + +#endif + + return; +} + + +static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + return; +} + + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + + +void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelDataAa * kda = NULL; + MemAlloc((void **)&kda, sizeof(KernelDataAa)); + + kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0; + kda->Iteration = -1; + + KernelData * kd = &kda->kd; + *kernelData = kd; + + kd->nObstIndices = ld->nObst; + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = ld->Dims[0]; + kd->Dims[1] = ld->Dims[1]; + kd->Dims[2] = ld->Dims[2]; + + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + Assert(VSIZE <= 4); + + // TODO: only add enough ghost cells so we can compute everything vectorized. + gDims[0] = lDims[0] + 2; + gDims[1] = lDims[1] + 2; + // TODO: fix this for aa-vec2-soa + gDims[2] = lDims[2] + 4; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4 + + kd->Offsets[0] = 1; + kd->Offsets[1] = 1; + kd->Offsets[2] = 1; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int gX = gDims[0]; + int gY = gDims[1]; + int gZ = gDims[2]; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3] = { 0 }; + + int nCells = gX * gY * gZ; + + PdfT * pdfs[2] = { NULL, NULL }; + + ParseParameters(params, blk); + + if (blk[2] % VSIZE != 0) { + blk[2] -= blk[2] % VSIZE; + printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]); + } + + if (blk[0] == 0) blk[0] = gX; + if (blk[1] == 0) blk[1] = gY; + if (blk[2] == 0) blk[2] = gZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2]; + + + printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n", + nCells, + sizeof(PdfT) * nCells * N_D3Q19, + sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + +#define PAGE_4K 4096 + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = NULL; + + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // This depends on the chosen data layout. + // The structure of the loop should resemble the same "execution layout" + // as in the kernel! + + int nThreads; +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + +#ifdef _OPENMP + #pragma omp parallel for \ + shared(gDims, pdfs, \ + oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld) +#endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = lX / nThreads * i; + int threadEndX = lX / nThreads * (i + 1); + + if (lX % nThreads > 0) { + if (lX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += lX % nThreads; + threadEndX += lX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < lY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], lY + oY); + int eZ = MIN(bZ + blk[2], lZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0; + } + + } } } // x, y, z + } } } // bx, by, bz + } + + + // Initialize all PDFs to some standard value. + for (int x = oX; x < lX + oX; ++x) { + for (int y = oY; y < lY + oY; ++y) { + for (int z = oZ; z < lZ + oZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + } + } } } // x, y, z + + + // Count how many *PDFs* need bounce back treatment. + + uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + + // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2. + // Works when declaring nBounceBackPdfs as int64_t or using volatile. + volatile int nBounceBackPdfs = 0; + // int64_t nBounceBackPdfs = 0; + int nx, ny, nz, px, py, pz; + + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; + + // Check if neighbor is inside the lattice. + // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + // continue; + // } + if ((nx < 0 || nx >= lX) && ld->PeriodicX) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((ny < 0 || ny >= lY) && ld->PeriodicY) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + } + } + } + } + } + + printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2)); + + MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100); + MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100); + + kd->nBounceBackPdfs = nBounceBackPdfs; + nBounceBackPdfs = 0; + + int srcIndex; + int dstIndex; + + // TODO: currently this is not NUMA-aware + // - maybe use the same blocking as for lattice initialization? + // - do place the bounce back index vector parallel? + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // For periodicity: + + // We assume we have finished odd time step (accessing neighbor PDFs) and are + // before executing the even time step (accessing local PDFs only). + + // Assuming we are at the most east position of the lattice. Through the odd + // time step propagation has put a PDF in the east slot of the ghost cell east + // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node. + + // In case of transition from even to odd time step , src and dest must be + // exchanged. + + + // x periodic + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + // See description of bounce back handling below. + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + } + else { + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // Put it on the other side back into the domain. + dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + } + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + // Depending on the time step we are in we have to exchange src and dst index. + + // We build the list for the case, when we have finished odd time step + // (accessing neighbor PDFs) and before we start with the even time step + // (accessing local PDFs only). + + // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell. + // Then we have to move the east PDF from the obstacle to our west position, + // i.e. the inverse of east. + + // In case of transition from even to odd time step src and dest just + // have to be exchanged. + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + } + } + } + } + } + } + + + // Fill remaining KernelData structures + kd->GetNode = FNAME(GetNode); + kd->SetNode = FNAME(SetNode); + + kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); + + kd->Kernel = FNAME(D3Q19AaVecKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + MemFree((void **) & ((*kernelData)->Pdfs[0])); + + MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc)); + MemFree((void **) & ((*kernelData)->BounceBackPdfsDst)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19AaVecCommon.h b/src/BenchKernelD3Q19AaVecCommon.h new file mode 100644 index 0000000..d416d07 --- /dev/null +++ b/src/BenchKernelD3Q19AaVecCommon.h @@ -0,0 +1,93 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ + +// #include "BenchKernelD3Q19AaCommon.h" + +#include "Kernel.h" + +typedef struct KernelDataAa_ +{ + KernelData kd; + int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking. + int Iteration; // Current iteration number. +} KernelDataAa; + +// Macro for casting KernelData * to KernelDataAa *. +#define KDA(_x_) ((KernelDataAa *)(_x_)) + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_SOA + #error Only DATA_LAYOUT_SOA is supported. +#endif + +#ifndef PROP_MODEL_AA + #error Only PROP_MODEL_AA is supported. +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) +{ + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + Assert(d >= 0); +#ifdef D3Q19 + Assert(d < N_D3Q19); +#else + #error Not implemented for this discretization. +#endif + +#ifdef DATA_LAYOUT_SOA + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; +// #elif DATA_LAYOUT_AOS +// return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; +#else + #error P_INDEX_5 function no implemented for chosen data layout. +#endif +} + + + +#endif // __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ + diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c index e697bc8..8f75649 100644 --- a/src/BenchKernelD3Q19Common.c +++ b/src/BenchKernelD3Q19Common.c @@ -32,6 +32,9 @@ #include <inttypes.h> #include <math.h> +#ifdef _OPENMP + #include <omp.h> +#endif // Forward definition. void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); @@ -224,8 +227,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -236,8 +239,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -248,8 +251,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -260,8 +263,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -285,7 +288,7 @@ static void ParseParameters(Parameters * params, int * blk) } -void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { KernelDataEx * kdex = NULL; MemAlloc((void **)&kdex, sizeof(KernelDataEx)); @@ -332,7 +335,17 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters PdfT * pdfs[2]; - ParseParameters(params, blk); + if (blocking) { + ParseParameters(params, blk); + } + else { + if (params->nKernelArgs != 0) { + printf("ERROR: unknown kernel parameter.\n"); + printf("This kernels accepts no parameters.\n"); + exit(1); + } + } + if (blk[0] == 0) blk[0] = gX; if (blk[1] == 0) blk[1] = gY; @@ -348,8 +361,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); - MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); - MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + + printf("# pdfs[0] = %p\n", pdfs[0]); + printf("# pdfs[1] = %p\n", pdfs[1]); kd->Pdfs[0] = pdfs[0]; kd->Pdfs[1] = pdfs[1]; @@ -358,39 +376,82 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // This depends on the chosen data layout. // The structure of the loop should resemble the same "execution layout" // as in the kernel! -#ifdef _OPENMP - #pragma omp parallel for collapse(3) -#endif - for (int bZ = 0; bZ < gZ; bZ += blk[2]) { - for (int bY = 0; bY < gY; bY += blk[1]) { - for (int bX = 0; bX < gX; bX += blk[0]) { + if (blocking) { + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; - // Must do everything here, else it would break collapse. - int eZ = MIN(bZ + blk[2], gZ); - int eY = MIN(bY + blk[1], gY); - int eX = MIN(bX + blk[0], gX); + int nThreads = 1; - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { - for (int x = bX; x < eX; ++x) { + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif - for (int d = 0; d < N_D3Q19; ++d) { - pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - } + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads) + #endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } } } } } } + + } + else { + #ifdef _OPENMP + #pragma omp parallel for collapse(3) + #endif + for (int x = 0; x < gX; ++x) { + for (int y = 0; y < gY; ++y) { + for (int z = 0; z < gZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } + } } // Initialize all PDFs to some standard value. - for (int z = 0; z < gZ; ++z) { + for (int x = 0; x < gX; ++x) { for (int y = 0; y < gY; ++y) { - for (int x = 0; x < gX; ++x) { + for (int z = 0; z < gZ; ++z) { for (int d = 0; d < N_D3Q19; ++d) { pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0; @@ -417,9 +478,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // TODO: apply blocking? - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -471,9 +532,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters int srcIndex; int dstIndex; - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -598,7 +659,12 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); - kd->Kernel = FNAME(D3Q19BlkKernel); + if (blocking) { + kd->Kernel = FNAME(D3Q19BlkKernel); + } + else { + kd->Kernel = FNAME(D3Q19Kernel); + } kd->DstPdfs = NULL; kd->PdfsActive = kd->Pdfs[0]; @@ -606,6 +672,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters return; } + +void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + D3Q19InternalInit(1, ld, kernelData, params); +} + + void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) { MemFree((void **) & ((*kernelData)->Pdfs[0])); @@ -624,27 +697,7 @@ void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { - Parameters p; - - if (params->nKernelArgs != 0) { - printf("ERROR: unknown kernel parameter.\n"); - printf("This kernels accepts no parameters.\n"); - exit(1); - } - - // Setup an empty parameters structure. - p.nArgs = params->nArgs; - p.Args = params->Args; - p.nKernelArgs = 0; - p.KernelArgs = NULL; - - // Call init routine for blocking kernel and override the - // kernel function to be called later on. - FNAME(D3Q19BlkInit)(ld, kernelData, &p); - - (*kernelData)->Kernel = FNAME(D3Q19Kernel); - - return; + D3Q19InternalInit(0, ld, kernelData, params); } diff --git a/src/BenchKernelD3Q19Common.h b/src/BenchKernelD3Q19Common.h index cceddca..268bfed 100644 --- a/src/BenchKernelD3Q19Common.h +++ b/src/BenchKernelD3Q19Common.h @@ -74,9 +74,9 @@ static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) #endif #ifdef DATA_LAYOUT_SOA - return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x; + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; #elif DATA_LAYOUT_AOS - return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d; + return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; #else #error P_INDEX_5 function no implemented for chosen data layout. #endif diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c index 7922b7a..e01853a 100644 --- a/src/BenchKernelD3Q19List.c +++ b/src/BenchKernelD3Q19List.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "LikwidIf.h" #include <inttypes.h> #include <math.h> @@ -97,6 +98,7 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData for(int iter = 0; iter < maxIterations; ++iter) { + X_LIKWID_START("list-os"); #ifdef _OPENMP #pragma omp parallel for default(none) \ @@ -307,6 +309,8 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData #undef I } // loop over fluid nodes + X_LIKWID_STOP("list-os"); + #ifdef VERIFICATION kd->PdfsActive = dst; KernelAddBodyForce(kd, ld, cd); diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c index d2bcb52..c1e48a5 100644 --- a/src/BenchKernelD3Q19ListAaCommon.c +++ b/src/BenchKernelD3Q19ListAaCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include <math.h> @@ -222,15 +223,18 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); - +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -248,8 +252,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -260,8 +264,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -272,8 +276,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -284,13 +288,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -346,6 +357,11 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete kdl->nFluid = -1; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -358,18 +374,21 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -443,18 +462,17 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -492,7 +510,15 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configure data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c index c522252..b1dee16 100644 --- a/src/BenchKernelD3Q19ListAaPv.c +++ b/src/BenchKernelD3Q19ListAaPv.c @@ -728,9 +728,9 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart; - VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart; - VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); //vui = vuy + vuz; vui = VADD(vuy,vuz); @@ -783,20 +783,20 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) ui = uy; evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_N)] = pdf_N - evenPart - oddPart; - src[ADJ_LIST(D3Q19_S)] = pdf_S - evenPart + oddPart; + *ppdf_S = pdf_N - evenPart - oddPart; + *ppdf_N = pdf_S - evenPart + oddPart; ui = ux; evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_E)] = pdf_E - evenPart - oddPart; - src[ADJ_LIST(D3Q19_W)] = pdf_W - evenPart + oddPart; + *ppdf_W = pdf_E - evenPart - oddPart; + *ppdf_E = pdf_W - evenPart + oddPart; ui = uz; evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_T)] = pdf_T - evenPart - oddPart; - src[ADJ_LIST(D3Q19_B)] = pdf_B - evenPart + oddPart; + *ppdf_B = pdf_T - evenPart - oddPart; + *ppdf_T = pdf_B - evenPart + oddPart; // direction: w_2 w_2_indep = w_2 * dir_indep_trm; @@ -804,38 +804,38 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) ui = -ux + uy; evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart; - src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart; + *ppdf_SE = pdf_NW - evenPart - oddPart; + *ppdf_NW = pdf_SE - evenPart + oddPart; ui = ux + uy; evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart; - src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart; + *ppdf_SW = pdf_NE - evenPart - oddPart; + *ppdf_NE = pdf_SW - evenPart + oddPart; ui = -ux + uz; evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart; + *ppdf_BE = pdf_TW - evenPart - oddPart; + *ppdf_TW = pdf_BE - evenPart + oddPart; ui = ux + uz; evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart; + *ppdf_BW = pdf_TE - evenPart - oddPart; + *ppdf_TE = pdf_BW - evenPart + oddPart; ui = -uy + uz; evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart; + *ppdf_BN = pdf_TS - evenPart - oddPart; + *ppdf_TS = pdf_BN - evenPart + oddPart; ui = uy + uz; evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart; + *ppdf_BS = pdf_TN - evenPart - oddPart; + *ppdf_TN = pdf_BS - evenPart + oddPart; pointerOffset = 1; } diff --git a/src/BenchKernelD3Q19ListAaPvCommon.c b/src/BenchKernelD3Q19ListAaPvCommon.c index 9cdf0e1..7b479cf 100644 --- a/src/BenchKernelD3Q19ListAaPvCommon.c +++ b/src/BenchKernelD3Q19ListAaPvCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include <math.h> @@ -225,15 +226,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -502,6 +514,11 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -514,17 +531,21 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; @@ -607,18 +628,18 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -660,7 +681,15 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.c b/src/BenchKernelD3Q19ListAaRiaCommon.c index 7faf37b..c6ba13a 100644 --- a/src/BenchKernelD3Q19ListAaRiaCommon.c +++ b/src/BenchKernelD3Q19ListAaRiaCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include <math.h> @@ -225,15 +226,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -485,6 +497,11 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -500,14 +517,18 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param int nCells = ld->nFluid; // TODO: + padding int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; @@ -590,18 +611,17 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -643,7 +663,15 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListCommon.c b/src/BenchKernelD3Q19ListCommon.c index 4f97bae..f66e379 100644 --- a/src/BenchKernelD3Q19ListCommon.c +++ b/src/BenchKernelD3Q19ListCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include <math.h> @@ -200,15 +201,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk <n>] [-blk-[xyz] <n>]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -226,8 +231,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -238,8 +243,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -250,8 +255,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -262,8 +267,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -273,6 +278,13 @@ static void ParseParameters(Parameters * params, int * blk) ParameterUsage(); exit(1); } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else { printf("ERROR: unknown kernel parameter.\n"); ParameterUsage(); @@ -324,6 +336,12 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters kdl->nFluid = -1; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -336,18 +354,21 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -425,18 +446,17 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -474,7 +494,15 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c index dfab54a..0132dc9 100644 --- a/src/BenchKernelD3Q19ListPullSplitNt.c +++ b/src/BenchKernelD3Q19ListPullSplitNt.c @@ -29,6 +29,7 @@ #include "Memory.h" #include "Vtk.h" #include "Vector.h" +#include "LikwidIf.h" #include <inttypes.h> #include <math.h> @@ -118,6 +119,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD KernelStatistics(kd, ld, cd, 0); #endif + + X_LIKWID_START("list-pull-split-nt-1s"); #ifdef _OPENMP #pragma omp parallel default(none) \ shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ @@ -184,6 +187,7 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD for(int iter = 0; iter < maxIterations; ++iter) { + #if 1 #define INDEX_START blIndexStart #define INDEX_STOP blIndexVec @@ -201,6 +205,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD #define INDEX_STOP blIndexStop #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" #endif + + #pragma omp barrier #pragma omp single @@ -235,6 +241,9 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD MemFree((void **)&tmpArray); } + + X_LIKWID_STOP("list-pull-split-nt-1s"); + #ifdef VTK_OUTPUT if (cd->VtkOutput) { kd->PdfsActive = src; @@ -321,6 +330,10 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD KernelStatistics(kd, ld, cd, 0); #endif + + X_LIKWID_START("list-pull-split-nt-2s"); + + #ifdef _OPENMP #pragma omp parallel default(none) \ shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ @@ -406,6 +419,7 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD #endif #pragma omp barrier + #pragma omp single { #ifdef VERIFICATION @@ -438,6 +452,8 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD MemFree((void **)&tmpArray); } + X_LIKWID_STOP("list-pull-split-nt-2s"); + #ifdef VTK_OUTPUT if (cd->VtkOutput) { kd->PdfsActive = src; diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.c b/src/BenchKernelD3Q19ListPullSplitNtCommon.c index b5df14b..8c8da88 100644 --- a/src/BenchKernelD3Q19ListPullSplitNtCommon.c +++ b/src/BenchKernelD3Q19ListPullSplitNtCommon.c @@ -29,6 +29,8 @@ #include "Memory.h" #include "Vtk.h" #include "Vector.h" +#include "Padding.h" + #include <math.h> @@ -183,16 +185,18 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n"); + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); return; } -static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) +static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; *nTmpArray = 152; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -210,8 +214,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -222,8 +226,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -234,8 +238,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -246,8 +250,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -270,6 +274,11 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) *nTmpArray = tmp; } + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -456,6 +465,11 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -471,8 +485,12 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * int nCells = ld->nFluid; int nFluid = ld->nFluid; + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } + // We padd each stream of a PDF array for a complete cache line. - // TODO: padding for L1/L2 and TLB. nCells = nCells + (8 - nCells % 8); Assert(nCells % VSIZE == 0); @@ -482,10 +500,6 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk, &kdlr->nTmpArray); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -572,18 +586,18 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -627,6 +641,15 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). // Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/Geometry.c b/src/Geometry.c index 31c985a..0450c56 100644 --- a/src/Geometry.c +++ b/src/Geometry.c @@ -72,6 +72,9 @@ void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], Lat typeDetails = &tmp; } + else if (strncasecmp("fluid", geometryType, 5) == 0) { + type = GEO_TYPE_FLUID; + } else { printf("ERROR: unknown geometry specified.\n"); Verify(0); @@ -99,7 +102,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period Assert(type >= GEO_TYPE_MIN); Assert(type <= GEO_TYPE_MAX); - const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" }; + const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks", "fluid" }; printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]); @@ -127,6 +130,10 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) { periodic[0] = 1; } + else if (type == GEO_TYPE_FLUID) { + // Actually this geometry is non-periodic, but we need fluid at the boundary. + periodic[0] = 1; periodic[1] = 1; periodic[2] = 1; + } // Walls or periodic on first and last x plane. for (int z = 0; z < dims[2]; ++z) { @@ -168,7 +175,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period } if (type == GEO_TYPE_CHANNEL) { - periodic[0] = 1; + // Nothing todo here. } else if (type == GEO_TYPE_PIPE) { #define SQR(a) ((a)*(a)) @@ -192,34 +199,38 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period int blockSize = *((int *)typeDetails); +#if 0 if (blockSize == 0) { blockSize = 8; } +#endif + if (blockSize > 0) { - int dimMin = dims[0]; + int dimMin = dims[0]; - if (dims[1] < dimMin) dimMin = dims[1]; - if (dims[2] < dimMin) dimMin = dims[2]; + if (dims[1] < dimMin) dimMin = dims[1]; + if (dims[2] < dimMin) dimMin = dims[2]; - if (blockSize < 0 || blockSize > dimMin / 2) { - printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); - // TODO: find a better solution for handling errors in here. - Verify(0); - } + if (blockSize < 0 || blockSize > dimMin / 2) { + printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); + // TODO: find a better solution for handling errors in here. + Verify(0); + } - // Number of blocks in x, y, and z direction. - int nbx = blockSize, nby = blockSize, nbz = blockSize; + // Number of blocks in x, y, and z direction. + int nbx = blockSize, nby = blockSize, nbz = blockSize; - for (int z = 0; z < dims[2]; ++z) { - if ((z % (2 * nbz)) < nbz) continue; + for (int z = 0; z < dims[2]; ++z) { + if ((z % (2 * nbz)) < nbz) continue; - for (int y = 0; y < dims[1]; ++y) { - if ((y % (2 * nby)) < nby) continue; + for (int y = 0; y < dims[1]; ++y) { + if ((y % (2 * nby)) < nby) continue; - for (int x = 0; x < dims[0]; ++x) { + for (int x = 0; x < dims[0]; ++x) { - if ((x % (2 * nbx)) >= nbx) { - lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + if ((x % (2 * nbx)) >= nbx) { + lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + } } } } diff --git a/src/Geometry.h b/src/Geometry.h index 11d72ff..a9bf260 100644 --- a/src/Geometry.h +++ b/src/Geometry.h @@ -37,7 +37,8 @@ typedef enum GEO_TYPES_ { GEO_TYPE_PIPE = 2, GEO_TYPE_BLOCKS = 3, // Expects a pointer to an integer, holding the // value of the block size as type detail. - GEO_TYPE_MAX = 3 + GEO_TYPE_FLUID = 4, + GEO_TYPE_MAX = 4 } GEO_TYPES; diff --git a/src/Kernel.h b/src/Kernel.h index eddae64..99e126b 100644 --- a/src/Kernel.h +++ b/src/Kernel.h @@ -56,7 +56,7 @@ #endif #ifdef PROP_MODEL_AA - #define PROP_MODEL_NAME AA + #define PROP_MODEL_NAME Aa #endif diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h index 2ee063a..557c653 100644 --- a/src/KernelFunctions.h +++ b/src/KernelFunctions.h @@ -28,6 +28,8 @@ #define __KERNEL_FUNCTIONS_H__ #include "BenchKernelD3Q19.h" +#include "BenchKernelD3Q19Aa.h" +#include "BenchKernelD3Q19AaVec.h" #include "BenchKernelD3Q19List.h" #include "BenchKernelD3Q19ListAa.h" #include "BenchKernelD3Q19ListAaRia.h" @@ -133,6 +135,22 @@ KernelFunctions g_kernels[] = .Init = D3Q19BlkInit_PullAoS, .Deinit = D3Q19BlkDeinit_PullAoS }, + { + .Name = "aa-aos", + .Init = D3Q19AaInit_AaAoS, + .Deinit = D3Q19AaDeinit_AaAoS + }, + { + .Name = "aa-soa", + .Init = D3Q19AaInit_AaSoA, + .Deinit = D3Q19AaDeinit_AaSoA + }, + { + .Name = "aa-vec-soa", + .Init = D3Q19AaVecInit_AaSoA, + .Deinit = D3Q19AaVecDeinit_AaSoA + } + }; #endif // __KERNEL_FUNCTIONS_H__ diff --git a/src/Lattice.c b/src/Lattice.c new file mode 100644 index 0000000..bc218eb --- /dev/null +++ b/src/Lattice.c @@ -0,0 +1,59 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. +// +// -------------------------------------------------------------------------- +#include "Lattice.h" + +// Dumps the layers [zStart, zStop] of lattice as ASCII. +// Specify zStart = -1 and zStop = -1 as begin/end of lattice. + +void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop) +{ + Assert(ld != NULL); + + const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet + + int localZStart = zStart; + int localZStop = zStop; + + int * dims = ld->Dims; + LatticeT * lattice = ld->Lattice; + + if (localZStart == -1) localZStart = 0; + if (localZStop == -1) localZStop = dims[2] - 1; + + for (int z = localZStop; z >= localZStart; --z) { + printf("plane % 2d\n", z); + + for (int y = dims[1] - 1; y >= 0; --y) { + printf(" %2d ", y); + for (int x = 0; x < dims[0]; ++x) { + printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]); + } + printf("\n"); + } + } +} + diff --git a/src/Lattice.h b/src/Lattice.h index 99b3fc3..c23b70f 100644 --- a/src/Lattice.h +++ b/src/Lattice.h @@ -74,5 +74,6 @@ static inline int L_INDEX_4(int dims[3], int x, int y, int z) return z * dims[0] * dims[1] + y * dims[0] + x; } +void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop); #endif // __LATTICE_H__ diff --git a/src/Main.c b/src/Main.c index 1525ad5..65a4035 100644 --- a/src/Main.c +++ b/src/Main.c @@ -468,7 +468,7 @@ int main(int argc, char * argv[]) int threadId = omp_get_thread_num(); int err; - err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId); + err = PinCurrentThreadByCpuList(pinString, threadId); if (err) { printf("ERROR [thread %d]: pinning failed.\n", threadId); @@ -557,9 +557,9 @@ int main(int argc, char * argv[]) perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6, geometryType, kernelToUse, #ifdef VERIFICATION - "# VERIFICATION" + "VERIFICATION" #else - "# benchmark" + "B" #endif ); diff --git a/src/Makefile b/src/Makefile index a9889a5..99ca902 100644 --- a/src/Makefile +++ b/src/Makefile @@ -29,7 +29,9 @@ # CONFIG ?= linux-gcc CONFIG ?= linux-intel -BUILD ?= debug +BUILD ?= release + +BENCHMARK ?= on # If we compile for BENCHMARK all verifcation and statistics are disabled, # if not specified otherwise. @@ -37,13 +39,12 @@ ifeq (on,$(BENCHMARK)) VERIFICATION ?= off VTK_OUTPUT ?= off STATISTICS ?= off +else + VERIFICATION ?= on + STATISTICS ?= on + VTK_OUTPUT ?= on endif -VERIFICATION ?= on - -STATISTICS ?= on - -VTK_OUTPUT ?= on COLOR ?= on @@ -95,7 +96,7 @@ DEP_DIR = obj/$(CONFIG)-$(BUILD)$(TAG)-dep # Sources to consider. SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \ - Vtk.c Pinning.c + Vtk.c Pinning.c Padding.c Lattice.c # ------------------------------------------------------------------------ # NO CHANGE BELOW SHOULD BE NEEDED @@ -127,7 +128,13 @@ OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \ $(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \ $(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \ $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \ - $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o + $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaVec_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaVecCommon_AaSoA.o OBJ = $(OBJ_C) @@ -203,9 +210,22 @@ ifeq (on,$(LIKWID)) LD_LIBS += $(LIKWID_LIB) -llikwid endif +# ARCH can only be assigned a string without a space. The space is escaped as +# a comma which we have to replace here. + +ifdef TARCH + ARCH_W_COMMA := $(TARCH) + + COMMA_ := , + override TARCH := $(subst $(COMMA_), ,$(TARCH)) + + $(info $(shell echo -e "\n$(COLOR_GREEN)INFO: Automatically expanding comma in TARCH variable: TARCH=$(TARCH).$(COLOR_NO)\n")) + +endif + .phony: all clean clean-all -$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) building $(.DEFAULT_GOAL)...")) +$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) TARCH=$(COLOR_CYAN)$(TARCH)$(COLOR_NO) building $(.DEFAULT_GOAL)...")) $(info # Object dir: $(OBJECT_DIR)) @@ -265,6 +285,13 @@ $(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS) @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)" $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@ +$(OBJECT_DIR)/%_AaAoS.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_AA -c $< -o $@ + +$(OBJECT_DIR)/%_AaSoA.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_AA -c $< -o $@ $(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS) @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)" diff --git a/src/Memory.c b/src/Memory.c index d9d1f94..e7dda39 100644 --- a/src/Memory.c +++ b/src/Memory.c @@ -24,11 +24,24 @@ // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. // // -------------------------------------------------------------------------- +// TODO: make configurable +#define HAVE_HUGE_PAGES + + +#ifdef HAVE_HUGE_PAGES +#define _BSD_SOURCE +#endif + #include <stdio.h> #include <stdlib.h> #include <string.h> // strerror #include <errno.h> + +#ifdef HAVE_HUGE_PAGES +#include <sys/mman.h> // madvise +#endif + #include "Base.h" #include "Memory.h" @@ -60,6 +73,19 @@ int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes) exit(1); } +#ifdef HAVE_HUGE_PAGES + + if (alignmentBytes % 4096 == 0) { + ret = madvise(*ptr, bytesToAlloc, MADV_HUGEPAGE); + + if (ret != 0) { + DEBUG_BREAK_POINT(); + Error("madvise(%p, %lu, MADV_HUGEPAGE) failed: %d - %s.\n", *ptr, bytesToAlloc, errno, strerror(errno)); + exit(1); + } + } +#endif + return 0; } diff --git a/src/Padding.c b/src/Padding.c new file mode 100644 index 0000000..97a98ea --- /dev/null +++ b/src/Padding.c @@ -0,0 +1,300 @@ +#include "Base.h" +#include "Kernel.h" +#include "Padding.h" + +#include <errno.h> +#include <limits.h> + +// Generates a PadInfo struct from padStr. If padStr is NULL then a NULL +// pointer is returned. +// +// Padding string is either "auto" or in the format of +// <modulus>+<remainder>(,<modulus>+<remainder>)*. +// +// If modulus is 0 then the remainder will just be added and no modulo will be +// applied. +// +// Returned PadInfo structs must be freed with PadInfoFree. + +PadInfo * PadInfoFromStr(const char * padStr) +{ + if (padStr == NULL) { + return NULL; + } + else if (!strncmp("auto", padStr, 4)) { + + PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo) + 4 * sizeof(int)); + Assert(padInfo != NULL); + + padInfo->nEntries = 2; + padInfo->Modulus = (int *)(padInfo + 1); + padInfo->Offset = ((int *)(padInfo + 1)) + 2; + + // Intel TLB 2 MiB pages + padInfo->Modulus[0] = 2 * 1024 * 1024 * 8; + padInfo->Offset[0] = 2 * 1024 * 1024 + 512; + + // Intel L2, 256 sets + padInfo->Modulus[1] = 256 * 64; + padInfo->Offset[1] = 128; + + return padInfo; + } + else if (!strncmp("no", padStr, 2) || !strncmp("off", padStr, 3)) { + PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo)); + Assert(padInfo != NULL); + + padInfo->nEntries = 0; + padInfo->Modulus = NULL; + padInfo->Offset = NULL; + + return padInfo; + } + + // Count number of commas. Number of commas + 1 gives us the number of + // padding entries. + const char * tmp = padStr; + int nCommas = 0; + + while (*tmp != 0x00) { + if (*tmp == ',') ++nCommas; + ++tmp; + } + + // Number of padding entries = nCommas + 1. + size_t padInfoSize = sizeof(PadInfo) + (nCommas + 1) * 2 * sizeof(int); + PadInfo * padInfo = (PadInfo *)malloc(padInfoSize); + Assert(padInfo != NULL); + + memset(padInfo, 0x00, padInfoSize); + + padInfo->Modulus = (int *)(padInfo + 1); + padInfo->Offset = ((int *)(padInfo + 1)) + nCommas + 1; + + tmp = padStr; + + int padEntryIndex = 0; + int entriesSeen = 0; + + int modulusSeen = 0; + int offsetSeen = 0; + + // TODO: parsing is currently a mess. We assume a fixed format, which + // should be easier to parse now. + while (*tmp != 0x00) { + + if (*tmp == ',') { + // Check if modulus was seen in this section, then move to the next. + + if (!modulusSeen) { + fprintf(stderr, "ERROR: modulus missing before next section in padding string.\n"); + exit(1); + } + + modulusSeen = 0; + offsetSeen = 0; + ++padEntryIndex; + ++tmp; + } + else if (*tmp == '+') { + // Check if modulus was seen in this section and no offset yet. + + if (!modulusSeen) { + fprintf(stderr, "ERROR: modulus missing before offset in padding string.\n"); + exit(1); + } + + if (offsetSeen) { + fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n"); + exit(1); + } + + ++tmp; + } + else if (*tmp >= '0' && *tmp <= '9') { + // Parse number and check for all the errors. + + char * endPtr; + + errno = 0; + + int value = strtol(tmp, &endPtr, 10); + + if ((value == LONG_MIN || value == LONG_MAX) && errno != 0) { + fprintf(stderr, "ERROR: over- or underflow in modulus/offset of padding string.\n"); + exit(1); + } + else if (value == 0 && errno != 0) { + fprintf(stderr, "ERROR: error parsing modulus/offset of padding string: %d - %s\n", errno, strerror(errno)); + exit(1); + } + else if (tmp == endPtr) { + // No digits found: empty string or invalid pad string. + fprintf(stderr, "ERROR: modulus or offset missing in padding string.\n"); + exit(1); + } + + if (value < 0) { + fprintf(stderr, "ERROR: modulus and offset must be >= 0 in padding string.\n"); + exit(1); + } + + if (!modulusSeen) { + if (offsetSeen) { + fprintf(stderr, "ERROR: offset already seen, but not modulus in padding string.\n"); + exit(1); + } + + Verify(padEntryIndex < (nCommas + 1)); + + padInfo->Modulus[padEntryIndex] = value; + + modulusSeen = 1; + ++entriesSeen; + } + else { + if (offsetSeen) { + fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n"); + exit(1); + } + + Verify(padEntryIndex < (nCommas + 1)); + + padInfo->Offset[padEntryIndex] = value; + + offsetSeen = 1; + } + // No increment of tmp needed, as endPtr points already to the first + // character which is not part of the number. + tmp = endPtr; + } + else { + fprintf(stderr, "ERROR: padding string contains invalid characters.\n"); + exit(1); + } + + } + + if (entriesSeen != nCommas + 1) { + fprintf(stderr, "ERROR: did not find all padding entries.\n"); + exit(1); + } + + for (int i = 0; i < nCommas + 1; ++i) { + if (padInfo->Offset[i] >= padInfo->Modulus[i]) { + fprintf(stderr, "ERROR: offset in padding entry %d is equal or larger than modulus.\n", i); + exit(1); + } + } + + padInfo->nEntries = entriesSeen; + + return padInfo; +} + +int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr) +{ + Assert(padInfoPtr != NULL); + + // If the padInfo is NULL determine if we use the "auto" configuration. + const int defaultAutoPadding = 1; + + + if (*padInfoPtr == NULL) { + if (!defaultAutoPadding) { + return nCells; + } + + *padInfoPtr = PadInfoFromStr("auto"); + } + + PadInfo * padInfo = *padInfoPtr; + + Assert(padInfo->nEntries >= 0); + + for (int i = 0; i < padInfo->nEntries; ++i) { + Assert(padInfo->Modulus[i] >= 0); + Assert(padInfo->Offset[i] >= 0); + + int nPadCells = 0; + + if (padInfo->Modulus[i] == 0) { + // When the modulus is just zero then we only add the offset. + nPadCells = padInfo->Offset[i] * cellSizeBytes; + } + else { + int nModCells = padInfo->Modulus[i] / cellSizeBytes; + + if (nModCells == 0) { + fprintf(stderr, "ERROR: modulus of %d byte in padding entry %d becomes zero for PDF size %d byte.\n", + padInfo->Modulus[i], i, cellSizeBytes); + exit(1); + } + + int nOffsetCells = padInfo->Offset[i] / cellSizeBytes; + int nRemainder = nCells % nModCells; + + nPadCells = (nOffsetCells + nModCells - nRemainder) % nModCells; + } + + nCells += nPadCells; + } + + return nCells; +} + +void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix) +{ + Assert(padInfo != NULL); + Assert(padInfo->nEntries >= 0); + Assert(f != NULL); + Assert(prefix != NULL); + + for (int i = 0; i < padInfo->nEntries; ++i) { + fprintf(f, "%sm: %10d b o: %10d b m: %f KiB o: %f KiB m: %f MiB o: %f MiB\n", + prefix, + padInfo->Modulus[i], padInfo->Offset[i], + padInfo->Modulus[i] / 1024.0, padInfo->Offset[i] / 1024.0, + padInfo->Modulus[i] / 1024.0 / 1024.0, padInfo->Offset[i] / 1024.0 / 1024.0); + } + + return; +} + +void PadInfoFree(PadInfo * padInfo) +{ + if (padInfo != NULL) { + free(padInfo); + } + + return; +} + +// Returns the new padded cell size and reports if padding was performed and how. + +int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr) +{ + // Apply padding. + int nNewCells = PadCells(nCells, sizeof(PdfT), padInfoPtr); + + if (nCells != nNewCells) { + printf("# padding info:\n"); + PadInfoPrint(*padInfoPtr, stdout, "# "); + // int nPaddedCells = nNewCells - nCells; + // printf("# padding per dir.: %10d nodes (%f MiB)\n", nPaddedCells, nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT)); + // printf("# padding total: %10d nodes (%f MiB)\n", 19 * nPaddedCells, 19 * nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT)); + + int nPadCells = nNewCells - nCells; + + printf("#\n# padding %d nodes with %d nodes, %f MiB per direction, %f MiB in total\n", + nCells, + nPadCells, + nPadCells * sizeof(PdfT) / 1024.0 / 1024.0, + nPadCells * sizeof(PdfT) / 1024.0 / 1024.0 * 19); + } + else { + printf("# padding info: no padding used\n"); + } + + return nNewCells; +} diff --git a/src/Padding.h b/src/Padding.h new file mode 100644 index 0000000..55fa33e --- /dev/null +++ b/src/Padding.h @@ -0,0 +1,19 @@ +#ifndef __PADDING_H__ +#define __PADDING_H__ + +typedef struct PadInfo_ +{ + int nEntries; + int * Modulus; + int * Offset; +} PadInfo; + +PadInfo * PadInfoFromStr(const char * padStr); + +int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfo); +int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr); + +void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix); +void PadInfoFree(PadInfo * padInfo); + +#endif // __PADDING_H__ diff --git a/src/Pinning.c b/src/Pinning.c index 0cf70ea..de9be3f 100644 --- a/src/Pinning.c +++ b/src/Pinning.c @@ -34,9 +34,6 @@ #include "Base.h" #include "Pinning.h" - - - // ----------------------------------------------------------------------- // // Binds the calling thread to specified core. @@ -73,8 +70,7 @@ int PinCurrentThreadToCore(int coreNumber) // // ----------------------------------------------------------------------- -int PinCurrentThreadByEnvVar(const char * envVarName, - int mpiRank, int nodeRank, int threadNumber) +int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber) { const char * envVarValue; int core; @@ -82,14 +78,12 @@ int PinCurrentThreadByEnvVar(const char * envVarName, envVarValue = getenv(envVarName); if (envVarValue == NULL) { - if (mpiRank == 0) { - Print("skip pinning: env var %s not set\n", envVarName); - } + printf("WARNING: skip pinning: env var %s not set\n", envVarName); return 0; } - core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber); + core = PinParseCpuList(envVarValue, threadNumber); if (core < 0) { return core; @@ -107,20 +101,17 @@ int PinCurrentThreadByEnvVar(const char * envVarName, // // ----------------------------------------------------------------------- -int PinCurrentThreadByCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber) +int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber) { int core; if (cpuList == NULL) { - if (mpiRank == 0) { - printf("ERROR: cpu list is NULL.\n"); - } + printf("ERROR: cpu list is NULL.\n"); exit(1); } - core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber); + core = PinParseCpuList(cpuList, threadNumber); if (core < 0) { return core; @@ -133,31 +124,13 @@ int PinCurrentThreadByCpuList(const char * cpuList, // ----------------------------------------------------------------------- // // Parses the provided cpu list and returns the core number for the -// specified MPI rank, local rank, and thread. -// -// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5 -// -// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a -// node rank. The first block maps to node rank 1, the second to node -// rank 2, etc. -// -// Inside a block the core numbers specify where the threads should -// be pinned to. They are separated by "," and the first number maps -// to the first core, the second number to the second core, etc. -// -// For example: 0,2,4_6,8,10 +// specified thread. // -// Node rank 0 thread 0 pinned to core 0 -// 0 1 2 -// 0 2 4 -// 1 0 6 -// 1 1 8 -// 1 2 10 +// The cpu list has for example a format of: 0,1,2 // // ----------------------------------------------------------------------- -int PinParseCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber) +int PinParseCpuList(const char * cpuList, int threadNumber) { int cpu = -1; @@ -168,8 +141,8 @@ int PinParseCpuList(const char * cpuList, const char * c = cpuList; // Ensure only valid characters are in the cpu list. - // Cpu list is in the format of "0,1,2_3,4,5". - while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) { + // Cpu list is in the format of "0,1,2,3,4,5". + while (((*c >= '0' && *c <= '9') || *c == ',')) { ++c; } @@ -180,19 +153,6 @@ int PinParseCpuList(const char * cpuList, c = cpuList; - int i = 0; - - // Move variable c after the "nodeRank"th "_" in the cpu list. - while (i < nodeRank && *c != 0x00) { - if (*c == '_') ++i; - ++c; - } - - if (i != nodeRank || *c < '0' || *c > '9') { - // Cpu list for this node rank not found. - return -3; - } - // Now find the core for the specified thread. int t = 0; @@ -201,11 +161,6 @@ int PinParseCpuList(const char * cpuList, if (*c == ',') { ++t; } - else if (*c == '_') { - // Unexpected character at this position. - break; - } - ++c; } diff --git a/src/Pinning.h b/src/Pinning.h index 40225c1..877048d 100644 --- a/src/Pinning.h +++ b/src/Pinning.h @@ -30,14 +30,11 @@ int PinCurrentThreadToCore(int coreNumber); -int PinParseCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber); +int PinParseCpuList(const char * cpuList, int threadNumber); -int PinCurrentThreadByEnvVar(const char * envVarName, - int mpiRank, int nodeRank, int threadNumber); +int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber); -int PinCurrentThreadByCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber); +int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber); int PinCurrentCore(); diff --git a/src/test.sh b/src/test.sh index 7a76f51..a0e8ad6 100755 --- a/src/test.sh +++ b/src/test.sh @@ -54,7 +54,7 @@ Config="$1" make clean-all make -j CONFIG=$Config TAG=$XTag-debug -make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v" -- 2.43.0