#main.css
[ -d html ] || mkdir -p html
# rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
- rst2html --stylesheet=html4css1.css $< html/$@.html
+ rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
ul.auto-toc {
list-style-type: none }
+</style>
+<style type="text/css">
+
+
+h1, h2, h3, h4, h5, h6 {
+ font-family: sans-serif;
+ font-size: 100%;
+ background-color: #dcdcdc;
+}
+
+h1.title {
+ background-color: gray;
+ color: white
+}
+
+table.footnote {
+ padding-left: 0.5ex;
+}
+
+table.citation {
+ padding-left: 0.5ex
+}
+
+td.label {
+ width: 10%;
+}
+
+table, table.docutils, td, th {
+ border: 0;
+}
+
+table.citation, table.footnote {
+ width: 100%;
+}
+
+th {
+ background-color: lavender ;
+}
+
+tr:nth-child(even) {
+ xxbackground-color: aliceblue;
+ background-color: white;
+}
+tr:nth-child(odd) {
+ xxbackground-color: lavender;
+ background-color: whitesmoke;
+}
+
+
+
</style>
</head>
<body>
<li><a class="reference internal" href="#benchmarking" id="id4">1.2 Benchmarking</a></li>
<li><a class="reference internal" href="#release-and-verification" id="id5">1.3 Release and Verification</a></li>
<li><a class="reference internal" href="#compilers" id="id6">1.4 Compilers</a></li>
-<li><a class="reference internal" href="#options-summary" id="id7">1.5 Options Summary</a></li>
+<li><a class="reference internal" href="#cleaning" id="id7">1.5 Cleaning</a></li>
+<li><a class="reference internal" href="#options-summary" id="id8">1.6 Options Summary</a></li>
</ul>
</li>
-<li><a class="reference internal" href="#invocation" id="id8">2 Invocation</a><ul class="auto-toc">
-<li><a class="reference internal" href="#command-line-parameters" id="id9">2.1 Command Line Parameters</a></li>
+<li><a class="reference internal" href="#invocation" id="id9">2 Invocation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#command-line-parameters" id="id10">2.1 Command Line Parameters</a></li>
+<li><a class="reference internal" href="#kernels" id="id11">2.2 Kernels</a></li>
</ul>
</li>
-<li><a class="reference internal" href="#id1" id="id10">3 Benchmarking</a></li>
-<li><a class="reference internal" href="#acknowledgements" id="id11">4 Acknowledgements</a></li>
+<li><a class="reference internal" href="#id1" id="id12">3 Benchmarking</a><ul class="auto-toc">
+<li><a class="reference internal" href="#padding" id="id13">3.1 Padding</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#geometries" id="id14">4 Geometries</a></li>
+<li><a class="reference internal" href="#results" id="id15">5 Results</a></li>
+<li><a class="reference internal" href="#licence" id="id16">6 Licence</a></li>
+<li><a class="reference internal" href="#acknowledgements" id="id17">7 Acknowledgements</a></li>
</ul>
</div>
<div class="section" id="compilation">
<div class="section" id="debug-and-verification">
<h2><a class="toc-backref" href="#id3">1.1 Debug and Verification</a></h2>
<pre class="literal-block">
-make
+make BUILD=debug BENCHMARK=off
</pre>
-<p>Running <tt class="docutils literal">make</tt> without any arguments builds the debug version (BUILD=debug) of
+<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of
the benchmark kernels, where no optimizations are performed, line numbers and
debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined. The resulting
binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named
<tt class="docutils literal"><span class="pre">lbmbenchk-linux-<compiler>-debug</span></tt>.</p>
-<p>Without any further specification the binary includes verification
-(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS</tt>), and VTK output
+<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification
+(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output
(<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p>
<p>Please note that the generated binary will therefore
exhibit a poor performance.</p>
<h2><a class="toc-backref" href="#id4">1.2 Benchmarking</a></h2>
<p>To generate a binary for benchmarking run make with</p>
<pre class="literal-block">
-make BENCHMARK=on BUILD=release
+make
</pre>
-<p>Here BUILD=release turns optimizations on and BENCHMARK=on disables
+<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
+BUILD=release turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
verfification, statistics, and VTK output.</p>
</div>
<div class="section" id="release-and-verification">
<p>Verification with the debug builds can be extremely slow. Hence verification
capabilities can be build with release builds:</p>
<pre class="literal-block">
-make BUILD=release
+make BENCHMARK=off
</pre>
</div>
<div class="section" id="compilers">
both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
<tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
</div>
+<div class="section" id="cleaning">
+<h2><a class="toc-backref" href="#id7">1.5 Cleaning</a></h2>
+<p>For each configuration and build (debug/release) a subdirectory under the
+<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
+stored.
+With</p>
+<pre class="literal-block">
+make CONFIG=... BUILD=... clean
+</pre>
+<p>a specific combination is select and cleaned, whereas with</p>
+<pre class="literal-block">
+make clean-all
+</pre>
+<p>all object and dependency files are deleted.</p>
+</div>
<div class="section" id="options-summary">
-<h2><a class="toc-backref" href="#id7">1.5 Options Summary</a></h2>
+<h2><a class="toc-backref" href="#id8">1.6 Options Summary</a></h2>
<p>Options that can be specified when building the framework with make:</p>
<table border="1" class="docutils">
<colgroup>
<td>default</td>
<td>description</td>
</tr>
-<tr><td>TARCH</td>
-<td>--</td>
-<td>--</td>
-<td>Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.</td>
-</tr>
<tr><td>BENCHMARK</td>
<td>on, off</td>
-<td>off</td>
-<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.</td>
+<td>on</td>
+<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td>
</tr>
<tr><td>BUILD</td>
<td>debug, release</td>
-<td>debug</td>
+<td>release</td>
<td>No optimization, debug symbols, DEBUG defined.</td>
</tr>
<tr><td>CONFIG</td>
<td>off</td>
<td>View statistics, like density etc, during simulation.</td>
</tr>
+<tr><td>TARCH</td>
+<td>--</td>
+<td>--</td>
+<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td>
+</tr>
<tr><td>VERIFICATION</td>
<td>on, off</td>
<td>off</td>
</div>
</div>
<div class="section" id="invocation">
-<h1><a class="toc-backref" href="#id8">2 Invocation</a></h1>
+<h1><a class="toc-backref" href="#id9">2 Invocation</a></h1>
<p>Running the binary will print among the GPL licence header a line like the following:</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification
+</pre>
<p>if verfication was enabled during compilation or</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark
+</pre>
<p>if verfication was disabled during compilation.</p>
<div class="section" id="command-line-parameters">
-<h2><a class="toc-backref" href="#id9">2.1 Command Line Parameters</a></h2>
+<h2><a class="toc-backref" href="#id10">2.1 Command Line Parameters</a></h2>
<p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
<pre class="literal-block">
Usage:
<p>Kernel specific parameters can be opatained via selecting the specific kernel
and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
<pre class="literal-block">
-$ bin/lbmbenchk-linux-intel-release -kernel -- -h
+$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
...
Kernel parameters:
[-blk <n>] [-blk-[xyz] <n>]
blk-pull-aos
</pre>
</div>
+<div class="section" id="kernels">
+<h2><a class="toc-backref" href="#id11">2.2 Kernels</a></h2>
+<p>The following list shortly describes available kernels:</p>
+<ul class="simple">
+<li>push-soa/push-aos/pull-soa/pull-aos:
+Unoptimized kernels (but stream/collide are already fused) using two grids as
+source and destination. Implement push/pull semantics as well structure of
+arrays (soa) or array of structures (aos) layout.</li>
+<li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+The same as the unoptimized kernels without the blk prefix, except that they support
+spatial blocking, i.e. loop blocking of the three loops used to iterate over
+the lattice. Here manual work sharing for OpenMP is used.</li>
+<li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+Here only a 1D vector of is used to store the fluid nodes, omitting the
+obstacles. An adjacency list is used to recover the neighborhood associations.</li>
+<li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+once. Postcollision values are written back via nontemporal stores in 18 (1s)
+or 9 (2s) loops.</li>
+<li>list-aa-aos/list-aa-soa:
+Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+list. Supported are array of structures (aos) and structure of arrays (soa)
+data layout is supported.</li>
+<li>list-aa-ria-soa:
+Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+list. Furthermore it contains a vectorized even time step and run length
+coding to reduce the loop balance of the odd time step.</li>
+<li>list-aa-pv-soa:
+All optimizations of list-aa-ria-soa. Additional with partial vectorization
+of the odd time step.</li>
+</ul>
+<p>Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.</p>
+<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means
+direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D
+vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="29%" />
+<col width="14%" />
+<col width="14%" />
+<col width="6%" />
+<col width="10%" />
+<col width="10%" />
+<col width="16%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel name</th>
+<th class="head">prop. step</th>
+<th class="head">data layout</th>
+<th class="head">addr.</th>
+<th class="head">parallel</th>
+<th class="head">blocking</th>
+<th class="head">B_l [B/FLUP]</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-split-nt-1s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-pull-split-nt-2s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>AA</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+</tbody>
+</table>
+</div>
</div>
<div class="section" id="id1">
-<h1><a class="toc-backref" href="#id10">3 Benchmarking</a></h1>
+<h1><a class="toc-backref" href="#id12">3 Benchmarking</a></h1>
<p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
created make sure the binary was compiled with:</p>
<ul class="simple">
-<li><tt class="docutils literal">BENCHMARK=on</tt> and</li>
-<li><tt class="docutils literal">BUILD=release</tt> and</li>
+<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
+<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
<li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
<li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
</ul>
<ul class="simple">
<li>transparent huge pages: when allocating memory small 4 KiB pages might be
replaced with larger ones. This is in general a good thing, but if this is
-really the case, depends on the system settings.</li>
+really the case, depends on the system settings (check e.g. the status of
+<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>).
+Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to
+a 4 KiB page, which should be the case for the lattices.
+This should result in huge pages except THP is disabled on the machine.
+(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently
+hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li>
<li>CPU/core frequency: For reproducible results the frequency of all cores
should be fixed.</li>
<li>NUMA placement policy: The benchmark assumes a first touch policy, which
to remote domains typically have a higher latency and lower bandwidth.</li>
<li>System load: interference with other application, espcially on desktop
systems should be avoided.</li>
-<li>Padding: most kernels do not care about padding against cache or TLB
-thrashing. Even if the number of (fluid) nodes suggest everything is fine,
-through parallelization still problems might occur.</li>
+<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
+adjusted so that no cache or TLB thrashing should occur. The parameters are
+optimized for current Intel based systems. For more details look into the
+padding section.</li>
<li>CPU dispatcher function: the compiler might add different versions of a
function for different ISA extensions. Make sure the code you might think is
executed is actually the code which is executed.</li>
</ul>
+<div class="section" id="padding">
+<h2><a class="toc-backref" href="#id13">3.1 Padding</a></h2>
+<p>With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.</p>
+<p>Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>)
+<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding),
+or a manual padding.</p>
+<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.</p>
+<p>Manual padding is done via a padding string and has the format
+<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g.
+<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>.
+This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong>
+would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.</p>
+<p>A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p>
+</div>
+</div>
+<div class="section" id="geometries">
+<h1><a class="toc-backref" href="#id14">4 Geometries</a></h1>
+<p>TODO: supported geometries: channel, pipe, blocks</p>
+</div>
+<div class="section" id="results">
+<h1><a class="toc-backref" href="#id15">5 Results</a></h1>
+<p>TODO</p>
+</div>
+<div class="section" id="licence">
+<h1><a class="toc-backref" href="#id16">6 Licence</a></h1>
+<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
</div>
<div class="section" id="acknowledgements">
-<h1><a class="toc-backref" href="#id11">4 Acknowledgements</a></h1>
+<h1><a class="toc-backref" href="#id17">7 Acknowledgements</a></h1>
<p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
<p>This work was funded by KONWHIR project OMI4PAPS.</p>
-<p>Document was generated at 2017-10-26 09:43.</p>
+<p>Document was generated at 2017-11-02 15:33.</p>
</div>
</div>
</body>
--- /dev/null
+
+h1, h2, h3, h4, h5, h6 {
+ font-family: sans-serif;
+ font-size: 100%;
+ background-color: #dcdcdc;
+}
+
+h1.title {
+ background-color: gray;
+ color: white
+}
+
+table.footnote {
+ padding-left: 0.5ex;
+}
+
+table.citation {
+ padding-left: 0.5ex
+}
+
+td.label {
+ width: 10%;
+}
+
+table, table.docutils, td, th {
+ border: 0;
+}
+
+table.citation, table.footnote {
+ width: 100%;
+}
+
+th {
+ background-color: lavender ;
+}
+
+tr:nth-child(even) {
+ xxbackground-color: aliceblue;
+ background-color: white;
+}
+tr:nth-child(odd) {
+ xxbackground-color: lavender;
+ background-color: whitesmoke;
+}
+
+
::
- make
+ make BUILD=debug BENCHMARK=off
-Running ``make`` without any arguments builds the debug version (BUILD=debug) of
+Running ``make`` with ``BUILD=debug`` builds the debug version of
the benchmark kernels, where no optimizations are performed, line numbers and
debug symbols are included as well as ``DEBUG`` will be defined. The resulting
binary will be found in the ``bin`` subdirectory and named
``lbmbenchk-linux-<compiler>-debug``.
-Without any further specification the binary includes verification
-(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output
+Specifying ``BENCHMARK=off`` turns on verification
+(``VERIFICATION=on``), statistics (``STATISTICS=on``), and VTK output
(``VTK_OUTPUT=on``) enabled.
Please note that the generated binary will therefore
To generate a binary for benchmarking run make with ::
- make BENCHMARK=on BUILD=release
+ make
-Here BUILD=release turns optimizations on and BENCHMARK=on disables
+As default ``BENCHMARK=on`` and ``BUILD=release`` is set, where
+BUILD=release turns optimizations on and ``BENCHMARK=on`` disables
verfification, statistics, and VTK output.
Release and Verification
Verification with the debug builds can be extremely slow. Hence verification
capabilities can be build with release builds: ::
- make BUILD=release
+ make BENCHMARK=off
Compilers
---------
both configuration can be chosen via ``CONFIG=linux-gcc`` or
``CONFIG=linux-intel``.
+
+Cleaning
+--------
+
+For each configuration and build (debug/release) a subdirectory under the
+``src/obj`` directory is created where the dependency and object files are
+stored.
+With ::
+
+ make CONFIG=... BUILD=... clean
+
+a specific combination is select and cleaned, whereas with ::
+
+ make clean-all
+
+all object and dependency files are deleted.
+
+
Options Summary
---------------
============= ======================= ============ ==========================================================
name values default description
------------- ----------------------- ------------ ----------------------------------------------------------
-TARCH -- -- Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.
-BENCHMARK on, off off If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.
-BUILD debug, release debug No optimization, debug symbols, DEBUG defined.
+BENCHMARK on, off on If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.
+BUILD debug, release release No optimization, debug symbols, DEBUG defined.
CONFIG linux-gcc, linux-intel linux-intel Select GCC or Intel compiler.
ISA avx, sse avx Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for.
OPENMP on, off on OpenMP, i.\,e.\. threading support.
STATISTICS on, off off View statistics, like density etc, during simulation.
+TARCH -- -- Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.
VERIFICATION on, off off Turn verification on/off.
VTK_OUTPUT on, off off Enable/Disable VTK file output.
============= ======================= ============ ==========================================================
Invocation
==========
-Running the binary will print among the GPL licence header a line like the following:
+Running the binary will print among the GPL licence header a line like the following: ::
LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification
-if verfication was enabled during compilation or
+if verfication was enabled during compilation or ::
LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark
Kernel specific parameters can be opatained via selecting the specific kernel
and passing ``-h`` as parameter: ::
- $ bin/lbmbenchk-linux-intel-release -kernel -- -h
+ $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
...
Kernel parameters:
[-blk <n>] [-blk-[xyz] <n>]
blk-pull-soa
blk-pull-aos
+Kernels
+-------
+
+The following list shortly describes available kernels:
+
+- push-soa/push-aos/pull-soa/pull-aos:
+ Unoptimized kernels (but stream/collide are already fused) using two grids as
+ source and destination. Implement push/pull semantics as well structure of
+ arrays (soa) or array of structures (aos) layout.
+
+- blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+ The same as the unoptimized kernels without the blk prefix, except that they support
+ spatial blocking, i.e. loop blocking of the three loops used to iterate over
+ the lattice. Here manual work sharing for OpenMP is used.
+
+- list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+ The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+ Here only a 1D vector of is used to store the fluid nodes, omitting the
+ obstacles. An adjacency list is used to recover the neighborhood associations.
+
+- list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+ Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+ once. Postcollision values are written back via nontemporal stores in 18 (1s)
+ or 9 (2s) loops.
+
+- list-aa-aos/list-aa-soa:
+ Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+ list. Supported are array of structures (aos) and structure of arrays (soa)
+ data layout is supported.
+
+- list-aa-ria-soa:
+ Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+ list. Furthermore it contains a vectorized even time step and run length
+ coding to reduce the loop balance of the odd time step.
+
+- list-aa-pv-soa:
+ All optimizations of list-aa-ria-soa. Additional with partial vectorization
+ of the odd time step.
+
+
+Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.
+
+The following table summarizes the properties of the kernels. Here **D** means
+direct addressing, i.e. full array, **I** means indirect addressing, i.e. 1D
+vector with adjacency list, **x** means supported, whereas **--** means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.
+
+
+====================== =========== =========== ===== ======== ======== ============
+kernel name prop. step data layout addr. parallel blocking B_l [B/FLUP]
+====================== =========== =========== ===== ======== ======== ============
+push-soa OS SoA D x -- 456
+push-aos OS AoS D x -- 456
+pull-soa OS SoA D x -- 456
+pull-aos OS AoS D x -- 456
+blk-push-soa OS SoA D x x 456
+blk-push-aos OS AoS D x x 456
+blk-pull-soa OS SoA D x x 456
+blk-pull-aos OS AoS D x x 456
+list-push-soa OS SoA I x x 528
+list-push-aos OS AoS I x x 528
+list-pull-soa OS SoA I x x 528
+list-pull-aos OS AoS I x x 528
+list-pull-split-nt-1s OS SoA I x x 376
+list-pull-split-nt-2s OS SoA I x x 376
+list-aa-soa AA SoA I x x 340
+list-aa-aos AA AoS I x x 340
+list-aa-ria-soa AA SoA I x x 304-342
+list-aa-pv-soa AA SoA I x x 304-342
+====================== =========== =========== ===== ======== ======== ============
Benchmarking
============
Correct benchmarking is a nontrivial task. Whenever benchmark results should be
created make sure the binary was compiled with:
-- ``BENCHMARK=on`` and
-- ``BUILD=release`` and
+- ``BENCHMARK=on`` (default if not overriden) and
+- ``BUILD=release`` (default if not overriden) and
- the correct ISA for macros is used, selected via ``ISA`` and
- use ``TARCH`` to specify the architecture the compiler generates code for.
- transparent huge pages: when allocating memory small 4 KiB pages might be
replaced with larger ones. This is in general a good thing, but if this is
- really the case, depends on the system settings.
+ really the case, depends on the system settings (check e.g. the status of
+ ``/sys/kernel/mm/transparent_hugepage/enabled``).
+ Currently ``madvise(MADV_HUGEPAGE)`` is used for allocations which are aligned to
+ a 4 KiB page, which should be the case for the lattices.
+ This should result in huge pages except THP is disabled on the machine.
+ (NOTE: madvise() is used if ``HAVE_HUGE_PAGES`` is defined, which is currently
+ hard coded defined in ``Memory.c``).
- CPU/core frequency: For reproducible results the frequency of all cores
should be fixed.
- System load: interference with other application, espcially on desktop
systems should be avoided.
-- Padding: most kernels do not care about padding against cache or TLB
- thrashing. Even if the number of (fluid) nodes suggest everything is fine,
- through parallelization still problems might occur.
+- Padding: For SoA based kernels the number of (fluid) nodes is automatically
+ adjusted so that no cache or TLB thrashing should occur. The parameters are
+ optimized for current Intel based systems. For more details look into the
+ padding section.
- CPU dispatcher function: the compiler might add different versions of a
function for different ISA extensions. Make sure the code you might think is
executed is actually the code which is executed.
+Padding
+-------
+
+With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.
+
+Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the ``--``)
+``-pad``. Supported values are ``auto`` (default), ``no`` (to disable padding),
+or a manual padding.
+
+Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.
+
+Manual padding is done via a padding string and has the format
+``mod_1+offset_1(,mod_n+offset_n)``, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g.
+**d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE**.
+This would distribute the pages evenly over the sets. Hereby **PAGE_SIZE * TLB_SETS**
+would be our ``mod_1`` and **PAGE_SIZE** (after the =) our ``offset_1``.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.
+
+A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like ``-pad 0+16`` would at a static padding of two nodes (one node = 8 b).
+
+
+Geometries
+==========
+
+TODO: supported geometries: channel, pipe, blocks
+
+
+Results
+=======
+
+TODO
+
+
+Licence
+=======
+
+The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.
+
Acknowledgements
================
}
-#define TOOL_NAME "lbmbenchk"
-
#define STRINGIFYX(x) #x
#define STRINGIFY(x) STRINGIFYX(x)
} while (0)
#define Print(formatString, ...) \
- fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__)
+ fprintf(stdout, SHC_MAGENTA formatString SHC_NC, ##__VA_ARGS__)
#define Warning(formatString, ...) \
- fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__)
+ fprintf(stdout, SHC_BROWN "WARNING: " SHC_NC formatString, ##__VA_ARGS__)
#define Error(formatString, ...) \
- fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__)
-
+ fprintf(stderr, SHC_RED "ERROR: " formatString SHC_NC , ##__VA_ARGS__)
+/*
#define DebugPrint(formatString, ...) \
- fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__)
-
+ fprintf(stderr, "DEBUG: " formatString, ##__VA_ARGS__)
+*/
#ifndef NO_SHELL_COLORS
// or "\e"
#include "Memory.h"
#include "Vtk.h"
+#include "LikwidIf.h"
#include <inttypes.h>
#include <math.h>
for (int iter = 0; iter < maxIterations; ++iter) {
+ X_LIKWID_START("os");
+
#ifdef _OPENMP
#pragma omp parallel for collapse(3) default(none) \
shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
evenPart, oddPart, w_1_indep, w_2_indep)
#endif
- for (int z = oZ; z < nZ + oZ; ++z) {
+ for (int x = oX; x < nX + oX; ++x) {
for (int y = oY; y < nY + oY; ++y) {
- for (int x = oX; x < nX + oX; ++x) {
+ for (int z = oZ; z < nZ + oZ; ++z) {
#define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
#ifdef PROP_MODEL_PUSH
}
} // z, y, x (from inner to outer)
+ // Stop counters before bounce back. Else computing loop balance will be incorrect.
+ X_LIKWID_STOP("os");
+
// Fixup bounce back PDFs.
#ifdef _OPENMP
#pragma omp parallel for default(none) \
for (int iter = 0; iter < maxIterations; ++iter) {
- // #ifdef _OPENMP --> add line continuation
- // #pragma omp parallel for collapse(3) default(none)
- // shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd,
- // w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd,
- // oX, oY, oZ, nX, nY, nZ, blk)
- // private(ux, uy, uz, ui, dens, dir_indep_trm,
- // pdf_C,
- // pdf_N, pdf_E, pdf_S, pdf_W,
- // pdf_NE, pdf_SE, pdf_SW, pdf_NW,
- // pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW,
- // pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW,
- // evenPart, oddPart, w_1_indep, w_2_indep)
- // #endif
#ifdef _OPENMP
- #pragma omp parallel for default(none) \
+ #pragma omp parallel default(none) \
shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
evenPart, oddPart, w_1_indep, w_2_indep)
#endif
+ {
+ X_LIKWID_START("blk-os");
- for (int i = 0; i < nThreads; ++i) {
+ int threadId = omp_get_thread_num();
- int threadStartX = nX / nThreads * i;
- int threadEndX = nX / nThreads * (i + 1);
+ int threadStartX = nX / nThreads * threadId;
+ int threadEndX = nX / nThreads * (threadId + 1);
if (nX % nThreads > 0) {
- if (nX % nThreads > i) {
- threadStartX += i;
- threadEndX += i + 1;
+ if (nX % nThreads > threadId) {
+ threadStartX += threadId;
+ threadEndX += threadId + 1;
}
else {
threadStartX += nX % nThreads;
}
}
- // for (int z = oZ; z < nZ + oZ; ++z) {
- // for (int y = oY; y < nY + oY; ++y) {
- // for (int x = oX; x < nX + oX; ++x) {
- for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
for (int bY = oY; bY < nY + oY; bY += blk[1]) {
- for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
- // for (int bX = oX; bX < nX + oX; bX += blk[0]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
// Must do everything here, else it would break collapse.
int eZ = MIN(bZ + blk[2], nZ + oZ);
int eY = MIN(bY + blk[1], nY + oY);
int eX = MIN(bX + blk[0], threadEndX + oX);
-// printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
-
- for (int z = bZ; z < eZ; ++z) {
+ for (int x = bX; x < eX; ++x) {
for (int y = bY; y < eY; ++y) {
- for (int x = bX; x < eX; ++x) {
+ for (int z = bZ; z < eZ; ++z) {
#define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
#define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)];
- //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
D3Q19_LIST
#undef X
// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
#define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
- //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
D3Q19_LIST
#undef X
} else {
#endif
ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
- pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+ pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
- pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+ pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
- pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+ pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
#ifdef LID_DRIVEN_CAVITY
}
}
} // z, y, x (from inner to outer)
- } // loop over threads
+ X_LIKWID_STOP("blk-os");
+ } // parallel region
+
+ // Stop counters before bounce back. Else computing loop balance will be incorrect.
// Fixup bounce back PDFs.
#ifdef _OPENMP
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
+
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+ Assert(ld != NULL);
+ Assert(kernelData != NULL);
+ Assert(cd != NULL);
+
+ Assert(cd->Omega > 0.0);
+ Assert(cd->Omega < 2.0);
+
+ KernelData * kd = (KernelData *)kernelData;
+
+
+ int nX = ld->Dims[0];
+ int nY = ld->Dims[1];
+ int nZ = ld->Dims[2];
+
+ int * gDims = kd->GlobalDims;
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ KernelDataAa * kda = KDA(kd);
+
+ int blk[3];
+ blk[0] = kda->Blk[0];
+ blk[1] = kda->Blk[1];
+ blk[2] = kda->Blk[2];
+
+ PdfT omega = cd->Omega;
+ PdfT omegaEven = omega;
+ PdfT magicParam = 1.0 / 12.0;
+ // 1/4: best stability;
+ // 1/12: removes third-order advection error (best advection);
+ // 1/6: removes fourth-order diffusion error (best diffusion);
+ // 3/16: exact location of bounce back for poiseuille flow
+
+ PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+ PdfT evenPart = 0.0;
+ PdfT oddPart = 0.0;
+ PdfT dir_indep_trm = 0.0;
+
+ PdfT w_0 = 1.0 / 3.0;
+ PdfT w_1 = 1.0 / 18.0;
+ PdfT w_2 = 1.0 / 36.0;
+
+ PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0;
+ PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0;
+
+ PdfT ux, uy, uz, ui;
+ PdfT dens;
+
+ // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+ #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name);
+ D3Q19_LIST
+ #undef X
+
+ PdfT * src = kd->Pdfs[0];
+
+ int maxIterations = cd->MaxIterations;
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, -1);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, 0);
+ #endif
+
+
+ int nThreads = 1;
+
+ #ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+ #endif
+
+ for (int iter = 0; iter < maxIterations; iter += 2) {
+
+ // --------------------------------------------------------------------
+ // even time step
+
+ X_LIKWID_START("aa-even");
+
+ // {{{
+
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+ w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+ oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \
+ private(ux, uy, uz, ui, dens, dir_indep_trm, \
+ pdf_C, \
+ pdf_N, pdf_E, pdf_S, pdf_W, \
+ pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+ pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+ pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+ evenPart, oddPart, w_1_indep, w_2_indep)
+ #endif
+
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = nX / nThreads * i;
+ int threadEndX = nX / nThreads * (i + 1);
+
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+ // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+
+
+ if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) {
+ continue;
+ }
+
+ #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+ // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+ #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)];
+ D3Q19_LIST
+ #undef X
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+ if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
+ ux = 0.1 * 0.577;
+ uy = 0.0;
+ uz = 0.0;
+
+ } else {
+ #endif
+ ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+ pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+ uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+ pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+ uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+ pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+ #ifdef LID_DRIVEN_CAVITY
+ }
+ #endif
+
+ dens = pdf_C +
+ pdf_N + pdf_E + pdf_S + pdf_W +
+ pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+ pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+ pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+ dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+ // direction: w_0
+ src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+ // direction: w_1
+ w_1_indep = w_1*dir_indep_trm;
+
+ ui = uy;
+ evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+ src[I(x, y, z, D3Q19_S)] = pdf_N - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_N)] = pdf_S - evenPart + oddPart;
+
+ ui = ux;
+ evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+ src[I(x, y, z, D3Q19_W)] = pdf_E - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_E)] = pdf_W - evenPart + oddPart;
+
+ ui = uz;
+ evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+ src[I(x, y, z, D3Q19_B)] = pdf_T - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_T)] = pdf_B - evenPart + oddPart;
+
+ // direction: w_2
+ w_2_indep = w_2*dir_indep_trm;
+
+ ui = -ux + uy;
+ evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+ ui = ux + uy;
+ evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+ ui = -ux + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+ ui = ux + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+ ui = -uy + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+ ui = uy + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+ src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+ src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+ #undef I
+ } } } // z, y, x (from inner to outer)
+ } } } // z, y, x (from inner to outer)
+
+ } // loop over threads
+
+ // }}}
+
+ X_LIKWID_STOP("aa-even");
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, iter);
+ #endif
+
+ // Fixup bounce back PDFs.
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(kd, src)
+ #endif
+ for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+ src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+ }
+
+ // save current iteration
+ kda->Iteration = iter;
+
+ #ifdef VERIFICATION
+ kd->PdfsActive = src;
+ KernelAddBodyForce(kd, ld, cd);
+ #endif
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, iter);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, iter);
+ #endif
+
+ // --------------------------------------------------------------------
+ // odd time step
+
+
+ X_LIKWID_START("aa-odd");
+
+ // {{{
+
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+ w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+ oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
+ private(ux, uy, uz, ui, dens, dir_indep_trm, \
+ pdf_C, \
+ pdf_N, pdf_E, pdf_S, pdf_W, \
+ pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+ pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+ pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+ evenPart, oddPart, w_1_indep, w_2_indep)
+ #endif
+
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = nX / nThreads * i;
+ int threadEndX = nX / nThreads * (i + 1);
+
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ // Must do everything here, else it would break collapse.
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+
+ #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+ // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+ #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)];
+ D3Q19_LIST
+ #undef X
+
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+ if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) {
+ ux = 0.1 * 0.577;
+ uy = 0.0;
+ uz = 0.0;
+
+ } else {
+#endif
+ ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+ pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+ uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+ pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+ uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+ pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+ }
+#endif
+
+ dens = pdf_C +
+ pdf_N + pdf_E + pdf_S + pdf_W +
+ pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+ pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+ pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+ dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+ // direction: w_0
+ src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+ // direction: w_1
+ w_1_indep = w_1*dir_indep_trm;
+
+ ui = uy;
+ evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+ src[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart;
+ src[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart;
+
+ ui = ux;
+ evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+ src[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart;
+ src[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart;
+
+ ui = uz;
+ evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+ oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+ src[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart;
+ src[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart;
+
+ // direction: w_2
+ w_2_indep = w_2*dir_indep_trm;
+
+ ui = -ux + uy;
+ evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+ src[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+ src[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+ ui = ux + uy;
+ evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+ src[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+ src[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+ ui = -ux + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+ src[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+ src[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+ ui = ux + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+ src[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+ src[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+ ui = -uy + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+ src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+ src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+ ui = uy + uz;
+ evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+ oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+ src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+ src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+
+ #undef I
+ } } } // z, y, x (from inner to outer)
+ } } } // z, y, x (from inner to outer)
+ } // loop over threads
+
+ // }}}
+
+ // Stop counters before bounce back. Else computing loop balance will be incorrect.
+
+ X_LIKWID_STOP("aa-odd");
+
+ // Fixup bounce back PDFs.
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(kd, src)
+ #endif
+ for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+ src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+ }
+
+ // save current iteration
+ kda->Iteration = iter + 1;
+
+ #ifdef VERIFICATION
+ kd->PdfsActive = src;
+ KernelAddBodyForce(kd, ld, cd);
+ #endif
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, iter + 1);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, iter + 1);
+ #endif // }}}
+
+
+ } // for (int iter = 0; ...
+
+ #ifdef VTK_OUTPUT
+
+ if (cd->VtkOutput) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, maxIterations);
+ }
+
+ #endif
+
+ return;
+}
+
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA__
+#define __BENCH_KERNEL_D3Q19_AA__
+
+#include "Kernel.h"
+
+
+void D3Q19AaInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaInit_AaAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19AaDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19AaDeinit_AaAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19__
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdf != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+ Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+ int nx = x - D3Q19_X[dir];
+ int ny = y - D3Q19_Y[dir];
+ int nz = z - D3Q19_Z[dir];
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+ #undef I
+ }
+ else {
+ int nx = x;
+ int ny = y;
+ int nz = z;
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+ #undef I
+ }
+
+
+ return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+ Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+ int nx = x - D3Q19_X[dir];
+ int ny = y - D3Q19_Y[dir];
+ int nz = z - D3Q19_Z[dir];
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+ #undef I
+ }
+ else {
+ int nx = x;
+ int ny = y;
+ int nz = z;
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+ #undef I
+ }
+
+ return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdfs != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ else {
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+ D3Q19_LIST
+ #undef X
+ #undef I
+
+ }
+
+#if 0 // DETECT NANs
+
+ for (int d = 0; d < 19; ++d) {
+ if (isnan(pdfs[d])) {
+ printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+ for (int d2 = 0; d2 < 19; ++d2) {
+ printf("%d: %e\n", d2, pdfs[d2]);
+ }
+
+ exit(1);
+ }
+ }
+
+#endif
+
+ return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdfs != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ else {
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ return;
+}
+
+
+static void ParameterUsage()
+{
+ printf("Kernel parameters:\n");
+ printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
+
+ return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+ Assert(blk != NULL);
+
+ blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+ #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
+ #define NEXT_ARG_PRESENT() \
+ do { \
+ if (i + 1 >= params->nKernelArgs) { \
+ printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+ exit(1); \
+ } \
+ } while (0)
+
+
+ for (int i = 0; i < params->nKernelArgs; ++i) {
+ if (ARG_IS("-blk") || ARG_IS("--blk")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[0] = blk[1] = blk[2] = tmp;
+ }
+ else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[0] = tmp;
+ }
+ else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[1] = tmp;
+ }
+ else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[2] = tmp;
+ }
+ else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+ ParameterUsage();
+ exit(1);
+ }
+ else {
+ printf("ERROR: unknown kernel parameter.\n");
+ ParameterUsage();
+ exit(1);
+ }
+ }
+
+ #undef ARG_IS
+ #undef NEXT_ARG_PRESENT
+
+ return;
+}
+
+
+void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+ KernelDataAa * kda = NULL;
+ MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+ kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+ kda->Iteration = -1;
+
+ KernelData * kd = &kda->kd;
+ *kernelData = kd;
+
+ kd->nObstIndices = ld->nObst;
+
+ // Ajust the dimensions according to padding, if used.
+ kd->Dims[0] = ld->Dims[0];
+ kd->Dims[1] = ld->Dims[1];
+ kd->Dims[2] = ld->Dims[2];
+
+
+ int * lDims = ld->Dims;
+ int * gDims = kd->GlobalDims;
+
+ gDims[0] = lDims[0] + 2;
+ gDims[1] = lDims[1] + 2;
+ gDims[2] = lDims[2] + 2;
+
+ kd->Offsets[0] = 1;
+ kd->Offsets[1] = 1;
+ kd->Offsets[2] = 1;
+
+ int lX = lDims[0];
+ int lY = lDims[1];
+ int lZ = lDims[2];
+
+ int gX = gDims[0];
+ int gY = gDims[1];
+ int gZ = gDims[2];
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ int blk[3] = { 0 };
+
+ int nCells = gX * gY * gZ;
+
+ PdfT * pdfs[2] = { NULL, NULL };
+
+ ParseParameters(params, blk);
+
+ if (blk[0] == 0) blk[0] = gX;
+ if (blk[1] == 0) blk[1] = gY;
+ if (blk[2] == 0) blk[2] = gZ;
+
+ printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+
+ kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+ printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+ nCells,
+ sizeof(PdfT) * nCells * N_D3Q19,
+ sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K 4096
+
+ MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+ kd->Pdfs[0] = pdfs[0];
+ kd->Pdfs[1] = NULL;
+
+
+ // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+ // This depends on the chosen data layout.
+ // The structure of the loop should resemble the same "execution layout"
+ // as in the kernel!
+
+ int nThreads;
+#ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+ #pragma omp parallel for \
+ shared(gDims, pdfs, \
+ oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = lX / nThreads * i;
+ int threadEndX = lX / nThreads * (i + 1);
+
+ if (lX % nThreads > 0) {
+ if (lX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += lX % nThreads;
+ threadEndX += lX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+ int eY = MIN(bY + blk[1], lY + oY);
+ int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+ // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0;
+ }
+
+ } } } // x, y, z
+ } } } // bx, by, bz
+ }
+
+
+ // Initialize all PDFs to some standard value.
+ for (int x = oX; x < lX + oX; ++x) {
+ for (int y = oY; y < lY + oY; ++y) {
+ for (int z = oZ; z < lZ + oZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+ }
+ } } } // x, y, z
+
+
+ // Count how many *PDFs* need bounce back treatment.
+
+ uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+ if (nPdfs > ((2LU << 31) - 1)) {
+ printf("ERROR: number of PDFs exceed 2^31.\n");
+ exit(1);
+ }
+
+ // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+ // Works when declaring nBounceBackPdfs as int64_t or using volatile.
+ volatile int nBounceBackPdfs = 0;
+ // int64_t nBounceBackPdfs = 0;
+ int nx, ny, nz, px, py, pz;
+
+
+ for (int x = 0; x < lX; ++x) {
+ for (int y = 0; y < lY; ++y) {
+ for (int z = 0; z < lZ; ++z) {
+
+ if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ nx = x - D3Q19_X[d];
+ ny = y - D3Q19_Y[d];
+ nz = z - D3Q19_Z[d];
+
+ // Check if neighbor is inside the lattice.
+ // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ // continue;
+ // }
+ if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ continue;
+ }
+ else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ }
+ }
+ }
+ }
+ }
+
+ printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+ MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+ MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+ kd->nBounceBackPdfs = nBounceBackPdfs;
+ nBounceBackPdfs = 0;
+
+ int srcIndex;
+ int dstIndex;
+
+ // TODO: currently this is not NUMA-aware
+ // - maybe use the same blocking as for lattice initialization?
+ // - do place the bounce back index vector parallel?
+
+ for (int x = 0; x < lX; ++x) {
+ for (int y = 0; y < lY; ++y) {
+ for (int z = 0; z < lZ; ++z) {
+
+ if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ nx = x + D3Q19_X[d];
+ ny = y + D3Q19_Y[d];
+ nz = z + D3Q19_Z[d];
+
+ if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+ ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+ ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+ ){
+ // For periodicity:
+
+ // We assume we have finished odd time step (accessing neighbor PDFs) and are
+ // before executing the even time step (accessing local PDFs only).
+
+ // Assuming we are at the most east position of the lattice. Through the odd
+ // time step propagation has put a PDF in the east slot of the ghost cell east
+ // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node.
+
+ // In case of transition from even to odd time step , src and dest must be
+ // exchanged.
+
+
+ // x periodic
+ if (nx < 0) {
+ px = lX - 1;
+ }
+ else if (nx >= lX) {
+ px = 0;
+ } else {
+ px = nx;
+ }
+
+ // y periodic
+ if (ny < 0) {
+ py = lY - 1;
+ }
+ else if (ny >= lY) {
+ py = 0;
+ } else {
+ py = ny;
+ }
+
+ // z periodic
+ if (nz < 0) {
+ pz = lZ - 1;
+ }
+ else if (nz >= lZ) {
+ pz = 0;
+ } else {
+ pz = nz;
+ }
+
+ if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+ // See description of bounce back handling below.
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
+ }
+ else {
+
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ // Put it on the other side back into the domain.
+ dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+ VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+ }
+
+ kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+ kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+ ++nBounceBackPdfs;
+
+ }
+ else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ continue;
+ }
+ else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+ // Depending on the time step we are in we have to exchange src and dst index.
+
+ // We build the list for the case, when we have finished odd time step
+ // (accessing neighbor PDFs) and before we start with the even time step
+ // (accessing local PDFs only).
+
+ // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+ // Then we have to move the east PDF from the obstacle to our west position,
+ // i.e. the inverse of east.
+
+ // In case of transition from even to odd time step src and dest just
+ // have to be exchanged.
+
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
+
+ VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+ kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+ kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+ ++nBounceBackPdfs;
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ // Fill remaining KernelData structures
+ kd->GetNode = FNAME(GetNode);
+ kd->SetNode = FNAME(SetNode);
+
+ kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+ kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+ kd->Kernel = FNAME(D3Q19AaKernel);
+
+ kd->DstPdfs = NULL;
+ kd->PdfsActive = kd->Pdfs[0];
+
+ return;
+}
+
+void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+ MemFree((void **) & ((*kernelData)->Pdfs[0]));
+ // MemFree((void **) & ((*kernelData)->Pdfs[1]));
+
+ MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+ MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+ MemFree((void **)kernelData);
+
+ return;
+}
+
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+ KernelData kd;
+ int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking.
+ int Iteration; // Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_) ((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+ #error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+ #error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5 FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+ Assert(dims[0] > 0);
+ Assert(dims[1] > 0);
+ Assert(dims[2] > 0);
+
+ Assert(x >= 0);
+ Assert(x < dims[0]);
+ Assert(y >= 0);
+ Assert(y < dims[1]);
+ Assert(z >= 0);
+ Assert(z < dims[2]);
+ Assert(d >= 0);
+#ifdef D3Q19
+ Assert(d < N_D3Q19);
+#else
+#error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+ return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+#elif DATA_LAYOUT_AOS
+ return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+#error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+#endif // __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+#include "Vector.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd);
+static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd);
+
+#if 0 // {{{
+void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix)
+{
+ return;
+
+ int * lDims = ld->Dims;
+ int * gDims = kd->GlobalDims;
+
+ int nX = gDims[0];
+ int nY = gDims[1];
+ int nZ = gDims[2];
+
+ PdfT pdfs[N_D3Q19];
+
+ int localZStart = zStart;
+ int localZStop = zStop;
+
+ if (localZStart == -1) localZStart = 0;
+ if (localZStop == -1) localZStop = gDims[2] - 1;
+
+ printf("D iter: %d\n", iter);
+
+ for (int dir = 0; dir < 19; ++dir) {
+ for (int z = localZStop; z >= localZStart; --z) {
+ printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z);
+
+ for(int y = 0; y < nY; ++y) {
+ printf("D [%2d][%2d][%s] %2d ", iter, dir, prefix, y);
+
+ for(int x = 0; x < nX; ++x) {
+
+ if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+
+ #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+ pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)];
+ #undef I
+// kd->GetNode(kd, x, y, z, pdfs);
+ }
+ else {
+ pdfs[dir] = -1.0;
+ }
+
+ printf("%.16e ", pdfs[dir]);
+ }
+
+ printf("\n");
+ }
+ }
+ }
+}
+#endif // }}}
+
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
+{
+ Assert(ld != NULL);
+ Assert(kd != NULL);
+ Assert(cd != NULL);
+
+ Assert(cd->Omega > 0.0);
+ Assert(cd->Omega < 2.0);
+
+ KernelDataAa * kda = KDA(kd);
+
+ PdfT * src = kd->PdfsActive;
+
+ int maxIterations = cd->MaxIterations;
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, -1);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, 0);
+ #endif
+
+ Assert((maxIterations % 2) == 0);
+
+ for (int iter = 0; iter < maxIterations; iter += 2) {
+
+ // --------------------------------------------------------------------
+ // even time step
+ // --------------------------------------------------------------------
+
+ X_LIKWID_START("aa-vec-even");
+
+ #pragma omp parallel
+ {
+ KernelEven(ld, kd, cd);
+ }
+
+ X_LIKWID_STOP("aa-vec-even");
+
+ // Fixup bounce back PDFs.
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(kd, src)
+ #endif
+ for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+ src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+ }
+
+ // save current iteration
+ kda->Iteration = iter;
+
+ #ifdef VERIFICATION
+ kd->PdfsActive = src;
+ KernelAddBodyForce(kd, ld, cd);
+ #endif
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, iter);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, iter);
+ #endif
+
+ // --------------------------------------------------------------------
+ // odd time step
+ // --------------------------------------------------------------------
+
+ X_LIKWID_START("aa-vec-odd");
+
+ #pragma omp parallel
+ {
+ KernelOdd(ld, kd, cd);
+ }
+
+ // Stop counters before bounce back. Else computing loop balance will
+ // be incorrect.
+
+ X_LIKWID_STOP("aa-vec-odd");
+
+ // Fixup bounce back PDFs.
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(kd, src)
+ #endif
+ for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+ src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+ }
+
+ // save current iteration
+ kda->Iteration = iter + 1;
+
+ #ifdef VERIFICATION
+ kd->PdfsActive = src;
+ KernelAddBodyForce(kd, ld, cd);
+ #endif
+
+ #ifdef VTK_OUTPUT
+ if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, iter + 1);
+ }
+ #endif
+
+ #ifdef STATISTICS
+ kd->PdfsActive = src;
+ KernelStatistics(kd, ld, cd, iter + 1);
+ #endif // }}}
+
+
+ } // for (int iter = 0; ...
+
+ #ifdef VTK_OUTPUT
+
+ if (cd->VtkOutput) {
+ kd->PdfsActive = src;
+ VtkWrite(ld, kd, cd, maxIterations);
+ }
+
+ #endif
+
+ return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
+{
+ Assert(ld != NULL);
+ Assert(kd != NULL);
+ Assert(cd != NULL);
+
+ Assert(cd->Omega > 0.0);
+ Assert(cd->Omega < 2.0);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int nX = ld->Dims[0];
+ int nY = ld->Dims[1];
+ int nZ = ld->Dims[2];
+
+ int * gDims = kd->GlobalDims;
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ int blk[3];
+ blk[0] = kda->Blk[0];
+ blk[1] = kda->Blk[1];
+ blk[2] = kda->Blk[2];
+
+ PdfT omega = cd->Omega;
+ PdfT omegaEven = omega;
+
+ PdfT magicParam = 1.0 / 12.0;
+ PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+ const PdfT w_0 = 1.0 / 3.0;
+ const PdfT w_1 = 1.0 / 18.0;
+ const PdfT w_2 = 1.0 / 36.0;
+
+ const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+ const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+
+ VPDFT VONE_HALF = VSET(0.5);
+ VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+ VPDFT vw_1_indep, vw_2_indep;
+ VPDFT vw_0 = VSET(w_0);
+ VPDFT vw_1 = VSET(w_1);
+ VPDFT vw_2 = VSET(w_2);
+
+ VPDFT vw_1_x3 = VSET(w_1_x3);
+ VPDFT vw_2_x3 = VSET(w_2_x3);
+ VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+ VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+ VPDFT vui, vux, vuy, vuz, vdens;
+
+ VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+ VPDFT vomegaEven = VSET(omegaEven);
+ VPDFT vomegaOdd = VSET(omegaOdd);
+
+ // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+ #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name);
+ D3Q19_LIST
+ #undef X
+
+ PdfT * src = kd->Pdfs[0];
+
+ int nThreads = 1;
+ int threadId = 0;
+
+ #ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+ threadId = omp_get_thread_num();
+ #endif
+
+ // TODO: Currently only a 1-D decomposition is applied. For achritectures
+ // with a lot of cores we want at least 2-D.
+
+ int threadStartX = nX / nThreads * threadId;
+ int threadEndX = nX / nThreads * (threadId + 1);
+
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > threadId) {
+ threadStartX += threadId;
+ threadEndX += threadId + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+ for (int x = bX; x < eX; x += 1) {
+ for (int y = bY; y < eY; y += 1) {
+ for (int z = bZ; z < eZ; z += VSIZE) {
+
+ #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+ // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+ #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
+ D3Q19_LIST
+ #undef X
+
+
+ vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+ vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+ vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+ vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+ VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),
+ VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+ vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+ VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+ vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+ vui = vuy;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+ vui = vux;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+ vui = vuz;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+ vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+ vui = VSUB(vuy,vux);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+ vui = VADD(vux,vuy);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+ vui = VSUB(vuz,vux);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+ vui = VADD(vux,vuz);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+ vui = VSUB(vuz,vuy);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+ vui = VADD(vuy,vuz);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+ #undef I
+ } } } // x, y, z
+ } } } // blocked x, y, z
+
+
+
+ return;
+} // }}}
+
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
+{
+ Assert(ld != NULL);
+ Assert(kd != NULL);
+ Assert(cd != NULL);
+
+ Assert(cd->Omega > 0.0);
+ Assert(cd->Omega < 2.0);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int nX = ld->Dims[0];
+ int nY = ld->Dims[1];
+ int nZ = ld->Dims[2];
+
+ int * gDims = kd->GlobalDims;
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ int blk[3];
+ blk[0] = kda->Blk[0];
+ blk[1] = kda->Blk[1];
+ blk[2] = kda->Blk[2];
+
+ PdfT omega = cd->Omega;
+ PdfT omegaEven = omega;
+
+ PdfT magicParam = 1.0 / 12.0;
+ PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+ const PdfT w_0 = 1.0 / 3.0;
+ const PdfT w_1 = 1.0 / 18.0;
+ const PdfT w_2 = 1.0 / 36.0;
+
+ const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+ const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+ VPDFT VONE_HALF = VSET(0.5);
+ VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+ VPDFT vw_1_indep, vw_2_indep;
+ VPDFT vw_0 = VSET(w_0);
+ VPDFT vw_1 = VSET(w_1);
+ VPDFT vw_2 = VSET(w_2);
+
+ VPDFT vw_1_x3 = VSET(w_1_x3);
+ VPDFT vw_2_x3 = VSET(w_2_x3);
+ VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+ VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+ VPDFT vui, vux, vuy, vuz, vdens;
+
+ VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+ VPDFT vomegaEven = VSET(omegaEven);
+ VPDFT vomegaOdd = VSET(omegaOdd);
+
+ // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+ #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name);
+ D3Q19_LIST
+ #undef X
+
+ PdfT * src = kd->Pdfs[0];
+
+ int threadId = 0;
+ int nThreads = 1;
+
+ #ifdef _OPENMP
+ threadId = omp_get_thread_num();
+ nThreads = omp_get_max_threads();
+ #endif
+
+ // TODO: Currently only a 1-D decomposition is applied. For achritectures
+ // with a lot of cores we want at least 2-D.
+ int threadStartX = nX / nThreads * threadId;
+ int threadEndX = nX / nThreads * (threadId + 1);
+
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > threadId) {
+ threadStartX += threadId;
+ threadEndX += threadId + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; z += VSIZE) {
+
+ #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+ #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]);
+ D3Q19_LIST
+ #undef X
+
+ vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+ vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+ vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+ vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+ VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+ vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+ VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+ vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+ vui = vuy;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x, y + 1, z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+ VSTU(&src[I(x, y - 1, z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+ vui = vux;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+ VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+ vui = vuz;
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+ VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+ VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+ vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+ vui = VSUB(vuy,vux);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+ VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+ vui = VADD(vux,vuy);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+ VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+ vui = VSUB(vuz,vux);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+ VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+ vui = VADD(vux,vuz);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+ VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+ vui = VSUB(vuz,vuy);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+ VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+ vui = VADD(vuy,vuz);
+ vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+ voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+ VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+ VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+ #undef I
+ } } } // x, y, z
+ } } } // blocked x, y, z
+
+ return;
+
+} // }}}
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC__
+#define __BENCH_KERNEL_D3Q19_AA_VEC__
+
+#include "Kernel.h"
+
+
+void D3Q19AaVecInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaVecDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC__
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdf != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+ Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+ int nx = x - D3Q19_X[dir];
+ int ny = y - D3Q19_Y[dir];
+ int nz = z - D3Q19_Z[dir];
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+ #undef I
+ }
+ else {
+ int nx = x;
+ int ny = y;
+ int nz = z;
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+ #undef I
+ }
+
+
+ return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+ Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+ int nx = x - D3Q19_X[dir];
+ int ny = y - D3Q19_Y[dir];
+ int nz = z - D3Q19_Z[dir];
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+ #undef I
+ }
+ else {
+ int nx = x;
+ int ny = y;
+ int nz = z;
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+ #undef I
+ }
+
+ return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdfs != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ else {
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+ D3Q19_LIST
+ #undef X
+ #undef I
+
+ }
+
+#if 0 // DETECT NANs
+
+ for (int d = 0; d < 19; ++d) {
+ if (isnan(pdfs[d])) {
+ printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+ for (int d2 = 0; d2 < 19; ++d2) {
+ printf("%d: %e\n", d2, pdfs[d2]);
+ }
+
+ exit(1);
+ }
+ }
+
+#endif
+
+ return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+ Assert(kd != NULL);
+ Assert(kd->PdfsActive != NULL);
+ Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+ Assert(pdfs != NULL);
+
+ Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+ Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+ KernelDataAa * kda = KDA(kd);
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ if (kda->Iteration % 2 == 0) {
+ // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ else {
+ #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+ #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+ D3Q19_LIST
+ #undef X
+ #undef I
+ }
+ return;
+}
+
+
+static void ParameterUsage()
+{
+ printf("Kernel parameters:\n");
+ printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
+
+ return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+ Assert(blk != NULL);
+
+ blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+ #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
+ #define NEXT_ARG_PRESENT() \
+ do { \
+ if (i + 1 >= params->nKernelArgs) { \
+ printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+ exit(1); \
+ } \
+ } while (0)
+
+
+ for (int i = 0; i < params->nKernelArgs; ++i) {
+ if (ARG_IS("-blk") || ARG_IS("--blk")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[0] = blk[1] = blk[2] = tmp;
+ }
+ else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[0] = tmp;
+ }
+ else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[1] = tmp;
+ }
+ else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+ NEXT_ARG_PRESENT();
+
+ int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
+ exit(1);
+ }
+
+ blk[2] = tmp;
+ }
+ else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+ ParameterUsage();
+ exit(1);
+ }
+ else {
+ printf("ERROR: unknown kernel parameter.\n");
+ ParameterUsage();
+ exit(1);
+ }
+ }
+
+ #undef ARG_IS
+ #undef NEXT_ARG_PRESENT
+
+ return;
+}
+
+
+void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+ KernelDataAa * kda = NULL;
+ MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+ kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+ kda->Iteration = -1;
+
+ KernelData * kd = &kda->kd;
+ *kernelData = kd;
+
+ kd->nObstIndices = ld->nObst;
+
+ // Ajust the dimensions according to padding, if used.
+ kd->Dims[0] = ld->Dims[0];
+ kd->Dims[1] = ld->Dims[1];
+ kd->Dims[2] = ld->Dims[2];
+
+
+ int * lDims = ld->Dims;
+ int * gDims = kd->GlobalDims;
+
+ Assert(VSIZE <= 4);
+
+ // TODO: only add enough ghost cells so we can compute everything vectorized.
+ gDims[0] = lDims[0] + 2;
+ gDims[1] = lDims[1] + 2;
+ // TODO: fix this for aa-vec2-soa
+ gDims[2] = lDims[2] + 4; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4
+
+ kd->Offsets[0] = 1;
+ kd->Offsets[1] = 1;
+ kd->Offsets[2] = 1;
+
+ int lX = lDims[0];
+ int lY = lDims[1];
+ int lZ = lDims[2];
+
+ int gX = gDims[0];
+ int gY = gDims[1];
+ int gZ = gDims[2];
+
+ int oX = kd->Offsets[0];
+ int oY = kd->Offsets[1];
+ int oZ = kd->Offsets[2];
+
+ int blk[3] = { 0 };
+
+ int nCells = gX * gY * gZ;
+
+ PdfT * pdfs[2] = { NULL, NULL };
+
+ ParseParameters(params, blk);
+
+ if (blk[2] % VSIZE != 0) {
+ blk[2] -= blk[2] % VSIZE;
+ printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]);
+ }
+
+ if (blk[0] == 0) blk[0] = gX;
+ if (blk[1] == 0) blk[1] = gY;
+ if (blk[2] == 0) blk[2] = gZ;
+
+ printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+ kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+ printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+ nCells,
+ sizeof(PdfT) * nCells * N_D3Q19,
+ sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K 4096
+
+ MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+ kd->Pdfs[0] = pdfs[0];
+ kd->Pdfs[1] = NULL;
+
+
+ // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+ // This depends on the chosen data layout.
+ // The structure of the loop should resemble the same "execution layout"
+ // as in the kernel!
+
+ int nThreads;
+#ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+ #pragma omp parallel for \
+ shared(gDims, pdfs, \
+ oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = lX / nThreads * i;
+ int threadEndX = lX / nThreads * (i + 1);
+
+ if (lX % nThreads > 0) {
+ if (lX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += lX % nThreads;
+ threadEndX += lX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+ int eY = MIN(bY + blk[1], lY + oY);
+ int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+ // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0;
+ }
+
+ } } } // x, y, z
+ } } } // bx, by, bz
+ }
+
+
+ // Initialize all PDFs to some standard value.
+ for (int x = oX; x < lX + oX; ++x) {
+ for (int y = oY; y < lY + oY; ++y) {
+ for (int z = oZ; z < lZ + oZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+ }
+ } } } // x, y, z
+
+
+ // Count how many *PDFs* need bounce back treatment.
+
+ uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+ if (nPdfs > ((2LU << 31) - 1)) {
+ printf("ERROR: number of PDFs exceed 2^31.\n");
+ exit(1);
+ }
+
+ // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+ // Works when declaring nBounceBackPdfs as int64_t or using volatile.
+ volatile int nBounceBackPdfs = 0;
+ // int64_t nBounceBackPdfs = 0;
+ int nx, ny, nz, px, py, pz;
+
+
+ for (int x = 0; x < lX; ++x) {
+ for (int y = 0; y < lY; ++y) {
+ for (int z = 0; z < lZ; ++z) {
+
+ if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ nx = x - D3Q19_X[d];
+ ny = y - D3Q19_Y[d];
+ nz = z - D3Q19_Z[d];
+
+ // Check if neighbor is inside the lattice.
+ // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ // continue;
+ // }
+ if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ continue;
+ }
+ else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+ ++nBounceBackPdfs; // Compiler bug --> see above
+ }
+ }
+ }
+ }
+ }
+ }
+
+ printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+ MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+ MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+ kd->nBounceBackPdfs = nBounceBackPdfs;
+ nBounceBackPdfs = 0;
+
+ int srcIndex;
+ int dstIndex;
+
+ // TODO: currently this is not NUMA-aware
+ // - maybe use the same blocking as for lattice initialization?
+ // - do place the bounce back index vector parallel?
+
+ for (int x = 0; x < lX; ++x) {
+ for (int y = 0; y < lY; ++y) {
+ for (int z = 0; z < lZ; ++z) {
+
+ if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ nx = x + D3Q19_X[d];
+ ny = y + D3Q19_Y[d];
+ nz = z + D3Q19_Z[d];
+
+ if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+ ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+ ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+ ){
+ // For periodicity:
+
+ // We assume we have finished odd time step (accessing neighbor PDFs) and are
+ // before executing the even time step (accessing local PDFs only).
+
+ // Assuming we are at the most east position of the lattice. Through the odd
+ // time step propagation has put a PDF in the east slot of the ghost cell east
+ // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node.
+
+ // In case of transition from even to odd time step , src and dest must be
+ // exchanged.
+
+
+ // x periodic
+ if (nx < 0) {
+ px = lX - 1;
+ }
+ else if (nx >= lX) {
+ px = 0;
+ } else {
+ px = nx;
+ }
+
+ // y periodic
+ if (ny < 0) {
+ py = lY - 1;
+ }
+ else if (ny >= lY) {
+ py = 0;
+ } else {
+ py = ny;
+ }
+
+ // z periodic
+ if (nz < 0) {
+ pz = lZ - 1;
+ }
+ else if (nz >= lZ) {
+ pz = 0;
+ } else {
+ pz = nz;
+ }
+
+ if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+ // See description of bounce back handling below.
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
+ }
+ else {
+
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ // Put it on the other side back into the domain.
+ dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+ VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+ }
+
+ kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+ kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+ ++nBounceBackPdfs;
+
+ }
+ else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+ continue;
+ }
+ else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+ // Depending on the time step we are in we have to exchange src and dst index.
+
+ // We build the list for the case, when we have finished odd time step
+ // (accessing neighbor PDFs) and before we start with the even time step
+ // (accessing local PDFs only).
+
+ // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+ // Then we have to move the east PDF from the obstacle to our west position,
+ // i.e. the inverse of east.
+
+ // In case of transition from even to odd time step src and dest just
+ // have to be exchanged.
+
+ srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+ dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]);
+
+ VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+ kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+ kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+ ++nBounceBackPdfs;
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ // Fill remaining KernelData structures
+ kd->GetNode = FNAME(GetNode);
+ kd->SetNode = FNAME(SetNode);
+
+ kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+ kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+ kd->Kernel = FNAME(D3Q19AaVecKernel);
+
+ kd->DstPdfs = NULL;
+ kd->PdfsActive = kd->Pdfs[0];
+
+ return;
+}
+
+void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+ MemFree((void **) & ((*kernelData)->Pdfs[0]));
+
+ MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+ MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+ MemFree((void **)kernelData);
+
+ return;
+}
+
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
+// #include "BenchKernelD3Q19AaCommon.h"
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+ KernelData kd;
+ int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking.
+ int Iteration; // Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_) ((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_SOA
+ #error Only DATA_LAYOUT_SOA is supported.
+#endif
+
+#ifndef PROP_MODEL_AA
+ #error Only PROP_MODEL_AA is supported.
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5 FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+ Assert(dims[0] > 0);
+ Assert(dims[1] > 0);
+ Assert(dims[2] > 0);
+
+ Assert(x >= 0);
+ Assert(x < dims[0]);
+ Assert(y >= 0);
+ Assert(y < dims[1]);
+ Assert(z >= 0);
+ Assert(z < dims[2]);
+ Assert(d >= 0);
+#ifdef D3Q19
+ Assert(d < N_D3Q19);
+#else
+ #error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+ return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+// #elif DATA_LAYOUT_AOS
+// return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+ #error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
#include <inttypes.h>
#include <math.h>
+#ifdef _OPENMP
+ #include <omp.h>
+#endif
// Forward definition.
void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
}
-void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
{
KernelDataEx * kdex = NULL;
MemAlloc((void **)&kdex, sizeof(KernelDataEx));
PdfT * pdfs[2];
- ParseParameters(params, blk);
+ if (blocking) {
+ ParseParameters(params, blk);
+ }
+ else {
+ if (params->nKernelArgs != 0) {
+ printf("ERROR: unknown kernel parameter.\n");
+ printf("This kernels accepts no parameters.\n");
+ exit(1);
+ }
+ }
+
if (blk[0] == 0) blk[0] = gX;
if (blk[1] == 0) blk[1] = gY;
nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
- MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
- MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+ // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+ // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+ MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+ MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+
+ printf("# pdfs[0] = %p\n", pdfs[0]);
+ printf("# pdfs[1] = %p\n", pdfs[1]);
kd->Pdfs[0] = pdfs[0];
kd->Pdfs[1] = pdfs[1];
// This depends on the chosen data layout.
// The structure of the loop should resemble the same "execution layout"
// as in the kernel!
-#ifdef _OPENMP
- #pragma omp parallel for collapse(3)
-#endif
- for (int bZ = 0; bZ < gZ; bZ += blk[2]) {
- for (int bY = 0; bY < gY; bY += blk[1]) {
- for (int bX = 0; bX < gX; bX += blk[0]) {
+ if (blocking) {
+ int nX = ld->Dims[0];
+ int nY = ld->Dims[1];
+ int nZ = ld->Dims[2];
- // Must do everything here, else it would break collapse.
- int eZ = MIN(bZ + blk[2], gZ);
- int eY = MIN(bY + blk[1], gY);
- int eX = MIN(bX + blk[0], gX);
+ int nThreads = 1;
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
- for (int x = bX; x < eX; ++x) {
+ #ifdef _OPENMP
+ nThreads = omp_get_max_threads();
+ #endif
- for (int d = 0; d < N_D3Q19; ++d) {
- pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
- pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
- }
+ #ifdef _OPENMP
+ #pragma omp parallel for default(none) \
+ shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads)
+ #endif
+ for (int i = 0; i < nThreads; ++i) {
+
+ int threadStartX = nX / nThreads * i;
+ int threadEndX = nX / nThreads * (i + 1);
+ if (nX % nThreads > 0) {
+ if (nX % nThreads > i) {
+ threadStartX += i;
+ threadEndX += i + 1;
+ }
+ else {
+ threadStartX += nX % nThreads;
+ threadEndX += nX % nThreads;
+ }
+ }
+
+ for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+ for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+ for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+ int eZ = MIN(bZ + blk[2], nZ + oZ);
+ int eY = MIN(bY + blk[1], nY + oY);
+ int eX = MIN(bX + blk[0], threadEndX + oX);
+
+ for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ }
+ }
+ }
}
}
}
}
}
+
+ }
+ else {
+ #ifdef _OPENMP
+ #pragma omp parallel for collapse(3)
+ #endif
+ for (int x = 0; x < gX; ++x) {
+ for (int y = 0; y < gY; ++y) {
+ for (int z = 0; z < gZ; ++z) {
+ for (int d = 0; d < N_D3Q19; ++d) {
+ pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+ }
+ }
+ }
+ }
}
// Initialize all PDFs to some standard value.
- for (int z = 0; z < gZ; ++z) {
+ for (int x = 0; x < gX; ++x) {
for (int y = 0; y < gY; ++y) {
- for (int x = 0; x < gX; ++x) {
+ for (int z = 0; z < gZ; ++z) {
for (int d = 0; d < N_D3Q19; ++d) {
pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
// TODO: apply blocking?
- for (int z = 0; z < lZ; ++z) {
+ for (int x = 0; x < lX; ++x) {
for (int y = 0; y < lY; ++y) {
- for (int x = 0; x < lX; ++x) {
+ for (int z = 0; z < lZ; ++z) {
if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
for (int d = 0; d < N_D3Q19; ++d) {
int srcIndex;
int dstIndex;
- for (int z = 0; z < lZ; ++z) {
+ for (int x = 0; x < lX; ++x) {
for (int y = 0; y < lY; ++y) {
- for (int x = 0; x < lX; ++x) {
+ for (int z = 0; z < lZ; ++z) {
if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
for (int d = 0; d < N_D3Q19; ++d) {
kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
- kd->Kernel = FNAME(D3Q19BlkKernel);
+ if (blocking) {
+ kd->Kernel = FNAME(D3Q19BlkKernel);
+ }
+ else {
+ kd->Kernel = FNAME(D3Q19Kernel);
+ }
kd->DstPdfs = NULL;
kd->PdfsActive = kd->Pdfs[0];
return;
}
+
+void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+ D3Q19InternalInit(1, ld, kernelData, params);
+}
+
+
void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
{
MemFree((void **) & ((*kernelData)->Pdfs[0]));
void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
{
- Parameters p;
-
- if (params->nKernelArgs != 0) {
- printf("ERROR: unknown kernel parameter.\n");
- printf("This kernels accepts no parameters.\n");
- exit(1);
- }
-
- // Setup an empty parameters structure.
- p.nArgs = params->nArgs;
- p.Args = params->Args;
- p.nKernelArgs = 0;
- p.KernelArgs = NULL;
-
- // Call init routine for blocking kernel and override the
- // kernel function to be called later on.
- FNAME(D3Q19BlkInit)(ld, kernelData, &p);
-
- (*kernelData)->Kernel = FNAME(D3Q19Kernel);
-
- return;
+ D3Q19InternalInit(0, ld, kernelData, params);
}
#endif
#ifdef DATA_LAYOUT_SOA
- return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x;
+ return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
#elif DATA_LAYOUT_AOS
- return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d;
+ return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
#else
#error P_INDEX_5 function no implemented for chosen data layout.
#endif
#include "Memory.h"
#include "Vtk.h"
+#include "LikwidIf.h"
#include <inttypes.h>
#include <math.h>
for(int iter = 0; iter < maxIterations; ++iter) {
+ X_LIKWID_START("list-os");
#ifdef _OPENMP
#pragma omp parallel for default(none) \
#undef I
} // loop over fluid nodes
+ X_LIKWID_STOP("list-os");
+
#ifdef VERIFICATION
kd->PdfsActive = dst;
KernelAddBodyForce(kd, ld, cd);
#include "Memory.h"
#include "Vtk.h"
+#include "Padding.h"
#include <math.h>
{
printf("Kernel parameters:\n");
printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
-
+#ifdef DATA_LAYOUT_SOA
+ printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
return;
}
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
{
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
+ *padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
#define NEXT_ARG_PRESENT() \
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
blk[2] = tmp;
}
+#ifdef DATA_LAYOUT_SOA
+ else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+ NEXT_ARG_PRESENT();
+
+ *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+ }
+#endif
else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
ParameterUsage();
exit(1);
kdl->nFluid = -1;
#endif
+ int blk[3] = { 0 };
+ PadInfo * padInfo = NULL;
+
+ ParseParameters(params, blk, &padInfo);
+
// Ajust the dimensions according to padding, if used.
kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
int lZ = lDims[2];
int nTotalCells = lX * lY * lZ;
- int nCells = ld->nFluid; // TODO: + padding
+ int nCells = ld->nFluid;
int nFluid = ld->nFluid;
+#ifdef DATA_LAYOUT_SOA
+ {
+ nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+ PadInfoFree(padInfo); padInfo = NULL;
+ }
+#endif
+
kdl->nCells = nCells;
kdl->nFluid = nFluid;
PdfT * pdfs[2];
- int blk[3] = { 0 };
-
- ParseParameters(params, blk);
-
if (blk[0] == 0) blk[0] = lX;
if (blk[1] == 0) blk[1] = lY;
if (blk[2] == 0) blk[2] = lZ;
// Blocking is implemented via setup of the adjacency list. The kernel later will
// walk through the lattice blocked automatically.
- for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
- for (int bY = 0; bY < lY; bY += blk[1]) {
for (int bX = 0; bX < lX; bX += blk[0]) {
+ for (int bY = 0; bY < lY; bY += blk[1]) {
+ for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
int eX = MIN(bX + blk[0], lX);
int eY = MIN(bY + blk[1], lY);
int eZ = MIN(bZ + blk[2], lZ);
-
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
latticeIndex = L_INDEX_4(lDims, x, y, z);
// Loop over all fluid nodes and compute the indices to the neighboring
// PDFs for configure data layout (AoS/SoA).
- // TODO: Parallelized loop to ensure correct NUMA placement.
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (int index = 0; index < nFluid; ++index) {
+ for (int d = 0; d < N_D3Q19_IDX; ++d) {
+ adjList[index * N_D3Q19_IDX + d] = -1;
+ }
+ }
+
// #ifdef _OPENMP --> add line continuation
// #pragma omp parallel for default(none)
// shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
//voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
//src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
- VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+ VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
//src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
- VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+ VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
//vui = vuy + vuz;
vui = VADD(vuy,vuz);
ui = uy;
evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
oddPart = omegaOdd * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3);
- src[ADJ_LIST(D3Q19_N)] = pdf_N - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_S)] = pdf_S - evenPart + oddPart;
+ *ppdf_S = pdf_N - evenPart - oddPart;
+ *ppdf_N = pdf_S - evenPart + oddPart;
ui = ux;
evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
oddPart = omegaOdd * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3);
- src[ADJ_LIST(D3Q19_E)] = pdf_E - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_W)] = pdf_W - evenPart + oddPart;
+ *ppdf_W = pdf_E - evenPart - oddPart;
+ *ppdf_E = pdf_W - evenPart + oddPart;
ui = uz;
evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
oddPart = omegaOdd * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3);
- src[ADJ_LIST(D3Q19_T)] = pdf_T - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_B)] = pdf_B - evenPart + oddPart;
+ *ppdf_B = pdf_T - evenPart - oddPart;
+ *ppdf_T = pdf_B - evenPart + oddPart;
// direction: w_2
w_2_indep = w_2 * dir_indep_trm;
ui = -ux + uy;
evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+ *ppdf_SE = pdf_NW - evenPart - oddPart;
+ *ppdf_NW = pdf_SE - evenPart + oddPart;
ui = ux + uy;
evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+ *ppdf_SW = pdf_NE - evenPart - oddPart;
+ *ppdf_NE = pdf_SW - evenPart + oddPart;
ui = -ux + uz;
evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+ *ppdf_BE = pdf_TW - evenPart - oddPart;
+ *ppdf_TW = pdf_BE - evenPart + oddPart;
ui = ux + uz;
evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+ *ppdf_BW = pdf_TE - evenPart - oddPart;
+ *ppdf_TE = pdf_BW - evenPart + oddPart;
ui = -uy + uz;
evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+ *ppdf_BN = pdf_TS - evenPart - oddPart;
+ *ppdf_TS = pdf_BN - evenPart + oddPart;
ui = uy + uz;
evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
oddPart = omegaOdd * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3);
- src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart;
- src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+ *ppdf_BS = pdf_TN - evenPart - oddPart;
+ *ppdf_TN = pdf_BS - evenPart + oddPart;
pointerOffset = 1;
}
#include "Memory.h"
#include "Vtk.h"
+#include "Padding.h"
#include <math.h>
{
printf("Kernel parameters:\n");
printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+ printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
return;
}
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
{
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
+ *padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
#define NEXT_ARG_PRESENT() \
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
blk[2] = tmp;
}
+#ifdef DATA_LAYOUT_SOA
+ else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+ NEXT_ARG_PRESENT();
+
+ *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+ }
+#endif
else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
ParameterUsage();
exit(1);
kdlr->nConsecThreadIndices = 0;
#endif
+ int blk[3] = { 0 };
+ PadInfo * padInfo = NULL;
+
+ ParseParameters(params, blk, &padInfo);
+
// Ajust the dimensions according to padding, if used.
kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
int lZ = lDims[2];
int nTotalCells = lX * lY * lZ;
- int nCells = ld->nFluid; // TODO: + padding
+ int nCells = ld->nFluid;
int nFluid = ld->nFluid;
+#ifdef DATA_LAYOUT_SOA
+ {
+ nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+ PadInfoFree(padInfo); padInfo = NULL;
+ }
+#endif
+
kdl->nCells = nCells;
kdl->nFluid = nFluid;
PdfT * pdfs[2];
- int blk[3] = { 0 };
-
- ParseParameters(params, blk);
if (blk[0] == 0) blk[0] = lX;
if (blk[1] == 0) blk[1] = lY;
// Blocking is implemented via setup of the adjacency list. The kernel later will
// walk through the lattice blocked automatically.
- for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
- for (int bY = 0; bY < lY; bY += blk[1]) {
for (int bX = 0; bX < lX; bX += blk[0]) {
+ for (int bY = 0; bY < lY; bY += blk[1]) {
+ for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
int eX = MIN(bX + blk[0], lX);
int eY = MIN(bY + blk[1], lY);
int eZ = MIN(bZ + blk[2], lZ);
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
latticeIndex = L_INDEX_4(lDims, x, y, z);
// Loop over all fluid nodes and compute the indices to the neighboring
// PDFs for configured data layout (AoS/SoA).
- // TODO: Parallelized loop to ensure correct NUMA placement.
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (int index = 0; index < nFluid; ++index) {
+ for (int d = 0; d < N_D3Q19_IDX; ++d) {
+ adjList[index * N_D3Q19_IDX + d] = -1;
+ }
+ }
+
// #ifdef _OPENMP --> add line continuation
// #pragma omp parallel for default(none)
// shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
#include "Memory.h"
#include "Vtk.h"
+#include "Padding.h"
#include <math.h>
{
printf("Kernel parameters:\n");
printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+ printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
return;
}
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
{
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
+ *padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
#define NEXT_ARG_PRESENT() \
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
blk[2] = tmp;
}
+#ifdef DATA_LAYOUT_SOA
+ else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+ NEXT_ARG_PRESENT();
+
+ *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+ }
+#endif
else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
ParameterUsage();
exit(1);
kdlr->nConsecThreadIndices = 0;
#endif
+ int blk[3] = { 0 };
+ PadInfo * padInfo = NULL;
+
+ ParseParameters(params, blk, &padInfo);
+
// Ajust the dimensions according to padding, if used.
kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
int nCells = ld->nFluid; // TODO: + padding
int nFluid = ld->nFluid;
+#ifdef DATA_LAYOUT_SOA
+ {
+ nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+ PadInfoFree(padInfo); padInfo = NULL;
+ }
+#endif
+
kdl->nCells = nCells;
kdl->nFluid = nFluid;
PdfT * pdfs[2];
- int blk[3] = { 0 };
-
- ParseParameters(params, blk);
if (blk[0] == 0) blk[0] = lX;
if (blk[1] == 0) blk[1] = lY;
// Blocking is implemented via setup of the adjacency list. The kernel later will
// walk through the lattice blocked automatically.
- for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
- for (int bY = 0; bY < lY; bY += blk[1]) {
for (int bX = 0; bX < lX; bX += blk[0]) {
+ for (int bY = 0; bY < lY; bY += blk[1]) {
+ for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
int eX = MIN(bX + blk[0], lX);
int eY = MIN(bY + blk[1], lY);
int eZ = MIN(bZ + blk[2], lZ);
-
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
latticeIndex = L_INDEX_4(lDims, x, y, z);
// Loop over all fluid nodes and compute the indices to the neighboring
// PDFs for configured data layout (AoS/SoA).
- // TODO: Parallelized loop to ensure correct NUMA placement.
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (int index = 0; index < nFluid; ++index) {
+ for (int d = 0; d < N_D3Q19_IDX; ++d) {
+ adjList[index * N_D3Q19_IDX + d] = -1;
+ }
+ }
+
// #ifdef _OPENMP --> add line continuation
// #pragma omp parallel for default(none)
// shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
#include "Memory.h"
#include "Vtk.h"
+#include "Padding.h"
#include <math.h>
{
printf("Kernel parameters:\n");
printf(" [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+ printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
return;
}
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
{
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
+ *padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
#define NEXT_ARG_PRESENT() \
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
ParameterUsage();
exit(1);
}
+#ifdef DATA_LAYOUT_SOA
+ else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+ NEXT_ARG_PRESENT();
+
+ *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+ }
+#endif
else {
printf("ERROR: unknown kernel parameter.\n");
ParameterUsage();
kdl->nFluid = -1;
#endif
+ int blk[3] = { 0 };
+ PadInfo * padInfo = NULL;
+
+ ParseParameters(params, blk, &padInfo);
+
+
// Ajust the dimensions according to padding, if used.
kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
int lZ = lDims[2];
int nTotalCells = lX * lY * lZ;
- int nCells = ld->nFluid; // TODO: + padding
+ int nCells = ld->nFluid;
int nFluid = ld->nFluid;
+#ifdef DATA_LAYOUT_SOA
+ {
+ nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+ PadInfoFree(padInfo); padInfo = NULL;
+ }
+#endif
+
kdl->nCells = nCells;
kdl->nFluid = nFluid;
PdfT * pdfs[2];
- int blk[3] = { 0 };
-
- ParseParameters(params, blk);
-
if (blk[0] == 0) blk[0] = lX;
if (blk[1] == 0) blk[1] = lY;
if (blk[2] == 0) blk[2] = lZ;
// Blocking is implemented via setup of the adjacency list. The kernel later will
// walk through the lattice blocked automatically.
- for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
- for (int bY = 0; bY < lY; bY += blk[1]) {
for (int bX = 0; bX < lX; bX += blk[0]) {
+ for (int bY = 0; bY < lY; bY += blk[1]) {
+ for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
int eX = MIN(bX + blk[0], lX);
int eY = MIN(bY + blk[1], lY);
int eZ = MIN(bZ + blk[2], lZ);
-
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
latticeIndex = L_INDEX_4(lDims, x, y, z);
// Loop over all fluid nodes and compute the indices to the neighboring
// PDFs for configured data layout (AoS/SoA).
- // TODO: Parallelized loop to ensure correct NUMA placement.
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (int index = 0; index < nFluid; ++index) {
+ for (int d = 0; d < N_D3Q19_IDX; ++d) {
+ adjList[index * N_D3Q19_IDX + d] = -1;
+ }
+ }
+
// #ifdef _OPENMP --> add line continuation
// #pragma omp parallel for default(none)
// shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
#include "Memory.h"
#include "Vtk.h"
#include "Vector.h"
+#include "LikwidIf.h"
#include <inttypes.h>
#include <math.h>
KernelStatistics(kd, ld, cd, 0);
#endif
+
+ X_LIKWID_START("list-pull-split-nt-1s");
#ifdef _OPENMP
#pragma omp parallel default(none) \
shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
for(int iter = 0; iter < maxIterations; ++iter) {
+
#if 1
#define INDEX_START blIndexStart
#define INDEX_STOP blIndexVec
#define INDEX_STOP blIndexStop
#include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
#endif
+
+
#pragma omp barrier
#pragma omp single
MemFree((void **)&tmpArray);
}
+
+ X_LIKWID_STOP("list-pull-split-nt-1s");
+
#ifdef VTK_OUTPUT
if (cd->VtkOutput) {
kd->PdfsActive = src;
KernelStatistics(kd, ld, cd, 0);
#endif
+
+ X_LIKWID_START("list-pull-split-nt-2s");
+
+
#ifdef _OPENMP
#pragma omp parallel default(none) \
shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
#endif
#pragma omp barrier
+
#pragma omp single
{
#ifdef VERIFICATION
MemFree((void **)&tmpArray);
}
+ X_LIKWID_STOP("list-pull-split-nt-2s");
+
#ifdef VTK_OUTPUT
if (cd->VtkOutput) {
kd->PdfsActive = src;
#include "Memory.h"
#include "Vtk.h"
#include "Vector.h"
+#include "Padding.h"
+
#include <math.h>
{
printf("Kernel parameters:\n");
printf(" [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
+ printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
return;
}
-static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
+static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo)
{
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
*nTmpArray = 152;
+ *padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
#define NEXT_ARG_PRESENT() \
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
int tmp = strtol(params->KernelArgs[++i], NULL, 0);
- if (tmp <= 0) {
- printf("ERROR: blocking parameter must be > 0.\n");
+ if (tmp < 0) {
+ printf("ERROR: blocking parameter must be >= 0.\n");
exit(1);
}
*nTmpArray = tmp;
}
+ else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+ NEXT_ARG_PRESENT();
+
+ *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+ }
else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
ParameterUsage();
exit(1);
kdlr->nConsecThreadIndices = 0;
#endif
+ int blk[3] = { 0 };
+ PadInfo * padInfo = NULL;
+
+ ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo);
+
// Ajust the dimensions according to padding, if used.
kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
int nCells = ld->nFluid;
int nFluid = ld->nFluid;
+ {
+ nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+ PadInfoFree(padInfo); padInfo = NULL;
+ }
+
// We padd each stream of a PDF array for a complete cache line.
- // TODO: padding for L1/L2 and TLB.
nCells = nCells + (8 - nCells % 8);
Assert(nCells % VSIZE == 0);
PdfT * pdfs[2];
- int blk[3] = { 0 };
-
- ParseParameters(params, blk, &kdlr->nTmpArray);
-
if (blk[0] == 0) blk[0] = lX;
if (blk[1] == 0) blk[1] = lY;
if (blk[2] == 0) blk[2] = lZ;
// Blocking is implemented via setup of the adjacency list. The kernel later will
// walk through the lattice blocked automatically.
- for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
- for (int bY = 0; bY < lY; bY += blk[1]) {
for (int bX = 0; bX < lX; bX += blk[0]) {
+ for (int bY = 0; bY < lY; bY += blk[1]) {
+ for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
int eX = MIN(bX + blk[0], lX);
int eY = MIN(bY + blk[1], lY);
int eZ = MIN(bZ + blk[2], lZ);
- for (int z = bZ; z < eZ; ++z) {
- for (int y = bY; y < eY; ++y) {
for (int x = bX; x < eX; ++x) {
+ for (int y = bY; y < eY; ++y) {
+ for (int z = bZ; z < eZ; ++z) {
latticeIndex = L_INDEX_4(lDims, x, y, z);
// Loop over all fluid nodes and compute the indices to the neighboring
// PDFs for configured data layout (AoS/SoA).
// Parallelized loop to ensure correct NUMA placement.
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (int index = 0; index < nFluid; ++index) {
+ for (int d = 0; d < N_D3Q19_IDX; ++d) {
+ adjList[index * N_D3Q19_IDX + d] = -1;
+ }
+ }
+
// #ifdef _OPENMP --> add line continuation
// #pragma omp parallel for default(none)
// shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
typeDetails = &tmp;
}
+ else if (strncasecmp("fluid", geometryType, 5) == 0) {
+ type = GEO_TYPE_FLUID;
+ }
else {
printf("ERROR: unknown geometry specified.\n");
Verify(0);
Assert(type >= GEO_TYPE_MIN);
Assert(type <= GEO_TYPE_MAX);
- const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" };
+ const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks", "fluid" };
printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]);
if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) {
periodic[0] = 1;
}
+ else if (type == GEO_TYPE_FLUID) {
+ // Actually this geometry is non-periodic, but we need fluid at the boundary.
+ periodic[0] = 1; periodic[1] = 1; periodic[2] = 1;
+ }
// Walls or periodic on first and last x plane.
for (int z = 0; z < dims[2]; ++z) {
}
if (type == GEO_TYPE_CHANNEL) {
- periodic[0] = 1;
+ // Nothing todo here.
}
else if (type == GEO_TYPE_PIPE) {
#define SQR(a) ((a)*(a))
int blockSize = *((int *)typeDetails);
+#if 0
if (blockSize == 0) {
blockSize = 8;
}
+#endif
+ if (blockSize > 0) {
- int dimMin = dims[0];
+ int dimMin = dims[0];
- if (dims[1] < dimMin) dimMin = dims[1];
- if (dims[2] < dimMin) dimMin = dims[2];
+ if (dims[1] < dimMin) dimMin = dims[1];
+ if (dims[2] < dimMin) dimMin = dims[2];
- if (blockSize < 0 || blockSize > dimMin / 2) {
- printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
- // TODO: find a better solution for handling errors in here.
- Verify(0);
- }
+ if (blockSize < 0 || blockSize > dimMin / 2) {
+ printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
+ // TODO: find a better solution for handling errors in here.
+ Verify(0);
+ }
- // Number of blocks in x, y, and z direction.
- int nbx = blockSize, nby = blockSize, nbz = blockSize;
+ // Number of blocks in x, y, and z direction.
+ int nbx = blockSize, nby = blockSize, nbz = blockSize;
- for (int z = 0; z < dims[2]; ++z) {
- if ((z % (2 * nbz)) < nbz) continue;
+ for (int z = 0; z < dims[2]; ++z) {
+ if ((z % (2 * nbz)) < nbz) continue;
- for (int y = 0; y < dims[1]; ++y) {
- if ((y % (2 * nby)) < nby) continue;
+ for (int y = 0; y < dims[1]; ++y) {
+ if ((y % (2 * nby)) < nby) continue;
- for (int x = 0; x < dims[0]; ++x) {
+ for (int x = 0; x < dims[0]; ++x) {
- if ((x % (2 * nbx)) >= nbx) {
- lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE;
+ if ((x % (2 * nbx)) >= nbx) {
+ lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE;
+ }
}
}
}
GEO_TYPE_PIPE = 2,
GEO_TYPE_BLOCKS = 3, // Expects a pointer to an integer, holding the
// value of the block size as type detail.
- GEO_TYPE_MAX = 3
+ GEO_TYPE_FLUID = 4,
+ GEO_TYPE_MAX = 4
} GEO_TYPES;
#endif
#ifdef PROP_MODEL_AA
- #define PROP_MODEL_NAME AA
+ #define PROP_MODEL_NAME Aa
#endif
#define __KERNEL_FUNCTIONS_H__
#include "BenchKernelD3Q19.h"
+#include "BenchKernelD3Q19Aa.h"
+#include "BenchKernelD3Q19AaVec.h"
#include "BenchKernelD3Q19List.h"
#include "BenchKernelD3Q19ListAa.h"
#include "BenchKernelD3Q19ListAaRia.h"
.Init = D3Q19BlkInit_PullAoS,
.Deinit = D3Q19BlkDeinit_PullAoS
},
+ {
+ .Name = "aa-aos",
+ .Init = D3Q19AaInit_AaAoS,
+ .Deinit = D3Q19AaDeinit_AaAoS
+ },
+ {
+ .Name = "aa-soa",
+ .Init = D3Q19AaInit_AaSoA,
+ .Deinit = D3Q19AaDeinit_AaSoA
+ },
+ {
+ .Name = "aa-vec-soa",
+ .Init = D3Q19AaVecInit_AaSoA,
+ .Deinit = D3Q19AaVecDeinit_AaSoA
+ }
+
};
#endif // __KERNEL_FUNCTIONS_H__
--- /dev/null
+// --------------------------------------------------------------------------
+//
+// Copyright
+// Markus Wittmann, 2016-2017
+// RRZE, University of Erlangen-Nuremberg, Germany
+// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+// Viktor Haag, 2016
+// LSS, University of Erlangen-Nuremberg, Germany
+//
+// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+// LbmBenchKernels is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// LbmBenchKernels is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Lattice.h"
+
+// Dumps the layers [zStart, zStop] of lattice as ASCII.
+// Specify zStart = -1 and zStop = -1 as begin/end of lattice.
+
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop)
+{
+ Assert(ld != NULL);
+
+ const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet
+
+ int localZStart = zStart;
+ int localZStop = zStop;
+
+ int * dims = ld->Dims;
+ LatticeT * lattice = ld->Lattice;
+
+ if (localZStart == -1) localZStart = 0;
+ if (localZStop == -1) localZStop = dims[2] - 1;
+
+ for (int z = localZStop; z >= localZStart; --z) {
+ printf("plane % 2d\n", z);
+
+ for (int y = dims[1] - 1; y >= 0; --y) {
+ printf(" %2d ", y);
+ for (int x = 0; x < dims[0]; ++x) {
+ printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]);
+ }
+ printf("\n");
+ }
+ }
+}
+
return z * dims[0] * dims[1] + y * dims[0] + x;
}
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop);
#endif // __LATTICE_H__
int threadId = omp_get_thread_num();
int err;
- err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId);
+ err = PinCurrentThreadByCpuList(pinString, threadId);
if (err) {
printf("ERROR [thread %d]: pinning failed.\n", threadId);
perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6,
geometryType, kernelToUse,
#ifdef VERIFICATION
- "# VERIFICATION"
+ "VERIFICATION"
#else
- "# benchmark"
+ "B"
#endif
);
# CONFIG ?= linux-gcc
CONFIG ?= linux-intel
-BUILD ?= debug
+BUILD ?= release
+
+BENCHMARK ?= on
# If we compile for BENCHMARK all verifcation and statistics are disabled,
# if not specified otherwise.
VERIFICATION ?= off
VTK_OUTPUT ?= off
STATISTICS ?= off
+else
+ VERIFICATION ?= on
+ STATISTICS ?= on
+ VTK_OUTPUT ?= on
endif
-VERIFICATION ?= on
-
-STATISTICS ?= on
-
-VTK_OUTPUT ?= on
COLOR ?= on
# Sources to consider.
SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \
- Vtk.c Pinning.c
+ Vtk.c Pinning.c Padding.c Lattice.c
# ------------------------------------------------------------------------
# NO CHANGE BELOW SHOULD BE NEEDED
$(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \
$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \
$(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \
- $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o
+ $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaAoS.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaAoS.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaSoA.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaSoA.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19AaVec_AaSoA.o \
+ $(OBJECT_DIR)/BenchKernelD3Q19AaVecCommon_AaSoA.o
OBJ = $(OBJ_C)
LD_LIBS += $(LIKWID_LIB) -llikwid
endif
+# ARCH can only be assigned a string without a space. The space is escaped as
+# a comma which we have to replace here.
+
+ifdef TARCH
+ ARCH_W_COMMA := $(TARCH)
+
+ COMMA_ := ,
+ override TARCH := $(subst $(COMMA_), ,$(TARCH))
+
+ $(info $(shell echo -e "\n$(COLOR_GREEN)INFO: Automatically expanding comma in TARCH variable: TARCH=$(TARCH).$(COLOR_NO)\n"))
+
+endif
+
.phony: all clean clean-all
-$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) building $(.DEFAULT_GOAL)..."))
+$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) TARCH=$(COLOR_CYAN)$(TARCH)$(COLOR_NO) building $(.DEFAULT_GOAL)..."))
$(info # Object dir: $(OBJECT_DIR))
@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)"
$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@
+$(OBJECT_DIR)/%_AaAoS.o: %.c $(REBUILD_DEPS)
+ @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+ $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_AA -c $< -o $@
+
+$(OBJECT_DIR)/%_AaSoA.o: %.c $(REBUILD_DEPS)
+ @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+ $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_AA -c $< -o $@
$(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS)
@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)"
// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
//
// --------------------------------------------------------------------------
+// TODO: make configurable
+#define HAVE_HUGE_PAGES
+
+
+#ifdef HAVE_HUGE_PAGES
+#define _BSD_SOURCE
+#endif
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // strerror
#include <errno.h>
+
+#ifdef HAVE_HUGE_PAGES
+#include <sys/mman.h> // madvise
+#endif
+
#include "Base.h"
#include "Memory.h"
exit(1);
}
+#ifdef HAVE_HUGE_PAGES
+
+ if (alignmentBytes % 4096 == 0) {
+ ret = madvise(*ptr, bytesToAlloc, MADV_HUGEPAGE);
+
+ if (ret != 0) {
+ DEBUG_BREAK_POINT();
+ Error("madvise(%p, %lu, MADV_HUGEPAGE) failed: %d - %s.\n", *ptr, bytesToAlloc, errno, strerror(errno));
+ exit(1);
+ }
+ }
+#endif
+
return 0;
}
--- /dev/null
+#include "Base.h"
+#include "Kernel.h"
+#include "Padding.h"
+
+#include <errno.h>
+#include <limits.h>
+
+// Generates a PadInfo struct from padStr. If padStr is NULL then a NULL
+// pointer is returned.
+//
+// Padding string is either "auto" or in the format of
+// <modulus>+<remainder>(,<modulus>+<remainder>)*.
+//
+// If modulus is 0 then the remainder will just be added and no modulo will be
+// applied.
+//
+// Returned PadInfo structs must be freed with PadInfoFree.
+
+PadInfo * PadInfoFromStr(const char * padStr)
+{
+ if (padStr == NULL) {
+ return NULL;
+ }
+ else if (!strncmp("auto", padStr, 4)) {
+
+ PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo) + 4 * sizeof(int));
+ Assert(padInfo != NULL);
+
+ padInfo->nEntries = 2;
+ padInfo->Modulus = (int *)(padInfo + 1);
+ padInfo->Offset = ((int *)(padInfo + 1)) + 2;
+
+ // Intel TLB 2 MiB pages
+ padInfo->Modulus[0] = 2 * 1024 * 1024 * 8;
+ padInfo->Offset[0] = 2 * 1024 * 1024 + 512;
+
+ // Intel L2, 256 sets
+ padInfo->Modulus[1] = 256 * 64;
+ padInfo->Offset[1] = 128;
+
+ return padInfo;
+ }
+ else if (!strncmp("no", padStr, 2) || !strncmp("off", padStr, 3)) {
+ PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo));
+ Assert(padInfo != NULL);
+
+ padInfo->nEntries = 0;
+ padInfo->Modulus = NULL;
+ padInfo->Offset = NULL;
+
+ return padInfo;
+ }
+
+ // Count number of commas. Number of commas + 1 gives us the number of
+ // padding entries.
+ const char * tmp = padStr;
+ int nCommas = 0;
+
+ while (*tmp != 0x00) {
+ if (*tmp == ',') ++nCommas;
+ ++tmp;
+ }
+
+ // Number of padding entries = nCommas + 1.
+ size_t padInfoSize = sizeof(PadInfo) + (nCommas + 1) * 2 * sizeof(int);
+ PadInfo * padInfo = (PadInfo *)malloc(padInfoSize);
+ Assert(padInfo != NULL);
+
+ memset(padInfo, 0x00, padInfoSize);
+
+ padInfo->Modulus = (int *)(padInfo + 1);
+ padInfo->Offset = ((int *)(padInfo + 1)) + nCommas + 1;
+
+ tmp = padStr;
+
+ int padEntryIndex = 0;
+ int entriesSeen = 0;
+
+ int modulusSeen = 0;
+ int offsetSeen = 0;
+
+ // TODO: parsing is currently a mess. We assume a fixed format, which
+ // should be easier to parse now.
+ while (*tmp != 0x00) {
+
+ if (*tmp == ',') {
+ // Check if modulus was seen in this section, then move to the next.
+
+ if (!modulusSeen) {
+ fprintf(stderr, "ERROR: modulus missing before next section in padding string.\n");
+ exit(1);
+ }
+
+ modulusSeen = 0;
+ offsetSeen = 0;
+ ++padEntryIndex;
+ ++tmp;
+ }
+ else if (*tmp == '+') {
+ // Check if modulus was seen in this section and no offset yet.
+
+ if (!modulusSeen) {
+ fprintf(stderr, "ERROR: modulus missing before offset in padding string.\n");
+ exit(1);
+ }
+
+ if (offsetSeen) {
+ fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+ exit(1);
+ }
+
+ ++tmp;
+ }
+ else if (*tmp >= '0' && *tmp <= '9') {
+ // Parse number and check for all the errors.
+
+ char * endPtr;
+
+ errno = 0;
+
+ int value = strtol(tmp, &endPtr, 10);
+
+ if ((value == LONG_MIN || value == LONG_MAX) && errno != 0) {
+ fprintf(stderr, "ERROR: over- or underflow in modulus/offset of padding string.\n");
+ exit(1);
+ }
+ else if (value == 0 && errno != 0) {
+ fprintf(stderr, "ERROR: error parsing modulus/offset of padding string: %d - %s\n", errno, strerror(errno));
+ exit(1);
+ }
+ else if (tmp == endPtr) {
+ // No digits found: empty string or invalid pad string.
+ fprintf(stderr, "ERROR: modulus or offset missing in padding string.\n");
+ exit(1);
+ }
+
+ if (value < 0) {
+ fprintf(stderr, "ERROR: modulus and offset must be >= 0 in padding string.\n");
+ exit(1);
+ }
+
+ if (!modulusSeen) {
+ if (offsetSeen) {
+ fprintf(stderr, "ERROR: offset already seen, but not modulus in padding string.\n");
+ exit(1);
+ }
+
+ Verify(padEntryIndex < (nCommas + 1));
+
+ padInfo->Modulus[padEntryIndex] = value;
+
+ modulusSeen = 1;
+ ++entriesSeen;
+ }
+ else {
+ if (offsetSeen) {
+ fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+ exit(1);
+ }
+
+ Verify(padEntryIndex < (nCommas + 1));
+
+ padInfo->Offset[padEntryIndex] = value;
+
+ offsetSeen = 1;
+ }
+ // No increment of tmp needed, as endPtr points already to the first
+ // character which is not part of the number.
+ tmp = endPtr;
+ }
+ else {
+ fprintf(stderr, "ERROR: padding string contains invalid characters.\n");
+ exit(1);
+ }
+
+ }
+
+ if (entriesSeen != nCommas + 1) {
+ fprintf(stderr, "ERROR: did not find all padding entries.\n");
+ exit(1);
+ }
+
+ for (int i = 0; i < nCommas + 1; ++i) {
+ if (padInfo->Offset[i] >= padInfo->Modulus[i]) {
+ fprintf(stderr, "ERROR: offset in padding entry %d is equal or larger than modulus.\n", i);
+ exit(1);
+ }
+ }
+
+ padInfo->nEntries = entriesSeen;
+
+ return padInfo;
+}
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+ Assert(padInfoPtr != NULL);
+
+ // If the padInfo is NULL determine if we use the "auto" configuration.
+ const int defaultAutoPadding = 1;
+
+
+ if (*padInfoPtr == NULL) {
+ if (!defaultAutoPadding) {
+ return nCells;
+ }
+
+ *padInfoPtr = PadInfoFromStr("auto");
+ }
+
+ PadInfo * padInfo = *padInfoPtr;
+
+ Assert(padInfo->nEntries >= 0);
+
+ for (int i = 0; i < padInfo->nEntries; ++i) {
+ Assert(padInfo->Modulus[i] >= 0);
+ Assert(padInfo->Offset[i] >= 0);
+
+ int nPadCells = 0;
+
+ if (padInfo->Modulus[i] == 0) {
+ // When the modulus is just zero then we only add the offset.
+ nPadCells = padInfo->Offset[i] * cellSizeBytes;
+ }
+ else {
+ int nModCells = padInfo->Modulus[i] / cellSizeBytes;
+
+ if (nModCells == 0) {
+ fprintf(stderr, "ERROR: modulus of %d byte in padding entry %d becomes zero for PDF size %d byte.\n",
+ padInfo->Modulus[i], i, cellSizeBytes);
+ exit(1);
+ }
+
+ int nOffsetCells = padInfo->Offset[i] / cellSizeBytes;
+ int nRemainder = nCells % nModCells;
+
+ nPadCells = (nOffsetCells + nModCells - nRemainder) % nModCells;
+ }
+
+ nCells += nPadCells;
+ }
+
+ return nCells;
+}
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix)
+{
+ Assert(padInfo != NULL);
+ Assert(padInfo->nEntries >= 0);
+ Assert(f != NULL);
+ Assert(prefix != NULL);
+
+ for (int i = 0; i < padInfo->nEntries; ++i) {
+ fprintf(f, "%sm: %10d b o: %10d b m: %f KiB o: %f KiB m: %f MiB o: %f MiB\n",
+ prefix,
+ padInfo->Modulus[i], padInfo->Offset[i],
+ padInfo->Modulus[i] / 1024.0, padInfo->Offset[i] / 1024.0,
+ padInfo->Modulus[i] / 1024.0 / 1024.0, padInfo->Offset[i] / 1024.0 / 1024.0);
+ }
+
+ return;
+}
+
+void PadInfoFree(PadInfo * padInfo)
+{
+ if (padInfo != NULL) {
+ free(padInfo);
+ }
+
+ return;
+}
+
+// Returns the new padded cell size and reports if padding was performed and how.
+
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+ // Apply padding.
+ int nNewCells = PadCells(nCells, sizeof(PdfT), padInfoPtr);
+
+ if (nCells != nNewCells) {
+ printf("# padding info:\n");
+ PadInfoPrint(*padInfoPtr, stdout, "# ");
+ // int nPaddedCells = nNewCells - nCells;
+ // printf("# padding per dir.: %10d nodes (%f MiB)\n", nPaddedCells, nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+ // printf("# padding total: %10d nodes (%f MiB)\n", 19 * nPaddedCells, 19 * nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+
+ int nPadCells = nNewCells - nCells;
+
+ printf("#\n# padding %d nodes with %d nodes, %f MiB per direction, %f MiB in total\n",
+ nCells,
+ nPadCells,
+ nPadCells * sizeof(PdfT) / 1024.0 / 1024.0,
+ nPadCells * sizeof(PdfT) / 1024.0 / 1024.0 * 19);
+ }
+ else {
+ printf("# padding info: no padding used\n");
+ }
+
+ return nNewCells;
+}
--- /dev/null
+#ifndef __PADDING_H__
+#define __PADDING_H__
+
+typedef struct PadInfo_
+{
+ int nEntries;
+ int * Modulus;
+ int * Offset;
+} PadInfo;
+
+PadInfo * PadInfoFromStr(const char * padStr);
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfo);
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr);
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix);
+void PadInfoFree(PadInfo * padInfo);
+
+#endif // __PADDING_H__
#include "Base.h"
#include "Pinning.h"
-
-
-
// -----------------------------------------------------------------------
//
// Binds the calling thread to specified core.
//
// -----------------------------------------------------------------------
-int PinCurrentThreadByEnvVar(const char * envVarName,
- int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber)
{
const char * envVarValue;
int core;
envVarValue = getenv(envVarName);
if (envVarValue == NULL) {
- if (mpiRank == 0) {
- Print("skip pinning: env var %s not set\n", envVarName);
- }
+ printf("WARNING: skip pinning: env var %s not set\n", envVarName);
return 0;
}
- core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber);
+ core = PinParseCpuList(envVarValue, threadNumber);
if (core < 0) {
return core;
//
// -----------------------------------------------------------------------
-int PinCurrentThreadByCpuList(const char * cpuList,
- int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber)
{
int core;
if (cpuList == NULL) {
- if (mpiRank == 0) {
- printf("ERROR: cpu list is NULL.\n");
- }
+ printf("ERROR: cpu list is NULL.\n");
exit(1);
}
- core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber);
+ core = PinParseCpuList(cpuList, threadNumber);
if (core < 0) {
return core;
// -----------------------------------------------------------------------
//
// Parses the provided cpu list and returns the core number for the
-// specified MPI rank, local rank, and thread.
-//
-// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5
-//
-// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a
-// node rank. The first block maps to node rank 1, the second to node
-// rank 2, etc.
-//
-// Inside a block the core numbers specify where the threads should
-// be pinned to. They are separated by "," and the first number maps
-// to the first core, the second number to the second core, etc.
-//
-// For example: 0,2,4_6,8,10
+// specified thread.
//
-// Node rank 0 thread 0 pinned to core 0
-// 0 1 2
-// 0 2 4
-// 1 0 6
-// 1 1 8
-// 1 2 10
+// The cpu list has for example a format of: 0,1,2
//
// -----------------------------------------------------------------------
-int PinParseCpuList(const char * cpuList,
- int mpiRank, int nodeRank, int threadNumber)
+int PinParseCpuList(const char * cpuList, int threadNumber)
{
int cpu = -1;
const char * c = cpuList;
// Ensure only valid characters are in the cpu list.
- // Cpu list is in the format of "0,1,2_3,4,5".
- while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) {
+ // Cpu list is in the format of "0,1,2,3,4,5".
+ while (((*c >= '0' && *c <= '9') || *c == ',')) {
++c;
}
c = cpuList;
- int i = 0;
-
- // Move variable c after the "nodeRank"th "_" in the cpu list.
- while (i < nodeRank && *c != 0x00) {
- if (*c == '_') ++i;
- ++c;
- }
-
- if (i != nodeRank || *c < '0' || *c > '9') {
- // Cpu list for this node rank not found.
- return -3;
- }
-
// Now find the core for the specified thread.
int t = 0;
if (*c == ',') {
++t;
}
- else if (*c == '_') {
- // Unexpected character at this position.
- break;
- }
-
++c;
}
int PinCurrentThreadToCore(int coreNumber);
-int PinParseCpuList(const char * cpuList,
- int mpiRank, int nodeRank, int threadNumber);
+int PinParseCpuList(const char * cpuList, int threadNumber);
-int PinCurrentThreadByEnvVar(const char * envVarName,
- int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber);
-int PinCurrentThreadByCpuList(const char * cpuList,
- int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber);
int PinCurrentCore();
make clean-all
make -j CONFIG=$Config TAG=$XTag-debug
-make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v"