bulk commit
authorMarkus Wittmann <markus.wittmann@fau.de>
Thu, 2 Nov 2017 14:54:11 +0000 (15:54 +0100)
committerMarkus Wittmann <markus.wittmann@fau.de>
Thu, 2 Nov 2017 14:54:11 +0000 (15:54 +0100)
- add AA pattern full array kernels
- add padding for list kernels
- transposed loops

38 files changed:
doc/Makefile
doc/html/main.html
doc/main.css [new file with mode: 0644]
doc/main.rst
src/Base.h
src/BenchKernelD3Q19.c
src/BenchKernelD3Q19Aa.c [new file with mode: 0644]
src/BenchKernelD3Q19Aa.h [new file with mode: 0644]
src/BenchKernelD3Q19AaCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19AaCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19AaVec.c [new file with mode: 0644]
src/BenchKernelD3Q19AaVec.h [new file with mode: 0644]
src/BenchKernelD3Q19AaVecCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19AaVecCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19Common.c
src/BenchKernelD3Q19Common.h
src/BenchKernelD3Q19List.c
src/BenchKernelD3Q19ListAaCommon.c
src/BenchKernelD3Q19ListAaPv.c
src/BenchKernelD3Q19ListAaPvCommon.c
src/BenchKernelD3Q19ListAaRiaCommon.c
src/BenchKernelD3Q19ListCommon.c
src/BenchKernelD3Q19ListPullSplitNt.c
src/BenchKernelD3Q19ListPullSplitNtCommon.c
src/Geometry.c
src/Geometry.h
src/Kernel.h
src/KernelFunctions.h
src/Lattice.c [new file with mode: 0644]
src/Lattice.h
src/Main.c
src/Makefile
src/Memory.c
src/Padding.c [new file with mode: 0644]
src/Padding.h [new file with mode: 0644]
src/Pinning.c
src/Pinning.h
src/test.sh

index 690e4eb0bce050add7ca4b24b9e1a438534626ce..6417a6e5ecf3ccb1902483dfb6fbda688dc872f6 100644 (file)
@@ -34,5 +34,5 @@ main: main.rst
 #main.css
        [ -d html ] || mkdir -p html
 #    rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
-       rst2html --stylesheet=html4css1.css $< html/$@.html
+       rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
 
index 36579ccbf7719cfdebdce12ad9b4cd4732c548ba..511f6b2dcf0bfd7cd2be208fc20e3d3f83c391e3 100644 (file)
@@ -335,6 +335,56 @@ h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
 ul.auto-toc {
   list-style-type: none }
 
+</style>
+<style type="text/css">
+
+
+h1, h2, h3, h4, h5, h6 {
+  font-family: sans-serif;
+  font-size: 100%;
+  background-color: #dcdcdc;
+}
+
+h1.title {
+  background-color: gray;
+  color: white
+}
+
+table.footnote {
+  padding-left: 0.5ex;
+}
+
+table.citation {
+  padding-left: 0.5ex
+}
+
+td.label {
+  width: 10%;
+}
+
+table, table.docutils, td, th {
+  border: 0;
+}
+
+table.citation, table.footnote {
+  width: 100%;
+}
+
+th {
+    background-color: lavender ;
+}
+
+tr:nth-child(even) {
+    xxbackground-color: aliceblue;
+    background-color: white; 
+}
+tr:nth-child(odd) {
+    xxbackground-color: lavender;
+    background-color: whitesmoke;
+}
+
+
+
 </style>
 </head>
 <body>
@@ -375,15 +425,23 @@ ul.auto-toc {
 <li><a class="reference internal" href="#benchmarking" id="id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
 <li><a class="reference internal" href="#release-and-verification" id="id5">1.3&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
 <li><a class="reference internal" href="#compilers" id="id6">1.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
-<li><a class="reference internal" href="#options-summary" id="id7">1.5&nbsp;&nbsp;&nbsp;Options Summary</a></li>
+<li><a class="reference internal" href="#cleaning" id="id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></li>
+<li><a class="reference internal" href="#options-summary" id="id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#invocation" id="id8">2&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
-<li><a class="reference internal" href="#command-line-parameters" id="id9">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#invocation" id="id9">2&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#command-line-parameters" id="id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#kernels" id="id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#id1" id="id10">3&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
-<li><a class="reference internal" href="#acknowledgements" id="id11">4&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
+<li><a class="reference internal" href="#id1" id="id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
+<li><a class="reference internal" href="#padding" id="id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#geometries" id="id14">4&nbsp;&nbsp;&nbsp;Geometries</a></li>
+<li><a class="reference internal" href="#results" id="id15">5&nbsp;&nbsp;&nbsp;Results</a></li>
+<li><a class="reference internal" href="#licence" id="id16">6&nbsp;&nbsp;&nbsp;Licence</a></li>
+<li><a class="reference internal" href="#acknowledgements" id="id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
 </ul>
 </div>
 <div class="section" id="compilation">
@@ -399,15 +457,15 @@ depending on compiler and build configuration.</p>
 <div class="section" id="debug-and-verification">
 <h2><a class="toc-backref" href="#id3">1.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
 <pre class="literal-block">
-make
+make BUILD=debug BENCHMARK=off
 </pre>
-<p>Running <tt class="docutils literal">make</tt> without any arguments builds the debug version (BUILD=debug) of
+<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of
 the benchmark kernels, where no optimizations are performed,  line numbers and
 debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined.  The resulting
 binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named
 <tt class="docutils literal"><span class="pre">lbmbenchk-linux-&lt;compiler&gt;-debug</span></tt>.</p>
-<p>Without any further specification the binary includes verification
-(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS</tt>), and VTK output
+<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification
+(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output
 (<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p>
 <p>Please note that the generated binary will therefore
 exhibit a poor performance.</p>
@@ -416,9 +474,10 @@ exhibit a poor performance.</p>
 <h2><a class="toc-backref" href="#id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
 <p>To generate a binary for benchmarking run make with</p>
 <pre class="literal-block">
-make BENCHMARK=on BUILD=release
+make
 </pre>
-<p>Here BUILD=release turns optimizations on and BENCHMARK=on disables
+<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
+BUILD=release turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
 verfification, statistics, and VTK output.</p>
 </div>
 <div class="section" id="release-and-verification">
@@ -426,7 +485,7 @@ verfification, statistics, and VTK output.</p>
 <p>Verification with the debug builds can be extremely slow. Hence verification
 capabilities can be build with release builds:</p>
 <pre class="literal-block">
-make BUILD=release
+make BENCHMARK=off
 </pre>
 </div>
 <div class="section" id="compilers">
@@ -435,8 +494,23 @@ make BUILD=release
 both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
 <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
 </div>
+<div class="section" id="cleaning">
+<h2><a class="toc-backref" href="#id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
+<p>For each configuration and build (debug/release) a subdirectory under the
+<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
+stored.
+With</p>
+<pre class="literal-block">
+make CONFIG=... BUILD=... clean
+</pre>
+<p>a specific combination is select and cleaned, whereas with</p>
+<pre class="literal-block">
+make clean-all
+</pre>
+<p>all object and dependency files are deleted.</p>
+</div>
 <div class="section" id="options-summary">
-<h2><a class="toc-backref" href="#id7">1.5&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
+<h2><a class="toc-backref" href="#id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
 <p>Options that can be specified when building the framework with make:</p>
 <table border="1" class="docutils">
 <colgroup>
@@ -451,19 +525,14 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 <td>default</td>
 <td>description</td>
 </tr>
-<tr><td>TARCH</td>
-<td>--</td>
-<td>--</td>
-<td>Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.</td>
-</tr>
 <tr><td>BENCHMARK</td>
 <td>on, off</td>
-<td>off</td>
-<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.</td>
+<td>on</td>
+<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td>
 </tr>
 <tr><td>BUILD</td>
 <td>debug, release</td>
-<td>debug</td>
+<td>release</td>
 <td>No optimization, debug symbols, DEBUG defined.</td>
 </tr>
 <tr><td>CONFIG</td>
@@ -486,6 +555,11 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 <td>off</td>
 <td>View statistics, like density etc, during simulation.</td>
 </tr>
+<tr><td>TARCH</td>
+<td>--</td>
+<td>--</td>
+<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td>
+</tr>
 <tr><td>VERIFICATION</td>
 <td>on, off</td>
 <td>off</td>
@@ -501,16 +575,18 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 </div>
 </div>
 <div class="section" id="invocation">
-<h1><a class="toc-backref" href="#id8">2&nbsp;&nbsp;&nbsp;Invocation</a></h1>
+<h1><a class="toc-backref" href="#id9">2&nbsp;&nbsp;&nbsp;Invocation</a></h1>
 <p>Running the binary will print among the GPL licence header a line like the following:</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+</pre>
 <p>if verfication was enabled during compilation or</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
+</pre>
 <p>if verfication was disabled during compilation.</p>
 <div class="section" id="command-line-parameters">
-<h2><a class="toc-backref" href="#id9">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
+<h2><a class="toc-backref" href="#id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
 <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
 <pre class="literal-block">
 Usage:
@@ -540,7 +616,7 @@ $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32
 <p>Kernel specific parameters can be opatained via selecting the specific kernel
 and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
 <pre class="literal-block">
-$ bin/lbmbenchk-linux-intel-release -kernel -- -h
+$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
 ...
 Kernel parameters:
 [-blk &lt;n&gt;] [-blk-[xyz] &lt;n&gt;]
@@ -574,14 +650,225 @@ Available kernels to benchmark:
    blk-pull-aos
 </pre>
 </div>
+<div class="section" id="kernels">
+<h2><a class="toc-backref" href="#id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
+<p>The following list shortly describes available kernels:</p>
+<ul class="simple">
+<li>push-soa/push-aos/pull-soa/pull-aos:
+Unoptimized kernels (but stream/collide are already fused) using two grids as
+source and destination. Implement push/pull semantics as well structure of
+arrays (soa) or array of structures (aos) layout.</li>
+<li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+The same as the unoptimized kernels without the blk prefix, except that they support
+spatial blocking, i.e. loop blocking of the three loops used to iterate over
+the lattice. Here manual work sharing for OpenMP is used.</li>
+<li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+Here only a 1D vector of is used to store the fluid nodes, omitting the
+obstacles. An adjacency list is used to recover the neighborhood associations.</li>
+<li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+once. Postcollision values are written back via nontemporal stores in 18 (1s)
+or 9 (2s) loops.</li>
+<li>list-aa-aos/list-aa-soa:
+Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+list. Supported are array of structures (aos) and structure of arrays (soa)
+data layout is supported.</li>
+<li>list-aa-ria-soa:
+Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+list. Furthermore it contains a vectorized even time step and run length
+coding to reduce the loop balance of the odd time step.</li>
+<li>list-aa-pv-soa:
+All optimizations of list-aa-ria-soa. Additional with partial vectorization
+of the odd time step.</li>
+</ul>
+<p>Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.</p>
+<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means
+direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D
+vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="29%" />
+<col width="14%" />
+<col width="14%" />
+<col width="6%" />
+<col width="10%" />
+<col width="10%" />
+<col width="16%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel name</th>
+<th class="head">prop. step</th>
+<th class="head">data layout</th>
+<th class="head">addr.</th>
+<th class="head">parallel</th>
+<th class="head">blocking</th>
+<th class="head">B_l [B/FLUP]</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-split-nt-1s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-pull-split-nt-2s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>AA</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+</tbody>
+</table>
+</div>
 </div>
 <div class="section" id="id1">
-<h1><a class="toc-backref" href="#id10">3&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
+<h1><a class="toc-backref" href="#id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
 <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
 created make sure the binary was compiled with:</p>
 <ul class="simple">
-<li><tt class="docutils literal">BENCHMARK=on</tt> and</li>
-<li><tt class="docutils literal">BUILD=release</tt> and</li>
+<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
+<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
 <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
 <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
 </ul>
@@ -594,7 +881,13 @@ $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9)
 <ul class="simple">
 <li>transparent huge pages: when allocating memory small 4 KiB pages might be
 replaced with larger ones. This is in general a good thing, but if this is
-really the case, depends on the system settings.</li>
+really the case, depends on the system settings (check e.g. the status of
+<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>).
+Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to
+a 4 KiB page, which should be the case for the lattices.
+This should result in huge pages except THP is disabled on the machine.
+(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently
+hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li>
 <li>CPU/core frequency: For reproducible results the frequency of all cores
 should be fixed.</li>
 <li>NUMA placement policy: The benchmark assumes a first touch policy, which
@@ -604,19 +897,63 @@ used is already full memory might be allocated in a remote domain. Accesses
 to remote domains typically have a higher latency and lower bandwidth.</li>
 <li>System load: interference with other application, espcially on desktop
 systems should be avoided.</li>
-<li>Padding: most kernels do not care about padding against cache or TLB
-thrashing. Even if the number of (fluid) nodes suggest everything is fine,
-through parallelization still problems might occur.</li>
+<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
+adjusted so that no cache or TLB thrashing should occur. The parameters are
+optimized for current Intel based systems. For more details look into the
+padding section.</li>
 <li>CPU dispatcher function: the compiler might add different versions of a
 function for different ISA extensions. Make sure the code you might think is
 executed is actually the code which is executed.</li>
 </ul>
+<div class="section" id="padding">
+<h2><a class="toc-backref" href="#id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></h2>
+<p>With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.</p>
+<p>Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>)
+<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding),
+or a manual padding.</p>
+<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.</p>
+<p>Manual padding is done via a padding string and has the format
+<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g.
+<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>.
+This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong>
+would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.</p>
+<p>A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p>
+</div>
+</div>
+<div class="section" id="geometries">
+<h1><a class="toc-backref" href="#id14">4&nbsp;&nbsp;&nbsp;Geometries</a></h1>
+<p>TODO: supported geometries: channel, pipe, blocks</p>
+</div>
+<div class="section" id="results">
+<h1><a class="toc-backref" href="#id15">5&nbsp;&nbsp;&nbsp;Results</a></h1>
+<p>TODO</p>
+</div>
+<div class="section" id="licence">
+<h1><a class="toc-backref" href="#id16">6&nbsp;&nbsp;&nbsp;Licence</a></h1>
+<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
 </div>
 <div class="section" id="acknowledgements">
-<h1><a class="toc-backref" href="#id11">4&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
+<h1><a class="toc-backref" href="#id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
 <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
 <p>This work was funded by KONWHIR project OMI4PAPS.</p>
-<p>Document was generated at 2017-10-26 09:43.</p>
+<p>Document was generated at 2017-11-02 15:33.</p>
 </div>
 </div>
 </body>
diff --git a/doc/main.css b/doc/main.css
new file mode 100644 (file)
index 0000000..27d3f26
--- /dev/null
@@ -0,0 +1,46 @@
+
+h1, h2, h3, h4, h5, h6 {
+  font-family: sans-serif;
+  font-size: 100%;
+  background-color: #dcdcdc;
+}
+
+h1.title {
+  background-color: gray;
+  color: white
+}
+
+table.footnote {
+  padding-left: 0.5ex;
+}
+
+table.citation {
+  padding-left: 0.5ex
+}
+
+td.label {
+  width: 10%;
+}
+
+table, table.docutils, td, th {
+  border: 0;
+}
+
+table.citation, table.footnote {
+  width: 100%;
+}
+
+th {
+    background-color: lavender ;
+}
+
+tr:nth-child(even) {
+    xxbackground-color: aliceblue;
+    background-color: white; 
+}
+tr:nth-child(odd) {
+    xxbackground-color: lavender;
+    background-color: whitesmoke;
+}
+
+
index 9ad56ba03c7f461b30bd3ab1249714445371f1bc..528b339be1c37a3bd522bba6256905b16d916428 100644 (file)
@@ -54,16 +54,16 @@ Debug and Verification
 
 :: 
 
-  make
+  make BUILD=debug BENCHMARK=off
 
-Running ``make`` without any arguments builds the debug version (BUILD=debug) of
+Running ``make`` with ``BUILD=debug`` builds the debug version of
 the benchmark kernels, where no optimizations are performed,  line numbers and
 debug symbols are included as well as ``DEBUG`` will be defined.  The resulting
 binary will be found in the ``bin`` subdirectory and named
 ``lbmbenchk-linux-<compiler>-debug``.
  
-Without any further specification the binary includes verification
-(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output
+Specifying ``BENCHMARK=off`` turns on verification
+(``VERIFICATION=on``), statistics (``STATISTICS=on``), and VTK output
 (``VTK_OUTPUT=on``) enabled. 
 
 Please note that the generated binary will therefore
@@ -74,9 +74,10 @@ Benchmarking
 
 To generate a binary for benchmarking run make with ::
 
-  make BENCHMARK=on BUILD=release
+  make 
 
-Here BUILD=release turns optimizations on and BENCHMARK=on disables
+As default ``BENCHMARK=on`` and ``BUILD=release`` is set, where
+BUILD=release turns optimizations on and ``BENCHMARK=on`` disables
 verfification, statistics, and VTK output.
 
 Release and Verification
@@ -85,7 +86,7 @@ Release and Verification
 Verification with the debug builds can be extremely slow. Hence verification
 capabilities can be build with release builds: ::
 
-  make BUILD=release
+  make BENCHMARK=off 
 
 Compilers
 ---------
@@ -94,6 +95,24 @@ Currently only the GCC and Intel compiler under Linux are supported. Between
 both configuration can be chosen via ``CONFIG=linux-gcc`` or
 ``CONFIG=linux-intel``.
 
+
+Cleaning
+--------
+
+For each configuration and build (debug/release) a subdirectory under the
+``src/obj`` directory is created where the dependency and object files are
+stored.
+With ::
+
+  make CONFIG=... BUILD=... clean
+
+a specific combination is select and cleaned, whereas with ::
+
+  make clean-all
+
+all object and dependency files are deleted.
+
+
 Options Summary
 ---------------
 
@@ -102,13 +121,13 @@ Options that can be specified when building the framework with make:
 ============= ======================= ============ ==========================================================
 name          values                  default      description
 ------------- ----------------------- ------------ ----------------------------------------------------------
-TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.
-BENCHMARK     on, off                 off          If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.
-BUILD         debug, release          debug        No optimization, debug symbols, DEBUG defined.
+BENCHMARK     on, off                 on           If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.
+BUILD         debug, release          release      No optimization, debug symbols, DEBUG defined.
 CONFIG        linux-gcc, linux-intel  linux-intel  Select GCC or Intel compiler. 
 ISA           avx, sse                avx          Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for.
 OPENMP        on, off                 on           OpenMP, i.\,e.\. threading support.
 STATISTICS    on, off                 off          View statistics, like density etc, during simulation. 
+TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.
 VERIFICATION  on, off                 off          Turn verification on/off.
 VTK_OUTPUT    on, off                 off          Enable/Disable VTK file output.
 ============= ======================= ============ ==========================================================
@@ -116,11 +135,11 @@ VTK_OUTPUT    on, off                 off          Enable/Disable VTK file outpu
 Invocation
 ==========
 
-Running the binary will print among the GPL licence header a line like the following:
+Running the binary will print among the GPL licence header a line like the following: ::
  
   LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
 
-if verfication was enabled during compilation or
+if verfication was enabled during compilation or ::
 
   LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
 
@@ -159,7 +178,7 @@ iterations, etc, which can afterward be override, e.g.: ::
 Kernel specific parameters can be opatained via selecting the specific kernel
 and passing ``-h`` as parameter: ::
 
-  $ bin/lbmbenchk-linux-intel-release -kernel -- -h
+  $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
   ...
   Kernel parameters:
   [-blk <n>] [-blk-[xyz] <n>]
@@ -193,6 +212,82 @@ A list of all available kernels can be obtained via ``-list``: ::
      blk-pull-soa
      blk-pull-aos
 
+Kernels
+-------
+
+The following list shortly describes available kernels:
+
+- push-soa/push-aos/pull-soa/pull-aos:
+  Unoptimized kernels (but stream/collide are already fused) using two grids as
+  source and destination. Implement push/pull semantics as well structure of
+  arrays (soa) or array of structures (aos) layout.
+
+- blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+  The same as the unoptimized kernels without the blk prefix, except that they support
+  spatial blocking, i.e. loop blocking of the three loops used to iterate over
+  the lattice. Here manual work sharing for OpenMP is used.
+
+- list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+  The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+  Here only a 1D vector of is used to store the fluid nodes, omitting the
+  obstacles. An adjacency list is used to recover the neighborhood associations.
+
+- list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+  Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+  once. Postcollision values are written back via nontemporal stores in 18 (1s)
+  or 9 (2s) loops.
+
+- list-aa-aos/list-aa-soa:
+  Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+  list. Supported are array of structures (aos) and structure of arrays (soa)
+  data layout is supported.
+
+- list-aa-ria-soa:
+  Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+  list. Furthermore it contains a vectorized even time step and run length
+  coding to reduce the loop balance of the odd time step.
+
+- list-aa-pv-soa:
+  All optimizations of list-aa-ria-soa. Additional with partial vectorization
+  of the odd time step.
+
+
+Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.
+
+The following table summarizes the properties of the kernels. Here **D** means
+direct addressing, i.e. full array, **I** means indirect addressing, i.e. 1D
+vector with adjacency list, **x** means supported, whereas **--** means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.
+
+
+====================== =========== =========== ===== ======== ======== ============
+kernel name            prop. step  data layout addr. parallel blocking B_l [B/FLUP]
+====================== =========== =========== ===== ======== ======== ============
+push-soa               OS          SoA         D     x         --      456
+push-aos               OS          AoS         D     x         --      456
+pull-soa               OS          SoA         D     x         --      456
+pull-aos               OS          AoS         D     x         --      456
+blk-push-soa           OS          SoA         D     x         x       456
+blk-push-aos           OS          AoS         D     x         x       456
+blk-pull-soa           OS          SoA         D     x         x       456
+blk-pull-aos           OS          AoS         D     x         x       456
+list-push-soa          OS          SoA         I     x         x       528
+list-push-aos          OS          AoS         I     x         x       528
+list-pull-soa          OS          SoA         I     x         x       528
+list-pull-aos          OS          AoS         I     x         x       528
+list-pull-split-nt-1s  OS          SoA         I     x         x       376
+list-pull-split-nt-2s  OS          SoA         I     x         x       376
+list-aa-soa            AA          SoA         I     x         x       340
+list-aa-aos            AA          AoS         I     x         x       340
+list-aa-ria-soa        AA          SoA         I     x         x       304-342
+list-aa-pv-soa         AA          SoA         I     x         x       304-342
+====================== =========== =========== ===== ======== ======== ============
 
 Benchmarking
 ============
@@ -200,8 +295,8 @@ Benchmarking
 Correct benchmarking is a nontrivial task. Whenever benchmark results should be
 created make sure the binary was compiled with:  
 
-- ``BENCHMARK=on`` and
-- ``BUILD=release`` and 
+- ``BENCHMARK=on`` (default if not overriden) and
+- ``BUILD=release`` (default if not overriden) and 
 - the correct ISA for macros is used, selected via ``ISA`` and
 - use ``TARCH`` to specify the architecture the compiler generates code for.
 
@@ -214,7 +309,13 @@ Things the binary does nor check or controll:
 
 - transparent huge pages: when allocating memory small 4 KiB pages might be
   replaced with larger ones. This is in general a good thing, but if this is
-  really the case, depends on the system settings.
+  really the case, depends on the system settings (check e.g. the status of
+  ``/sys/kernel/mm/transparent_hugepage/enabled``).
+  Currently ``madvise(MADV_HUGEPAGE)`` is used for allocations which are aligned to
+  a 4 KiB page, which should be the case for the lattices. 
+  This should result in huge pages except THP is disabled on the machine.
+  (NOTE: madvise() is used if ``HAVE_HUGE_PAGES`` is defined, which is currently
+  hard coded defined in ``Memory.c``).
 
 - CPU/core frequency: For reproducible results the frequency of all cores
   should be fixed.
@@ -228,14 +329,69 @@ Things the binary does nor check or controll:
 - System load: interference with other application, espcially on desktop
   systems should be avoided.
 
-- Padding: most kernels do not care about padding against cache or TLB
-  thrashing. Even if the number of (fluid) nodes suggest everything is fine,
-  through parallelization still problems might occur.
+- Padding: For SoA based kernels the number of (fluid) nodes is automatically
+  adjusted so that no cache or TLB thrashing should occur. The parameters are
+  optimized for current Intel based systems. For more details look into the
+  padding section.
 
 - CPU dispatcher function: the compiler might add different versions of a
   function for different ISA extensions. Make sure the code you might think is
   executed is actually the code which is executed.
 
+Padding
+-------
+
+With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.
+
+Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the ``--``)
+``-pad``. Supported values are ``auto`` (default), ``no`` (to disable padding),
+or a manual padding.
+
+Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.
+
+Manual padding is done via a padding string and has the format
+``mod_1+offset_1(,mod_n+offset_n)``, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g. 
+**d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE**. 
+This would distribute the pages evenly over the sets. Hereby **PAGE_SIZE * TLB_SETS**
+would be our ``mod_1`` and **PAGE_SIZE** (after the =) our ``offset_1``.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.
+
+A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like ``-pad 0+16`` would at a static padding of two nodes (one node = 8 b).
+
+
+Geometries
+==========
+
+TODO: supported geometries: channel, pipe, blocks
+
+
+Results
+=======
+
+TODO
+
+
+Licence
+=======
+
+The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.
+
 
 Acknowledgements
 ================
index 27f91165d8c33d44c19a6503973e35f5aa9ecba8..ae610824182c7a3efd2d742c26ccb81a70edcb9c 100644 (file)
@@ -44,8 +44,6 @@ static inline double Time()
 }
 
 
-#define TOOL_NAME "lbmbenchk"
-
 #define STRINGIFYX(x)   #x
 #define STRINGIFY(x)    STRINGIFYX(x)
 
@@ -117,17 +115,17 @@ static inline double Time()
         } while (0)
 
     #define Print(formatString, ...) \
-        fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__)
+        fprintf(stdout, SHC_MAGENTA formatString SHC_NC, ##__VA_ARGS__)
 
     #define Warning(formatString, ...) \
-        fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__)
+        fprintf(stdout, SHC_BROWN "WARNING: " SHC_NC formatString, ##__VA_ARGS__)
 
     #define Error(formatString, ...) \
-        fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__)
-
+        fprintf(stderr, SHC_RED "ERROR: " formatString SHC_NC , ##__VA_ARGS__)
+/*
     #define DebugPrint(formatString, ...) \
-        fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__)
-
+        fprintf(stderr, "DEBUG: " formatString, ##__VA_ARGS__)
+*/
     #ifndef NO_SHELL_COLORS
 
         // or "\e"
index 8203d2a9682da774f4b0f4878ce2bdb3b90afcbd..bd6d43c3ac374a04f9a7dd2a60dd1f22c288ccc3 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -98,6 +99,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 
        for (int iter = 0; iter < maxIterations; ++iter) {
 
+               X_LIKWID_START("os");
+
                #ifdef _OPENMP
                #pragma omp parallel for collapse(3) default(none) \
                                shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
@@ -112,9 +115,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
                                        evenPart, oddPart, w_1_indep, w_2_indep)
                #endif
 
-               for (int z = oZ; z < nZ + oZ; ++z) {
+               for (int x = oX; x < nX + oX; ++x) {
                        for (int y = oY; y < nY + oY; ++y) {
-                               for (int x = oX; x < nX + oX; ++x) {
+                               for (int z = oZ; z < nZ + oZ; ++z) {
                                        #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
 
 #ifdef PROP_MODEL_PUSH
@@ -306,6 +309,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
                        }
                } // z, y, x (from inner to outer)
 
+               // Stop counters before bounce back. Else computing loop balance will be incorrect.
+               X_LIKWID_STOP("os");
+
                // Fixup bounce back PDFs.
                #ifdef _OPENMP
                #pragma omp parallel for default(none) \
@@ -429,21 +435,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
        for (int iter = 0; iter < maxIterations; ++iter) {
 
-               // #ifdef _OPENMP --> add line continuation
-               // #pragma omp parallel for collapse(3) default(none)
-               //              shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd,
-               //              w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd,
-               //              oX, oY, oZ, nX, nY, nZ, blk)
-               //              private(ux, uy, uz, ui, dens, dir_indep_trm,
-               //                      pdf_C,
-               //                      pdf_N, pdf_E, pdf_S, pdf_W,
-               //                      pdf_NE, pdf_SE, pdf_SW, pdf_NW,
-               //                      pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW,
-               //                      pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW,
-               //                      evenPart, oddPart, w_1_indep, w_2_indep)
-               // #endif
                #ifdef _OPENMP
-               #pragma omp parallel for default(none) \
+               #pragma omp parallel default(none) \
                                shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
                                w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
                                oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
@@ -455,16 +448,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
                                        pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
                                        evenPart, oddPart, w_1_indep, w_2_indep)
                #endif
+               {
+                       X_LIKWID_START("blk-os");
 
-               for (int i = 0; i < nThreads; ++i) {
+                       int threadId = omp_get_thread_num();
 
-                       int threadStartX = nX / nThreads * i;
-                       int threadEndX   = nX / nThreads * (i + 1);
+                       int threadStartX = nX / nThreads * threadId;
+                       int threadEndX   = nX / nThreads * (threadId + 1);
 
                        if (nX % nThreads > 0) {
-                               if (nX % nThreads > i) {
-                                       threadStartX += i;
-                                       threadEndX   += i + 1;
+                               if (nX % nThreads > threadId) {
+                                       threadStartX += threadId;
+                                       threadEndX   += threadId + 1;
                                }
                                else {
                                        threadStartX += nX % nThreads;
@@ -472,24 +467,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
                                }
                        }
 
-               // for (int z = oZ; z < nZ + oZ; ++z) {
-               //      for (int y = oY; y < nY + oY; ++y) {
-               //              for (int x = oX; x < nX + oX; ++x) {
-               for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+               for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
                        for (int bY = oY; bY < nY + oY; bY += blk[1]) {
-                               for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
-                               // for (int bX = oX; bX < nX + oX; bX += blk[0]) {
+                               for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
 
                                        // Must do everything here, else it would break collapse.
                                        int eZ = MIN(bZ + blk[2], nZ + oZ);
                                        int eY = MIN(bY + blk[1], nY + oY);
                                        int eX = MIN(bX + blk[0], threadEndX + oX);
 
-//                                     printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
-
-                                       for (int z = bZ; z < eZ; ++z) {
+                                       for (int x = bX; x < eX; ++x) {
                                                for (int y = bY; y < eY; ++y) {
-                                                       for (int x = bX; x < eX; ++x) {
+                                                       for (int z = bZ; z < eZ; ++z) {
 
                                        #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
 
@@ -497,7 +486,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
                                        // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
                                        #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x, y, z, idx)];
-                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
                                        D3Q19_LIST
                                        #undef X
 
@@ -505,7 +493,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
                                        // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
                                        #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
-                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
                                        D3Q19_LIST
                                        #undef X
 
@@ -525,11 +512,11 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
                                        } else {
                                        #endif
                                                ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
-                                                       pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
                                                uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
-                                                       pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
                                                uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
-                                                       pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+                                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
                                        #ifdef LID_DRIVEN_CAVITY
                                        }
 
@@ -685,7 +672,10 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
                        }
                } // z, y, x (from inner to outer)
 
-               } // loop over threads
+                       X_LIKWID_STOP("blk-os");
+               } // parallel region
+
+               // Stop counters before bounce back. Else computing loop balance will be incorrect.
 
                // Fixup bounce back PDFs.
                #ifdef _OPENMP
diff --git a/src/BenchKernelD3Q19Aa.c b/src/BenchKernelD3Q19Aa.c
new file mode 100644 (file)
index 0000000..87a9e33
--- /dev/null
@@ -0,0 +1,529 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int * gDims = kd->GlobalDims;
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       KernelDataAa * kda = KDA(kd);
+
+       int blk[3];
+       blk[0] = kda->Blk[0];
+       blk[1] = kda->Blk[1];
+       blk[2] = kda->Blk[2];
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+       PdfT magicParam = 1.0 / 12.0;
+       //  1/4: best stability;
+       // 1/12: removes third-order advection error (best advection);
+       //  1/6: removes fourth-order diffusion error (best diffusion);
+       // 3/16: exact location of bounce back for poiseuille flow
+
+       PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT w_1_x3 = w_1 * 3.0;        PdfT w_1_nine_half = w_1 * 9.0/2.0;     PdfT w_1_indep = 0.0;
+       PdfT w_2_x3 = w_2 * 3.0;        PdfT w_2_nine_half = w_2 * 9.0/2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ux, uy, uz, ui;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+
+       int maxIterations = cd->MaxIterations;
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+
+       int nThreads = 1;
+
+       #ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+       #endif
+
+       for (int iter = 0; iter < maxIterations; iter += 2) {
+
+               // --------------------------------------------------------------------
+               // even time step
+
+               X_LIKWID_START("aa-even");
+
+               // {{{
+
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+                               w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+                               oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \
+                               private(ux, uy, uz, ui, dens, dir_indep_trm, \
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                       evenPart, oddPart, w_1_indep, w_2_indep)
+               #endif
+
+               for (int i = 0; i < nThreads; ++i) {
+
+                       int threadStartX = nX / nThreads * i;
+                       int threadEndX   = nX / nThreads * (i + 1);
+
+                       if (nX % nThreads > 0) {
+                               if (nX % nThreads > i) {
+                                       threadStartX += i;
+                                       threadEndX   += i + 1;
+                               }
+                               else {
+                                       threadStartX += nX % nThreads;
+                                       threadEndX   += nX % nThreads;
+                               }
+                       }
+
+                       for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+                       for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+                       for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+                               int eX = MIN(bX + blk[0], threadEndX + oX);
+                               int eY = MIN(bY + blk[1], nY + oY);
+                               int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+                               // printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+                               for (int x = bX; x < eX; ++x) {
+                               for (int y = bY; y < eY; ++y) {
+                               for (int z = bZ; z < eZ; ++z) {
+
+
+                                       if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) {
+                                               continue;
+                                       }
+
+                                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x, y, z, idx)];
+                                       D3Q19_LIST
+                                       #undef X
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+                                       if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+                                               ux = 0.1 * 0.577;
+                                               uy = 0.0;
+                                               uz = 0.0;
+
+                                       } else {
+       #endif
+                                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+       #ifdef LID_DRIVEN_CAVITY
+                                       }
+       #endif
+
+                                       dens = pdf_C +
+                                                       pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                                       pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                                       pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                                       pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                                       // direction: w_0
+                                       src[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       src[I(x, y, z, D3Q19_S)]  = pdf_N - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_N)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       src[I(x, y, z, D3Q19_W)]  = pdf_E - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_E)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       src[I(x, y, z, D3Q19_B)]  = pdf_T - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_T)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+                                       src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+                                       #undef I
+                               } } } // z, y, x (from inner to outer)
+                       } } } // z, y, x (from inner to outer)
+
+               } // loop over threads
+
+               // }}}
+
+               X_LIKWID_STOP("aa-even");
+
+               #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(kd, src)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+               }
+
+               // save current iteration
+               kda->Iteration = iter;
+
+               #ifdef VERIFICATION
+               kd->PdfsActive = src;
+               KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+               if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, iter);
+               }
+               #endif
+
+               #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // --------------------------------------------------------------------
+               // odd time step
+
+
+               X_LIKWID_START("aa-odd");
+
+               // {{{
+
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+                               w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+                               oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
+                               private(ux, uy, uz, ui, dens, dir_indep_trm, \
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                       evenPart, oddPart, w_1_indep, w_2_indep)
+               #endif
+
+               for (int i = 0; i < nThreads; ++i) {
+
+                       int threadStartX = nX / nThreads * i;
+                       int threadEndX   = nX / nThreads * (i + 1);
+
+                       if (nX % nThreads > 0) {
+                               if (nX % nThreads > i) {
+                                       threadStartX += i;
+                                       threadEndX   += i + 1;
+                               }
+                               else {
+                                       threadStartX += nX % nThreads;
+                                       threadEndX   += nX % nThreads;
+                               }
+                       }
+
+                       for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+                       for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+                       for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+                               // Must do everything here, else it would break collapse.
+                               int eZ = MIN(bZ + blk[2], nZ + oZ);
+                               int eY = MIN(bY + blk[1], nY + oY);
+                               int eX = MIN(bX + blk[0], threadEndX + oX);
+
+                               for (int x = bX; x < eX; ++x) {
+                               for (int y = bY; y < eY; ++y) {
+                               for (int z = bZ; z < eZ; ++z) {
+
+                                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)];
+                                       D3Q19_LIST
+                                       #undef X
+
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+                                       if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+                                               ux = 0.1 * 0.577;
+                                               uy = 0.0;
+                                               uz = 0.0;
+
+                                       } else {
+#endif
+                                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                                       }
+#endif
+
+                                       dens = pdf_C +
+                                                       pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                                       pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                                       pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                                       pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                                       // direction: w_0
+                                       src[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       src[I(x, y + 1,   z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                                       src[I(x, y - 1,   z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       src[I(x + 1,   y,   z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                                       src[I(x - 1,   y,   z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       src[I(x,   y, z + 1, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                                       src[I(x,   y, z - 1, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       src[I(x - 1, y + 1,   z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                                       src[I(x + 1, y - 1,   z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       src[I(x + 1, y + 1,   z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                                       src[I(x - 1, y - 1,   z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       src[I(x - 1,   y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                                       src[I(x + 1,   y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       src[I(x + 1,   y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                                       src[I(x - 1,   y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                                       src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                                       src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+
+                                       #undef I
+                               } } } // z, y, x (from inner to outer)
+                       } } } // z, y, x (from inner to outer)
+               } // loop over threads
+
+               // }}}
+
+               // Stop counters before bounce back. Else computing loop balance will be incorrect.
+
+               X_LIKWID_STOP("aa-odd");
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(kd, src)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+               }
+
+               // save current iteration
+               kda->Iteration = iter + 1;
+
+               #ifdef VERIFICATION
+               kd->PdfsActive = src;
+               KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+               if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, iter + 1);
+               }
+               #endif
+
+               #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, iter + 1);
+               #endif // }}}
+
+
+       } // for (int iter = 0; ...
+
+       #ifdef VTK_OUTPUT
+
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+
+       #endif
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19Aa.h b/src/BenchKernelD3Q19Aa.h
new file mode 100644 (file)
index 0000000..d0605d1
--- /dev/null
@@ -0,0 +1,41 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA__
+#define __BENCH_KERNEL_D3Q19_AA__
+
+#include "Kernel.h"
+
+
+void D3Q19AaInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaInit_AaAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19AaDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19AaDeinit_AaAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19__
diff --git a/src/BenchKernelD3Q19AaCommon.c b/src/BenchKernelD3Q19AaCommon.c
new file mode 100644 (file)
index 0000000..fbdcbc1
--- /dev/null
@@ -0,0 +1,651 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+               int nx = x - D3Q19_X[dir];
+               int ny = y - D3Q19_Y[dir];
+               int nz = z - D3Q19_Z[dir];
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+               #undef I
+       }
+       else {
+               int nx = x;
+               int ny = y;
+               int nz = z;
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+               #undef I
+       }
+
+
+       return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+               int nx = x - D3Q19_X[dir];
+               int ny = y - D3Q19_Y[dir];
+               int nz = z - D3Q19_Z[dir];
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+               #undef I
+       }
+       else {
+               int nx = x;
+               int ny = y;
+               int nz = z;
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+               #undef I
+       }
+
+       return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       else {
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+#if 0          // DETECT NANs
+
+       for (int d = 0; d < 19; ++d) {
+               if (isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+                       for (int d2 = 0; d2 < 19; ++d2) {
+                               printf("%d: %e\n", d2, pdfs[d2]);
+                       }
+
+                       exit(1);
+               }
+       }
+
+#endif
+
+       return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       else {
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       return;
+}
+
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+
+void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelDataAa * kda = NULL;
+       MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+       kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+       kda->Iteration = -1;
+
+       KernelData * kd = &kda->kd;
+       *kernelData = kd;
+
+       kd->nObstIndices = ld->nObst;
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = ld->Dims[0];
+       kd->Dims[1] = ld->Dims[1];
+       kd->Dims[2] = ld->Dims[2];
+
+
+       int * lDims = ld->Dims;
+       int * gDims = kd->GlobalDims;
+
+       gDims[0] = lDims[0] + 2;
+       gDims[1] = lDims[1] + 2;
+       gDims[2] = lDims[2] + 2;
+
+       kd->Offsets[0] = 1;
+       kd->Offsets[1] = 1;
+       kd->Offsets[2] = 1;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int gX = gDims[0];
+       int gY = gDims[1];
+       int gZ = gDims[2];
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       int blk[3] = { 0 };
+
+       int nCells = gX * gY * gZ;
+
+       PdfT * pdfs[2] = { NULL, NULL };
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = gX;
+       if (blk[1] == 0) blk[1] = gY;
+       if (blk[2] == 0) blk[2] = gZ;
+
+       printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+
+       kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+       printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+              nCells,
+                  sizeof(PdfT) * nCells * N_D3Q19,
+              sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K                4096
+
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+       kd->Pdfs[0] = pdfs[0];
+       kd->Pdfs[1] = NULL;
+
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // This depends on the chosen data layout.
+       // The structure of the loop should resemble the same "execution layout"
+       // as in the kernel!
+
+       int nThreads;
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+       #pragma omp parallel for \
+                               shared(gDims, pdfs, \
+                               oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+       for (int i = 0; i < nThreads; ++i) {
+
+               int threadStartX = lX / nThreads * i;
+               int threadEndX   = lX / nThreads * (i + 1);
+
+               if (lX % nThreads > 0) {
+                       if (lX % nThreads > i) {
+                               threadStartX += i;
+                               threadEndX   += i + 1;
+                       }
+                       else {
+                               threadStartX += lX % nThreads;
+                               threadEndX   += lX % nThreads;
+                       }
+               }
+
+               for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+               for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+               for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+                       int eX = MIN(bX + blk[0], threadEndX + oX);
+                       int eY = MIN(bY + blk[1], lY + oY);
+                       int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+                       // printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+                       for (int x = bX; x < eX; ++x) {
+                       for (int y = bY; y < eY; ++y) {
+                       for (int z = bZ; z < eZ; ++z) {
+
+                               for (int d = 0; d < N_D3Q19; ++d) {
+                                       pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0;
+                               }
+
+                       } } } // x, y, z
+               } } } // bx, by, bz
+       }
+
+
+       // Initialize all PDFs to some standard value.
+       for (int x = oX; x < lX + oX; ++x) {
+       for (int y = oY; y < lY + oY; ++y) {
+       for (int z = oZ; z < lZ + oZ; ++z) {
+               for (int d = 0; d < N_D3Q19; ++d) {
+                       pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+               }
+       } } } // x, y, z
+
+
+       // Count how many *PDFs* need bounce back treatment.
+
+       uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+       if (nPdfs > ((2LU << 31) - 1)) {
+               printf("ERROR: number of PDFs exceed 2^31.\n");
+               exit(1);
+       }
+
+       // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+       // Works when declaring nBounceBackPdfs as int64_t or using volatile.
+       volatile int nBounceBackPdfs = 0;
+       // int64_t nBounceBackPdfs = 0;
+       int nx, ny, nz, px, py, pz;
+
+
+       for (int x = 0; x < lX; ++x) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int z = 0; z < lZ; ++z) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               nx = x - D3Q19_X[d];
+                                               ny = y - D3Q19_Y[d];
+                                               nz = z - D3Q19_Z[d];
+
+                                               // Check if neighbor is inside the lattice.
+                                               // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                               //      continue;
+                                               // }
+                                               if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+       MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+       MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+       kd->nBounceBackPdfs = nBounceBackPdfs;
+       nBounceBackPdfs = 0;
+
+       int srcIndex;
+       int dstIndex;
+
+       // TODO: currently this is not NUMA-aware
+    //       - maybe use the same blocking as for lattice initialization?
+       //       - do place the bounce back index vector parallel?
+
+       for (int x = 0; x < lX; ++x) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int z = 0; z < lZ; ++z) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               nx = x + D3Q19_X[d];
+                                               ny = y + D3Q19_Y[d];
+                                               nz = z + D3Q19_Z[d];
+
+                                               if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                                               ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                                               ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                               ){
+                                                       // For periodicity:
+
+                                                       // We assume we have finished odd time step (accessing neighbor PDFs) and are
+                                                       // before executing the even time step (accessing local PDFs only).
+
+                                                       // Assuming we are at the most east position of the lattice. Through the odd
+                                                       // time step propagation has put a PDF in the east slot of the ghost cell east
+                                                       // of us, i.e. nx, ny, nz.  We copy it to the east slot of the most west node.
+
+                                                       // In case of transition from even to odd time step , src and dest must be
+                                                       // exchanged.
+
+
+                                                       // x periodic
+                                                       if (nx < 0) {
+                                                               px = lX - 1;
+                                                       }
+                                                       else if (nx >= lX) {
+                                                               px = 0;
+                                                       } else {
+                                                               px = nx;
+                                                       }
+
+                                                       // y periodic
+                                                       if (ny < 0) {
+                                                               py = lY - 1;
+                                                       }
+                                                       else if (ny >= lY) {
+                                                               py = 0;
+                                                       } else {
+                                                               py = ny;
+                                                       }
+
+                                                       // z periodic
+                                                       if (nz < 0) {
+                                                               pz = lZ - 1;
+                                                       }
+                                                       else if (nz >= lZ) {
+                                                               pz = 0;
+                                                       } else {
+                                                               pz = nz;
+                                                       }
+
+                                                       if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                                               // See description of bounce back handling below.
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+                                                       }
+                                                       else {
+
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               // Put it on the other side back into the domain.
+                                                               dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+                                                               VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       }
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                                                       // Depending on the time step we are in we have to exchange src and dst index.
+
+                                                       // We build the list for the case, when we have finished odd time step
+                                                       // (accessing neighbor PDFs) and before we start with the even time step
+                                                       // (accessing local PDFs only).
+
+                                                       // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+                                                       // Then we have to move the east PDF from the obstacle to our west position,
+                                                       // i.e. the inverse of east.
+
+                                                       // In case of transition from even to odd time step src and dest just
+                                                       // have to be exchanged.
+
+                                                       srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                       dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+
+                                                       VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = FNAME(GetNode);
+       kd->SetNode = FNAME(SetNode);
+
+       kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+       kd->Kernel = FNAME(D3Q19AaKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       MemFree((void **) & ((*kernelData)->Pdfs[0]));
+       // MemFree((void **) & ((*kernelData)->Pdfs[1]));
+
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+       MemFree((void **)kernelData);
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19AaCommon.h b/src/BenchKernelD3Q19AaCommon.h
new file mode 100644 (file)
index 0000000..151e5a0
--- /dev/null
@@ -0,0 +1,89 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+       KernelData kd;
+       int Blk[3];                     // Blocking in X, Y, and Z direction, value of 0 disables blocking.
+       int Iteration;          // Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_)       ((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)    JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+       #error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+       #error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5              FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+
+       Assert(x >= 0);
+       Assert(x < dims[0]);
+       Assert(y >= 0);
+       Assert(y < dims[1]);
+       Assert(z >= 0);
+       Assert(z < dims[2]);
+       Assert(d >= 0);
+#ifdef D3Q19
+       Assert(d < N_D3Q19);
+#else
+#error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+       return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+#elif DATA_LAYOUT_AOS
+       return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+#error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+#endif // __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19AaVec.c b/src/BenchKernelD3Q19AaVec.c
new file mode 100644 (file)
index 0000000..2642c1c
--- /dev/null
@@ -0,0 +1,610 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+#include "Vector.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd);
+static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd);
+
+#if 0 // {{{
+void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix)
+{
+       return;
+
+       int * lDims = ld->Dims;
+       int * gDims = kd->GlobalDims;
+
+       int nX = gDims[0];
+       int nY = gDims[1];
+       int nZ = gDims[2];
+
+       PdfT pdfs[N_D3Q19];
+
+       int localZStart = zStart;
+       int localZStop  = zStop;
+
+       if (localZStart == -1) localZStart = 0;
+       if (localZStop  == -1) localZStop  = gDims[2] - 1;
+
+       printf("D iter: %d\n", iter);
+
+       for (int dir = 0; dir < 19; ++dir) {
+               for (int z = localZStop; z >= localZStart; --z) {
+                       printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z);
+
+                       for(int y = 0; y < nY; ++y) {
+                               printf("D [%2d][%2d][%s] %2d  ", iter, dir, prefix, y);
+
+                               for(int x = 0; x < nX; ++x) {
+
+                                       if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+
+                                               #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+                                               pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)];
+                                               #undef I
+//                                             kd->GetNode(kd, x, y, z, pdfs);
+                                       }
+                                       else {
+                                               pdfs[dir] = -1.0;
+                                       }
+
+                                       printf("%.16e ", pdfs[dir]);
+                               }
+
+                               printf("\n");
+                       }
+               }
+       }
+}
+#endif  // }}}
+
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kd != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelDataAa * kda = KDA(kd);
+
+       PdfT * src = kd->PdfsActive;
+
+       int maxIterations = cd->MaxIterations;
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       Assert((maxIterations % 2) == 0);
+
+       for (int iter = 0; iter < maxIterations; iter += 2) {
+
+               // --------------------------------------------------------------------
+               // even time step
+               // --------------------------------------------------------------------
+
+               X_LIKWID_START("aa-vec-even");
+
+               #pragma omp parallel
+               {
+                       KernelEven(ld, kd, cd);
+               }
+
+               X_LIKWID_STOP("aa-vec-even");
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+                       #pragma omp parallel for default(none) \
+                                       shared(kd, src)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+               }
+
+               // save current iteration
+               kda->Iteration = iter;
+
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                               kd->PdfsActive = src;
+                               VtkWrite(ld, kd, cd, iter);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kd->PdfsActive = src;
+                       KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // --------------------------------------------------------------------
+               // odd time step
+               // --------------------------------------------------------------------
+
+               X_LIKWID_START("aa-vec-odd");
+
+               #pragma omp parallel
+               {
+                       KernelOdd(ld, kd, cd);
+               }
+
+               // Stop counters before bounce back. Else computing loop balance will
+               // be incorrect.
+
+               X_LIKWID_STOP("aa-vec-odd");
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+                       #pragma omp parallel for default(none) \
+                                       shared(kd, src)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+               }
+
+               // save current iteration
+               kda->Iteration = iter + 1;
+
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) {
+                               kd->PdfsActive = src;
+                               VtkWrite(ld, kd, cd, iter + 1);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kd->PdfsActive = src;
+                       KernelStatistics(kd, ld, cd, iter + 1);
+               #endif // }}}
+
+
+       } // for (int iter = 0; ...
+
+       #ifdef VTK_OUTPUT
+
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+
+       #endif
+
+       return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
+{
+       Assert(ld != NULL);
+       Assert(kd != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelDataAa * kda  = KDA(kd);
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int * gDims = kd->GlobalDims;
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       int blk[3];
+       blk[0] = kda->Blk[0];
+       blk[1] = kda->Blk[1];
+       blk[2] = kda->Blk[2];
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+
+       VPDFT VONE_HALF   = VSET(0.5);
+       VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+       VPDFT vw_1_indep, vw_2_indep;
+       VPDFT vw_0 = VSET(w_0);
+       VPDFT vw_1 = VSET(w_1);
+       VPDFT vw_2 = VSET(w_2);
+
+       VPDFT vw_1_x3 = VSET(w_1_x3);
+       VPDFT vw_2_x3 = VSET(w_2_x3);
+       VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       VPDFT vui, vux, vuy, vuz, vdens;
+
+       VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+       VPDFT vomegaEven = VSET(omegaEven);
+       VPDFT vomegaOdd  = VSET(omegaOdd);
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   VPDFT JOIN(vpdf_,name);
+               D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+
+       int nThreads = 1;
+       int threadId = 0;
+
+       #ifdef _OPENMP
+               nThreads = omp_get_max_threads();
+               threadId = omp_get_thread_num();
+       #endif
+
+       // TODO: Currently only a 1-D decomposition is applied. For achritectures
+       //       with a lot of cores we want at least 2-D.
+
+       int threadStartX = nX / nThreads * threadId;
+       int threadEndX   = nX / nThreads * (threadId + 1);
+
+       if (nX % nThreads > 0) {
+               if (nX % nThreads > threadId) {
+                       threadStartX += threadId;
+                       threadEndX   += threadId + 1;
+               }
+               else {
+                       threadStartX += nX % nThreads;
+                       threadEndX   += nX % nThreads;
+               }
+       }
+
+       AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+       for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+       for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+       for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+               int eX = MIN(bX + blk[0], threadEndX + oX);
+               int eY = MIN(bY + blk[1], nY + oY);
+               int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+               for (int x = bX; x < eX; x += 1) {
+               for (int y = bY; y < eY; y += 1) {
+               for (int z = bZ; z < eZ; z += VSIZE) {
+
+                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
+                               D3Q19_LIST
+                       #undef X
+
+
+                       vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+                       vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+                       vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+                       vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+                                       VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),
+                                       VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+                       vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+                       VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+                       vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+                       vui = vuy;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+                       vui = vux;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+                       vui = vuz;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+                       vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+                       vui = VSUB(vuy,vux);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+                       vui = VADD(vux,vuy);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+                       vui = VSUB(vuz,vux);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+                       vui = VADD(vux,vuz);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+                       vui = VSUB(vuz,vuy);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+                       vui = VADD(vuy,vuz);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+                       #undef I
+               } } } // x, y, z
+       } } } // blocked x, y, z
+
+
+
+       return;
+} // }}}
+
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd)  // {{{
+{
+       Assert(ld != NULL);
+       Assert(kd != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelDataAa * kda  = KDA(kd);
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int * gDims = kd->GlobalDims;
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       int blk[3];
+       blk[0] = kda->Blk[0];
+       blk[1] = kda->Blk[1];
+       blk[2] = kda->Blk[2];
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+       VPDFT VONE_HALF   = VSET(0.5);
+       VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+       VPDFT vw_1_indep, vw_2_indep;
+       VPDFT vw_0 = VSET(w_0);
+       VPDFT vw_1 = VSET(w_1);
+       VPDFT vw_2 = VSET(w_2);
+
+       VPDFT vw_1_x3 = VSET(w_1_x3);
+       VPDFT vw_2_x3 = VSET(w_2_x3);
+       VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       VPDFT vui, vux, vuy, vuz, vdens;
+
+       VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+       VPDFT vomegaEven = VSET(omegaEven);
+       VPDFT vomegaOdd  = VSET(omegaOdd);
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)           VPDFT JOIN(vpdf_,name);
+               D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+
+       int threadId = 0;
+       int nThreads = 1;
+
+       #ifdef _OPENMP
+               threadId = omp_get_thread_num();
+               nThreads = omp_get_max_threads();
+       #endif
+
+       // TODO: Currently only a 1-D decomposition is applied. For achritectures
+       //       with a lot of cores we want at least 2-D.
+       int threadStartX = nX / nThreads * threadId;
+       int threadEndX   = nX / nThreads * (threadId + 1);
+
+       if (nX % nThreads > 0) {
+               if (nX % nThreads > threadId) {
+                       threadStartX += threadId;
+                       threadEndX   += threadId + 1;
+               }
+               else {
+                       threadStartX += nX % nThreads;
+                       threadEndX   += nX % nThreads;
+               }
+       }
+
+       AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+       for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+       for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+       for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+               int eX = MIN(bX + blk[0], threadEndX + oX);
+               int eY = MIN(bY + blk[1], nY + oY);
+               int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+               for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; z += VSIZE) {
+
+                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]);
+                       D3Q19_LIST
+                       #undef X
+
+                       vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+                       vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+                       vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+                       vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+                                                VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+                       vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+                       VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+                       vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+                       vui = vuy;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x, y + 1,  z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+                       VSTU(&src[I(x, y - 1,  z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+                       vui = vux;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+                       VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+                       vui = vuz;
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+                       VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+                       VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+                       vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+                       vui = VSUB(vuy,vux);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+                       VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+                       vui = VADD(vux,vuy);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+                       VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+                       vui = VSUB(vuz,vux);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+                       VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+                       vui = VADD(vux,vuz);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+                       VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+                       vui = VSUB(vuz,vuy);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+                       VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+                       vui = VADD(vuy,vuz);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+                       VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+                       VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+                       #undef I
+               } } } // x, y, z
+       } } } // blocked x, y, z
+
+       return;
+
+}  // }}}
diff --git a/src/BenchKernelD3Q19AaVec.h b/src/BenchKernelD3Q19AaVec.h
new file mode 100644 (file)
index 0000000..a125b04
--- /dev/null
@@ -0,0 +1,38 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC__
+#define __BENCH_KERNEL_D3Q19_AA_VEC__
+
+#include "Kernel.h"
+
+
+void D3Q19AaVecInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaVecDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC__
diff --git a/src/BenchKernelD3Q19AaVecCommon.c b/src/BenchKernelD3Q19AaVecCommon.c
new file mode 100644 (file)
index 0000000..57a9fae
--- /dev/null
@@ -0,0 +1,656 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+               int nx = x - D3Q19_X[dir];
+               int ny = y - D3Q19_Y[dir];
+               int nz = z - D3Q19_Z[dir];
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+               #undef I
+       }
+       else {
+               int nx = x;
+               int ny = y;
+               int nz = z;
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+               #undef I
+       }
+
+
+       return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+               int nx = x - D3Q19_X[dir];
+               int ny = y - D3Q19_Y[dir];
+               int nz = z - D3Q19_Z[dir];
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+               #undef I
+       }
+       else {
+               int nx = x;
+               int ny = y;
+               int nz = z;
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+               #undef I
+       }
+
+       return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       else {
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+#if 0          // DETECT NANs
+
+       for (int d = 0; d < 19; ++d) {
+               if (isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+                       for (int d2 = 0; d2 < 19; ++d2) {
+                               printf("%d: %e\n", d2, pdfs[d2]);
+                       }
+
+                       exit(1);
+               }
+       }
+
+#endif
+
+       return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataAa * kda = KDA(kd);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       if (kda->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       else {
+               #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+       }
+       return;
+}
+
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+
+void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelDataAa * kda = NULL;
+       MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+       kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+       kda->Iteration = -1;
+
+       KernelData * kd = &kda->kd;
+       *kernelData = kd;
+
+       kd->nObstIndices = ld->nObst;
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = ld->Dims[0];
+       kd->Dims[1] = ld->Dims[1];
+       kd->Dims[2] = ld->Dims[2];
+
+
+       int * lDims = ld->Dims;
+       int * gDims = kd->GlobalDims;
+
+       Assert(VSIZE <= 4);
+
+       // TODO: only add enough ghost cells so we can compute everything vectorized.
+       gDims[0] = lDims[0] + 2;
+       gDims[1] = lDims[1] + 2;
+       // TODO: fix this for aa-vec2-soa
+       gDims[2] = lDims[2] + 4; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4
+
+       kd->Offsets[0] = 1;
+       kd->Offsets[1] = 1;
+       kd->Offsets[2] = 1;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int gX = gDims[0];
+       int gY = gDims[1];
+       int gZ = gDims[2];
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       int blk[3] = { 0 };
+
+       int nCells = gX * gY * gZ;
+
+       PdfT * pdfs[2] = { NULL, NULL };
+
+       ParseParameters(params, blk);
+
+       if (blk[2] % VSIZE != 0) {
+               blk[2] -= blk[2] % VSIZE;
+               printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]);
+       }
+
+       if (blk[0] == 0) blk[0] = gX;
+       if (blk[1] == 0) blk[1] = gY;
+       if (blk[2] == 0) blk[2] = gZ;
+
+       printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+       kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+       printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+              nCells,
+                  sizeof(PdfT) * nCells * N_D3Q19,
+              sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K                4096
+
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+       kd->Pdfs[0] = pdfs[0];
+       kd->Pdfs[1] = NULL;
+
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // This depends on the chosen data layout.
+       // The structure of the loop should resemble the same "execution layout"
+       // as in the kernel!
+
+       int nThreads;
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+       #pragma omp parallel for \
+                               shared(gDims, pdfs, \
+                               oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+       for (int i = 0; i < nThreads; ++i) {
+
+               int threadStartX = lX / nThreads * i;
+               int threadEndX   = lX / nThreads * (i + 1);
+
+               if (lX % nThreads > 0) {
+                       if (lX % nThreads > i) {
+                               threadStartX += i;
+                               threadEndX   += i + 1;
+                       }
+                       else {
+                               threadStartX += lX % nThreads;
+                               threadEndX   += lX % nThreads;
+                       }
+               }
+
+               for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+               for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+               for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+                       int eX = MIN(bX + blk[0], threadEndX + oX);
+                       int eY = MIN(bY + blk[1], lY + oY);
+                       int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+                       // printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+                       for (int x = bX; x < eX; ++x) {
+                       for (int y = bY; y < eY; ++y) {
+                       for (int z = bZ; z < eZ; ++z) {
+
+                               for (int d = 0; d < N_D3Q19; ++d) {
+                                       pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0;
+                               }
+
+                       } } } // x, y, z
+               } } } // bx, by, bz
+       }
+
+
+       // Initialize all PDFs to some standard value.
+       for (int x = oX; x < lX + oX; ++x) {
+       for (int y = oY; y < lY + oY; ++y) {
+       for (int z = oZ; z < lZ + oZ; ++z) {
+               for (int d = 0; d < N_D3Q19; ++d) {
+                       pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+               }
+       } } } // x, y, z
+
+
+       // Count how many *PDFs* need bounce back treatment.
+
+       uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+       if (nPdfs > ((2LU << 31) - 1)) {
+               printf("ERROR: number of PDFs exceed 2^31.\n");
+               exit(1);
+       }
+
+       // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+       // Works when declaring nBounceBackPdfs as int64_t or using volatile.
+       volatile int nBounceBackPdfs = 0;
+       // int64_t nBounceBackPdfs = 0;
+       int nx, ny, nz, px, py, pz;
+
+
+       for (int x = 0; x < lX; ++x) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int z = 0; z < lZ; ++z) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               nx = x - D3Q19_X[d];
+                                               ny = y - D3Q19_Y[d];
+                                               nz = z - D3Q19_Z[d];
+
+                                               // Check if neighbor is inside the lattice.
+                                               // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                               //      continue;
+                                               // }
+                                               if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+       MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+       MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+       kd->nBounceBackPdfs = nBounceBackPdfs;
+       nBounceBackPdfs = 0;
+
+       int srcIndex;
+       int dstIndex;
+
+       // TODO: currently this is not NUMA-aware
+    //       - maybe use the same blocking as for lattice initialization?
+       //       - do place the bounce back index vector parallel?
+
+       for (int x = 0; x < lX; ++x) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int z = 0; z < lZ; ++z) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               nx = x + D3Q19_X[d];
+                                               ny = y + D3Q19_Y[d];
+                                               nz = z + D3Q19_Z[d];
+
+                                               if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                                               ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                                               ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                               ){
+                                                       // For periodicity:
+
+                                                       // We assume we have finished odd time step (accessing neighbor PDFs) and are
+                                                       // before executing the even time step (accessing local PDFs only).
+
+                                                       // Assuming we are at the most east position of the lattice. Through the odd
+                                                       // time step propagation has put a PDF in the east slot of the ghost cell east
+                                                       // of us, i.e. nx, ny, nz.  We copy it to the east slot of the most west node.
+
+                                                       // In case of transition from even to odd time step , src and dest must be
+                                                       // exchanged.
+
+
+                                                       // x periodic
+                                                       if (nx < 0) {
+                                                               px = lX - 1;
+                                                       }
+                                                       else if (nx >= lX) {
+                                                               px = 0;
+                                                       } else {
+                                                               px = nx;
+                                                       }
+
+                                                       // y periodic
+                                                       if (ny < 0) {
+                                                               py = lY - 1;
+                                                       }
+                                                       else if (ny >= lY) {
+                                                               py = 0;
+                                                       } else {
+                                                               py = ny;
+                                                       }
+
+                                                       // z periodic
+                                                       if (nz < 0) {
+                                                               pz = lZ - 1;
+                                                       }
+                                                       else if (nz >= lZ) {
+                                                               pz = 0;
+                                                       } else {
+                                                               pz = nz;
+                                                       }
+
+                                                       if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                                               // See description of bounce back handling below.
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+                                                       }
+                                                       else {
+
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               // Put it on the other side back into the domain.
+                                                               dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+                                                               VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       }
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                                                       // Depending on the time step we are in we have to exchange src and dst index.
+
+                                                       // We build the list for the case, when we have finished odd time step
+                                                       // (accessing neighbor PDFs) and before we start with the even time step
+                                                       // (accessing local PDFs only).
+
+                                                       // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+                                                       // Then we have to move the east PDF from the obstacle to our west position,
+                                                       // i.e. the inverse of east.
+
+                                                       // In case of transition from even to odd time step src and dest just
+                                                       // have to be exchanged.
+
+                                                       srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                       dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+
+                                                       VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = FNAME(GetNode);
+       kd->SetNode = FNAME(SetNode);
+
+       kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+       kd->Kernel = FNAME(D3Q19AaVecKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       MemFree((void **) & ((*kernelData)->Pdfs[0]));
+
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+       MemFree((void **)kernelData);
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19AaVecCommon.h b/src/BenchKernelD3Q19AaVecCommon.h
new file mode 100644 (file)
index 0000000..d416d07
--- /dev/null
@@ -0,0 +1,93 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
+// #include "BenchKernelD3Q19AaCommon.h"
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+       KernelData kd;
+       int Blk[3];                     // Blocking in X, Y, and Z direction, value of 0 disables blocking.
+       int Iteration;          // Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_)       ((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)    JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_SOA
+       #error Only DATA_LAYOUT_SOA is supported.
+#endif
+
+#ifndef PROP_MODEL_AA
+       #error Only PROP_MODEL_AA is supported.
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5              FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+
+       Assert(x >= 0);
+       Assert(x < dims[0]);
+       Assert(y >= 0);
+       Assert(y < dims[1]);
+       Assert(z >= 0);
+       Assert(z < dims[2]);
+       Assert(d >= 0);
+#ifdef D3Q19
+       Assert(d < N_D3Q19);
+#else
+       #error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+       return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+// #elif DATA_LAYOUT_AOS
+//     return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+       #error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
index e697bc8a573a2d5fb646a0686ed2157805c829f8..8f756493fe62d744c3cdee663dcd2bb5dad744f9 100644 (file)
@@ -32,6 +32,9 @@
 #include <inttypes.h>
 #include <math.h>
 
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
 
 // Forward definition.
 void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
@@ -224,8 +227,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -236,8 +239,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -248,8 +251,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -260,8 +263,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -285,7 +288,7 @@ static void ParseParameters(Parameters * params, int * blk)
 }
 
 
-void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 {
        KernelDataEx * kdex = NULL;
        MemAlloc((void **)&kdex, sizeof(KernelDataEx));
@@ -332,7 +335,17 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
        PdfT * pdfs[2];
 
-       ParseParameters(params, blk);
+       if (blocking) {
+               ParseParameters(params, blk);
+       }
+       else {
+               if (params->nKernelArgs != 0) {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       printf("This kernels accepts no parameters.\n");
+                       exit(1);
+               }
+       }
+
 
        if (blk[0] == 0) blk[0] = gX;
        if (blk[1] == 0) blk[1] = gY;
@@ -348,8 +361,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
               nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
               2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
 
-       MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
-       MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+       // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+       // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+       MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+
+       printf("# pdfs[0] = %p\n", pdfs[0]);
+       printf("# pdfs[1] = %p\n", pdfs[1]);
 
        kd->Pdfs[0] = pdfs[0];
        kd->Pdfs[1] = pdfs[1];
@@ -358,39 +376,82 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        // This depends on the chosen data layout.
        // The structure of the loop should resemble the same "execution layout"
        // as in the kernel!
-#ifdef _OPENMP
-       #pragma omp parallel for collapse(3)
-#endif
 
-       for (int bZ = 0; bZ < gZ; bZ += blk[2]) {
-               for (int bY = 0; bY < gY; bY += blk[1]) {
-                       for (int bX = 0; bX < gX; bX += blk[0]) {
+       if (blocking) {
+               int nX = ld->Dims[0];
+               int nY = ld->Dims[1];
+               int nZ = ld->Dims[2];
 
-                               // Must do everything here, else it would break collapse.
-                               int eZ = MIN(bZ + blk[2], gZ);
-                               int eY = MIN(bY + blk[1], gY);
-                               int eX = MIN(bX + blk[0], gX);
+               int nThreads = 1;
 
-                               for (int z = bZ; z < eZ; ++z) {
-                                       for (int y = bY; y < eY; ++y) {
-                                               for (int x = bX; x < eX; ++x) {
+               #ifdef _OPENMP
+               nThreads = omp_get_max_threads();
+               #endif
 
-                                                       for (int d = 0; d < N_D3Q19; ++d) {
-                                                               pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
-                                                               pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
-                                                       }
+               #ifdef _OPENMP
+                       #pragma omp parallel for default(none) \
+                               shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads)
+               #endif
+               for (int i = 0; i < nThreads; ++i) {
+
+                       int threadStartX = nX / nThreads * i;
+                       int threadEndX   = nX / nThreads * (i + 1);
 
+                       if (nX % nThreads > 0) {
+                               if (nX % nThreads > i) {
+                                       threadStartX += i;
+                                       threadEndX   += i + 1;
+                               }
+                               else {
+                                       threadStartX += nX % nThreads;
+                                       threadEndX   += nX % nThreads;
+                               }
+                       }
+
+                       for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+                               for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+                                       for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+                                               int eZ = MIN(bZ + blk[2], nZ + oZ);
+                                               int eY = MIN(bY + blk[1], nY + oY);
+                                               int eX = MIN(bX + blk[0], threadEndX + oX);
+
+                                               for (int x = bX; x < eX; ++x) {
+                                                       for (int y = bY; y < eY; ++y) {
+                                                               for (int z = bZ; z < eZ; ++z) {
+                                                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                                                               pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                                                               pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                                                       }
+                                                               }
+                                                       }
                                                }
                                        }
                                }
                        }
                }
+
+       }
+       else {
+               #ifdef _OPENMP
+                       #pragma omp parallel for collapse(3)
+               #endif
+               for (int x = 0; x < gX; ++x) {
+                       for (int y = 0; y < gY; ++y) {
+                               for (int z = 0; z < gZ; ++z) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                               pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                       }
+                               }
+                       }
+               }
        }
 
        // Initialize all PDFs to some standard value.
-       for (int z = 0; z < gZ; ++z) {
+       for (int x = 0; x < gX; ++x) {
                for (int y = 0; y < gY; ++y) {
-                       for (int x = 0; x < gX; ++x) {
+                       for (int z = 0; z < gZ; ++z) {
                                for (int d = 0; d < N_D3Q19; ++d) {
                                        pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
                                        pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
@@ -417,9 +478,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
        // TODO: apply blocking?
 
-       for (int z = 0; z < lZ; ++z) {
+       for (int x = 0; x < lX; ++x) {
                for (int y = 0; y < lY; ++y) {
-                       for (int x = 0; x < lX; ++x) {
+                       for (int z = 0; z < lZ; ++z) {
 
                                if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
                                        for (int d = 0; d < N_D3Q19; ++d) {
@@ -471,9 +532,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        int srcIndex;
        int dstIndex;
 
-       for (int z = 0; z < lZ; ++z) {
+       for (int x = 0; x < lX; ++x) {
                for (int y = 0; y < lY; ++y) {
-                       for (int x = 0; x < lX; ++x) {
+                       for (int z = 0; z < lZ; ++z) {
 
                                if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
                                        for (int d = 0; d < N_D3Q19; ++d) {
@@ -598,7 +659,12 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
        kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
 
-       kd->Kernel = FNAME(D3Q19BlkKernel);
+       if (blocking) {
+               kd->Kernel = FNAME(D3Q19BlkKernel);
+       }
+       else {
+               kd->Kernel = FNAME(D3Q19Kernel);
+       }
 
        kd->DstPdfs = NULL;
        kd->PdfsActive = kd->Pdfs[0];
@@ -606,6 +672,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        return;
 }
 
+
+void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       D3Q19InternalInit(1, ld, kernelData, params);
+}
+
+
 void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 {
        MemFree((void **) & ((*kernelData)->Pdfs[0]));
@@ -624,27 +697,7 @@ void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 
 void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 {
-       Parameters p;
-
-       if (params->nKernelArgs != 0) {
-               printf("ERROR: unknown kernel parameter.\n");
-               printf("This kernels accepts no parameters.\n");
-               exit(1);
-       }
-
-       // Setup an empty parameters structure.
-       p.nArgs        = params->nArgs;
-       p.Args         = params->Args;
-       p.nKernelArgs  = 0;
-       p.KernelArgs   = NULL;
-
-       // Call init routine for blocking kernel and override the
-       // kernel function to be called later on.
-       FNAME(D3Q19BlkInit)(ld, kernelData, &p);
-
-       (*kernelData)->Kernel = FNAME(D3Q19Kernel);
-
-       return;
+       D3Q19InternalInit(0, ld, kernelData, params);
 
 }
 
index cceddca1f40362f8ca930b55cc75fb734ee7eee2..268bfed0ebfb0584c174f5ff26a3bd2bc78dfe71 100644 (file)
@@ -74,9 +74,9 @@ static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
 #endif
 
 #ifdef DATA_LAYOUT_SOA
-       return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x;
+       return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
 #elif DATA_LAYOUT_AOS
-       return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d;
+       return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
 #else
 #error P_INDEX_5 function no implemented for chosen data layout.
 #endif
index 7922b7adda23242276ebe2e27147cf84074849c0..e01853aab7cf668a25f5fce990120c8d6e6c73c0 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -97,6 +98,7 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
        for(int iter = 0; iter < maxIterations; ++iter) {
 
 
+       X_LIKWID_START("list-os");
 
        #ifdef _OPENMP
                #pragma omp parallel for default(none) \
@@ -307,6 +309,8 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
                        #undef I
                } // loop over fluid nodes
 
+               X_LIKWID_STOP("list-os");
+
                #ifdef VERIFICATION
                        kd->PdfsActive = dst;
                        KernelAddBodyForce(kd, ld, cd);
index d2bcb5270f92b0ce748db910c5e260d5d9a336f3..c1e48a5aec8b0e053fb93f9323f3b6070791c003 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -222,15 +223,18 @@ static void ParameterUsage()
 {
        printf("Kernel parameters:\n");
        printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
-
+#ifdef DATA_LAYOUT_SOA
+       printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
        return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
+       *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
        #define NEXT_ARG_PRESENT() \
@@ -248,8 +252,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -260,8 +264,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -272,8 +276,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -284,13 +288,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
                        blk[2] = tmp;
                }
+#ifdef DATA_LAYOUT_SOA
+               else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+                       NEXT_ARG_PRESENT();
+
+                       *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+               }
+#endif
                else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
                        ParameterUsage();
                        exit(1);
@@ -346,6 +357,11 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
        kdl->nFluid = -1;
 #endif
 
+       int blk[3] = { 0 };
+       PadInfo * padInfo = NULL;
+
+       ParseParameters(params, blk, &padInfo);
+
        // Ajust the dimensions according to padding, if used.
        kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
        kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -358,18 +374,21 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
        int lZ = lDims[2];
 
        int nTotalCells = lX * lY * lZ;
-       int nCells = ld->nFluid; // TODO: + padding
+       int nCells = ld->nFluid;
        int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+       {
+               nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+               PadInfoFree(padInfo); padInfo = NULL;
+       }
+#endif
+
        kdl->nCells = nCells;
        kdl->nFluid = nFluid;
 
        PdfT * pdfs[2];
 
-       int blk[3] = { 0 };
-
-       ParseParameters(params, blk);
-
        if (blk[0] == 0) blk[0] = lX;
        if (blk[1] == 0) blk[1] = lY;
        if (blk[2] == 0) blk[2] = lZ;
@@ -443,18 +462,17 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 
        // Blocking is implemented via setup of the adjacency list. The kernel later will
        // walk through the lattice blocked automatically.
-       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-       for (int bY = 0; bY < lY; bY += blk[1]) {
        for (int bX = 0; bX < lX; bX += blk[0]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
                int eX = MIN(bX + blk[0], lX);
                int eY = MIN(bY + blk[1], lY);
                int eZ = MIN(bZ + blk[2], lZ);
 
-
-               for (int z = bZ; z < eZ; ++z) {
-               for (int y = bY; y < eY; ++y) {
                for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; ++z) {
 
                        latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -492,7 +510,15 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 
        // Loop over all fluid nodes and compute the indices to the neighboring
        // PDFs for configure data layout (AoS/SoA).
-       // TODO: Parallelized loop to ensure correct NUMA placement.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+               for (int d = 0; d < N_D3Q19_IDX; ++d) {
+                       adjList[index * N_D3Q19_IDX + d] = -1;
+               }
+       }
+
        // #ifdef _OPENMP --> add line continuation
        //      #pragma omp parallel for default(none)
        //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
index c5222524617a23fd1f7aedbe733458aa80ddfede..b1dee16a35dc5ab8c36e6a7cea8848edf282d0f3 100644 (file)
@@ -728,9 +728,9 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
                        //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
                        voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
                        //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
-                       VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+                       VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
                        //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
-                       VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+                       VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
 
                        //vui = vuy + vuz;
                        vui = VADD(vuy,vuz);
@@ -783,20 +783,20 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
                        ui = uy;
                        evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3);
-                       src[ADJ_LIST(D3Q19_N)]  = pdf_N - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_S)]  = pdf_S - evenPart + oddPart;
+                       *ppdf_S  = pdf_N - evenPart - oddPart;
+                       *ppdf_N  = pdf_S - evenPart + oddPart;
 
                        ui = ux;
                        evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3);
-                       src[ADJ_LIST(D3Q19_E)]  = pdf_E - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_W)]  = pdf_W - evenPart + oddPart;
+                       *ppdf_W  = pdf_E - evenPart - oddPart;
+                       *ppdf_E  = pdf_W - evenPart + oddPart;
 
                        ui = uz;
                        evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3);
-                       src[ADJ_LIST(D3Q19_T)]  = pdf_T - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_B)]  = pdf_B - evenPart + oddPart;
+                       *ppdf_B  = pdf_T - evenPart - oddPart;
+                       *ppdf_T  = pdf_B - evenPart + oddPart;
 
                        // direction: w_2
                        w_2_indep = w_2 * dir_indep_trm;
@@ -804,38 +804,38 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
                        ui = -ux + uy;
                        evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+                       *ppdf_SE = pdf_NW - evenPart - oddPart;
+                       *ppdf_NW = pdf_SE - evenPart + oddPart;
 
                        ui = ux + uy;
                        evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+                       *ppdf_SW = pdf_NE - evenPart - oddPart;
+                       *ppdf_NE = pdf_SW - evenPart + oddPart;
 
                        ui = -ux + uz;
                        evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+                       *ppdf_BE = pdf_TW - evenPart - oddPart;
+                       *ppdf_TW = pdf_BE - evenPart + oddPart;
 
                        ui = ux + uz;
                        evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+                       *ppdf_BW = pdf_TE - evenPart - oddPart;
+                       *ppdf_TE = pdf_BW - evenPart + oddPart;
 
                        ui = -uy + uz;
                        evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+                       *ppdf_BN = pdf_TS - evenPart - oddPart;
+                       *ppdf_TS = pdf_BN - evenPart + oddPart;
 
                        ui = uy + uz;
                        evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
                        oddPart  = omegaOdd  * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3);
-                       src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart;
-                       src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+                       *ppdf_BS = pdf_TN - evenPart - oddPart;
+                       *ppdf_TN = pdf_BS - evenPart + oddPart;
 
                        pointerOffset = 1;
                }
index 9cdf0e13ab057c297da8aa0053b5fe27d8445f91..7b479cfa62aea674796eb98a458d2b642af7a9b6 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -225,15 +226,19 @@ static void ParameterUsage()
 {
        printf("Kernel parameters:\n");
        printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+       printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
        return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
+       *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
        #define NEXT_ARG_PRESENT() \
@@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
                        blk[2] = tmp;
                }
+#ifdef DATA_LAYOUT_SOA
+               else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+                       NEXT_ARG_PRESENT();
+
+                       *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+               }
+#endif
                else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
                        ParameterUsage();
                        exit(1);
@@ -502,6 +514,11 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
        kdlr->nConsecThreadIndices = 0;
 #endif
 
+       int blk[3] = { 0 };
+       PadInfo * padInfo = NULL;
+
+       ParseParameters(params, blk, &padInfo);
+
        // Ajust the dimensions according to padding, if used.
        kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
        kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -514,17 +531,21 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
        int lZ = lDims[2];
 
        int nTotalCells = lX * lY * lZ;
-       int nCells = ld->nFluid; // TODO: + padding
+       int nCells = ld->nFluid;
        int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+       {
+               nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+               PadInfoFree(padInfo); padInfo = NULL;
+       }
+#endif
+
        kdl->nCells = nCells;
        kdl->nFluid = nFluid;
 
        PdfT * pdfs[2];
 
-       int blk[3] = { 0 };
-
-       ParseParameters(params, blk);
 
        if (blk[0] == 0) blk[0] = lX;
        if (blk[1] == 0) blk[1] = lY;
@@ -607,18 +628,18 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 
        // Blocking is implemented via setup of the adjacency list. The kernel later will
        // walk through the lattice blocked automatically.
-       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-       for (int bY = 0; bY < lY; bY += blk[1]) {
        for (int bX = 0; bX < lX; bX += blk[0]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
                int eX = MIN(bX + blk[0], lX);
                int eY = MIN(bY + blk[1], lY);
                int eZ = MIN(bZ + blk[2], lZ);
 
 
-               for (int z = bZ; z < eZ; ++z) {
-               for (int y = bY; y < eY; ++y) {
                for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; ++z) {
 
                        latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -660,7 +681,15 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 
        // Loop over all fluid nodes and compute the indices to the neighboring
        // PDFs for configured data layout (AoS/SoA).
-       // TODO: Parallelized loop to ensure correct NUMA placement.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+               for (int d = 0; d < N_D3Q19_IDX; ++d) {
+                       adjList[index * N_D3Q19_IDX + d] = -1;
+               }
+       }
+
        // #ifdef _OPENMP  --> add line continuation
        //      #pragma omp parallel for default(none)
        //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
index 7faf37b581a60e7333c7065c83cdd5c3723224e5..c6ba13a5ce386b8640c1693cc814b4209655ed5b 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -225,15 +226,19 @@ static void ParameterUsage()
 {
        printf("Kernel parameters:\n");
        printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+       printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
        return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
+       *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
        #define NEXT_ARG_PRESENT() \
@@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
                        blk[2] = tmp;
                }
+#ifdef DATA_LAYOUT_SOA
+               else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+                       NEXT_ARG_PRESENT();
+
+                       *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+               }
+#endif
                else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
                        ParameterUsage();
                        exit(1);
@@ -485,6 +497,11 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
        kdlr->nConsecThreadIndices = 0;
 #endif
 
+       int blk[3] = { 0 };
+       PadInfo * padInfo = NULL;
+
+       ParseParameters(params, blk, &padInfo);
+
        // Ajust the dimensions according to padding, if used.
        kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
        kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -500,14 +517,18 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
        int nCells = ld->nFluid; // TODO: + padding
        int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+       {
+               nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+               PadInfoFree(padInfo); padInfo = NULL;
+       }
+#endif
+
        kdl->nCells = nCells;
        kdl->nFluid = nFluid;
 
        PdfT * pdfs[2];
 
-       int blk[3] = { 0 };
-
-       ParseParameters(params, blk);
 
        if (blk[0] == 0) blk[0] = lX;
        if (blk[1] == 0) blk[1] = lY;
@@ -590,18 +611,17 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 
        // Blocking is implemented via setup of the adjacency list. The kernel later will
        // walk through the lattice blocked automatically.
-       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-       for (int bY = 0; bY < lY; bY += blk[1]) {
        for (int bX = 0; bX < lX; bX += blk[0]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
                int eX = MIN(bX + blk[0], lX);
                int eY = MIN(bY + blk[1], lY);
                int eZ = MIN(bZ + blk[2], lZ);
 
-
-               for (int z = bZ; z < eZ; ++z) {
-               for (int y = bY; y < eY; ++y) {
                for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; ++z) {
 
                        latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -643,7 +663,15 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 
        // Loop over all fluid nodes and compute the indices to the neighboring
        // PDFs for configured data layout (AoS/SoA).
-       // TODO: Parallelized loop to ensure correct NUMA placement.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+               for (int d = 0; d < N_D3Q19_IDX; ++d) {
+                       adjList[index * N_D3Q19_IDX + d] = -1;
+               }
+       }
+
        // #ifdef _OPENMP  --> add line continuation
        //      #pragma omp parallel for default(none)
        //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
index 4f97bae167bdf4b12c2b907d6b4d70fdba30b482..f66e379a8c8d699d8bb598b1a59f2e4b56d28f4e 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -200,15 +201,19 @@ static void ParameterUsage()
 {
        printf("Kernel parameters:\n");
        printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+       printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
        return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
+       *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
        #define NEXT_ARG_PRESENT() \
@@ -226,8 +231,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -238,8 +243,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -250,8 +255,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -262,8 +267,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -273,6 +278,13 @@ static void ParseParameters(Parameters * params, int * blk)
                        ParameterUsage();
                        exit(1);
                }
+#ifdef DATA_LAYOUT_SOA
+               else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+                       NEXT_ARG_PRESENT();
+
+                       *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+               }
+#endif
                else {
                        printf("ERROR: unknown kernel parameter.\n");
                        ParameterUsage();
@@ -324,6 +336,12 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        kdl->nFluid = -1;
 #endif
 
+       int blk[3] = { 0 };
+       PadInfo * padInfo = NULL;
+
+       ParseParameters(params, blk, &padInfo);
+
+
        // Ajust the dimensions according to padding, if used.
        kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
        kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -336,18 +354,21 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
        int lZ = lDims[2];
 
        int nTotalCells = lX * lY * lZ;
-       int nCells = ld->nFluid; // TODO: + padding
+       int nCells = ld->nFluid;
        int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+       {
+               nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+               PadInfoFree(padInfo); padInfo = NULL;
+       }
+#endif
+
        kdl->nCells = nCells;
        kdl->nFluid = nFluid;
 
        PdfT * pdfs[2];
 
-       int blk[3] = { 0 };
-
-       ParseParameters(params, blk);
-
        if (blk[0] == 0) blk[0] = lX;
        if (blk[1] == 0) blk[1] = lY;
        if (blk[2] == 0) blk[2] = lZ;
@@ -425,18 +446,17 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
        // Blocking is implemented via setup of the adjacency list. The kernel later will
        // walk through the lattice blocked automatically.
-       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-       for (int bY = 0; bY < lY; bY += blk[1]) {
        for (int bX = 0; bX < lX; bX += blk[0]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
                int eX = MIN(bX + blk[0], lX);
                int eY = MIN(bY + blk[1], lY);
                int eZ = MIN(bZ + blk[2], lZ);
 
-
-               for (int z = bZ; z < eZ; ++z) {
-               for (int y = bY; y < eY; ++y) {
                for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; ++z) {
 
                        latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -474,7 +494,15 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
        // Loop over all fluid nodes and compute the indices to the neighboring
        // PDFs for configured data layout (AoS/SoA).
-       // TODO: Parallelized loop to ensure correct NUMA placement.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+               for (int d = 0; d < N_D3Q19_IDX; ++d) {
+                       adjList[index * N_D3Q19_IDX + d] = -1;
+               }
+       }
+
        // #ifdef _OPENMP  --> add line continuation
        //      #pragma omp parallel for default(none)
        //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
index dfab54afbdc574c3936d146e28e9925e479a1088..0132dc9058f26d7f18f153e8a0c11f37d38efa74 100644 (file)
@@ -29,6 +29,7 @@
 #include "Memory.h"
 #include "Vtk.h"
 #include "Vector.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -118,6 +119,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
                KernelStatistics(kd, ld, cd, 0);
        #endif
 
+
+                       X_LIKWID_START("list-pull-split-nt-1s");
        #ifdef _OPENMP
                #pragma omp parallel default(none) \
                        shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
@@ -184,6 +187,7 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 
                for(int iter = 0; iter < maxIterations; ++iter) {
 
+
 #if 1
                        #define INDEX_START     blIndexStart
                        #define INDEX_STOP  blIndexVec
@@ -201,6 +205,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
                        #define INDEX_STOP      blIndexStop
                        #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 #endif
+
+
                        #pragma omp barrier
 
                        #pragma omp single
@@ -235,6 +241,9 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
                MemFree((void **)&tmpArray);
        }
 
+
+       X_LIKWID_STOP("list-pull-split-nt-1s");
+
 #ifdef VTK_OUTPUT
        if (cd->VtkOutput) {
                kd->PdfsActive = src;
@@ -321,6 +330,10 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
                KernelStatistics(kd, ld, cd, 0);
        #endif
 
+
+                       X_LIKWID_START("list-pull-split-nt-2s");
+
+
        #ifdef _OPENMP
                #pragma omp parallel default(none) \
                        shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
@@ -406,6 +419,7 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 #endif
                        #pragma omp barrier
 
+
                        #pragma omp single
                        {
                                #ifdef VERIFICATION
@@ -438,6 +452,8 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
                MemFree((void **)&tmpArray);
        }
 
+                       X_LIKWID_STOP("list-pull-split-nt-2s");
+
 #ifdef VTK_OUTPUT
        if (cd->VtkOutput) {
                kd->PdfsActive = src;
index b5df14b2a300b28f8ec0cd782a6e4f6aaf5627a1..8c8da885578c21b5b6883619536464e993f7f8c9 100644 (file)
@@ -29,6 +29,8 @@
 #include "Memory.h"
 #include "Vtk.h"
 #include "Vector.h"
+#include "Padding.h"
+
 
 #include <math.h>
 
@@ -183,16 +185,18 @@ static void ParameterUsage()
 {
        printf("Kernel parameters:\n");
        printf("  [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
+       printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
 
        return;
 }
 
-static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
+static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo)
 {
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
        *nTmpArray = 152;
+       *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
        #define NEXT_ARG_PRESENT() \
@@ -210,8 +214,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -222,8 +226,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -234,8 +238,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -246,8 +250,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
                        int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-                       if (tmp <= 0) {
-                               printf("ERROR: blocking parameter must be > 0.\n");
+                       if (tmp < 0) {
+                               printf("ERROR: blocking parameter must be >= 0.\n");
                                exit(1);
                        }
 
@@ -270,6 +274,11 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
                        *nTmpArray = tmp;
                }
+               else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+                       NEXT_ARG_PRESENT();
+
+                       *padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+               }
                else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
                        ParameterUsage();
                        exit(1);
@@ -456,6 +465,11 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
        kdlr->nConsecThreadIndices = 0;
 #endif
 
+       int blk[3] = { 0 };
+       PadInfo * padInfo = NULL;
+
+       ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo);
+
        // Ajust the dimensions according to padding, if used.
        kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
        kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -471,8 +485,12 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
        int nCells = ld->nFluid;
        int nFluid = ld->nFluid;
 
+       {
+               nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+               PadInfoFree(padInfo); padInfo = NULL;
+       }
+
        // We padd each stream of a PDF array for a complete cache line.
-       // TODO: padding for L1/L2 and TLB.
        nCells = nCells + (8 - nCells % 8);
 
        Assert(nCells % VSIZE == 0);
@@ -482,10 +500,6 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 
        PdfT * pdfs[2];
 
-       int blk[3] = { 0 };
-
-       ParseParameters(params, blk, &kdlr->nTmpArray);
-
        if (blk[0] == 0) blk[0] = lX;
        if (blk[1] == 0) blk[1] = lY;
        if (blk[2] == 0) blk[2] = lZ;
@@ -572,18 +586,18 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 
        // Blocking is implemented via setup of the adjacency list. The kernel later will
        // walk through the lattice blocked automatically.
-       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-       for (int bY = 0; bY < lY; bY += blk[1]) {
        for (int bX = 0; bX < lX; bX += blk[0]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
                int eX = MIN(bX + blk[0], lX);
                int eY = MIN(bY + blk[1], lY);
                int eZ = MIN(bZ + blk[2], lZ);
 
 
-               for (int z = bZ; z < eZ; ++z) {
-               for (int y = bY; y < eY; ++y) {
                for (int x = bX; x < eX; ++x) {
+               for (int y = bY; y < eY; ++y) {
+               for (int z = bZ; z < eZ; ++z) {
 
                        latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -627,6 +641,15 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
        // Loop over all fluid nodes and compute the indices to the neighboring
        // PDFs for configured data layout (AoS/SoA).
        // Parallelized loop to ensure correct NUMA placement.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+               for (int d = 0; d < N_D3Q19_IDX; ++d) {
+                       adjList[index * N_D3Q19_IDX + d] = -1;
+               }
+       }
+
        // #ifdef _OPENMP  --> add line continuation
        //      #pragma omp parallel for default(none)
        //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
index 31c985abc25a115c6e4055afde86cc41f00931b6..0450c56d81fb6fea2b988bf16f7c0864b4e21f61 100644 (file)
@@ -72,6 +72,9 @@ void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], Lat
 
                typeDetails = &tmp;
        }
+       else if (strncasecmp("fluid", geometryType, 5) == 0) {
+               type = GEO_TYPE_FLUID;
+       }
        else {
                printf("ERROR: unknown geometry specified.\n");
                Verify(0);
@@ -99,7 +102,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
        Assert(type >= GEO_TYPE_MIN);
        Assert(type <= GEO_TYPE_MAX);
 
-       const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" };
+       const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks", "fluid" };
 
        printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]);
 
@@ -127,6 +130,10 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
        if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) {
                periodic[0] = 1;
        }
+       else if (type == GEO_TYPE_FLUID) {
+               // Actually this geometry is non-periodic, but we need fluid at the boundary.
+               periodic[0] = 1; periodic[1] = 1; periodic[2] = 1;
+       }
 
        // Walls or periodic on first and last x plane.
        for (int z = 0; z < dims[2]; ++z) {
@@ -168,7 +175,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
        }
 
        if (type == GEO_TYPE_CHANNEL) {
-               periodic[0] = 1;
+               // Nothing todo here.
        }
        else if (type == GEO_TYPE_PIPE) {
                #define SQR(a) ((a)*(a))
@@ -192,34 +199,38 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
 
                int blockSize = *((int *)typeDetails);
 
+#if 0
                if (blockSize == 0) {
                        blockSize = 8;
                }
+#endif
+               if (blockSize > 0) {
 
-               int dimMin = dims[0];
+                       int dimMin = dims[0];
 
-               if (dims[1] < dimMin) dimMin = dims[1];
-               if (dims[2] < dimMin) dimMin = dims[2];
+                       if (dims[1] < dimMin) dimMin = dims[1];
+                       if (dims[2] < dimMin) dimMin = dims[2];
 
-               if (blockSize < 0 || blockSize > dimMin / 2) {
-                       printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
-                       // TODO: find a better solution for handling errors in here.
-                       Verify(0);
-               }
+                       if (blockSize < 0 || blockSize > dimMin / 2) {
+                               printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
+                               // TODO: find a better solution for handling errors in here.
+                               Verify(0);
+                       }
 
-               // Number of blocks in x, y, and z direction.
-               int nbx = blockSize, nby = blockSize, nbz = blockSize;
+                       // Number of blocks in x, y, and z direction.
+                       int nbx = blockSize, nby = blockSize, nbz = blockSize;
 
-               for (int z = 0; z < dims[2]; ++z) {
-                               if ((z % (2 * nbz)) < nbz) continue;
+                       for (int z = 0; z < dims[2]; ++z) {
+                                       if ((z % (2 * nbz)) < nbz) continue;
 
-                       for (int y = 0; y < dims[1]; ++y) {
-                               if ((y % (2 * nby)) < nby) continue;
+                               for (int y = 0; y < dims[1]; ++y) {
+                                       if ((y % (2 * nby)) < nby) continue;
 
-                               for (int x = 0; x < dims[0]; ++x) {
+                                       for (int x = 0; x < dims[0]; ++x) {
 
-                                       if ((x % (2 * nbx)) >= nbx) {
-                                               lattice[L_INDEX_4(dims, x, y, z)]       = LAT_CELL_OBSTACLE;
+                                               if ((x % (2 * nbx)) >= nbx) {
+                                                       lattice[L_INDEX_4(dims, x, y, z)]       = LAT_CELL_OBSTACLE;
+                                               }
                                        }
                                }
                        }
index 11d72ff8f015cfc8f62dccf34ed252f54708b3d3..a9bf2601368935ee0ea3796e088fbc07197a47c7 100644 (file)
@@ -37,7 +37,8 @@ typedef enum GEO_TYPES_ {
        GEO_TYPE_PIPE = 2,
        GEO_TYPE_BLOCKS = 3,    // Expects a pointer to an integer, holding the
                                                    // value of the block size as type detail.
-       GEO_TYPE_MAX = 3
+       GEO_TYPE_FLUID = 4,
+       GEO_TYPE_MAX = 4
 } GEO_TYPES;
 
 
index eddae640df9b3d08fec9840cba5b02813c548457..99e126b720b943ff5ffa4e92651a0811128c6b73 100644 (file)
@@ -56,7 +56,7 @@
 #endif
 
 #ifdef PROP_MODEL_AA
-       #define PROP_MODEL_NAME AA
+       #define PROP_MODEL_NAME Aa
 #endif
 
 
index 2ee063abba75c60f064ea553fc93f3a711307222..557c6535b024186131b3fb9d9d3c9c046865b3d3 100644 (file)
@@ -28,6 +28,8 @@
 #define __KERNEL_FUNCTIONS_H__
 
 #include "BenchKernelD3Q19.h"
+#include "BenchKernelD3Q19Aa.h"
+#include "BenchKernelD3Q19AaVec.h"
 #include "BenchKernelD3Q19List.h"
 #include "BenchKernelD3Q19ListAa.h"
 #include "BenchKernelD3Q19ListAaRia.h"
@@ -133,6 +135,22 @@ KernelFunctions g_kernels[] =
                .Init   = D3Q19BlkInit_PullAoS,
                .Deinit = D3Q19BlkDeinit_PullAoS
        },
+       {
+               .Name   = "aa-aos",
+               .Init   = D3Q19AaInit_AaAoS,
+               .Deinit = D3Q19AaDeinit_AaAoS
+       },
+       {
+               .Name   = "aa-soa",
+               .Init   = D3Q19AaInit_AaSoA,
+               .Deinit = D3Q19AaDeinit_AaSoA
+       },
+       {
+               .Name   = "aa-vec-soa",
+               .Init   = D3Q19AaVecInit_AaSoA,
+               .Deinit = D3Q19AaVecDeinit_AaSoA
+       }
+
 };
 
 #endif // __KERNEL_FUNCTIONS_H__
diff --git a/src/Lattice.c b/src/Lattice.c
new file mode 100644 (file)
index 0000000..bc218eb
--- /dev/null
@@ -0,0 +1,59 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Lattice.h"
+
+// Dumps the layers [zStart, zStop] of lattice as ASCII.
+// Specify zStart = -1 and zStop = -1 as begin/end of lattice.
+
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop)
+{
+       Assert(ld != NULL);
+
+       const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet
+
+       int localZStart = zStart;
+       int localZStop  = zStop;
+
+       int * dims = ld->Dims;
+       LatticeT * lattice = ld->Lattice;
+
+       if (localZStart == -1) localZStart = 0;
+       if (localZStop  == -1) localZStop  = dims[2] - 1;
+
+       for (int z = localZStop; z >= localZStart; --z) {
+               printf("plane % 2d\n", z);
+
+               for (int y = dims[1] - 1; y >= 0; --y) {
+                       printf(" %2d  ", y);
+                       for (int x = 0; x < dims[0]; ++x) {
+                               printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]);
+                       }
+                       printf("\n");
+               }
+       }
+}
+
index 99b3fc30a641c75d9977a26371935ee6d9fb8777..c23b70f9a9a42cd807181c65402a18c6c6733c14 100644 (file)
@@ -74,5 +74,6 @@ static inline int L_INDEX_4(int dims[3], int x, int y, int z)
        return z * dims[0] * dims[1] + y * dims[0] + x;
 }
 
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop);
 
 #endif // __LATTICE_H__
index 1525ad5ed6f0b65c722df7af198bd5a4bb8c37a3..65a403597131433876505c064ed42d295d884978 100644 (file)
@@ -468,7 +468,7 @@ int main(int argc, char * argv[])
                        int threadId = omp_get_thread_num();
                        int err;
 
-                       err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId);
+                       err = PinCurrentThreadByCpuList(pinString, threadId);
 
                        if (err) {
                                printf("ERROR [thread %d]: pinning failed.\n", threadId);
@@ -557,9 +557,9 @@ int main(int argc, char * argv[])
                perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6,
                geometryType, kernelToUse,
 #ifdef VERIFICATION
-               "VERIFICATION"
+               "VERIFICATION"
 #else
-               "# benchmark"
+               "B"
 #endif
        );
 
index a9889a5c4494fccbda0e9c6f0c986a93c848a58c..99ca90283d74e95a55a68da41dffce443f422de2 100644 (file)
@@ -29,7 +29,9 @@
 # CONFIG ?= linux-gcc
 CONFIG ?= linux-intel
 
-BUILD ?= debug
+BUILD ?= release
+
+BENCHMARK ?= on
 
 # If we compile for BENCHMARK all verifcation and statistics are disabled,
 # if not specified otherwise.
@@ -37,13 +39,12 @@ ifeq (on,$(BENCHMARK))
   VERIFICATION ?= off
   VTK_OUTPUT   ?= off
   STATISTICS   ?= off
+else
+  VERIFICATION ?= on
+  STATISTICS   ?= on
+  VTK_OUTPUT   ?= on
 endif
 
-VERIFICATION ?= on
-
-STATISTICS ?= on
-
-VTK_OUTPUT ?= on
 
 COLOR ?= on
 
@@ -95,7 +96,7 @@ DEP_DIR         = obj/$(CONFIG)-$(BUILD)$(TAG)-dep
 
 # Sources to consider.
 SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \
-            Vtk.c Pinning.c
+            Vtk.c Pinning.c Padding.c Lattice.c
 
 # ------------------------------------------------------------------------
 # NO CHANGE BELOW SHOULD BE NEEDED
@@ -127,7 +128,13 @@ OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \
                $(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \
                $(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \
                $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \
-               $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o
+               $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19AaVec_AaSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19AaVecCommon_AaSoA.o
 
 OBJ = $(OBJ_C)
 
@@ -203,9 +210,22 @@ ifeq (on,$(LIKWID))
   LD_LIBS  += $(LIKWID_LIB) -llikwid
 endif
 
+# ARCH can only be assigned a string without a space. The space is escaped as
+# a comma which we have to replace here.
+
+ifdef TARCH
+  ARCH_W_COMMA := $(TARCH)
+
+  COMMA_ := ,
+  override TARCH := $(subst $(COMMA_), ,$(TARCH))
+
+  $(info $(shell echo -e "\n$(COLOR_GREEN)INFO: Automatically expanding comma in TARCH variable:  TARCH=$(TARCH).$(COLOR_NO)\n"))
+
+endif
+
 .phony: all clean clean-all
 
-$(info $(shell $(ECHO_E) "# Configuration:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   building $(.DEFAULT_GOAL)..."))
+$(info $(shell $(ECHO_E) "# Configuration:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   TARCH=$(COLOR_CYAN)$(TARCH)$(COLOR_NO)   building $(.DEFAULT_GOAL)..."))
 
 
 $(info # Object dir:      $(OBJECT_DIR))
@@ -265,6 +285,13 @@ $(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS)
        @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)"
        $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@
 
+$(OBJECT_DIR)/%_AaAoS.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_AA -c $< -o $@
+
+$(OBJECT_DIR)/%_AaSoA.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_AA -c $< -o $@
 
 $(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS)
        @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)"
index d9d1f942c7e11c3c8ddf05e4d2d5ab58fac1f24e..e7dda39419bb03f143bf6777186d755ed3717a64 100644 (file)
 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
 //
 // --------------------------------------------------------------------------
+// TODO: make configurable
+#define HAVE_HUGE_PAGES
+
+
+#ifdef HAVE_HUGE_PAGES
+#define _BSD_SOURCE
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>  // strerror
 #include <errno.h>
 
+
+#ifdef HAVE_HUGE_PAGES
+#include <sys/mman.h> // madvise
+#endif
+
 #include "Base.h"
 #include "Memory.h"
 
@@ -60,6 +73,19 @@ int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes)
                exit(1);
        }
 
+#ifdef HAVE_HUGE_PAGES
+
+       if (alignmentBytes % 4096 == 0) {
+               ret = madvise(*ptr, bytesToAlloc, MADV_HUGEPAGE);
+
+               if (ret != 0) {
+                       DEBUG_BREAK_POINT();
+                       Error("madvise(%p, %lu, MADV_HUGEPAGE) failed: %d - %s.\n", *ptr, bytesToAlloc, errno, strerror(errno));
+                       exit(1);
+               }
+       }
+#endif
+
        return 0;
 }
 
diff --git a/src/Padding.c b/src/Padding.c
new file mode 100644 (file)
index 0000000..97a98ea
--- /dev/null
@@ -0,0 +1,300 @@
+#include "Base.h"
+#include "Kernel.h"
+#include "Padding.h"
+
+#include <errno.h>
+#include <limits.h>
+
+// Generates a PadInfo struct from padStr. If padStr is NULL then a NULL
+// pointer is returned.
+//
+// Padding string is either "auto" or in the format of
+// <modulus>+<remainder>(,<modulus>+<remainder>)*.
+//
+// If modulus is 0 then the remainder will just be added and no modulo will be
+// applied.
+//
+// Returned PadInfo structs must be freed with PadInfoFree.
+
+PadInfo * PadInfoFromStr(const char * padStr)
+{
+       if (padStr == NULL) {
+               return NULL;
+       }
+       else if (!strncmp("auto", padStr, 4)) {
+
+               PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo) + 4 * sizeof(int));
+               Assert(padInfo != NULL);
+
+               padInfo->nEntries = 2;
+               padInfo->Modulus = (int *)(padInfo + 1);
+               padInfo->Offset = ((int *)(padInfo + 1)) + 2;
+
+               // Intel TLB 2 MiB pages
+               padInfo->Modulus[0] = 2 * 1024 * 1024 * 8;
+               padInfo->Offset[0]  = 2 * 1024 * 1024 + 512;
+
+               // Intel L2, 256 sets
+               padInfo->Modulus[1] = 256 * 64;
+               padInfo->Offset[1]  = 128;
+
+               return padInfo;
+       }
+       else if (!strncmp("no", padStr, 2) || !strncmp("off", padStr, 3)) {
+               PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo));
+               Assert(padInfo != NULL);
+
+               padInfo->nEntries = 0;
+               padInfo->Modulus  = NULL;
+               padInfo->Offset   = NULL;
+
+               return padInfo;
+       }
+
+       // Count number of commas. Number of commas + 1 gives us the number of
+       // padding entries.
+       const char * tmp = padStr;
+       int nCommas = 0;
+
+       while (*tmp != 0x00) {
+               if (*tmp == ',') ++nCommas;
+               ++tmp;
+       }
+
+       // Number of padding entries = nCommas + 1.
+       size_t padInfoSize = sizeof(PadInfo) + (nCommas + 1) * 2 * sizeof(int);
+       PadInfo * padInfo = (PadInfo *)malloc(padInfoSize);
+       Assert(padInfo != NULL);
+
+       memset(padInfo, 0x00, padInfoSize);
+
+       padInfo->Modulus = (int *)(padInfo + 1);
+       padInfo->Offset = ((int *)(padInfo + 1)) + nCommas + 1;
+
+       tmp = padStr;
+
+       int padEntryIndex = 0;
+       int entriesSeen   = 0;
+
+       int modulusSeen   = 0;
+       int offsetSeen    = 0;
+
+       // TODO: parsing is currently a mess. We assume a fixed format, which
+       // should be easier to parse now.
+       while (*tmp != 0x00) {
+
+               if (*tmp == ',') {
+                       // Check if modulus was seen in this section, then move to the next.
+
+                       if (!modulusSeen) {
+                               fprintf(stderr, "ERROR: modulus missing before next section in padding string.\n");
+                               exit(1);
+                       }
+
+                       modulusSeen = 0;
+                       offsetSeen  = 0;
+                       ++padEntryIndex;
+                       ++tmp;
+               }
+               else if (*tmp == '+') {
+                       // Check if modulus was seen in this section and no offset yet.
+
+                       if (!modulusSeen) {
+                               fprintf(stderr, "ERROR: modulus missing before offset in padding string.\n");
+                               exit(1);
+                       }
+
+                       if (offsetSeen) {
+                               fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+                               exit(1);
+                       }
+
+                       ++tmp;
+               }
+               else if (*tmp >= '0' && *tmp <= '9') {
+                       // Parse number and check for all the errors.
+
+                       char * endPtr;
+
+                       errno = 0;
+
+                       int value = strtol(tmp, &endPtr, 10);
+
+                       if ((value == LONG_MIN || value == LONG_MAX) && errno != 0) {
+                               fprintf(stderr, "ERROR: over- or underflow in modulus/offset of padding string.\n");
+                               exit(1);
+                       }
+                       else if (value == 0 && errno != 0) {
+                               fprintf(stderr, "ERROR: error parsing modulus/offset of padding string: %d - %s\n", errno, strerror(errno));
+                               exit(1);
+                       }
+                       else if (tmp == endPtr) {
+                               // No digits found: empty string or invalid pad string.
+                               fprintf(stderr, "ERROR: modulus or offset missing in padding string.\n");
+                               exit(1);
+                       }
+
+                       if (value < 0) {
+                               fprintf(stderr, "ERROR: modulus and offset must be >= 0 in padding string.\n");
+                               exit(1);
+                       }
+
+                       if (!modulusSeen) {
+                               if (offsetSeen) {
+                                       fprintf(stderr, "ERROR: offset already seen, but not modulus in padding string.\n");
+                                       exit(1);
+                               }
+
+                               Verify(padEntryIndex < (nCommas + 1));
+
+                               padInfo->Modulus[padEntryIndex] = value;
+
+                               modulusSeen = 1;
+                               ++entriesSeen;
+                       }
+                       else {
+                               if (offsetSeen) {
+                                       fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+                                       exit(1);
+                               }
+
+                               Verify(padEntryIndex < (nCommas + 1));
+
+                               padInfo->Offset[padEntryIndex] = value;
+
+                               offsetSeen = 1;
+                       }
+                       // No increment of tmp needed, as endPtr points already to the first
+                       // character which is not part of the number.
+                       tmp = endPtr;
+               }
+               else {
+                       fprintf(stderr, "ERROR: padding string contains invalid characters.\n");
+                       exit(1);
+               }
+
+       }
+
+       if (entriesSeen != nCommas + 1) {
+               fprintf(stderr, "ERROR: did not find all padding entries.\n");
+               exit(1);
+       }
+
+       for (int i = 0; i < nCommas + 1; ++i) {
+               if (padInfo->Offset[i] >= padInfo->Modulus[i]) {
+                       fprintf(stderr, "ERROR: offset in padding entry %d is equal or larger than modulus.\n", i);
+                       exit(1);
+               }
+       }
+
+       padInfo->nEntries = entriesSeen;
+
+       return padInfo;
+}
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+       Assert(padInfoPtr != NULL);
+
+       // If the padInfo is NULL determine if we use the "auto" configuration.
+       const int defaultAutoPadding = 1;
+
+
+       if (*padInfoPtr == NULL) {
+               if (!defaultAutoPadding) {
+                       return nCells;
+               }
+
+               *padInfoPtr = PadInfoFromStr("auto");
+       }
+
+       PadInfo * padInfo = *padInfoPtr;
+
+       Assert(padInfo->nEntries >= 0);
+
+       for (int i = 0; i < padInfo->nEntries; ++i) {
+               Assert(padInfo->Modulus[i] >=  0);
+               Assert(padInfo->Offset[i]  >= 0);
+
+               int nPadCells = 0;
+
+               if (padInfo->Modulus[i] == 0) {
+                       // When the modulus is just zero then we only add the offset.
+                       nPadCells = padInfo->Offset[i] * cellSizeBytes;
+               }
+               else {
+                       int nModCells    = padInfo->Modulus[i] / cellSizeBytes;
+
+                       if (nModCells == 0) {
+                               fprintf(stderr, "ERROR: modulus of %d byte in padding entry %d becomes zero for PDF size %d byte.\n",
+                                               padInfo->Modulus[i], i, cellSizeBytes);
+                               exit(1);
+                       }
+
+                       int nOffsetCells = padInfo->Offset[i]  / cellSizeBytes;
+                       int nRemainder   = nCells % nModCells;
+
+                       nPadCells = (nOffsetCells + nModCells - nRemainder) % nModCells;
+               }
+
+               nCells += nPadCells;
+       }
+
+       return nCells;
+}
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix)
+{
+       Assert(padInfo != NULL);
+       Assert(padInfo->nEntries >= 0);
+       Assert(f != NULL);
+       Assert(prefix != NULL);
+
+       for (int i = 0; i < padInfo->nEntries; ++i) {
+               fprintf(f, "%sm: %10d b  o: %10d b   m: %f KiB  o: %f KiB   m: %f MiB  o: %f MiB\n",
+                       prefix,
+                       padInfo->Modulus[i],                   padInfo->Offset[i],
+                       padInfo->Modulus[i] / 1024.0,          padInfo->Offset[i] / 1024.0,
+                       padInfo->Modulus[i] / 1024.0 / 1024.0, padInfo->Offset[i] / 1024.0 / 1024.0);
+       }
+
+       return;
+}
+
+void PadInfoFree(PadInfo * padInfo)
+{
+       if (padInfo != NULL) {
+               free(padInfo);
+       }
+
+       return;
+}
+
+// Returns the new padded cell size and reports if padding was performed and how.
+
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+       // Apply padding.
+       int nNewCells = PadCells(nCells, sizeof(PdfT), padInfoPtr);
+
+       if (nCells != nNewCells) {
+               printf("# padding info:\n");
+               PadInfoPrint(*padInfoPtr, stdout, "#                        ");
+               // int nPaddedCells = nNewCells - nCells;
+               // printf("# padding per dir.:      %10d nodes (%f MiB)\n", nPaddedCells, nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+               // printf("# padding total:         %10d nodes (%f MiB)\n", 19 * nPaddedCells, 19 * nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+
+               int nPadCells = nNewCells - nCells;
+
+               printf("#\n# padding %d nodes with %d nodes, %f MiB per direction, %f MiB in total\n",
+                       nCells,
+                       nPadCells,
+                       nPadCells * sizeof(PdfT) / 1024.0 / 1024.0,
+                       nPadCells * sizeof(PdfT) / 1024.0 / 1024.0 * 19);
+       }
+       else {
+               printf("# padding info:          no padding used\n");
+       }
+
+       return nNewCells;
+}
diff --git a/src/Padding.h b/src/Padding.h
new file mode 100644 (file)
index 0000000..55fa33e
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef __PADDING_H__
+#define __PADDING_H__
+
+typedef struct PadInfo_
+{
+       int nEntries;
+       int * Modulus;
+       int * Offset;
+} PadInfo;
+
+PadInfo * PadInfoFromStr(const char * padStr);
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfo);
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr);
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix);
+void PadInfoFree(PadInfo * padInfo);
+
+#endif // __PADDING_H__
index 0cf70eafbddd61e8aa2968f544a1d1f499ad6eb0..de9be3ffdd1f79c055b26efa02abb8a09e1ffd1b 100644 (file)
@@ -34,9 +34,6 @@
 #include "Base.h"
 #include "Pinning.h"
 
-
-
-
 // -----------------------------------------------------------------------
 //
 // Binds the calling thread to specified core.
@@ -73,8 +70,7 @@ int PinCurrentThreadToCore(int coreNumber)
 //
 // -----------------------------------------------------------------------
 
-int PinCurrentThreadByEnvVar(const char * envVarName,
-       int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber)
 {
        const char * envVarValue;
        int core;
@@ -82,14 +78,12 @@ int PinCurrentThreadByEnvVar(const char * envVarName,
        envVarValue = getenv(envVarName);
 
        if (envVarValue == NULL) {
-               if (mpiRank == 0) {
-                       Print("skip pinning: env var %s not set\n", envVarName);
-               }
+               printf("WARNING: skip pinning: env var %s not set\n", envVarName);
 
                return 0;
        }
 
-       core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber);
+       core = PinParseCpuList(envVarValue, threadNumber);
 
        if (core < 0) {
                return core;
@@ -107,20 +101,17 @@ int PinCurrentThreadByEnvVar(const char * envVarName,
 //
 // -----------------------------------------------------------------------
 
-int PinCurrentThreadByCpuList(const char * cpuList,
-       int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber)
 {
        int core;
 
        if (cpuList == NULL) {
-               if (mpiRank == 0) {
-                       printf("ERROR: cpu list is NULL.\n");
-               }
+               printf("ERROR: cpu list is NULL.\n");
 
                exit(1);
        }
 
-       core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber);
+       core = PinParseCpuList(cpuList, threadNumber);
 
        if (core < 0) {
                return core;
@@ -133,31 +124,13 @@ int PinCurrentThreadByCpuList(const char * cpuList,
 // -----------------------------------------------------------------------
 //
 // Parses the provided cpu list and returns the core number for the
-// specified MPI rank, local rank, and thread.
-//
-// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5
-//
-// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a
-// node rank. The first block maps to node rank 1, the second to node
-// rank 2, etc.
-//
-// Inside a block the core numbers specify where the threads should
-// be pinned to. They are separated by "," and the first number maps
-// to the first core, the second number to the second core, etc.
-//
-// For example: 0,2,4_6,8,10
+// specified thread.
 //
-// Node rank 0 thread 0 pinned to core  0
-//           0        1                 2
-//           0        2                 4
-//           1        0                 6
-//           1        1                 8
-//           1        2                10
+// The cpu list has for example a format of: 0,1,2
 //
 // -----------------------------------------------------------------------
 
-int PinParseCpuList(const char * cpuList,
-               int mpiRank, int nodeRank, int threadNumber)
+int PinParseCpuList(const char * cpuList, int threadNumber)
 {
        int cpu = -1;
 
@@ -168,8 +141,8 @@ int PinParseCpuList(const char * cpuList,
        const char * c = cpuList;
 
        // Ensure only valid characters are in the cpu list.
-       // Cpu list is in the format of "0,1,2_3,4,5".
-       while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) {
+       // Cpu list is in the format of "0,1,2,3,4,5".
+       while (((*c >= '0' && *c <= '9') || *c == ',')) {
                ++c;
        }
 
@@ -180,19 +153,6 @@ int PinParseCpuList(const char * cpuList,
 
        c = cpuList;
 
-       int i = 0;
-
-       // Move variable c after the "nodeRank"th "_" in the cpu list.
-       while (i < nodeRank && *c != 0x00) {
-       if (*c == '_') ++i;
-               ++c;
-       }
-
-       if (i != nodeRank || *c < '0' || *c > '9') {
-               // Cpu list for this node rank not found.
-               return -3;
-       }
-
        // Now find the core for the specified thread.
 
        int t = 0;
@@ -201,11 +161,6 @@ int PinParseCpuList(const char * cpuList,
                if (*c == ',') {
                        ++t;
                }
-               else if (*c == '_') {
-                       // Unexpected character at this position.
-                       break;
-               }
-
                ++c;
        }
 
index 40225c1eea5ec1ec81748b58d886c6658156db02..877048de32b41a8cf33d9a2295dd433a7152d5af 100644 (file)
 
 int PinCurrentThreadToCore(int coreNumber);
 
-int PinParseCpuList(const char * cpuList,
-                                       int mpiRank, int nodeRank, int threadNumber);
+int PinParseCpuList(const char * cpuList, int threadNumber);
 
-int PinCurrentThreadByEnvVar(const char * envVarName,
-       int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber);
 
-int PinCurrentThreadByCpuList(const char * cpuList,
-       int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber);
 
 int PinCurrentCore();
 
index 7a76f51b8ca6321bcf79b087c62c41a9acc12f8a..a0e8ad67fc3575cb346b321651ce4cfb1c47f869 100755 (executable)
@@ -54,7 +54,7 @@ Config="$1"
 
 make clean-all
 make -j CONFIG=$Config TAG=$XTag-debug
-make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
 make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
 
 BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v"
This page took 0.58376 seconds and 5 git commands to generate.