From e3f82424829ebb623343ce0092238f83b4a1b8c2 Mon Sep 17 00:00:00 2001
From: Markus Wittmann <markus.wittmann@fau.de>
Date: Thu, 2 Nov 2017 15:54:11 +0100
Subject: [PATCH] bulk commit

- add AA pattern full array kernels
- add padding for list kernels
- transposed loops
---
 doc/Makefile                                |   2 +-
 doc/html/main.html                          | 411 ++++++++++--
 doc/main.css                                |  46 ++
 doc/main.rst                                | 194 +++++-
 src/Base.h                                  |  14 +-
 src/BenchKernelD3Q19.c                      |  66 +-
 src/BenchKernelD3Q19Aa.c                    | 529 ++++++++++++++++
 src/BenchKernelD3Q19Aa.h                    |  41 ++
 src/BenchKernelD3Q19AaCommon.c              | 651 +++++++++++++++++++
 src/BenchKernelD3Q19AaCommon.h              |  89 +++
 src/BenchKernelD3Q19AaVec.c                 | 610 ++++++++++++++++++
 src/BenchKernelD3Q19AaVec.h                 |  38 ++
 src/BenchKernelD3Q19AaVecCommon.c           | 656 ++++++++++++++++++++
 src/BenchKernelD3Q19AaVecCommon.h           |  93 +++
 src/BenchKernelD3Q19Common.c                | 167 +++--
 src/BenchKernelD3Q19Common.h                |   4 +-
 src/BenchKernelD3Q19List.c                  |   4 +
 src/BenchKernelD3Q19ListAaCommon.c          |  68 +-
 src/BenchKernelD3Q19ListAaPv.c              |  40 +-
 src/BenchKernelD3Q19ListAaPvCommon.c        |  65 +-
 src/BenchKernelD3Q19ListAaRiaCommon.c       |  64 +-
 src/BenchKernelD3Q19ListCommon.c            |  68 +-
 src/BenchKernelD3Q19ListPullSplitNt.c       |  16 +
 src/BenchKernelD3Q19ListPullSplitNtCommon.c |  59 +-
 src/Geometry.c                              |  49 +-
 src/Geometry.h                              |   3 +-
 src/Kernel.h                                |   2 +-
 src/KernelFunctions.h                       |  18 +
 src/Lattice.c                               |  59 ++
 src/Lattice.h                               |   1 +
 src/Main.c                                  |   6 +-
 src/Makefile                                |  45 +-
 src/Memory.c                                |  26 +
 src/Padding.c                               | 300 +++++++++
 src/Padding.h                               |  19 +
 src/Pinning.c                               |  67 +-
 src/Pinning.h                               |   9 +-
 src/test.sh                                 |   2 +-
 38 files changed, 4228 insertions(+), 373 deletions(-)
 create mode 100644 doc/main.css
 create mode 100644 src/BenchKernelD3Q19Aa.c
 create mode 100644 src/BenchKernelD3Q19Aa.h
 create mode 100644 src/BenchKernelD3Q19AaCommon.c
 create mode 100644 src/BenchKernelD3Q19AaCommon.h
 create mode 100644 src/BenchKernelD3Q19AaVec.c
 create mode 100644 src/BenchKernelD3Q19AaVec.h
 create mode 100644 src/BenchKernelD3Q19AaVecCommon.c
 create mode 100644 src/BenchKernelD3Q19AaVecCommon.h
 create mode 100644 src/Lattice.c
 create mode 100644 src/Padding.c
 create mode 100644 src/Padding.h

diff --git a/doc/Makefile b/doc/Makefile
index 690e4eb..6417a6e 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -34,5 +34,5 @@ main: main.rst
 #main.css
 	[ -d html ] || mkdir -p html
 #    rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
-	rst2html --stylesheet=html4css1.css $< html/$@.html
+	rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
 
diff --git a/doc/html/main.html b/doc/html/main.html
index 36579cc..511f6b2 100644
--- a/doc/html/main.html
+++ b/doc/html/main.html
@@ -335,6 +335,56 @@ h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
 ul.auto-toc {
   list-style-type: none }
 
+</style>
+<style type="text/css">
+
+
+h1, h2, h3, h4, h5, h6 {
+  font-family: sans-serif;
+  font-size: 100%;
+  background-color: #dcdcdc;
+}
+
+h1.title {
+  background-color: gray;
+  color: white
+}
+
+table.footnote {
+  padding-left: 0.5ex;
+}
+
+table.citation {
+  padding-left: 0.5ex
+}
+
+td.label {
+  width: 10%;
+}
+
+table, table.docutils, td, th {
+  border: 0;
+}
+
+table.citation, table.footnote {
+  width: 100%;
+}
+
+th {
+    background-color: lavender ;
+}
+
+tr:nth-child(even) {
+    xxbackground-color: aliceblue;
+    background-color: white; 
+}
+tr:nth-child(odd) {
+    xxbackground-color: lavender;
+    background-color: whitesmoke;
+}
+
+
+
 </style>
 </head>
 <body>
@@ -375,15 +425,23 @@ ul.auto-toc {
 <li><a class="reference internal" href="#benchmarking" id="id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
 <li><a class="reference internal" href="#release-and-verification" id="id5">1.3&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
 <li><a class="reference internal" href="#compilers" id="id6">1.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
-<li><a class="reference internal" href="#options-summary" id="id7">1.5&nbsp;&nbsp;&nbsp;Options Summary</a></li>
+<li><a class="reference internal" href="#cleaning" id="id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></li>
+<li><a class="reference internal" href="#options-summary" id="id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#invocation" id="id8">2&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
-<li><a class="reference internal" href="#command-line-parameters" id="id9">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#invocation" id="id9">2&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#command-line-parameters" id="id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#kernels" id="id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#id1" id="id10">3&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
-<li><a class="reference internal" href="#acknowledgements" id="id11">4&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
+<li><a class="reference internal" href="#id1" id="id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
+<li><a class="reference internal" href="#padding" id="id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#geometries" id="id14">4&nbsp;&nbsp;&nbsp;Geometries</a></li>
+<li><a class="reference internal" href="#results" id="id15">5&nbsp;&nbsp;&nbsp;Results</a></li>
+<li><a class="reference internal" href="#licence" id="id16">6&nbsp;&nbsp;&nbsp;Licence</a></li>
+<li><a class="reference internal" href="#acknowledgements" id="id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
 </ul>
 </div>
 <div class="section" id="compilation">
@@ -399,15 +457,15 @@ depending on compiler and build configuration.</p>
 <div class="section" id="debug-and-verification">
 <h2><a class="toc-backref" href="#id3">1.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
 <pre class="literal-block">
-make
+make BUILD=debug BENCHMARK=off
 </pre>
-<p>Running <tt class="docutils literal">make</tt> without any arguments builds the debug version (BUILD=debug) of
+<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of
 the benchmark kernels, where no optimizations are performed,  line numbers and
 debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined.  The resulting
 binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named
 <tt class="docutils literal"><span class="pre">lbmbenchk-linux-&lt;compiler&gt;-debug</span></tt>.</p>
-<p>Without any further specification the binary includes verification
-(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS</tt>), and VTK output
+<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification
+(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output
 (<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p>
 <p>Please note that the generated binary will therefore
 exhibit a poor performance.</p>
@@ -416,9 +474,10 @@ exhibit a poor performance.</p>
 <h2><a class="toc-backref" href="#id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
 <p>To generate a binary for benchmarking run make with</p>
 <pre class="literal-block">
-make BENCHMARK=on BUILD=release
+make
 </pre>
-<p>Here BUILD=release turns optimizations on and BENCHMARK=on disables
+<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
+BUILD=release turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
 verfification, statistics, and VTK output.</p>
 </div>
 <div class="section" id="release-and-verification">
@@ -426,7 +485,7 @@ verfification, statistics, and VTK output.</p>
 <p>Verification with the debug builds can be extremely slow. Hence verification
 capabilities can be build with release builds:</p>
 <pre class="literal-block">
-make BUILD=release
+make BENCHMARK=off
 </pre>
 </div>
 <div class="section" id="compilers">
@@ -435,8 +494,23 @@ make BUILD=release
 both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
 <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
 </div>
+<div class="section" id="cleaning">
+<h2><a class="toc-backref" href="#id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
+<p>For each configuration and build (debug/release) a subdirectory under the
+<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
+stored.
+With</p>
+<pre class="literal-block">
+make CONFIG=... BUILD=... clean
+</pre>
+<p>a specific combination is select and cleaned, whereas with</p>
+<pre class="literal-block">
+make clean-all
+</pre>
+<p>all object and dependency files are deleted.</p>
+</div>
 <div class="section" id="options-summary">
-<h2><a class="toc-backref" href="#id7">1.5&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
+<h2><a class="toc-backref" href="#id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
 <p>Options that can be specified when building the framework with make:</p>
 <table border="1" class="docutils">
 <colgroup>
@@ -451,19 +525,14 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 <td>default</td>
 <td>description</td>
 </tr>
-<tr><td>TARCH</td>
-<td>--</td>
-<td>--</td>
-<td>Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.</td>
-</tr>
 <tr><td>BENCHMARK</td>
 <td>on, off</td>
-<td>off</td>
-<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.</td>
+<td>on</td>
+<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td>
 </tr>
 <tr><td>BUILD</td>
 <td>debug, release</td>
-<td>debug</td>
+<td>release</td>
 <td>No optimization, debug symbols, DEBUG defined.</td>
 </tr>
 <tr><td>CONFIG</td>
@@ -486,6 +555,11 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 <td>off</td>
 <td>View statistics, like density etc, during simulation.</td>
 </tr>
+<tr><td>TARCH</td>
+<td>--</td>
+<td>--</td>
+<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td>
+</tr>
 <tr><td>VERIFICATION</td>
 <td>on, off</td>
 <td>off</td>
@@ -501,16 +575,18 @@ both configuration can be chosen via <tt class="docutils literal"><span class="p
 </div>
 </div>
 <div class="section" id="invocation">
-<h1><a class="toc-backref" href="#id8">2&nbsp;&nbsp;&nbsp;Invocation</a></h1>
+<h1><a class="toc-backref" href="#id9">2&nbsp;&nbsp;&nbsp;Invocation</a></h1>
 <p>Running the binary will print among the GPL licence header a line like the following:</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+</pre>
 <p>if verfication was enabled during compilation or</p>
-<blockquote>
-LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark</blockquote>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
+</pre>
 <p>if verfication was disabled during compilation.</p>
 <div class="section" id="command-line-parameters">
-<h2><a class="toc-backref" href="#id9">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
+<h2><a class="toc-backref" href="#id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
 <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
 <pre class="literal-block">
 Usage:
@@ -540,7 +616,7 @@ $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32
 <p>Kernel specific parameters can be opatained via selecting the specific kernel
 and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
 <pre class="literal-block">
-$ bin/lbmbenchk-linux-intel-release -kernel -- -h
+$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
 ...
 Kernel parameters:
 [-blk &lt;n&gt;] [-blk-[xyz] &lt;n&gt;]
@@ -574,14 +650,225 @@ Available kernels to benchmark:
    blk-pull-aos
 </pre>
 </div>
+<div class="section" id="kernels">
+<h2><a class="toc-backref" href="#id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
+<p>The following list shortly describes available kernels:</p>
+<ul class="simple">
+<li>push-soa/push-aos/pull-soa/pull-aos:
+Unoptimized kernels (but stream/collide are already fused) using two grids as
+source and destination. Implement push/pull semantics as well structure of
+arrays (soa) or array of structures (aos) layout.</li>
+<li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+The same as the unoptimized kernels without the blk prefix, except that they support
+spatial blocking, i.e. loop blocking of the three loops used to iterate over
+the lattice. Here manual work sharing for OpenMP is used.</li>
+<li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+Here only a 1D vector of is used to store the fluid nodes, omitting the
+obstacles. An adjacency list is used to recover the neighborhood associations.</li>
+<li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+once. Postcollision values are written back via nontemporal stores in 18 (1s)
+or 9 (2s) loops.</li>
+<li>list-aa-aos/list-aa-soa:
+Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+list. Supported are array of structures (aos) and structure of arrays (soa)
+data layout is supported.</li>
+<li>list-aa-ria-soa:
+Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+list. Furthermore it contains a vectorized even time step and run length
+coding to reduce the loop balance of the odd time step.</li>
+<li>list-aa-pv-soa:
+All optimizations of list-aa-ria-soa. Additional with partial vectorization
+of the odd time step.</li>
+</ul>
+<p>Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.</p>
+<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means
+direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D
+vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="29%" />
+<col width="14%" />
+<col width="14%" />
+<col width="6%" />
+<col width="10%" />
+<col width="10%" />
+<col width="16%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel name</th>
+<th class="head">prop. step</th>
+<th class="head">data layout</th>
+<th class="head">addr.</th>
+<th class="head">parallel</th>
+<th class="head">blocking</th>
+<th class="head">B_l [B/FLUP]</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-split-nt-1s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-pull-split-nt-2s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>AA</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+</tbody>
+</table>
+</div>
 </div>
 <div class="section" id="id1">
-<h1><a class="toc-backref" href="#id10">3&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
+<h1><a class="toc-backref" href="#id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
 <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
 created make sure the binary was compiled with:</p>
 <ul class="simple">
-<li><tt class="docutils literal">BENCHMARK=on</tt> and</li>
-<li><tt class="docutils literal">BUILD=release</tt> and</li>
+<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
+<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
 <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
 <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
 </ul>
@@ -594,7 +881,13 @@ $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9)
 <ul class="simple">
 <li>transparent huge pages: when allocating memory small 4 KiB pages might be
 replaced with larger ones. This is in general a good thing, but if this is
-really the case, depends on the system settings.</li>
+really the case, depends on the system settings (check e.g. the status of
+<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>).
+Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to
+a 4 KiB page, which should be the case for the lattices.
+This should result in huge pages except THP is disabled on the machine.
+(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently
+hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li>
 <li>CPU/core frequency: For reproducible results the frequency of all cores
 should be fixed.</li>
 <li>NUMA placement policy: The benchmark assumes a first touch policy, which
@@ -604,19 +897,63 @@ used is already full memory might be allocated in a remote domain. Accesses
 to remote domains typically have a higher latency and lower bandwidth.</li>
 <li>System load: interference with other application, espcially on desktop
 systems should be avoided.</li>
-<li>Padding: most kernels do not care about padding against cache or TLB
-thrashing. Even if the number of (fluid) nodes suggest everything is fine,
-through parallelization still problems might occur.</li>
+<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
+adjusted so that no cache or TLB thrashing should occur. The parameters are
+optimized for current Intel based systems. For more details look into the
+padding section.</li>
 <li>CPU dispatcher function: the compiler might add different versions of a
 function for different ISA extensions. Make sure the code you might think is
 executed is actually the code which is executed.</li>
 </ul>
+<div class="section" id="padding">
+<h2><a class="toc-backref" href="#id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></h2>
+<p>With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.</p>
+<p>Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>)
+<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding),
+or a manual padding.</p>
+<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.</p>
+<p>Manual padding is done via a padding string and has the format
+<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g.
+<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>.
+This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong>
+would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.</p>
+<p>A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p>
+</div>
+</div>
+<div class="section" id="geometries">
+<h1><a class="toc-backref" href="#id14">4&nbsp;&nbsp;&nbsp;Geometries</a></h1>
+<p>TODO: supported geometries: channel, pipe, blocks</p>
+</div>
+<div class="section" id="results">
+<h1><a class="toc-backref" href="#id15">5&nbsp;&nbsp;&nbsp;Results</a></h1>
+<p>TODO</p>
+</div>
+<div class="section" id="licence">
+<h1><a class="toc-backref" href="#id16">6&nbsp;&nbsp;&nbsp;Licence</a></h1>
+<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
 </div>
 <div class="section" id="acknowledgements">
-<h1><a class="toc-backref" href="#id11">4&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
+<h1><a class="toc-backref" href="#id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
 <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
 <p>This work was funded by KONWHIR project OMI4PAPS.</p>
-<p>Document was generated at 2017-10-26 09:43.</p>
+<p>Document was generated at 2017-11-02 15:33.</p>
 </div>
 </div>
 </body>
diff --git a/doc/main.css b/doc/main.css
new file mode 100644
index 0000000..27d3f26
--- /dev/null
+++ b/doc/main.css
@@ -0,0 +1,46 @@
+
+h1, h2, h3, h4, h5, h6 {
+  font-family: sans-serif;
+  font-size: 100%;
+  background-color: #dcdcdc;
+}
+
+h1.title {
+  background-color: gray;
+  color: white
+}
+
+table.footnote {
+  padding-left: 0.5ex;
+}
+
+table.citation {
+  padding-left: 0.5ex
+}
+
+td.label {
+  width: 10%;
+}
+
+table, table.docutils, td, th {
+  border: 0;
+}
+
+table.citation, table.footnote {
+  width: 100%;
+}
+
+th {
+    background-color: lavender ;
+}
+
+tr:nth-child(even) {
+    xxbackground-color: aliceblue;
+    background-color: white; 
+}
+tr:nth-child(odd) {
+    xxbackground-color: lavender;
+    background-color: whitesmoke;
+}
+
+
diff --git a/doc/main.rst b/doc/main.rst
index 9ad56ba..528b339 100644
--- a/doc/main.rst
+++ b/doc/main.rst
@@ -54,16 +54,16 @@ Debug and Verification
 
 :: 
 
-  make
+  make BUILD=debug BENCHMARK=off
 
-Running ``make`` without any arguments builds the debug version (BUILD=debug) of
+Running ``make`` with ``BUILD=debug`` builds the debug version of
 the benchmark kernels, where no optimizations are performed,  line numbers and
 debug symbols are included as well as ``DEBUG`` will be defined.  The resulting
 binary will be found in the ``bin`` subdirectory and named
 ``lbmbenchk-linux-<compiler>-debug``.
  
-Without any further specification the binary includes verification
-(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output
+Specifying ``BENCHMARK=off`` turns on verification
+(``VERIFICATION=on``), statistics (``STATISTICS=on``), and VTK output
 (``VTK_OUTPUT=on``) enabled. 
 
 Please note that the generated binary will therefore
@@ -74,9 +74,10 @@ Benchmarking
 
 To generate a binary for benchmarking run make with ::
 
-  make BENCHMARK=on BUILD=release
+  make 
 
-Here BUILD=release turns optimizations on and BENCHMARK=on disables
+As default ``BENCHMARK=on`` and ``BUILD=release`` is set, where
+BUILD=release turns optimizations on and ``BENCHMARK=on`` disables
 verfification, statistics, and VTK output.
 
 Release and Verification
@@ -85,7 +86,7 @@ Release and Verification
 Verification with the debug builds can be extremely slow. Hence verification
 capabilities can be build with release builds: ::
 
-  make BUILD=release
+  make BENCHMARK=off 
 
 Compilers
 ---------
@@ -94,6 +95,24 @@ Currently only the GCC and Intel compiler under Linux are supported. Between
 both configuration can be chosen via ``CONFIG=linux-gcc`` or
 ``CONFIG=linux-intel``.
 
+
+Cleaning
+--------
+
+For each configuration and build (debug/release) a subdirectory under the
+``src/obj`` directory is created where the dependency and object files are
+stored.
+With ::
+
+  make CONFIG=... BUILD=... clean
+
+a specific combination is select and cleaned, whereas with ::
+
+  make clean-all
+
+all object and dependency files are deleted.
+
+
 Options Summary
 ---------------
 
@@ -102,13 +121,13 @@ Options that can be specified when building the framework with make:
 ============= ======================= ============ ==========================================================
 name          values                  default      description
 ------------- ----------------------- ------------ ----------------------------------------------------------
-TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.
-BENCHMARK     on, off                 off          If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.
-BUILD         debug, release          debug        No optimization, debug symbols, DEBUG defined.
+BENCHMARK     on, off                 on           If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.
+BUILD         debug, release          release      No optimization, debug symbols, DEBUG defined.
 CONFIG        linux-gcc, linux-intel  linux-intel  Select GCC or Intel compiler. 
 ISA           avx, sse                avx          Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for.
 OPENMP        on, off                 on           OpenMP, i.\,e.\. threading support.
 STATISTICS    on, off                 off          View statistics, like density etc, during simulation. 
+TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.
 VERIFICATION  on, off                 off          Turn verification on/off.
 VTK_OUTPUT    on, off                 off          Enable/Disable VTK file output.
 ============= ======================= ============ ==========================================================
@@ -116,11 +135,11 @@ VTK_OUTPUT    on, off                 off          Enable/Disable VTK file outpu
 Invocation
 ==========
 
-Running the binary will print among the GPL licence header a line like the following:
+Running the binary will print among the GPL licence header a line like the following: ::
  
   LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
 
-if verfication was enabled during compilation or
+if verfication was enabled during compilation or ::
 
   LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
 
@@ -159,7 +178,7 @@ iterations, etc, which can afterward be override, e.g.: ::
 Kernel specific parameters can be opatained via selecting the specific kernel
 and passing ``-h`` as parameter: ::
 
-  $ bin/lbmbenchk-linux-intel-release -kernel -- -h
+  $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
   ...
   Kernel parameters:
   [-blk <n>] [-blk-[xyz] <n>]
@@ -193,6 +212,82 @@ A list of all available kernels can be obtained via ``-list``: ::
      blk-pull-soa
      blk-pull-aos
 
+Kernels
+-------
+
+The following list shortly describes available kernels:
+
+- push-soa/push-aos/pull-soa/pull-aos:
+  Unoptimized kernels (but stream/collide are already fused) using two grids as
+  source and destination. Implement push/pull semantics as well structure of
+  arrays (soa) or array of structures (aos) layout.
+
+- blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
+  The same as the unoptimized kernels without the blk prefix, except that they support
+  spatial blocking, i.e. loop blocking of the three loops used to iterate over
+  the lattice. Here manual work sharing for OpenMP is used.
+
+- list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
+  The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+  Here only a 1D vector of is used to store the fluid nodes, omitting the
+  obstacles. An adjacency list is used to recover the neighborhood associations.
+
+- list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
+  Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+  once. Postcollision values are written back via nontemporal stores in 18 (1s)
+  or 9 (2s) loops.
+
+- list-aa-aos/list-aa-soa:
+  Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+  list. Supported are array of structures (aos) and structure of arrays (soa)
+  data layout is supported.
+
+- list-aa-ria-soa:
+  Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+  list. Furthermore it contains a vectorized even time step and run length
+  coding to reduce the loop balance of the odd time step.
+
+- list-aa-pv-soa:
+  All optimizations of list-aa-ria-soa. Additional with partial vectorization
+  of the odd time step.
+
+
+Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.
+
+The following table summarizes the properties of the kernels. Here **D** means
+direct addressing, i.e. full array, **I** means indirect addressing, i.e. 1D
+vector with adjacency list, **x** means supported, whereas **--** means unsupported.
+The loop balance B_l is computed for D3Q19 model with double precision floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.
+
+
+====================== =========== =========== ===== ======== ======== ============
+kernel name            prop. step  data layout addr. parallel blocking B_l [B/FLUP]
+====================== =========== =========== ===== ======== ======== ============
+push-soa               OS          SoA         D     x         --      456
+push-aos               OS          AoS         D     x         --      456
+pull-soa               OS          SoA         D     x         --      456
+pull-aos               OS          AoS         D     x         --      456
+blk-push-soa           OS          SoA         D     x         x       456
+blk-push-aos           OS          AoS         D     x         x       456
+blk-pull-soa           OS          SoA         D     x         x       456
+blk-pull-aos           OS          AoS         D     x         x       456
+list-push-soa          OS          SoA         I     x         x       528
+list-push-aos          OS          AoS         I     x         x       528
+list-pull-soa          OS          SoA         I     x         x       528
+list-pull-aos          OS          AoS         I     x         x       528
+list-pull-split-nt-1s  OS          SoA         I     x         x       376
+list-pull-split-nt-2s  OS          SoA         I     x         x       376
+list-aa-soa            AA          SoA         I     x         x       340
+list-aa-aos            AA          AoS         I     x         x       340
+list-aa-ria-soa        AA          SoA         I     x         x       304-342
+list-aa-pv-soa         AA          SoA         I     x         x       304-342
+====================== =========== =========== ===== ======== ======== ============
 
 Benchmarking
 ============
@@ -200,8 +295,8 @@ Benchmarking
 Correct benchmarking is a nontrivial task. Whenever benchmark results should be
 created make sure the binary was compiled with:  
 
-- ``BENCHMARK=on`` and
-- ``BUILD=release`` and 
+- ``BENCHMARK=on`` (default if not overriden) and
+- ``BUILD=release`` (default if not overriden) and 
 - the correct ISA for macros is used, selected via ``ISA`` and
 - use ``TARCH`` to specify the architecture the compiler generates code for.
 
@@ -214,7 +309,13 @@ Things the binary does nor check or controll:
 
 - transparent huge pages: when allocating memory small 4 KiB pages might be
   replaced with larger ones. This is in general a good thing, but if this is
-  really the case, depends on the system settings.
+  really the case, depends on the system settings (check e.g. the status of
+  ``/sys/kernel/mm/transparent_hugepage/enabled``).
+  Currently ``madvise(MADV_HUGEPAGE)`` is used for allocations which are aligned to
+  a 4 KiB page, which should be the case for the lattices. 
+  This should result in huge pages except THP is disabled on the machine.
+  (NOTE: madvise() is used if ``HAVE_HUGE_PAGES`` is defined, which is currently
+  hard coded defined in ``Memory.c``).
 
 - CPU/core frequency: For reproducible results the frequency of all cores
   should be fixed.
@@ -228,14 +329,69 @@ Things the binary does nor check or controll:
 - System load: interference with other application, espcially on desktop
   systems should be avoided.
 
-- Padding: most kernels do not care about padding against cache or TLB
-  thrashing. Even if the number of (fluid) nodes suggest everything is fine,
-  through parallelization still problems might occur.
+- Padding: For SoA based kernels the number of (fluid) nodes is automatically
+  adjusted so that no cache or TLB thrashing should occur. The parameters are
+  optimized for current Intel based systems. For more details look into the
+  padding section.
 
 - CPU dispatcher function: the compiler might add different versions of a
   function for different ISA extensions. Make sure the code you might think is
   executed is actually the code which is executed.
 
+Padding
+-------
+
+With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.
+
+Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the ``--``)
+``-pad``. Supported values are ``auto`` (default), ``no`` (to disable padding),
+or a manual padding.
+
+Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.
+
+Manual padding is done via a padding string and has the format
+``mod_1+offset_1(,mod_n+offset_n)``, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g. 
+**d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE**. 
+This would distribute the pages evenly over the sets. Hereby **PAGE_SIZE * TLB_SETS**
+would be our ``mod_1`` and **PAGE_SIZE** (after the =) our ``offset_1``.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.
+
+A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like ``-pad 0+16`` would at a static padding of two nodes (one node = 8 b).
+
+
+Geometries
+==========
+
+TODO: supported geometries: channel, pipe, blocks
+
+
+Results
+=======
+
+TODO
+
+
+Licence
+=======
+
+The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.
+
 
 Acknowledgements
 ================
diff --git a/src/Base.h b/src/Base.h
index 27f9116..ae61082 100644
--- a/src/Base.h
+++ b/src/Base.h
@@ -44,8 +44,6 @@ static inline double Time()
 }
 
 
-#define TOOL_NAME "lbmbenchk"
-
 #define STRINGIFYX(x)   #x
 #define STRINGIFY(x)    STRINGIFYX(x)
 
@@ -117,17 +115,17 @@ static inline double Time()
         } while (0)
 
     #define Print(formatString, ...) \
-        fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__)
+        fprintf(stdout, SHC_MAGENTA formatString SHC_NC, ##__VA_ARGS__)
 
     #define Warning(formatString, ...) \
-        fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__)
+        fprintf(stdout, SHC_BROWN "WARNING: " SHC_NC formatString, ##__VA_ARGS__)
 
     #define Error(formatString, ...) \
-        fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__)
-
+        fprintf(stderr, SHC_RED "ERROR: " formatString SHC_NC , ##__VA_ARGS__)
+/*
     #define DebugPrint(formatString, ...) \
-        fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__)
-
+        fprintf(stderr, "DEBUG: " formatString, ##__VA_ARGS__)
+*/
     #ifndef NO_SHELL_COLORS
 
         // or "\e"
diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c
index 8203d2a..bd6d43c 100644
--- a/src/BenchKernelD3Q19.c
+++ b/src/BenchKernelD3Q19.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -98,6 +99,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 
 	for (int iter = 0; iter < maxIterations; ++iter) {
 
+		X_LIKWID_START("os");
+
 		#ifdef _OPENMP
 		#pragma omp parallel for collapse(3) default(none) \
 				shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
@@ -112,9 +115,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 				  	evenPart, oddPart, w_1_indep, w_2_indep)
 		#endif
 
-		for (int z = oZ; z < nZ + oZ; ++z) {
+		for (int x = oX; x < nX + oX; ++x) {
 			for (int y = oY; y < nY + oY; ++y) {
-				for (int x = oX; x < nX + oX; ++x) {
+				for (int z = oZ; z < nZ + oZ; ++z) {
 					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
 #ifdef PROP_MODEL_PUSH
@@ -306,6 +309,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 			}
 		} // z, y, x (from inner to outer)
 
+		// Stop counters before bounce back. Else computing loop balance will be incorrect.
+		X_LIKWID_STOP("os");
+
 		// Fixup bounce back PDFs.
 		#ifdef _OPENMP
 		#pragma omp parallel for default(none) \
@@ -429,21 +435,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
 	for (int iter = 0; iter < maxIterations; ++iter) {
 
-		// #ifdef _OPENMP --> add line continuation
-		// #pragma omp parallel for collapse(3) default(none)
-		// 		shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd,
-		// 		w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd,
-		// 		oX, oY, oZ, nX, nY, nZ, blk)
-		// 		private(ux, uy, uz, ui, dens, dir_indep_trm,
-		// 			pdf_C,
-		// 		  	pdf_N, pdf_E, pdf_S, pdf_W,
-		// 		  	pdf_NE, pdf_SE, pdf_SW, pdf_NW,
-		// 		  	pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW,
-		// 		  	pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW,
-		// 		  	evenPart, oddPart, w_1_indep, w_2_indep)
-		// #endif
 		#ifdef _OPENMP
-		#pragma omp parallel for default(none) \
+		#pragma omp parallel default(none) \
 				shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
 				w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
 				oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
@@ -455,16 +448,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 				  	pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
 				  	evenPart, oddPart, w_1_indep, w_2_indep)
 		#endif
+		{
+			X_LIKWID_START("blk-os");
 
-		for (int i = 0; i < nThreads; ++i) {
+			int threadId = omp_get_thread_num();
 
-			int threadStartX = nX / nThreads * i;
-			int threadEndX   = nX / nThreads * (i + 1);
+			int threadStartX = nX / nThreads * threadId;
+			int threadEndX   = nX / nThreads * (threadId + 1);
 
 			if (nX % nThreads > 0) {
-				if (nX % nThreads > i) {
-					threadStartX += i;
-					threadEndX   += i + 1;
+				if (nX % nThreads > threadId) {
+					threadStartX += threadId;
+					threadEndX   += threadId + 1;
 				}
 				else {
 					threadStartX += nX % nThreads;
@@ -472,24 +467,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 				}
 			}
 
-		// for (int z = oZ; z < nZ + oZ; ++z) {
-		// 	for (int y = oY; y < nY + oY; ++y) {
-		// 		for (int x = oX; x < nX + oX; ++x) {
-		for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+		for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
 			for (int bY = oY; bY < nY + oY; bY += blk[1]) {
-				for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
-				// for (int bX = oX; bX < nX + oX; bX += blk[0]) {
+				for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
 
 					// Must do everything here, else it would break collapse.
 					int eZ = MIN(bZ + blk[2], nZ + oZ);
 					int eY = MIN(bY + blk[1], nY + oY);
 					int eX = MIN(bX + blk[0], threadEndX + oX);
 
-//					printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
-
-					for (int z = bZ; z < eZ; ++z) {
+					for (int x = bX; x < eX; ++x) {
 						for (int y = bY; y < eY; ++y) {
-							for (int x = bX; x < eX; ++x) {
+							for (int z = bZ; z < eZ; ++z) {
 
 					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
@@ -497,7 +486,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
 					// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
 					#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = src[I(x, y, z, idx)];
-					//if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
 					D3Q19_LIST
 					#undef X
 
@@ -505,7 +493,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
 					// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
 					#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
-					//if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
 					D3Q19_LIST
 					#undef X
 
@@ -525,11 +512,11 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 					} else {
 					#endif
 						ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
-							pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+							 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
 						uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
-							pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+							 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
 						uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
-							pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+							 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
 					#ifdef LID_DRIVEN_CAVITY
 					}
 
@@ -685,7 +672,10 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 			}
 		} // z, y, x (from inner to outer)
 
-		} // loop over threads
+			X_LIKWID_STOP("blk-os");
+		} // parallel region
+
+		// Stop counters before bounce back. Else computing loop balance will be incorrect.
 
 		// Fixup bounce back PDFs.
 		#ifdef _OPENMP
diff --git a/src/BenchKernelD3Q19Aa.c b/src/BenchKernelD3Q19Aa.c
new file mode 100644
index 0000000..87a9e33
--- /dev/null
+++ b/src/BenchKernelD3Q19Aa.c
@@ -0,0 +1,529 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+
+
+	int nX = ld->Dims[0];
+	int nY = ld->Dims[1];
+	int nZ = ld->Dims[2];
+
+	int * gDims = kd->GlobalDims;
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	KernelDataAa * kda = KDA(kd);
+
+	int blk[3];
+	blk[0] = kda->Blk[0];
+	blk[1] = kda->Blk[1];
+	blk[2] = kda->Blk[2];
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+	PdfT magicParam = 1.0 / 12.0;
+	//  1/4: best stability;
+	// 1/12: removes third-order advection error (best advection);
+	//  1/6: removes fourth-order diffusion error (best diffusion);
+	// 3/16: exact location of bounce back for poiseuille flow
+
+	PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	PdfT w_0 = 1.0 /  3.0;
+	PdfT w_1 = 1.0 / 18.0;
+	PdfT w_2 = 1.0 / 36.0;
+
+	PdfT w_1_x3 = w_1 * 3.0;	PdfT w_1_nine_half = w_1 * 9.0/2.0;	PdfT w_1_indep = 0.0;
+	PdfT w_2_x3 = w_2 * 3.0;	PdfT w_2_nine_half = w_2 * 9.0/2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z)	PdfT JOIN(pdf_,name);
+	D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int maxIterations = cd->MaxIterations;
+
+	#ifdef VTK_OUTPUT
+		if (cd->VtkOutput) {
+			kd->PdfsActive = src;
+			VtkWrite(ld, kd, cd, -1);
+		}
+	#endif
+
+	#ifdef STATISTICS
+		kd->PdfsActive = src;
+		KernelStatistics(kd, ld, cd, 0);
+	#endif
+
+
+	int nThreads = 1;
+
+	#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+	#endif
+
+	for (int iter = 0; iter < maxIterations; iter += 2) {
+
+		// --------------------------------------------------------------------
+		// even time step
+
+		X_LIKWID_START("aa-even");
+
+		// {{{
+
+		#ifdef _OPENMP
+		#pragma omp parallel for default(none) \
+				shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+				w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+				oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \
+				private(ux, uy, uz, ui, dens, dir_indep_trm, \
+					pdf_C, \
+				  	pdf_N, pdf_E, pdf_S, pdf_W, \
+				  	pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+				  	pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+				  	pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+				  	evenPart, oddPart, w_1_indep, w_2_indep)
+		#endif
+
+		for (int i = 0; i < nThreads; ++i) {
+
+			int threadStartX = nX / nThreads * i;
+			int threadEndX   = nX / nThreads * (i + 1);
+
+			if (nX % nThreads > 0) {
+				if (nX % nThreads > i) {
+					threadStartX += i;
+					threadEndX   += i + 1;
+				}
+				else {
+					threadStartX += nX % nThreads;
+					threadEndX   += nX % nThreads;
+				}
+			}
+
+			for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+			for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+			for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+				int eX = MIN(bX + blk[0], threadEndX + oX);
+				int eY = MIN(bY + blk[1], nY + oY);
+				int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+				// printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+				for (int x = bX; x < eX; ++x) {
+				for (int y = bY; y < eY; ++y) {
+				for (int z = bZ; z < eZ; ++z) {
+
+
+					if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) {
+						continue;
+					}
+
+					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+					// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+					#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = src[I(x, y, z, idx)];
+					D3Q19_LIST
+					#undef X
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+					if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+						ux = 0.1 * 0.577;
+						uy = 0.0;
+						uz = 0.0;
+
+					} else {
+	#endif
+						ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+							 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+						uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+							 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+						uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+							 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+	#ifdef LID_DRIVEN_CAVITY
+					}
+	#endif
+
+					dens = pdf_C +
+							pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+							pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+							pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+							pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+					dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+					// direction: w_0
+					src[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+					// direction: w_1
+					w_1_indep = w_1*dir_indep_trm;
+
+					ui = uy;
+					evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+					src[I(x, y, z, D3Q19_S)]  = pdf_N - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_N)]  = pdf_S - evenPart + oddPart;
+
+					ui = ux;
+					evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+					src[I(x, y, z, D3Q19_W)]  = pdf_E - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_E)]  = pdf_W - evenPart + oddPart;
+
+					ui = uz;
+					evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+					src[I(x, y, z, D3Q19_B)]  = pdf_T - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_T)]  = pdf_B - evenPart + oddPart;
+
+					// direction: w_2
+					w_2_indep = w_2*dir_indep_trm;
+
+					ui = -ux + uy;
+					evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+					ui = ux + uy;
+					evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+					ui = -ux + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+					ui = ux + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+					ui = -uy + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+					ui = uy + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+					src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+					src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+					#undef I
+				} } } // z, y, x (from inner to outer)
+			} } } // z, y, x (from inner to outer)
+
+		} // loop over threads
+
+		// }}}
+
+		X_LIKWID_STOP("aa-even");
+
+		#ifdef STATISTICS
+		kd->PdfsActive = src;
+		KernelStatistics(kd, ld, cd, iter);
+		#endif
+
+		// Fixup bounce back PDFs.
+		#ifdef _OPENMP
+		#pragma omp parallel for default(none) \
+				shared(kd, src)
+		#endif
+		for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+			src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+		}
+
+		// save current iteration
+		kda->Iteration = iter;
+
+		#ifdef VERIFICATION
+		kd->PdfsActive = src;
+		KernelAddBodyForce(kd, ld, cd);
+		#endif
+
+		#ifdef VTK_OUTPUT
+		if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+			kd->PdfsActive = src;
+			VtkWrite(ld, kd, cd, iter);
+		}
+		#endif
+
+		#ifdef STATISTICS
+		kd->PdfsActive = src;
+		KernelStatistics(kd, ld, cd, iter);
+		#endif
+
+		// --------------------------------------------------------------------
+		// odd time step
+
+
+		X_LIKWID_START("aa-odd");
+
+		// {{{
+
+		#ifdef _OPENMP
+		#pragma omp parallel for default(none) \
+				shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \
+				w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+				oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
+				private(ux, uy, uz, ui, dens, dir_indep_trm, \
+					pdf_C, \
+				  	pdf_N, pdf_E, pdf_S, pdf_W, \
+				  	pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+				  	pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+				  	pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+				  	evenPart, oddPart, w_1_indep, w_2_indep)
+		#endif
+
+		for (int i = 0; i < nThreads; ++i) {
+
+			int threadStartX = nX / nThreads * i;
+			int threadEndX   = nX / nThreads * (i + 1);
+
+			if (nX % nThreads > 0) {
+				if (nX % nThreads > i) {
+					threadStartX += i;
+					threadEndX   += i + 1;
+				}
+				else {
+					threadStartX += nX % nThreads;
+					threadEndX   += nX % nThreads;
+				}
+			}
+
+			for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+			for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+			for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+				// Must do everything here, else it would break collapse.
+				int eZ = MIN(bZ + blk[2], nZ + oZ);
+				int eY = MIN(bY + blk[1], nY + oY);
+				int eX = MIN(bX + blk[0], threadEndX + oX);
+
+				for (int x = bX; x < eX; ++x) {
+				for (int y = bY; y < eY; ++y) {
+				for (int z = bZ; z < eZ; ++z) {
+
+					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+					// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+					#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)];
+					D3Q19_LIST
+					#undef X
+
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+
+					if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+						ux = 0.1 * 0.577;
+						uy = 0.0;
+						uz = 0.0;
+
+					} else {
+#endif
+						ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+							 pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+						uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+							 pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+						uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+							 pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+					}
+#endif
+
+					dens = pdf_C +
+							pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+							pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+							pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+							pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+					dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+					// direction: w_0
+					src[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+					// direction: w_1
+					w_1_indep = w_1*dir_indep_trm;
+
+					ui = uy;
+					evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+					src[I(x, y + 1,   z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+					src[I(x, y - 1,   z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+					ui = ux;
+					evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+					src[I(x + 1,   y,   z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+					src[I(x - 1,   y,   z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+					ui = uz;
+					evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+					oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+					src[I(x,   y, z + 1, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+					src[I(x,   y, z - 1, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+					// direction: w_2
+					w_2_indep = w_2*dir_indep_trm;
+
+					ui = -ux + uy;
+					evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+					src[I(x - 1, y + 1,   z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+					src[I(x + 1, y - 1,   z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+					ui = ux + uy;
+					evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+					src[I(x + 1, y + 1,   z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+					src[I(x - 1, y - 1,   z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+					ui = -ux + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+					src[I(x - 1,   y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+					src[I(x + 1,   y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+					ui = ux + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+					src[I(x + 1,   y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+					src[I(x - 1,   y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+					ui = -uy + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+					src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+					src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+					ui = uy + uz;
+					evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+					oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+					src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+					src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+
+					#undef I
+				} } } // z, y, x (from inner to outer)
+			} } } // z, y, x (from inner to outer)
+		} // loop over threads
+
+		// }}}
+
+		// Stop counters before bounce back. Else computing loop balance will be incorrect.
+
+		X_LIKWID_STOP("aa-odd");
+
+		// Fixup bounce back PDFs.
+		#ifdef _OPENMP
+		#pragma omp parallel for default(none) \
+				shared(kd, src)
+		#endif
+		for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+			src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+		}
+
+		// save current iteration
+		kda->Iteration = iter + 1;
+
+		#ifdef VERIFICATION
+		kd->PdfsActive = src;
+		KernelAddBodyForce(kd, ld, cd);
+		#endif
+
+		#ifdef VTK_OUTPUT
+		if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) {
+			kd->PdfsActive = src;
+			VtkWrite(ld, kd, cd, iter + 1);
+		}
+		#endif
+
+		#ifdef STATISTICS
+		kd->PdfsActive = src;
+		KernelStatistics(kd, ld, cd, iter + 1);
+		#endif // }}}
+
+
+	} // for (int iter = 0; ...
+
+	#ifdef VTK_OUTPUT
+
+	if (cd->VtkOutput) {
+		kd->PdfsActive = src;
+		VtkWrite(ld, kd, cd, maxIterations);
+	}
+
+	#endif
+
+	return;
+}
+
diff --git a/src/BenchKernelD3Q19Aa.h b/src/BenchKernelD3Q19Aa.h
new file mode 100644
index 0000000..d0605d1
--- /dev/null
+++ b/src/BenchKernelD3Q19Aa.h
@@ -0,0 +1,41 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA__
+#define __BENCH_KERNEL_D3Q19_AA__
+
+#include "Kernel.h"
+
+
+void D3Q19AaInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaInit_AaAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19AaDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19AaDeinit_AaAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19__
diff --git a/src/BenchKernelD3Q19AaCommon.c b/src/BenchKernelD3Q19AaCommon.c
new file mode 100644
index 0000000..fbdcbc1
--- /dev/null
+++ b/src/BenchKernelD3Q19AaCommon.c
@@ -0,0 +1,651 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdf != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+		int nx = x - D3Q19_X[dir];
+		int ny = y - D3Q19_Y[dir];
+		int nz = z - D3Q19_Z[dir];
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		*pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+		#undef I
+	}
+	else {
+		int nx = x;
+		int ny = y;
+		int nz = z;
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		*pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+		#undef I
+	}
+
+
+	return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+		int nx = x - D3Q19_X[dir];
+		int ny = y - D3Q19_Y[dir];
+		int nz = z - D3Q19_Z[dir];
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+		#undef I
+	}
+	else {
+		int nx = x;
+		int ny = y;
+		int nz = z;
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+		#undef I
+	}
+
+	return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	else {
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+		D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+#if 0		// DETECT NANs
+
+	for (int d = 0; d < 19; ++d) {
+		if (isnan(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+
+			exit(1);
+		}
+	}
+
+#endif
+
+	return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	else {
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	return;
+}
+
+
+static void ParameterUsage()
+{
+	printf("Kernel parameters:\n");
+	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+	return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+	Assert(blk != NULL);
+
+	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+	#define NEXT_ARG_PRESENT() \
+		do { \
+			if (i + 1 >= params->nKernelArgs) { \
+				printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+				exit(1); \
+			} \
+		} while (0)
+
+
+	for (int i = 0; i < params->nKernelArgs; ++i) {
+		if (ARG_IS("-blk") || ARG_IS("--blk")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[0] = blk[1] = blk[2] = tmp;
+		}
+		else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[0] = tmp;
+		}
+		else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[1] = tmp;
+		}
+		else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[2] = tmp;
+		}
+		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+			ParameterUsage();
+			exit(1);
+		}
+		else {
+			printf("ERROR: unknown kernel parameter.\n");
+			ParameterUsage();
+			exit(1);
+		}
+	}
+
+	#undef ARG_IS
+	#undef NEXT_ARG_PRESENT
+
+	return;
+}
+
+
+void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	KernelDataAa * kda = NULL;
+	MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+	kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+	kda->Iteration = -1;
+
+	KernelData * kd = &kda->kd;
+	*kernelData = kd;
+
+	kd->nObstIndices = ld->nObst;
+
+	// Ajust the dimensions according to padding, if used.
+	kd->Dims[0] = ld->Dims[0];
+	kd->Dims[1] = ld->Dims[1];
+	kd->Dims[2] = ld->Dims[2];
+
+
+	int * lDims = ld->Dims;
+	int * gDims = kd->GlobalDims;
+
+	gDims[0] = lDims[0] + 2;
+	gDims[1] = lDims[1] + 2;
+	gDims[2] = lDims[2] + 2;
+
+	kd->Offsets[0] = 1;
+	kd->Offsets[1] = 1;
+	kd->Offsets[2] = 1;
+
+	int lX = lDims[0];
+	int lY = lDims[1];
+	int lZ = lDims[2];
+
+	int gX = gDims[0];
+	int gY = gDims[1];
+	int gZ = gDims[2];
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	int blk[3] = { 0 };
+
+	int nCells = gX * gY * gZ;
+
+	PdfT * pdfs[2] = { NULL, NULL };
+
+	ParseParameters(params, blk);
+
+	if (blk[0] == 0) blk[0] = gX;
+	if (blk[1] == 0) blk[1] = gY;
+	if (blk[2] == 0) blk[2] = gZ;
+
+	printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+
+	kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+	printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+	       nCells,
+		   sizeof(PdfT) * nCells * N_D3Q19,
+	       sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K		4096
+
+	MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+	kd->Pdfs[0] = pdfs[0];
+	kd->Pdfs[1] = NULL;
+
+
+	// Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+	// This depends on the chosen data layout.
+	// The structure of the loop should resemble the same "execution layout"
+	// as in the kernel!
+
+	int nThreads;
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+	#pragma omp parallel for \
+				shared(gDims, pdfs, \
+				oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+	for (int i = 0; i < nThreads; ++i) {
+
+		int threadStartX = lX / nThreads * i;
+		int threadEndX   = lX / nThreads * (i + 1);
+
+		if (lX % nThreads > 0) {
+			if (lX % nThreads > i) {
+				threadStartX += i;
+				threadEndX   += i + 1;
+			}
+			else {
+				threadStartX += lX % nThreads;
+				threadEndX   += lX % nThreads;
+			}
+		}
+
+		for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+		for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+		for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+			int eX = MIN(bX + blk[0], threadEndX + oX);
+			int eY = MIN(bY + blk[1], lY + oY);
+			int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+			// printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+			for (int x = bX; x < eX; ++x) {
+			for (int y = bY; y < eY; ++y) {
+			for (int z = bZ; z < eZ; ++z) {
+
+				for (int d = 0; d < N_D3Q19; ++d) {
+					pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0;
+				}
+
+			} } } // x, y, z
+		} } } // bx, by, bz
+	}
+
+
+	// Initialize all PDFs to some standard value.
+	for (int x = oX; x < lX + oX; ++x) {
+	for (int y = oY; y < lY + oY; ++y) {
+	for (int z = oZ; z < lZ + oZ; ++z) {
+		for (int d = 0; d < N_D3Q19; ++d) {
+			pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+		}
+	} } } // x, y, z
+
+
+	// Count how many *PDFs* need bounce back treatment.
+
+	uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+	if (nPdfs > ((2LU << 31) - 1)) {
+		printf("ERROR: number of PDFs exceed 2^31.\n");
+		exit(1);
+	}
+
+	// Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+	// Works when declaring nBounceBackPdfs as int64_t or using volatile.
+	volatile int nBounceBackPdfs = 0;
+	// int64_t nBounceBackPdfs = 0;
+	int nx, ny, nz, px, py, pz;
+
+
+	for (int x = 0; x < lX; ++x) {
+		for (int y = 0; y < lY; ++y) {
+			for (int z = 0; z < lZ; ++z) {
+
+				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+					for (int d = 0; d < N_D3Q19; ++d) {
+						nx = x - D3Q19_X[d];
+						ny = y - D3Q19_Y[d];
+						nz = z - D3Q19_Z[d];
+
+						// Check if neighbor is inside the lattice.
+						// if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+						// 	continue;
+						// }
+						if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+							continue;
+						}
+						else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+					}
+				}
+			}
+		}
+	}
+
+	printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+	MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+	MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+	kd->nBounceBackPdfs = nBounceBackPdfs;
+	nBounceBackPdfs = 0;
+
+	int srcIndex;
+	int dstIndex;
+
+	// TODO: currently this is not NUMA-aware
+    //       - maybe use the same blocking as for lattice initialization?
+	//       - do place the bounce back index vector parallel?
+
+	for (int x = 0; x < lX; ++x) {
+		for (int y = 0; y < lY; ++y) {
+			for (int z = 0; z < lZ; ++z) {
+
+				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+					for (int d = 0; d < N_D3Q19; ++d) {
+						nx = x + D3Q19_X[d];
+						ny = y + D3Q19_Y[d];
+						nz = z + D3Q19_Z[d];
+
+						if ( 	((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+								((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+								((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+						){
+							// For periodicity:
+
+							// We assume we have finished odd time step (accessing neighbor PDFs) and are
+							// before executing the even time step (accessing local PDFs only).
+
+							// Assuming we are at the most east position of the lattice. Through the odd
+							// time step propagation has put a PDF in the east slot of the ghost cell east
+							// of us, i.e. nx, ny, nz.  We copy it to the east slot of the most west node.
+
+							// In case of transition from even to odd time step , src and dest must be
+							// exchanged.
+
+
+							// x periodic
+							if (nx < 0) {
+								px = lX - 1;
+							}
+							else if (nx >= lX) {
+								px = 0;
+							} else {
+								px = nx;
+							}
+
+							// y periodic
+							if (ny < 0) {
+								py = lY - 1;
+							}
+							else if (ny >= lY) {
+								py = 0;
+							} else {
+								py = ny;
+							}
+
+							// z periodic
+							if (nz < 0) {
+								pz = lZ - 1;
+							}
+							else if (nz >= lZ) {
+								pz = 0;
+							} else {
+								pz = nz;
+							}
+
+							if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+								// See description of bounce back handling below.
+								srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+								dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+							}
+							else {
+
+								srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+								// Put it on the other side back into the domain.
+								dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+								VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+							}
+
+							kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+							kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+							++nBounceBackPdfs;
+
+						}
+						else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+							continue;
+						}
+						else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+							// Depending on the time step we are in we have to exchange src and dst index.
+
+							// We build the list for the case, when we have finished odd time step
+							// (accessing neighbor PDFs) and before we start with the even time step
+							// (accessing local PDFs only).
+
+							// Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+							// Then we have to move the east PDF from the obstacle to our west position,
+							// i.e. the inverse of east.
+
+							// In case of transition from even to odd time step src and dest just
+							// have to be exchanged.
+
+							srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+							dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+
+							VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+							kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+							kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+							++nBounceBackPdfs;
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+	// Fill remaining KernelData structures
+	kd->GetNode = FNAME(GetNode);
+	kd->SetNode = FNAME(SetNode);
+
+	kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+	kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+	kd->Kernel = FNAME(D3Q19AaKernel);
+
+	kd->DstPdfs = NULL;
+	kd->PdfsActive = kd->Pdfs[0];
+
+	return;
+}
+
+void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+	MemFree((void **) & ((*kernelData)->Pdfs[0]));
+	// MemFree((void **) & ((*kernelData)->Pdfs[1]));
+
+	MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+	MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+	MemFree((void **)kernelData);
+
+	return;
+}
+
diff --git a/src/BenchKernelD3Q19AaCommon.h b/src/BenchKernelD3Q19AaCommon.h
new file mode 100644
index 0000000..151e5a0
--- /dev/null
+++ b/src/BenchKernelD3Q19AaCommon.h
@@ -0,0 +1,89 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+	KernelData kd;
+	int Blk[3];			// Blocking in X, Y, and Z direction, value of 0 disables blocking.
+	int Iteration;		// Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_)	((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)	JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+	#error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+	#error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5		FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+	Assert(dims[0] > 0);
+	Assert(dims[1] > 0);
+	Assert(dims[2] > 0);
+
+	Assert(x >= 0);
+	Assert(x < dims[0]);
+	Assert(y >= 0);
+	Assert(y < dims[1]);
+	Assert(z >= 0);
+	Assert(z < dims[2]);
+	Assert(d >= 0);
+#ifdef D3Q19
+	Assert(d < N_D3Q19);
+#else
+#error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+	return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+#elif DATA_LAYOUT_AOS
+	return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+#error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+#endif // __BENCH_KERNEL_D3Q19_AA_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19AaVec.c b/src/BenchKernelD3Q19AaVec.c
new file mode 100644
index 0000000..2642c1c
--- /dev/null
+++ b/src/BenchKernelD3Q19AaVec.c
@@ -0,0 +1,610 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+#include "Vector.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd);
+static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd);
+
+#if 0 // {{{
+void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix)
+{
+	return;
+
+	int * lDims = ld->Dims;
+	int * gDims = kd->GlobalDims;
+
+	int nX = gDims[0];
+	int nY = gDims[1];
+	int nZ = gDims[2];
+
+	PdfT pdfs[N_D3Q19];
+
+	int localZStart = zStart;
+	int localZStop  = zStop;
+
+	if (localZStart == -1) localZStart = 0;
+	if (localZStop  == -1) localZStop  = gDims[2] - 1;
+
+	printf("D iter: %d\n", iter);
+
+	for (int dir = 0; dir < 19; ++dir) {
+		for (int z = localZStop; z >= localZStart; --z) {
+			printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z);
+
+			for(int y = 0; y < nY; ++y) {
+				printf("D [%2d][%2d][%s] %2d  ", iter, dir, prefix, y);
+
+				for(int x = 0; x < nX; ++x) {
+
+					if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+
+						#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
+						pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)];
+						#undef I
+//						kd->GetNode(kd, x, y, z, pdfs);
+					}
+					else {
+						pdfs[dir] = -1.0;
+					}
+
+					printf("%.16e ", pdfs[dir]);
+				}
+
+				printf("\n");
+			}
+		}
+	}
+}
+#endif  // }}}
+
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
+{
+	Assert(ld != NULL);
+	Assert(kd != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelDataAa * kda = KDA(kd);
+
+	PdfT * src = kd->PdfsActive;
+
+	int maxIterations = cd->MaxIterations;
+
+	#ifdef VTK_OUTPUT
+		if (cd->VtkOutput) {
+			kd->PdfsActive = src;
+			VtkWrite(ld, kd, cd, -1);
+		}
+	#endif
+
+	#ifdef STATISTICS
+		kd->PdfsActive = src;
+		KernelStatistics(kd, ld, cd, 0);
+	#endif
+
+	Assert((maxIterations % 2) == 0);
+
+	for (int iter = 0; iter < maxIterations; iter += 2) {
+
+		// --------------------------------------------------------------------
+		// even time step
+		// --------------------------------------------------------------------
+
+		X_LIKWID_START("aa-vec-even");
+
+		#pragma omp parallel
+		{
+			KernelEven(ld, kd, cd);
+		}
+
+		X_LIKWID_STOP("aa-vec-even");
+
+		// Fixup bounce back PDFs.
+		#ifdef _OPENMP
+			#pragma omp parallel for default(none) \
+					shared(kd, src)
+		#endif
+		for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+			src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]];
+		}
+
+		// save current iteration
+		kda->Iteration = iter;
+
+		#ifdef VERIFICATION
+			kd->PdfsActive = src;
+			KernelAddBodyForce(kd, ld, cd);
+		#endif
+
+		#ifdef VTK_OUTPUT
+			if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+				kd->PdfsActive = src;
+				VtkWrite(ld, kd, cd, iter);
+			}
+		#endif
+
+		#ifdef STATISTICS
+			kd->PdfsActive = src;
+			KernelStatistics(kd, ld, cd, iter);
+		#endif
+
+		// --------------------------------------------------------------------
+		// odd time step
+		// --------------------------------------------------------------------
+
+		X_LIKWID_START("aa-vec-odd");
+
+		#pragma omp parallel
+		{
+			KernelOdd(ld, kd, cd);
+		}
+
+		// Stop counters before bounce back. Else computing loop balance will
+		// be incorrect.
+
+		X_LIKWID_STOP("aa-vec-odd");
+
+		// Fixup bounce back PDFs.
+		#ifdef _OPENMP
+			#pragma omp parallel for default(none) \
+					shared(kd, src)
+		#endif
+		for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+			src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]];
+		}
+
+		// save current iteration
+		kda->Iteration = iter + 1;
+
+		#ifdef VERIFICATION
+			kd->PdfsActive = src;
+			KernelAddBodyForce(kd, ld, cd);
+		#endif
+
+		#ifdef VTK_OUTPUT
+			if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) {
+				kd->PdfsActive = src;
+				VtkWrite(ld, kd, cd, iter + 1);
+			}
+		#endif
+
+		#ifdef STATISTICS
+			kd->PdfsActive = src;
+			KernelStatistics(kd, ld, cd, iter + 1);
+		#endif // }}}
+
+
+	} // for (int iter = 0; ...
+
+	#ifdef VTK_OUTPUT
+
+	if (cd->VtkOutput) {
+		kd->PdfsActive = src;
+		VtkWrite(ld, kd, cd, maxIterations);
+	}
+
+	#endif
+
+	return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
+{
+	Assert(ld != NULL);
+	Assert(kd != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelDataAa * kda  = KDA(kd);
+
+	int nX = ld->Dims[0];
+	int nY = ld->Dims[1];
+	int nZ = ld->Dims[2];
+
+	int * gDims = kd->GlobalDims;
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	int blk[3];
+	blk[0] = kda->Blk[0];
+	blk[1] = kda->Blk[1];
+	blk[2] = kda->Blk[2];
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+
+	VPDFT VONE_HALF   = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vui, vux, vuy, vuz, vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z)	VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nThreads = 1;
+	int threadId = 0;
+
+	#ifdef _OPENMP
+		nThreads = omp_get_max_threads();
+		threadId = omp_get_thread_num();
+	#endif
+
+	// TODO: Currently only a 1-D decomposition is applied. For achritectures
+	//       with a lot of cores we want at least 2-D.
+
+	int threadStartX = nX / nThreads * threadId;
+	int threadEndX   = nX / nThreads * (threadId + 1);
+
+	if (nX % nThreads > 0) {
+		if (nX % nThreads > threadId) {
+			threadStartX += threadId;
+			threadEndX   += threadId + 1;
+		}
+		else {
+			threadStartX += nX % nThreads;
+			threadEndX   += nX % nThreads;
+		}
+	}
+
+	AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+	for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+	for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+	for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+		int eX = MIN(bX + blk[0], threadEndX + oX);
+		int eY = MIN(bY + blk[1], nY + oY);
+		int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+		for (int x = bX; x < eX; x += 1) {
+		for (int y = bY; y < eY; y += 1) {
+		for (int z = bZ; z < eZ; z += VSIZE) {
+
+			#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+			// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
+				D3Q19_LIST
+			#undef X
+
+
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+					VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),
+					VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			vui = vuy;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+			vui = vux;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+			vui = vuz;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			vui = VSUB(vuy,vux);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+			vui = VADD(vux,vuy);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+			vui = VSUB(vuz,vux);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+			vui = VADD(vux,vuz);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+			vui = VSUB(vuz,vuy);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+			vui = VADD(vuy,vuz);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+			#undef I
+		} } } // x, y, z
+	} } } // blocked x, y, z
+
+
+
+	return;
+} // }}}
+
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd)  // {{{
+{
+	Assert(ld != NULL);
+	Assert(kd != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelDataAa * kda  = KDA(kd);
+
+	int nX = ld->Dims[0];
+	int nY = ld->Dims[1];
+	int nZ = ld->Dims[2];
+
+	int * gDims = kd->GlobalDims;
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	int blk[3];
+	blk[0] = kda->Blk[0];
+	blk[1] = kda->Blk[1];
+	blk[2] = kda->Blk[2];
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+	VPDFT VONE_HALF   = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vui, vux, vuy, vuz, vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) 		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int threadId = 0;
+	int nThreads = 1;
+
+	#ifdef _OPENMP
+		threadId = omp_get_thread_num();
+		nThreads = omp_get_max_threads();
+	#endif
+
+	// TODO: Currently only a 1-D decomposition is applied. For achritectures
+	//       with a lot of cores we want at least 2-D.
+	int threadStartX = nX / nThreads * threadId;
+	int threadEndX   = nX / nThreads * (threadId + 1);
+
+	if (nX % nThreads > 0) {
+		if (nX % nThreads > threadId) {
+			threadStartX += threadId;
+			threadEndX   += threadId + 1;
+		}
+		else {
+			threadStartX += nX % nThreads;
+			threadEndX   += nX % nThreads;
+		}
+	}
+
+	AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE);
+
+	for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+	for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+	for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+		int eX = MIN(bX + blk[0], threadEndX + oX);
+		int eY = MIN(bY + blk[1], nY + oY);
+		int eZ = MIN(bZ + blk[2], nZ + oZ);
+
+		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; z += VSIZE) {
+
+			#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]);
+			D3Q19_LIST
+			#undef X
+
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+						 VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			vui = vuy;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x, y + 1,  z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+			VSTU(&src[I(x, y - 1,  z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+			vui = vux;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+			VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+			vui = vuz;
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+			VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+			VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			vui = VSUB(vuy,vux);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+			VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+			vui = VADD(vux,vuy);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+			VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+			vui = VSUB(vuz,vux);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+			VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+			vui = VADD(vux,vuz);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+			VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+			vui = VSUB(vuz,vuy);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+			VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+			vui = VADD(vuy,vuz);
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+			VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+			VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+			#undef I
+		} } } // x, y, z
+	} } } // blocked x, y, z
+
+	return;
+
+}  // }}}
diff --git a/src/BenchKernelD3Q19AaVec.h b/src/BenchKernelD3Q19AaVec.h
new file mode 100644
index 0000000..a125b04
--- /dev/null
+++ b/src/BenchKernelD3Q19AaVec.h
@@ -0,0 +1,38 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC__
+#define __BENCH_KERNEL_D3Q19_AA_VEC__
+
+#include "Kernel.h"
+
+
+void D3Q19AaVecInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19AaVecDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC__
diff --git a/src/BenchKernelD3Q19AaVecCommon.c b/src/BenchKernelD3Q19AaVecCommon.c
new file mode 100644
index 0000000..57a9fae
--- /dev/null
+++ b/src/BenchKernelD3Q19AaVecCommon.c
@@ -0,0 +1,656 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19AaVecCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdf != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+		int nx = x - D3Q19_X[dir];
+		int ny = y - D3Q19_Y[dir];
+		int nz = z - D3Q19_Z[dir];
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		*pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])];
+		#undef I
+	}
+	else {
+		int nx = x;
+		int ny = y;
+		int nz = z;
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		*pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+		#undef I
+	}
+
+
+	return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+		int nx = x - D3Q19_X[dir];
+		int ny = y - D3Q19_Y[dir];
+		int nz = z - D3Q19_Z[dir];
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf;
+		#undef I
+	}
+	else {
+		int nx = x;
+		int ny = y;
+		int nz = z;
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+		#undef I
+	}
+
+	return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	else {
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+		D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+#if 0		// DETECT NANs
+
+	for (int d = 0; d < 19; ++d) {
+		if (isnan(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+
+			exit(1);
+		}
+	}
+
+#endif
+
+	return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataAa * kda = KDA(kd);
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	if (kda->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	else {
+		#define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+		D3Q19_LIST
+		#undef X
+		#undef I
+	}
+	return;
+}
+
+
+static void ParameterUsage()
+{
+	printf("Kernel parameters:\n");
+	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+	return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+	Assert(blk != NULL);
+
+	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+	#define NEXT_ARG_PRESENT() \
+		do { \
+			if (i + 1 >= params->nKernelArgs) { \
+				printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+				exit(1); \
+			} \
+		} while (0)
+
+
+	for (int i = 0; i < params->nKernelArgs; ++i) {
+		if (ARG_IS("-blk") || ARG_IS("--blk")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[0] = blk[1] = blk[2] = tmp;
+		}
+		else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[0] = tmp;
+		}
+		else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[1] = tmp;
+		}
+		else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
+				exit(1);
+			}
+
+			blk[2] = tmp;
+		}
+		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+			ParameterUsage();
+			exit(1);
+		}
+		else {
+			printf("ERROR: unknown kernel parameter.\n");
+			ParameterUsage();
+			exit(1);
+		}
+	}
+
+	#undef ARG_IS
+	#undef NEXT_ARG_PRESENT
+
+	return;
+}
+
+
+void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	KernelDataAa * kda = NULL;
+	MemAlloc((void **)&kda, sizeof(KernelDataAa));
+
+	kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0;
+	kda->Iteration = -1;
+
+	KernelData * kd = &kda->kd;
+	*kernelData = kd;
+
+	kd->nObstIndices = ld->nObst;
+
+	// Ajust the dimensions according to padding, if used.
+	kd->Dims[0] = ld->Dims[0];
+	kd->Dims[1] = ld->Dims[1];
+	kd->Dims[2] = ld->Dims[2];
+
+
+	int * lDims = ld->Dims;
+	int * gDims = kd->GlobalDims;
+
+	Assert(VSIZE <= 4);
+
+	// TODO: only add enough ghost cells so we can compute everything vectorized.
+	gDims[0] = lDims[0] + 2;
+	gDims[1] = lDims[1] + 2;
+	// TODO: fix this for aa-vec2-soa
+	gDims[2] = lDims[2] + 4; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4
+
+	kd->Offsets[0] = 1;
+	kd->Offsets[1] = 1;
+	kd->Offsets[2] = 1;
+
+	int lX = lDims[0];
+	int lY = lDims[1];
+	int lZ = lDims[2];
+
+	int gX = gDims[0];
+	int gY = gDims[1];
+	int gZ = gDims[2];
+
+	int oX = kd->Offsets[0];
+	int oY = kd->Offsets[1];
+	int oZ = kd->Offsets[2];
+
+	int blk[3] = { 0 };
+
+	int nCells = gX * gY * gZ;
+
+	PdfT * pdfs[2] = { NULL, NULL };
+
+	ParseParameters(params, blk);
+
+	if (blk[2] % VSIZE != 0) {
+		blk[2] -= blk[2] % VSIZE;
+		printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]);
+	}
+
+	if (blk[0] == 0) blk[0] = gX;
+	if (blk[1] == 0) blk[1] = gY;
+	if (blk[2] == 0) blk[2] = gZ;
+
+	printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+	kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2];
+
+
+	printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n",
+	       nCells,
+		   sizeof(PdfT) * nCells * N_D3Q19,
+	       sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+#define PAGE_4K		4096
+
+	MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+	kd->Pdfs[0] = pdfs[0];
+	kd->Pdfs[1] = NULL;
+
+
+	// Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+	// This depends on the chosen data layout.
+	// The structure of the loop should resemble the same "execution layout"
+	// as in the kernel!
+
+	int nThreads;
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+#ifdef _OPENMP
+	#pragma omp parallel for \
+				shared(gDims, pdfs, \
+				oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld)
+#endif
+	for (int i = 0; i < nThreads; ++i) {
+
+		int threadStartX = lX / nThreads * i;
+		int threadEndX   = lX / nThreads * (i + 1);
+
+		if (lX % nThreads > 0) {
+			if (lX % nThreads > i) {
+				threadStartX += i;
+				threadEndX   += i + 1;
+			}
+			else {
+				threadStartX += lX % nThreads;
+				threadEndX   += lX % nThreads;
+			}
+		}
+
+		for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+		for (int bY = oY; bY < lY + oY; bY += blk[1]) {
+		for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) {
+
+			int eX = MIN(bX + blk[0], threadEndX + oX);
+			int eY = MIN(bY + blk[1], lY + oY);
+			int eZ = MIN(bZ + blk[2], lZ + oZ);
+
+			// printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+			for (int x = bX; x < eX; ++x) {
+			for (int y = bY; y < eY; ++y) {
+			for (int z = bZ; z < eZ; ++z) {
+
+				for (int d = 0; d < N_D3Q19; ++d) {
+					pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0;
+				}
+
+			} } } // x, y, z
+		} } } // bx, by, bz
+	}
+
+
+	// Initialize all PDFs to some standard value.
+	for (int x = oX; x < lX + oX; ++x) {
+	for (int y = oY; y < lY + oY; ++y) {
+	for (int z = oZ; z < lZ + oZ; ++z) {
+		for (int d = 0; d < N_D3Q19; ++d) {
+			pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+		}
+	} } } // x, y, z
+
+
+	// Count how many *PDFs* need bounce back treatment.
+
+	uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+	if (nPdfs > ((2LU << 31) - 1)) {
+		printf("ERROR: number of PDFs exceed 2^31.\n");
+		exit(1);
+	}
+
+	// Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+	// Works when declaring nBounceBackPdfs as int64_t or using volatile.
+	volatile int nBounceBackPdfs = 0;
+	// int64_t nBounceBackPdfs = 0;
+	int nx, ny, nz, px, py, pz;
+
+
+	for (int x = 0; x < lX; ++x) {
+		for (int y = 0; y < lY; ++y) {
+			for (int z = 0; z < lZ; ++z) {
+
+				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+					for (int d = 0; d < N_D3Q19; ++d) {
+						nx = x - D3Q19_X[d];
+						ny = y - D3Q19_Y[d];
+						nz = z - D3Q19_Z[d];
+
+						// Check if neighbor is inside the lattice.
+						// if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+						// 	continue;
+						// }
+						if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+						else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+							continue;
+						}
+						else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+							++nBounceBackPdfs; // Compiler bug --> see above
+						}
+					}
+				}
+			}
+		}
+	}
+
+	printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+	MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+	MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+	kd->nBounceBackPdfs = nBounceBackPdfs;
+	nBounceBackPdfs = 0;
+
+	int srcIndex;
+	int dstIndex;
+
+	// TODO: currently this is not NUMA-aware
+    //       - maybe use the same blocking as for lattice initialization?
+	//       - do place the bounce back index vector parallel?
+
+	for (int x = 0; x < lX; ++x) {
+		for (int y = 0; y < lY; ++y) {
+			for (int z = 0; z < lZ; ++z) {
+
+				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+					for (int d = 0; d < N_D3Q19; ++d) {
+						nx = x + D3Q19_X[d];
+						ny = y + D3Q19_Y[d];
+						nz = z + D3Q19_Z[d];
+
+						if ( 	((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+								((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+								((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+						){
+							// For periodicity:
+
+							// We assume we have finished odd time step (accessing neighbor PDFs) and are
+							// before executing the even time step (accessing local PDFs only).
+
+							// Assuming we are at the most east position of the lattice. Through the odd
+							// time step propagation has put a PDF in the east slot of the ghost cell east
+							// of us, i.e. nx, ny, nz.  We copy it to the east slot of the most west node.
+
+							// In case of transition from even to odd time step , src and dest must be
+							// exchanged.
+
+
+							// x periodic
+							if (nx < 0) {
+								px = lX - 1;
+							}
+							else if (nx >= lX) {
+								px = 0;
+							} else {
+								px = nx;
+							}
+
+							// y periodic
+							if (ny < 0) {
+								py = lY - 1;
+							}
+							else if (ny >= lY) {
+								py = 0;
+							} else {
+								py = ny;
+							}
+
+							// z periodic
+							if (nz < 0) {
+								pz = lZ - 1;
+							}
+							else if (nz >= lZ) {
+								pz = 0;
+							} else {
+								pz = nz;
+							}
+
+							if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+								// See description of bounce back handling below.
+								srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+								dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+							}
+							else {
+
+								srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+								// Put it on the other side back into the domain.
+								dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+
+								VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+							}
+
+							kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+							kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+							++nBounceBackPdfs;
+
+						}
+						else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+							continue;
+						}
+						else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+							// Depending on the time step we are in we have to exchange src and dst index.
+
+							// We build the list for the case, when we have finished odd time step
+							// (accessing neighbor PDFs) and before we start with the even time step
+							// (accessing local PDFs only).
+
+							// Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell.
+							// Then we have to move the east PDF from the obstacle to our west position,
+							// i.e. the inverse of east.
+
+							// In case of transition from even to odd time step src and dest just
+							// have to be exchanged.
+
+							srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+							dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+
+							VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+							kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+							kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+							++nBounceBackPdfs;
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+	// Fill remaining KernelData structures
+	kd->GetNode = FNAME(GetNode);
+	kd->SetNode = FNAME(SetNode);
+
+	kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+	kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+	kd->Kernel = FNAME(D3Q19AaVecKernel);
+
+	kd->DstPdfs = NULL;
+	kd->PdfsActive = kd->Pdfs[0];
+
+	return;
+}
+
+void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+	MemFree((void **) & ((*kernelData)->Pdfs[0]));
+
+	MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+	MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+	MemFree((void **)kernelData);
+
+	return;
+}
+
diff --git a/src/BenchKernelD3Q19AaVecCommon.h b/src/BenchKernelD3Q19AaVecCommon.h
new file mode 100644
index 0000000..d416d07
--- /dev/null
+++ b/src/BenchKernelD3Q19AaVecCommon.h
@@ -0,0 +1,93 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
+// #include "BenchKernelD3Q19AaCommon.h"
+
+#include "Kernel.h"
+
+typedef struct KernelDataAa_
+{
+	KernelData kd;
+	int Blk[3];			// Blocking in X, Y, and Z direction, value of 0 disables blocking.
+	int Iteration;		// Current iteration number.
+} KernelDataAa;
+
+// Macro for casting KernelData * to KernelDataAa *.
+#define KDA(_x_)	((KernelDataAa *)(_x_))
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)	JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_SOA
+	#error Only DATA_LAYOUT_SOA is supported.
+#endif
+
+#ifndef PROP_MODEL_AA
+	#error Only PROP_MODEL_AA is supported.
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5		FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+	Assert(dims[0] > 0);
+	Assert(dims[1] > 0);
+	Assert(dims[2] > 0);
+
+	Assert(x >= 0);
+	Assert(x < dims[0]);
+	Assert(y >= 0);
+	Assert(y < dims[1]);
+	Assert(z >= 0);
+	Assert(z < dims[2]);
+	Assert(d >= 0);
+#ifdef D3Q19
+	Assert(d < N_D3Q19);
+#else
+	#error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+	return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
+// #elif DATA_LAYOUT_AOS
+// 	return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
+#else
+	#error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c
index e697bc8..8f75649 100644
--- a/src/BenchKernelD3Q19Common.c
+++ b/src/BenchKernelD3Q19Common.c
@@ -32,6 +32,9 @@
 #include <inttypes.h>
 #include <math.h>
 
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
 
 // Forward definition.
 void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
@@ -224,8 +227,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -236,8 +239,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -248,8 +251,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -260,8 +263,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -285,7 +288,7 @@ static void ParseParameters(Parameters * params, int * blk)
 }
 
 
-void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 {
 	KernelDataEx * kdex = NULL;
 	MemAlloc((void **)&kdex, sizeof(KernelDataEx));
@@ -332,7 +335,17 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
 	PdfT * pdfs[2];
 
-	ParseParameters(params, blk);
+	if (blocking) {
+		ParseParameters(params, blk);
+	}
+	else {
+		if (params->nKernelArgs != 0) {
+			printf("ERROR: unknown kernel parameter.\n");
+			printf("This kernels accepts no parameters.\n");
+			exit(1);
+		}
+	}
+
 
 	if (blk[0] == 0) blk[0] = gX;
 	if (blk[1] == 0) blk[1] = gY;
@@ -348,8 +361,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	       nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
 	       2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
 
-	MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
-	MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+	// MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+	// MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+	MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+	MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024);
+
+	printf("# pdfs[0] = %p\n", pdfs[0]);
+	printf("# pdfs[1] = %p\n", pdfs[1]);
 
 	kd->Pdfs[0] = pdfs[0];
 	kd->Pdfs[1] = pdfs[1];
@@ -358,39 +376,82 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	// This depends on the chosen data layout.
 	// The structure of the loop should resemble the same "execution layout"
 	// as in the kernel!
-#ifdef _OPENMP
-	#pragma omp parallel for collapse(3)
-#endif
 
-	for (int bZ = 0; bZ < gZ; bZ += blk[2]) {
-		for (int bY = 0; bY < gY; bY += blk[1]) {
-			for (int bX = 0; bX < gX; bX += blk[0]) {
+	if (blocking) {
+		int nX = ld->Dims[0];
+		int nY = ld->Dims[1];
+		int nZ = ld->Dims[2];
 
-				// Must do everything here, else it would break collapse.
-				int eZ = MIN(bZ + blk[2], gZ);
-				int eY = MIN(bY + blk[1], gY);
-				int eX = MIN(bX + blk[0], gX);
+		int nThreads = 1;
 
-				for (int z = bZ; z < eZ; ++z) {
-					for (int y = bY; y < eY; ++y) {
-						for (int x = bX; x < eX; ++x) {
+		#ifdef _OPENMP
+		nThreads = omp_get_max_threads();
+		#endif
 
-							for (int d = 0; d < N_D3Q19; ++d) {
-								pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
-								pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
-							}
+		#ifdef _OPENMP
+			#pragma omp parallel for default(none) \
+				shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads)
+		#endif
+		for (int i = 0; i < nThreads; ++i) {
+
+			int threadStartX = nX / nThreads * i;
+			int threadEndX   = nX / nThreads * (i + 1);
 
+			if (nX % nThreads > 0) {
+				if (nX % nThreads > i) {
+					threadStartX += i;
+					threadEndX   += i + 1;
+				}
+				else {
+					threadStartX += nX % nThreads;
+					threadEndX   += nX % nThreads;
+				}
+			}
+
+			for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+				for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+					for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+
+						int eZ = MIN(bZ + blk[2], nZ + oZ);
+						int eY = MIN(bY + blk[1], nY + oY);
+						int eX = MIN(bX + blk[0], threadEndX + oX);
+
+						for (int x = bX; x < eX; ++x) {
+							for (int y = bY; y < eY; ++y) {
+								for (int z = bZ; z < eZ; ++z) {
+									for (int d = 0; d < N_D3Q19; ++d) {
+										pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+										pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+									}
+								}
+							}
 						}
 					}
 				}
 			}
 		}
+
+	}
+	else {
+		#ifdef _OPENMP
+			#pragma omp parallel for collapse(3)
+		#endif
+		for (int x = 0; x < gX; ++x) {
+			for (int y = 0; y < gY; ++y) {
+				for (int z = 0; z < gZ; ++z) {
+					for (int d = 0; d < N_D3Q19; ++d) {
+						pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+						pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+					}
+				}
+			}
+		}
 	}
 
 	// Initialize all PDFs to some standard value.
-	for (int z = 0; z < gZ; ++z) {
+	for (int x = 0; x < gX; ++x) {
 		for (int y = 0; y < gY; ++y) {
-			for (int x = 0; x < gX; ++x) {
+			for (int z = 0; z < gZ; ++z) {
 				for (int d = 0; d < N_D3Q19; ++d) {
 					pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
 					pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
@@ -417,9 +478,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
 	// TODO: apply blocking?
 
-	for (int z = 0; z < lZ; ++z) {
+	for (int x = 0; x < lX; ++x) {
 		for (int y = 0; y < lY; ++y) {
-			for (int x = 0; x < lX; ++x) {
+			for (int z = 0; z < lZ; ++z) {
 
 				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
 					for (int d = 0; d < N_D3Q19; ++d) {
@@ -471,9 +532,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	int srcIndex;
 	int dstIndex;
 
-	for (int z = 0; z < lZ; ++z) {
+	for (int x = 0; x < lX; ++x) {
 		for (int y = 0; y < lY; ++y) {
-			for (int x = 0; x < lX; ++x) {
+			for (int z = 0; z < lZ; ++z) {
 
 				if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
 					for (int d = 0; d < N_D3Q19; ++d) {
@@ -598,7 +659,12 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
 	kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
 
-	kd->Kernel = FNAME(D3Q19BlkKernel);
+	if (blocking) {
+		kd->Kernel = FNAME(D3Q19BlkKernel);
+	}
+	else {
+		kd->Kernel = FNAME(D3Q19Kernel);
+	}
 
 	kd->DstPdfs = NULL;
 	kd->PdfsActive = kd->Pdfs[0];
@@ -606,6 +672,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	return;
 }
 
+
+void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	D3Q19InternalInit(1, ld, kernelData, params);
+}
+
+
 void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 {
 	MemFree((void **) & ((*kernelData)->Pdfs[0]));
@@ -624,27 +697,7 @@ void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
 
 void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
 {
-	Parameters p;
-
-	if (params->nKernelArgs != 0) {
-		printf("ERROR: unknown kernel parameter.\n");
-		printf("This kernels accepts no parameters.\n");
-		exit(1);
-	}
-
-	// Setup an empty parameters structure.
-	p.nArgs        = params->nArgs;
-	p.Args         = params->Args;
-	p.nKernelArgs  = 0;
-	p.KernelArgs   = NULL;
-
-	// Call init routine for blocking kernel and override the
-	// kernel function to be called later on.
-	FNAME(D3Q19BlkInit)(ld, kernelData, &p);
-
-	(*kernelData)->Kernel = FNAME(D3Q19Kernel);
-
-	return;
+	D3Q19InternalInit(0, ld, kernelData, params);
 
 }
 
diff --git a/src/BenchKernelD3Q19Common.h b/src/BenchKernelD3Q19Common.h
index cceddca..268bfed 100644
--- a/src/BenchKernelD3Q19Common.h
+++ b/src/BenchKernelD3Q19Common.h
@@ -74,9 +74,9 @@ static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
 #endif
 
 #ifdef DATA_LAYOUT_SOA
-	return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x;
+	return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z;
 #elif DATA_LAYOUT_AOS
-	return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d;
+	return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d;
 #else
 #error P_INDEX_5 function no implemented for chosen data layout.
 #endif
diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c
index 7922b7a..e01853a 100644
--- a/src/BenchKernelD3Q19List.c
+++ b/src/BenchKernelD3Q19List.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -97,6 +98,7 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
 	for(int iter = 0; iter < maxIterations; ++iter) {
 
 
+	X_LIKWID_START("list-os");
 
 	#ifdef _OPENMP
 		#pragma omp parallel for default(none) \
@@ -307,6 +309,8 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
 			#undef I
 		} // loop over fluid nodes
 
+		X_LIKWID_STOP("list-os");
+
 		#ifdef VERIFICATION
 			kd->PdfsActive = dst;
 			KernelAddBodyForce(kd, ld, cd);
diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c
index d2bcb52..c1e48a5 100644
--- a/src/BenchKernelD3Q19ListAaCommon.c
+++ b/src/BenchKernelD3Q19ListAaCommon.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -222,15 +223,18 @@ static void ParameterUsage()
 {
 	printf("Kernel parameters:\n");
 	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
-
+#ifdef DATA_LAYOUT_SOA
+	printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 	return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
 	Assert(blk != NULL);
 
 	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+	*padInfo = NULL;
 
 	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 	#define NEXT_ARG_PRESENT() \
@@ -248,8 +252,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -260,8 +264,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -272,8 +276,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -284,13 +288,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
 			blk[2] = tmp;
 		}
+#ifdef DATA_LAYOUT_SOA
+		else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+			NEXT_ARG_PRESENT();
+
+			*padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+		}
+#endif
 		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 			ParameterUsage();
 			exit(1);
@@ -346,6 +357,11 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 	kdl->nFluid = -1;
 #endif
 
+	int blk[3] = { 0 };
+	PadInfo * padInfo = NULL;
+
+	ParseParameters(params, blk, &padInfo);
+
 	// Ajust the dimensions according to padding, if used.
 	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -358,18 +374,21 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 	int lZ = lDims[2];
 
 	int nTotalCells = lX * lY * lZ;
-	int nCells = ld->nFluid; // TODO: + padding
+	int nCells = ld->nFluid;
 	int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+	{
+		nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+		PadInfoFree(padInfo); padInfo = NULL;
+	}
+#endif
+
 	kdl->nCells = nCells;
 	kdl->nFluid = nFluid;
 
 	PdfT * pdfs[2];
 
-	int blk[3] = { 0 };
-
-	ParseParameters(params, blk);
-
 	if (blk[0] == 0) blk[0] = lX;
 	if (blk[1] == 0) blk[1] = lY;
 	if (blk[2] == 0) blk[2] = lZ;
@@ -443,18 +462,17 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 
 	// Blocking is implemented via setup of the adjacency list. The kernel later will
 	// walk through the lattice blocked automatically.
-	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-	for (int bY = 0; bY < lY; bY += blk[1]) {
 	for (int bX = 0; bX < lX; bX += blk[0]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
 		int eX = MIN(bX + blk[0], lX);
 		int eY = MIN(bY + blk[1], lY);
 		int eZ = MIN(bZ + blk[2], lZ);
 
-
-		for (int z = bZ; z < eZ; ++z) {
-		for (int y = bY; y < eY; ++y) {
 		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; ++z) {
 
 			latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -492,7 +510,15 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 
 	// Loop over all fluid nodes and compute the indices to the neighboring
 	// PDFs for configure data layout (AoS/SoA).
-	// TODO: Parallelized loop to ensure correct NUMA placement.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int index = 0; index < nFluid; ++index) {
+		for (int d = 0; d < N_D3Q19_IDX; ++d) {
+			adjList[index * N_D3Q19_IDX + d] = -1;
+		}
+	}
+
 	// #ifdef _OPENMP --> add line continuation
 	// 	#pragma omp parallel for default(none)
 	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c
index c522252..b1dee16 100644
--- a/src/BenchKernelD3Q19ListAaPv.c
+++ b/src/BenchKernelD3Q19ListAaPv.c
@@ -728,9 +728,9 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 			//voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
 			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
 			//src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
-			VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+			VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
 			//src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
-			VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+			VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
 
 			//vui = vuy + vuz;
 			vui = VADD(vuy,vuz);
@@ -783,20 +783,20 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 			ui = uy;
 			evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3);
-			src[ADJ_LIST(D3Q19_N)]  = pdf_N - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_S)]  = pdf_S - evenPart + oddPart;
+			*ppdf_S  = pdf_N - evenPart - oddPart;
+			*ppdf_N  = pdf_S - evenPart + oddPart;
 
 			ui = ux;
 			evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3);
-			src[ADJ_LIST(D3Q19_E)]  = pdf_E - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_W)]  = pdf_W - evenPart + oddPart;
+			*ppdf_W  = pdf_E - evenPart - oddPart;
+			*ppdf_E  = pdf_W - evenPart + oddPart;
 
 			ui = uz;
 			evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3);
-			src[ADJ_LIST(D3Q19_T)]  = pdf_T - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_B)]  = pdf_B - evenPart + oddPart;
+			*ppdf_B  = pdf_T - evenPart - oddPart;
+			*ppdf_T  = pdf_B - evenPart + oddPart;
 
 			// direction: w_2
 			w_2_indep = w_2 * dir_indep_trm;
@@ -804,38 +804,38 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 			ui = -ux + uy;
 			evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+			*ppdf_SE = pdf_NW - evenPart - oddPart;
+			*ppdf_NW = pdf_SE - evenPart + oddPart;
 
 			ui = ux + uy;
 			evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+			*ppdf_SW = pdf_NE - evenPart - oddPart;
+			*ppdf_NE = pdf_SW - evenPart + oddPart;
 
 			ui = -ux + uz;
 			evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+			*ppdf_BE = pdf_TW - evenPart - oddPart;
+			*ppdf_TW = pdf_BE - evenPart + oddPart;
 
 			ui = ux + uz;
 			evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+			*ppdf_BW = pdf_TE - evenPart - oddPart;
+			*ppdf_TE = pdf_BW - evenPart + oddPart;
 
 			ui = -uy + uz;
 			evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+			*ppdf_BN = pdf_TS - evenPart - oddPart;
+			*ppdf_TS = pdf_BN - evenPart + oddPart;
 
 			ui = uy + uz;
 			evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
 			oddPart  = omegaOdd  * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3);
-			src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart;
-			src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+			*ppdf_BS = pdf_TN - evenPart - oddPart;
+			*ppdf_TN = pdf_BS - evenPart + oddPart;
 
 			pointerOffset = 1;
 		}
diff --git a/src/BenchKernelD3Q19ListAaPvCommon.c b/src/BenchKernelD3Q19ListAaPvCommon.c
index 9cdf0e1..7b479cf 100644
--- a/src/BenchKernelD3Q19ListAaPvCommon.c
+++ b/src/BenchKernelD3Q19ListAaPvCommon.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -225,15 +226,19 @@ static void ParameterUsage()
 {
 	printf("Kernel parameters:\n");
 	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+	printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
 	return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
 	Assert(blk != NULL);
 
 	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+	*padInfo = NULL;
 
 	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 	#define NEXT_ARG_PRESENT() \
@@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
 			blk[2] = tmp;
 		}
+#ifdef DATA_LAYOUT_SOA
+		else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+			NEXT_ARG_PRESENT();
+
+			*padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+		}
+#endif
 		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 			ParameterUsage();
 			exit(1);
@@ -502,6 +514,11 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 	kdlr->nConsecThreadIndices = 0;
 #endif
 
+	int blk[3] = { 0 };
+	PadInfo * padInfo = NULL;
+
+	ParseParameters(params, blk, &padInfo);
+
 	// Ajust the dimensions according to padding, if used.
 	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -514,17 +531,21 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 	int lZ = lDims[2];
 
 	int nTotalCells = lX * lY * lZ;
-	int nCells = ld->nFluid; // TODO: + padding
+	int nCells = ld->nFluid;
 	int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+	{
+		nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+		PadInfoFree(padInfo); padInfo = NULL;
+	}
+#endif
+
 	kdl->nCells = nCells;
 	kdl->nFluid = nFluid;
 
 	PdfT * pdfs[2];
 
-	int blk[3] = { 0 };
-
-	ParseParameters(params, blk);
 
 	if (blk[0] == 0) blk[0] = lX;
 	if (blk[1] == 0) blk[1] = lY;
@@ -607,18 +628,18 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 
 	// Blocking is implemented via setup of the adjacency list. The kernel later will
 	// walk through the lattice blocked automatically.
-	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-	for (int bY = 0; bY < lY; bY += blk[1]) {
 	for (int bX = 0; bX < lX; bX += blk[0]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
 		int eX = MIN(bX + blk[0], lX);
 		int eY = MIN(bY + blk[1], lY);
 		int eZ = MIN(bZ + blk[2], lZ);
 
 
-		for (int z = bZ; z < eZ; ++z) {
-		for (int y = bY; y < eY; ++y) {
 		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; ++z) {
 
 			latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -660,7 +681,15 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame
 
 	// Loop over all fluid nodes and compute the indices to the neighboring
 	// PDFs for configured data layout (AoS/SoA).
-	// TODO: Parallelized loop to ensure correct NUMA placement.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int index = 0; index < nFluid; ++index) {
+		for (int d = 0; d < N_D3Q19_IDX; ++d) {
+			adjList[index * N_D3Q19_IDX + d] = -1;
+		}
+	}
+
 	// #ifdef _OPENMP  --> add line continuation
 	// 	#pragma omp parallel for default(none)
 	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.c b/src/BenchKernelD3Q19ListAaRiaCommon.c
index 7faf37b..c6ba13a 100644
--- a/src/BenchKernelD3Q19ListAaRiaCommon.c
+++ b/src/BenchKernelD3Q19ListAaRiaCommon.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -225,15 +226,19 @@ static void ParameterUsage()
 {
 	printf("Kernel parameters:\n");
 	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+	printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
 	return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
 	Assert(blk != NULL);
 
 	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+	*padInfo = NULL;
 
 	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 	#define NEXT_ARG_PRESENT() \
@@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
 			blk[2] = tmp;
 		}
+#ifdef DATA_LAYOUT_SOA
+		else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+			NEXT_ARG_PRESENT();
+
+			*padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+		}
+#endif
 		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 			ParameterUsage();
 			exit(1);
@@ -485,6 +497,11 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 	kdlr->nConsecThreadIndices = 0;
 #endif
 
+	int blk[3] = { 0 };
+	PadInfo * padInfo = NULL;
+
+	ParseParameters(params, blk, &padInfo);
+
 	// Ajust the dimensions according to padding, if used.
 	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -500,14 +517,18 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 	int nCells = ld->nFluid; // TODO: + padding
 	int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+	{
+		nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+		PadInfoFree(padInfo); padInfo = NULL;
+	}
+#endif
+
 	kdl->nCells = nCells;
 	kdl->nFluid = nFluid;
 
 	PdfT * pdfs[2];
 
-	int blk[3] = { 0 };
-
-	ParseParameters(params, blk);
 
 	if (blk[0] == 0) blk[0] = lX;
 	if (blk[1] == 0) blk[1] = lY;
@@ -590,18 +611,17 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 
 	// Blocking is implemented via setup of the adjacency list. The kernel later will
 	// walk through the lattice blocked automatically.
-	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-	for (int bY = 0; bY < lY; bY += blk[1]) {
 	for (int bX = 0; bX < lX; bX += blk[0]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
 		int eX = MIN(bX + blk[0], lX);
 		int eY = MIN(bY + blk[1], lY);
 		int eZ = MIN(bZ + blk[2], lZ);
 
-
-		for (int z = bZ; z < eZ; ++z) {
-		for (int y = bY; y < eY; ++y) {
 		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; ++z) {
 
 			latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -643,7 +663,15 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param
 
 	// Loop over all fluid nodes and compute the indices to the neighboring
 	// PDFs for configured data layout (AoS/SoA).
-	// TODO: Parallelized loop to ensure correct NUMA placement.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int index = 0; index < nFluid; ++index) {
+		for (int d = 0; d < N_D3Q19_IDX; ++d) {
+			adjList[index * N_D3Q19_IDX + d] = -1;
+		}
+	}
+
 	// #ifdef _OPENMP  --> add line continuation
 	// 	#pragma omp parallel for default(none)
 	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
diff --git a/src/BenchKernelD3Q19ListCommon.c b/src/BenchKernelD3Q19ListCommon.c
index 4f97bae..f66e379 100644
--- a/src/BenchKernelD3Q19ListCommon.c
+++ b/src/BenchKernelD3Q19ListCommon.c
@@ -28,6 +28,7 @@
 
 #include "Memory.h"
 #include "Vtk.h"
+#include "Padding.h"
 
 #include <math.h>
 
@@ -200,15 +201,19 @@ static void ParameterUsage()
 {
 	printf("Kernel parameters:\n");
 	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+#ifdef DATA_LAYOUT_SOA
+	printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
+#endif
 
 	return;
 }
 
-static void ParseParameters(Parameters * params, int * blk)
+static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo)
 {
 	Assert(blk != NULL);
 
 	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+	*padInfo = NULL;
 
 	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 	#define NEXT_ARG_PRESENT() \
@@ -226,8 +231,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -238,8 +243,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -250,8 +255,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -262,8 +267,8 @@ static void ParseParameters(Parameters * params, int * blk)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -273,6 +278,13 @@ static void ParseParameters(Parameters * params, int * blk)
 			ParameterUsage();
 			exit(1);
 		}
+#ifdef DATA_LAYOUT_SOA
+		else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+			NEXT_ARG_PRESENT();
+
+			*padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+		}
+#endif
 		else {
 			printf("ERROR: unknown kernel parameter.\n");
 			ParameterUsage();
@@ -324,6 +336,12 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	kdl->nFluid = -1;
 #endif
 
+	int blk[3] = { 0 };
+	PadInfo * padInfo = NULL;
+
+	ParseParameters(params, blk, &padInfo);
+
+
 	// Ajust the dimensions according to padding, if used.
 	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -336,18 +354,21 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 	int lZ = lDims[2];
 
 	int nTotalCells = lX * lY * lZ;
-	int nCells = ld->nFluid; // TODO: + padding
+	int nCells = ld->nFluid;
 	int nFluid = ld->nFluid;
 
+#ifdef DATA_LAYOUT_SOA
+	{
+		nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+		PadInfoFree(padInfo); padInfo = NULL;
+	}
+#endif
+
 	kdl->nCells = nCells;
 	kdl->nFluid = nFluid;
 
 	PdfT * pdfs[2];
 
-	int blk[3] = { 0 };
-
-	ParseParameters(params, blk);
-
 	if (blk[0] == 0) blk[0] = lX;
 	if (blk[1] == 0) blk[1] = lY;
 	if (blk[2] == 0) blk[2] = lZ;
@@ -425,18 +446,17 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
 	// Blocking is implemented via setup of the adjacency list. The kernel later will
 	// walk through the lattice blocked automatically.
-	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-	for (int bY = 0; bY < lY; bY += blk[1]) {
 	for (int bX = 0; bX < lX; bX += blk[0]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
 		int eX = MIN(bX + blk[0], lX);
 		int eY = MIN(bY + blk[1], lY);
 		int eZ = MIN(bZ + blk[2], lZ);
 
-
-		for (int z = bZ; z < eZ; ++z) {
-		for (int y = bY; y < eY; ++y) {
 		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; ++z) {
 
 			latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -474,7 +494,15 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters
 
 	// Loop over all fluid nodes and compute the indices to the neighboring
 	// PDFs for configured data layout (AoS/SoA).
-	// TODO: Parallelized loop to ensure correct NUMA placement.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int index = 0; index < nFluid; ++index) {
+		for (int d = 0; d < N_D3Q19_IDX; ++d) {
+			adjList[index * N_D3Q19_IDX + d] = -1;
+		}
+	}
+
 	// #ifdef _OPENMP  --> add line continuation
 	// 	#pragma omp parallel for default(none)
 	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c
index dfab54a..0132dc9 100644
--- a/src/BenchKernelD3Q19ListPullSplitNt.c
+++ b/src/BenchKernelD3Q19ListPullSplitNt.c
@@ -29,6 +29,7 @@
 #include "Memory.h"
 #include "Vtk.h"
 #include "Vector.h"
+#include "LikwidIf.h"
 
 #include <inttypes.h>
 #include <math.h>
@@ -118,6 +119,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+
+			X_LIKWID_START("list-pull-split-nt-1s");
 	#ifdef _OPENMP
 		#pragma omp parallel default(none) \
 			shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
@@ -184,6 +187,7 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 
 		for(int iter = 0; iter < maxIterations; ++iter) {
 
+
 #if 1
 			#define INDEX_START	blIndexStart
 			#define INDEX_STOP  blIndexVec
@@ -201,6 +205,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 			#define INDEX_STOP	blIndexStop
 			#include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
 #endif
+
+
 			#pragma omp barrier
 
 			#pragma omp single
@@ -235,6 +241,9 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		MemFree((void **)&tmpArray);
 	}
 
+
+	X_LIKWID_STOP("list-pull-split-nt-1s");
+
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
 		kd->PdfsActive = src;
@@ -321,6 +330,10 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+
+			X_LIKWID_START("list-pull-split-nt-2s");
+
+
 	#ifdef _OPENMP
 		#pragma omp parallel default(none) \
 			shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
@@ -406,6 +419,7 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 #endif
 			#pragma omp barrier
 
+
 			#pragma omp single
 			{
 				#ifdef VERIFICATION
@@ -438,6 +452,8 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		MemFree((void **)&tmpArray);
 	}
 
+			X_LIKWID_STOP("list-pull-split-nt-2s");
+
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
 		kd->PdfsActive = src;
diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.c b/src/BenchKernelD3Q19ListPullSplitNtCommon.c
index b5df14b..8c8da88 100644
--- a/src/BenchKernelD3Q19ListPullSplitNtCommon.c
+++ b/src/BenchKernelD3Q19ListPullSplitNtCommon.c
@@ -29,6 +29,8 @@
 #include "Memory.h"
 #include "Vtk.h"
 #include "Vector.h"
+#include "Padding.h"
+
 
 #include <math.h>
 
@@ -183,16 +185,18 @@ static void ParameterUsage()
 {
 	printf("Kernel parameters:\n");
 	printf("  [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
+	printf("  [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n");
 
 	return;
 }
 
-static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
+static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo)
 {
 	Assert(blk != NULL);
 
 	blk[0] = 0; blk[1] = 0; blk[2] = 0;
 	*nTmpArray = 152;
+	*padInfo = NULL;
 
 	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
 	#define NEXT_ARG_PRESENT() \
@@ -210,8 +214,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -222,8 +226,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -234,8 +238,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -246,8 +250,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
 			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
 
-			if (tmp <= 0) {
-				printf("ERROR: blocking parameter must be > 0.\n");
+			if (tmp < 0) {
+				printf("ERROR: blocking parameter must be >= 0.\n");
 				exit(1);
 			}
 
@@ -270,6 +274,11 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
 
 			*nTmpArray = tmp;
 		}
+		else if (ARG_IS("-pad") || ARG_IS("--pad")) {
+			NEXT_ARG_PRESENT();
+
+			*padInfo = PadInfoFromStr(params->KernelArgs[++i]);
+		}
 		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
 			ParameterUsage();
 			exit(1);
@@ -456,6 +465,11 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 	kdlr->nConsecThreadIndices = 0;
 #endif
 
+	int blk[3] = { 0 };
+	PadInfo * padInfo = NULL;
+
+	ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo);
+
 	// Ajust the dimensions according to padding, if used.
 	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
 	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
@@ -471,8 +485,12 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 	int nCells = ld->nFluid;
 	int nFluid = ld->nFluid;
 
+	{
+		nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo);
+		PadInfoFree(padInfo); padInfo = NULL;
+	}
+
 	// We padd each stream of a PDF array for a complete cache line.
-	// TODO: padding for L1/L2 and TLB.
 	nCells = nCells + (8 - nCells % 8);
 
 	Assert(nCells % VSIZE == 0);
@@ -482,10 +500,6 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 
 	PdfT * pdfs[2];
 
-	int blk[3] = { 0 };
-
-	ParseParameters(params, blk, &kdlr->nTmpArray);
-
 	if (blk[0] == 0) blk[0] = lX;
 	if (blk[1] == 0) blk[1] = lY;
 	if (blk[2] == 0) blk[2] = lZ;
@@ -572,18 +586,18 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 
 	// Blocking is implemented via setup of the adjacency list. The kernel later will
 	// walk through the lattice blocked automatically.
-	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
-	for (int bY = 0; bY < lY; bY += blk[1]) {
 	for (int bX = 0; bX < lX; bX += blk[0]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
 
 		int eX = MIN(bX + blk[0], lX);
 		int eY = MIN(bY + blk[1], lY);
 		int eZ = MIN(bZ + blk[2], lZ);
 
 
-		for (int z = bZ; z < eZ; ++z) {
-		for (int y = bY; y < eY; ++y) {
 		for (int x = bX; x < eX; ++x) {
+		for (int y = bY; y < eY; ++y) {
+		for (int z = bZ; z < eZ; ++z) {
 
 			latticeIndex = L_INDEX_4(lDims, x, y, z);
 
@@ -627,6 +641,15 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
 	// Loop over all fluid nodes and compute the indices to the neighboring
 	// PDFs for configured data layout (AoS/SoA).
 	// Parallelized loop to ensure correct NUMA placement.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int index = 0; index < nFluid; ++index) {
+		for (int d = 0; d < N_D3Q19_IDX; ++d) {
+			adjList[index * N_D3Q19_IDX + d] = -1;
+		}
+	}
+
 	// #ifdef _OPENMP  --> add line continuation
 	// 	#pragma omp parallel for default(none)
 	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
diff --git a/src/Geometry.c b/src/Geometry.c
index 31c985a..0450c56 100644
--- a/src/Geometry.c
+++ b/src/Geometry.c
@@ -72,6 +72,9 @@ void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], Lat
 
 		typeDetails = &tmp;
 	}
+	else if (strncasecmp("fluid", geometryType, 5) == 0) {
+		type = GEO_TYPE_FLUID;
+	}
 	else {
 		printf("ERROR: unknown geometry specified.\n");
 		Verify(0);
@@ -99,7 +102,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
 	Assert(type >= GEO_TYPE_MIN);
 	Assert(type <= GEO_TYPE_MAX);
 
-	const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" };
+	const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks", "fluid" };
 
 	printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]);
 
@@ -127,6 +130,10 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
 	if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) {
 		periodic[0] = 1;
 	}
+	else if (type == GEO_TYPE_FLUID) {
+		// Actually this geometry is non-periodic, but we need fluid at the boundary.
+		periodic[0] = 1; periodic[1] = 1; periodic[2] = 1;
+	}
 
 	// Walls or periodic on first and last x plane.
 	for (int z = 0; z < dims[2]; ++z) {
@@ -168,7 +175,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
 	}
 
 	if (type == GEO_TYPE_CHANNEL) {
-		periodic[0] = 1;
+		// Nothing todo here.
 	}
 	else if (type == GEO_TYPE_PIPE) {
 		#define SQR(a) ((a)*(a))
@@ -192,34 +199,38 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period
 
 		int blockSize = *((int *)typeDetails);
 
+#if 0
 		if (blockSize == 0) {
 			blockSize = 8;
 		}
+#endif
+		if (blockSize > 0) {
 
-		int dimMin = dims[0];
+			int dimMin = dims[0];
 
-		if (dims[1] < dimMin) dimMin = dims[1];
-		if (dims[2] < dimMin) dimMin = dims[2];
+			if (dims[1] < dimMin) dimMin = dims[1];
+			if (dims[2] < dimMin) dimMin = dims[2];
 
-		if (blockSize < 0 || blockSize > dimMin / 2) {
-			printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
-			// TODO: find a better solution for handling errors in here.
-			Verify(0);
-		}
+			if (blockSize < 0 || blockSize > dimMin / 2) {
+				printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
+				// TODO: find a better solution for handling errors in here.
+				Verify(0);
+			}
 
-		// Number of blocks in x, y, and z direction.
-		int nbx = blockSize, nby = blockSize, nbz = blockSize;
+			// Number of blocks in x, y, and z direction.
+			int nbx = blockSize, nby = blockSize, nbz = blockSize;
 
-		for (int z = 0; z < dims[2]; ++z) {
-				if ((z % (2 * nbz)) < nbz) continue;
+			for (int z = 0; z < dims[2]; ++z) {
+					if ((z % (2 * nbz)) < nbz) continue;
 
-			for (int y = 0; y < dims[1]; ++y) {
-				if ((y % (2 * nby)) < nby) continue;
+				for (int y = 0; y < dims[1]; ++y) {
+					if ((y % (2 * nby)) < nby) continue;
 
-				for (int x = 0; x < dims[0]; ++x) {
+					for (int x = 0; x < dims[0]; ++x) {
 
-					if ((x % (2 * nbx)) >= nbx) {
-						lattice[L_INDEX_4(dims, x, y, z)] 	= LAT_CELL_OBSTACLE;
+						if ((x % (2 * nbx)) >= nbx) {
+							lattice[L_INDEX_4(dims, x, y, z)] 	= LAT_CELL_OBSTACLE;
+						}
 					}
 				}
 			}
diff --git a/src/Geometry.h b/src/Geometry.h
index 11d72ff..a9bf260 100644
--- a/src/Geometry.h
+++ b/src/Geometry.h
@@ -37,7 +37,8 @@ typedef enum GEO_TYPES_ {
 	GEO_TYPE_PIPE = 2,
 	GEO_TYPE_BLOCKS = 3,	// Expects a pointer to an integer, holding the
 						    // value of the block size as type detail.
-	GEO_TYPE_MAX = 3
+	GEO_TYPE_FLUID = 4,
+	GEO_TYPE_MAX = 4
 } GEO_TYPES;
 
 
diff --git a/src/Kernel.h b/src/Kernel.h
index eddae64..99e126b 100644
--- a/src/Kernel.h
+++ b/src/Kernel.h
@@ -56,7 +56,7 @@
 #endif
 
 #ifdef PROP_MODEL_AA
-	#define PROP_MODEL_NAME AA
+	#define PROP_MODEL_NAME Aa
 #endif
 
 
diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h
index 2ee063a..557c653 100644
--- a/src/KernelFunctions.h
+++ b/src/KernelFunctions.h
@@ -28,6 +28,8 @@
 #define __KERNEL_FUNCTIONS_H__
 
 #include "BenchKernelD3Q19.h"
+#include "BenchKernelD3Q19Aa.h"
+#include "BenchKernelD3Q19AaVec.h"
 #include "BenchKernelD3Q19List.h"
 #include "BenchKernelD3Q19ListAa.h"
 #include "BenchKernelD3Q19ListAaRia.h"
@@ -133,6 +135,22 @@ KernelFunctions g_kernels[] =
 		.Init   = D3Q19BlkInit_PullAoS,
 		.Deinit = D3Q19BlkDeinit_PullAoS
 	},
+	{
+		.Name	= "aa-aos",
+		.Init	= D3Q19AaInit_AaAoS,
+		.Deinit	= D3Q19AaDeinit_AaAoS
+	},
+	{
+		.Name	= "aa-soa",
+		.Init	= D3Q19AaInit_AaSoA,
+		.Deinit	= D3Q19AaDeinit_AaSoA
+	},
+	{
+		.Name	= "aa-vec-soa",
+		.Init	= D3Q19AaVecInit_AaSoA,
+		.Deinit	= D3Q19AaVecDeinit_AaSoA
+	}
+
 };
 
 #endif // __KERNEL_FUNCTIONS_H__
diff --git a/src/Lattice.c b/src/Lattice.c
new file mode 100644
index 0000000..bc218eb
--- /dev/null
+++ b/src/Lattice.c
@@ -0,0 +1,59 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Lattice.h"
+
+// Dumps the layers [zStart, zStop] of lattice as ASCII.
+// Specify zStart = -1 and zStop = -1 as begin/end of lattice.
+
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop)
+{
+	Assert(ld != NULL);
+
+	const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet
+
+	int localZStart = zStart;
+	int localZStop  = zStop;
+
+	int * dims = ld->Dims;
+	LatticeT * lattice = ld->Lattice;
+
+	if (localZStart == -1) localZStart = 0;
+	if (localZStop  == -1) localZStop  = dims[2] - 1;
+
+	for (int z = localZStop; z >= localZStart; --z) {
+		printf("plane % 2d\n", z);
+
+		for (int y = dims[1] - 1; y >= 0; --y) {
+			printf(" %2d  ", y);
+			for (int x = 0; x < dims[0]; ++x) {
+				printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]);
+			}
+			printf("\n");
+		}
+	}
+}
+
diff --git a/src/Lattice.h b/src/Lattice.h
index 99b3fc3..c23b70f 100644
--- a/src/Lattice.h
+++ b/src/Lattice.h
@@ -74,5 +74,6 @@ static inline int L_INDEX_4(int dims[3], int x, int y, int z)
 	return z * dims[0] * dims[1] + y * dims[0] + x;
 }
 
+void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop);
 
 #endif // __LATTICE_H__
diff --git a/src/Main.c b/src/Main.c
index 1525ad5..65a4035 100644
--- a/src/Main.c
+++ b/src/Main.c
@@ -468,7 +468,7 @@ int main(int argc, char * argv[])
 			int threadId = omp_get_thread_num();
 			int err;
 
-			err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId);
+			err = PinCurrentThreadByCpuList(pinString, threadId);
 
 			if (err) {
 				printf("ERROR [thread %d]: pinning failed.\n", threadId);
@@ -557,9 +557,9 @@ int main(int argc, char * argv[])
 		perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6,
 		geometryType, kernelToUse,
 #ifdef VERIFICATION
-		"# VERIFICATION"
+		"VERIFICATION"
 #else
-		"# benchmark"
+		"B"
 #endif
 	);
 
diff --git a/src/Makefile b/src/Makefile
index a9889a5..99ca902 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -29,7 +29,9 @@
 # CONFIG ?= linux-gcc
 CONFIG ?= linux-intel
 
-BUILD ?= debug
+BUILD ?= release
+
+BENCHMARK ?= on
 
 # If we compile for BENCHMARK all verifcation and statistics are disabled,
 # if not specified otherwise.
@@ -37,13 +39,12 @@ ifeq (on,$(BENCHMARK))
   VERIFICATION ?= off
   VTK_OUTPUT   ?= off
   STATISTICS   ?= off
+else
+  VERIFICATION ?= on
+  STATISTICS   ?= on
+  VTK_OUTPUT   ?= on
 endif
 
-VERIFICATION ?= on
-
-STATISTICS ?= on
-
-VTK_OUTPUT ?= on
 
 COLOR ?= on
 
@@ -95,7 +96,7 @@ DEP_DIR         = obj/$(CONFIG)-$(BUILD)$(TAG)-dep
 
 # Sources to consider.
 SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \
-            Vtk.c Pinning.c
+            Vtk.c Pinning.c Padding.c Lattice.c
 
 # ------------------------------------------------------------------------
 # NO CHANGE BELOW SHOULD BE NEEDED
@@ -127,7 +128,13 @@ OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \
 		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \
 		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \
 		$(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \
-		$(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o
+		$(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19Aa_AaAoS.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaAoS.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19Aa_AaSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19AaVec_AaSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19AaVecCommon_AaSoA.o
 
 OBJ = $(OBJ_C)
 
@@ -203,9 +210,22 @@ ifeq (on,$(LIKWID))
   LD_LIBS  += $(LIKWID_LIB) -llikwid
 endif
 
+# ARCH can only be assigned a string without a space. The space is escaped as
+# a comma which we have to replace here.
+
+ifdef TARCH
+  ARCH_W_COMMA := $(TARCH)
+
+  COMMA_ := ,
+  override TARCH := $(subst $(COMMA_), ,$(TARCH))
+
+  $(info $(shell echo -e "\n$(COLOR_GREEN)INFO: Automatically expanding comma in TARCH variable:  TARCH=$(TARCH).$(COLOR_NO)\n"))
+
+endif
+
 .phony: all clean clean-all
 
-$(info $(shell $(ECHO_E) "# Configuration:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   building $(.DEFAULT_GOAL)..."))
+$(info $(shell $(ECHO_E) "# Configuration:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   TARCH=$(COLOR_CYAN)$(TARCH)$(COLOR_NO)   building $(.DEFAULT_GOAL)..."))
 
 
 $(info # Object dir:      $(OBJECT_DIR))
@@ -265,6 +285,13 @@ $(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS)
 	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)"
 	$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@
 
+$(OBJECT_DIR)/%_AaAoS.o: %.c $(REBUILD_DEPS)
+	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+	$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_AA -c $< -o $@
+
+$(OBJECT_DIR)/%_AaSoA.o: %.c $(REBUILD_DEPS)
+	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)"
+	$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_AA -c $< -o $@
 
 $(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS)
 	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)"
diff --git a/src/Memory.c b/src/Memory.c
index d9d1f94..e7dda39 100644
--- a/src/Memory.c
+++ b/src/Memory.c
@@ -24,11 +24,24 @@
 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
 //
 // --------------------------------------------------------------------------
+// TODO: make configurable
+#define HAVE_HUGE_PAGES
+
+
+#ifdef HAVE_HUGE_PAGES
+#define _BSD_SOURCE
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>  // strerror
 #include <errno.h>
 
+
+#ifdef HAVE_HUGE_PAGES
+#include <sys/mman.h> // madvise
+#endif
+
 #include "Base.h"
 #include "Memory.h"
 
@@ -60,6 +73,19 @@ int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes)
 		exit(1);
 	}
 
+#ifdef HAVE_HUGE_PAGES
+
+	if (alignmentBytes % 4096 == 0) {
+		ret = madvise(*ptr, bytesToAlloc, MADV_HUGEPAGE);
+
+		if (ret != 0) {
+			DEBUG_BREAK_POINT();
+			Error("madvise(%p, %lu, MADV_HUGEPAGE) failed: %d - %s.\n", *ptr, bytesToAlloc, errno, strerror(errno));
+			exit(1);
+		}
+	}
+#endif
+
 	return 0;
 }
 
diff --git a/src/Padding.c b/src/Padding.c
new file mode 100644
index 0000000..97a98ea
--- /dev/null
+++ b/src/Padding.c
@@ -0,0 +1,300 @@
+#include "Base.h"
+#include "Kernel.h"
+#include "Padding.h"
+
+#include <errno.h>
+#include <limits.h>
+
+// Generates a PadInfo struct from padStr. If padStr is NULL then a NULL
+// pointer is returned.
+//
+// Padding string is either "auto" or in the format of
+// <modulus>+<remainder>(,<modulus>+<remainder>)*.
+//
+// If modulus is 0 then the remainder will just be added and no modulo will be
+// applied.
+//
+// Returned PadInfo structs must be freed with PadInfoFree.
+
+PadInfo * PadInfoFromStr(const char * padStr)
+{
+	if (padStr == NULL) {
+		return NULL;
+	}
+	else if (!strncmp("auto", padStr, 4)) {
+
+		PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo) + 4 * sizeof(int));
+		Assert(padInfo != NULL);
+
+		padInfo->nEntries = 2;
+		padInfo->Modulus = (int *)(padInfo + 1);
+		padInfo->Offset = ((int *)(padInfo + 1)) + 2;
+
+		// Intel TLB 2 MiB pages
+		padInfo->Modulus[0] = 2 * 1024 * 1024 * 8;
+		padInfo->Offset[0]  = 2 * 1024 * 1024 + 512;
+
+		// Intel L2, 256 sets
+		padInfo->Modulus[1] = 256 * 64;
+		padInfo->Offset[1]  = 128;
+
+		return padInfo;
+	}
+	else if (!strncmp("no", padStr, 2) || !strncmp("off", padStr, 3)) {
+		PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo));
+		Assert(padInfo != NULL);
+
+		padInfo->nEntries = 0;
+		padInfo->Modulus  = NULL;
+		padInfo->Offset   = NULL;
+
+		return padInfo;
+	}
+
+	// Count number of commas. Number of commas + 1 gives us the number of
+	// padding entries.
+	const char * tmp = padStr;
+	int nCommas = 0;
+
+	while (*tmp != 0x00) {
+		if (*tmp == ',') ++nCommas;
+		++tmp;
+	}
+
+	// Number of padding entries = nCommas + 1.
+	size_t padInfoSize = sizeof(PadInfo) + (nCommas + 1) * 2 * sizeof(int);
+	PadInfo * padInfo = (PadInfo *)malloc(padInfoSize);
+	Assert(padInfo != NULL);
+
+	memset(padInfo, 0x00, padInfoSize);
+
+	padInfo->Modulus = (int *)(padInfo + 1);
+	padInfo->Offset = ((int *)(padInfo + 1)) + nCommas + 1;
+
+	tmp = padStr;
+
+	int padEntryIndex = 0;
+	int entriesSeen   = 0;
+
+	int modulusSeen   = 0;
+	int offsetSeen    = 0;
+
+	// TODO: parsing is currently a mess. We assume a fixed format, which
+	// should be easier to parse now.
+	while (*tmp != 0x00) {
+
+		if (*tmp == ',') {
+			// Check if modulus was seen in this section, then move to the next.
+
+			if (!modulusSeen) {
+				fprintf(stderr, "ERROR: modulus missing before next section in padding string.\n");
+				exit(1);
+			}
+
+			modulusSeen = 0;
+			offsetSeen  = 0;
+			++padEntryIndex;
+			++tmp;
+		}
+		else if (*tmp == '+') {
+			// Check if modulus was seen in this section and no offset yet.
+
+			if (!modulusSeen) {
+				fprintf(stderr, "ERROR: modulus missing before offset in padding string.\n");
+				exit(1);
+			}
+
+			if (offsetSeen) {
+				fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+				exit(1);
+			}
+
+			++tmp;
+		}
+		else if (*tmp >= '0' && *tmp <= '9') {
+			// Parse number and check for all the errors.
+
+			char * endPtr;
+
+			errno = 0;
+
+			int value = strtol(tmp, &endPtr, 10);
+
+			if ((value == LONG_MIN || value == LONG_MAX) && errno != 0) {
+				fprintf(stderr, "ERROR: over- or underflow in modulus/offset of padding string.\n");
+				exit(1);
+			}
+			else if (value == 0 && errno != 0) {
+				fprintf(stderr, "ERROR: error parsing modulus/offset of padding string: %d - %s\n", errno, strerror(errno));
+				exit(1);
+			}
+			else if (tmp == endPtr) {
+				// No digits found: empty string or invalid pad string.
+				fprintf(stderr, "ERROR: modulus or offset missing in padding string.\n");
+				exit(1);
+			}
+
+			if (value < 0) {
+				fprintf(stderr, "ERROR: modulus and offset must be >= 0 in padding string.\n");
+				exit(1);
+			}
+
+			if (!modulusSeen) {
+				if (offsetSeen) {
+					fprintf(stderr, "ERROR: offset already seen, but not modulus in padding string.\n");
+					exit(1);
+				}
+
+				Verify(padEntryIndex < (nCommas + 1));
+
+				padInfo->Modulus[padEntryIndex] = value;
+
+				modulusSeen = 1;
+				++entriesSeen;
+			}
+			else {
+				if (offsetSeen) {
+					fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n");
+					exit(1);
+				}
+
+				Verify(padEntryIndex < (nCommas + 1));
+
+				padInfo->Offset[padEntryIndex] = value;
+
+				offsetSeen = 1;
+			}
+			// No increment of tmp needed, as endPtr points already to the first
+			// character which is not part of the number.
+			tmp = endPtr;
+		}
+		else {
+			fprintf(stderr, "ERROR: padding string contains invalid characters.\n");
+			exit(1);
+		}
+
+	}
+
+	if (entriesSeen != nCommas + 1) {
+		fprintf(stderr, "ERROR: did not find all padding entries.\n");
+		exit(1);
+	}
+
+	for (int i = 0; i < nCommas + 1; ++i) {
+		if (padInfo->Offset[i] >= padInfo->Modulus[i]) {
+			fprintf(stderr, "ERROR: offset in padding entry %d is equal or larger than modulus.\n", i);
+			exit(1);
+		}
+	}
+
+	padInfo->nEntries = entriesSeen;
+
+	return padInfo;
+}
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+	Assert(padInfoPtr != NULL);
+
+	// If the padInfo is NULL determine if we use the "auto" configuration.
+	const int defaultAutoPadding = 1;
+
+
+	if (*padInfoPtr == NULL) {
+		if (!defaultAutoPadding) {
+			return nCells;
+		}
+
+		*padInfoPtr = PadInfoFromStr("auto");
+	}
+
+	PadInfo * padInfo = *padInfoPtr;
+
+	Assert(padInfo->nEntries >= 0);
+
+	for (int i = 0; i < padInfo->nEntries; ++i) {
+		Assert(padInfo->Modulus[i] >=  0);
+		Assert(padInfo->Offset[i]  >= 0);
+
+		int nPadCells = 0;
+
+		if (padInfo->Modulus[i] == 0) {
+			// When the modulus is just zero then we only add the offset.
+			nPadCells = padInfo->Offset[i] * cellSizeBytes;
+		}
+		else {
+			int nModCells    = padInfo->Modulus[i] / cellSizeBytes;
+
+			if (nModCells == 0) {
+				fprintf(stderr, "ERROR: modulus of %d byte in padding entry %d becomes zero for PDF size %d byte.\n",
+						padInfo->Modulus[i], i, cellSizeBytes);
+				exit(1);
+			}
+
+			int nOffsetCells = padInfo->Offset[i]  / cellSizeBytes;
+			int nRemainder   = nCells % nModCells;
+
+			nPadCells = (nOffsetCells + nModCells - nRemainder) % nModCells;
+		}
+
+		nCells += nPadCells;
+	}
+
+	return nCells;
+}
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix)
+{
+	Assert(padInfo != NULL);
+	Assert(padInfo->nEntries >= 0);
+	Assert(f != NULL);
+	Assert(prefix != NULL);
+
+	for (int i = 0; i < padInfo->nEntries; ++i) {
+		fprintf(f, "%sm: %10d b  o: %10d b   m: %f KiB  o: %f KiB   m: %f MiB  o: %f MiB\n",
+			prefix,
+			padInfo->Modulus[i],                   padInfo->Offset[i],
+			padInfo->Modulus[i] / 1024.0,          padInfo->Offset[i] / 1024.0,
+			padInfo->Modulus[i] / 1024.0 / 1024.0, padInfo->Offset[i] / 1024.0 / 1024.0);
+	}
+
+	return;
+}
+
+void PadInfoFree(PadInfo * padInfo)
+{
+	if (padInfo != NULL) {
+		free(padInfo);
+	}
+
+	return;
+}
+
+// Returns the new padded cell size and reports if padding was performed and how.
+
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr)
+{
+	// Apply padding.
+	int nNewCells = PadCells(nCells, sizeof(PdfT), padInfoPtr);
+
+	if (nCells != nNewCells) {
+		printf("# padding info:\n");
+		PadInfoPrint(*padInfoPtr, stdout, "#                        ");
+		// int nPaddedCells = nNewCells - nCells;
+		// printf("# padding per dir.:      %10d nodes (%f MiB)\n", nPaddedCells, nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+		// printf("# padding total:         %10d nodes (%f MiB)\n", 19 * nPaddedCells, 19 * nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT));
+
+		int nPadCells = nNewCells - nCells;
+
+		printf("#\n# padding %d nodes with %d nodes, %f MiB per direction, %f MiB in total\n",
+			nCells,
+			nPadCells,
+			nPadCells * sizeof(PdfT) / 1024.0 / 1024.0,
+			nPadCells * sizeof(PdfT) / 1024.0 / 1024.0 * 19);
+	}
+	else {
+		printf("# padding info:          no padding used\n");
+	}
+
+	return nNewCells;
+}
diff --git a/src/Padding.h b/src/Padding.h
new file mode 100644
index 0000000..55fa33e
--- /dev/null
+++ b/src/Padding.h
@@ -0,0 +1,19 @@
+#ifndef __PADDING_H__
+#define __PADDING_H__
+
+typedef struct PadInfo_
+{
+	int nEntries;
+	int * Modulus;
+	int * Offset;
+} PadInfo;
+
+PadInfo * PadInfoFromStr(const char * padStr);
+
+int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfo);
+int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr);
+
+void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix);
+void PadInfoFree(PadInfo * padInfo);
+
+#endif // __PADDING_H__
diff --git a/src/Pinning.c b/src/Pinning.c
index 0cf70ea..de9be3f 100644
--- a/src/Pinning.c
+++ b/src/Pinning.c
@@ -34,9 +34,6 @@
 #include "Base.h"
 #include "Pinning.h"
 
-
-
-
 // -----------------------------------------------------------------------
 //
 // Binds the calling thread to specified core.
@@ -73,8 +70,7 @@ int PinCurrentThreadToCore(int coreNumber)
 //
 // -----------------------------------------------------------------------
 
-int PinCurrentThreadByEnvVar(const char * envVarName,
-	int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber)
 {
 	const char * envVarValue;
 	int core;
@@ -82,14 +78,12 @@ int PinCurrentThreadByEnvVar(const char * envVarName,
 	envVarValue = getenv(envVarName);
 
 	if (envVarValue == NULL) {
-		if (mpiRank == 0) {
-			Print("skip pinning: env var %s not set\n", envVarName);
-		}
+		printf("WARNING: skip pinning: env var %s not set\n", envVarName);
 
 		return 0;
 	}
 
-	core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber);
+	core = PinParseCpuList(envVarValue, threadNumber);
 
 	if (core < 0) {
 		return core;
@@ -107,20 +101,17 @@ int PinCurrentThreadByEnvVar(const char * envVarName,
 //
 // -----------------------------------------------------------------------
 
-int PinCurrentThreadByCpuList(const char * cpuList,
-	int mpiRank, int nodeRank, int threadNumber)
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber)
 {
 	int core;
 
 	if (cpuList == NULL) {
-		if (mpiRank == 0) {
-			printf("ERROR: cpu list is NULL.\n");
-		}
+		printf("ERROR: cpu list is NULL.\n");
 
 		exit(1);
 	}
 
-	core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber);
+	core = PinParseCpuList(cpuList, threadNumber);
 
 	if (core < 0) {
 		return core;
@@ -133,31 +124,13 @@ int PinCurrentThreadByCpuList(const char * cpuList,
 // -----------------------------------------------------------------------
 //
 // Parses the provided cpu list and returns the core number for the
-// specified MPI rank, local rank, and thread.
-//
-// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5
-//
-// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a
-// node rank. The first block maps to node rank 1, the second to node
-// rank 2, etc.
-//
-// Inside a block the core numbers specify where the threads should
-// be pinned to. They are separated by "," and the first number maps
-// to the first core, the second number to the second core, etc.
-//
-// For example: 0,2,4_6,8,10
+// specified thread.
 //
-// Node rank 0 thread 0 pinned to core  0
-//           0        1                 2
-//           0        2                 4
-//           1        0                 6
-//           1        1                 8
-//           1        2                10
+// The cpu list has for example a format of: 0,1,2
 //
 // -----------------------------------------------------------------------
 
-int PinParseCpuList(const char * cpuList,
-		int mpiRank, int nodeRank, int threadNumber)
+int PinParseCpuList(const char * cpuList, int threadNumber)
 {
 	int cpu = -1;
 
@@ -168,8 +141,8 @@ int PinParseCpuList(const char * cpuList,
 	const char * c = cpuList;
 
 	// Ensure only valid characters are in the cpu list.
-	// Cpu list is in the format of "0,1,2_3,4,5".
-	while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) {
+	// Cpu list is in the format of "0,1,2,3,4,5".
+	while (((*c >= '0' && *c <= '9') || *c == ',')) {
 		++c;
 	}
 
@@ -180,19 +153,6 @@ int PinParseCpuList(const char * cpuList,
 
 	c = cpuList;
 
-	int i = 0;
-
-	// Move variable c after the "nodeRank"th "_" in the cpu list.
-	while (i < nodeRank && *c != 0x00) {
-    	if (*c == '_') ++i;
-		++c;
-	}
-
-	if (i != nodeRank || *c < '0' || *c > '9') {
-		// Cpu list for this node rank not found.
-		return -3;
-  	}
-
 	// Now find the core for the specified thread.
 
 	int t = 0;
@@ -201,11 +161,6 @@ int PinParseCpuList(const char * cpuList,
 		if (*c == ',') {
 			++t;
 		}
-		else if (*c == '_') {
-			// Unexpected character at this position.
-			break;
-		}
-
 		++c;
 	}
 
diff --git a/src/Pinning.h b/src/Pinning.h
index 40225c1..877048d 100644
--- a/src/Pinning.h
+++ b/src/Pinning.h
@@ -30,14 +30,11 @@
 
 int PinCurrentThreadToCore(int coreNumber);
 
-int PinParseCpuList(const char * cpuList,
-					int mpiRank, int nodeRank, int threadNumber);
+int PinParseCpuList(const char * cpuList, int threadNumber);
 
-int PinCurrentThreadByEnvVar(const char * envVarName,
-	int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber);
 
-int PinCurrentThreadByCpuList(const char * cpuList,
-	int mpiRank, int nodeRank, int threadNumber);
+int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber);
 
 int PinCurrentCore();
 
diff --git a/src/test.sh b/src/test.sh
index 7a76f51..a0e8ad6 100755
--- a/src/test.sh
+++ b/src/test.sh
@@ -54,7 +54,7 @@ Config="$1"
 
 make clean-all
 make -j CONFIG=$Config TAG=$XTag-debug
-make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
 make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
 
 BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v"
-- 
2.43.0