From: Markus Wittmann <markus.wittmann@fau.de>
Date: Tue, 15 May 2018 11:57:54 +0000 (+0200)
Subject: merge with kernels from MH's master thesis
X-Git-Url: http://git.rrze.uni-erlangen.de/gitweb/?p=LbmBenchmarkKernelsPublic.git;a=commitdiff_plain;h=8cafd9ea08a6b1103eab29811227a7ae536dffa6

merge with kernels from MH's master thesis

TODOs:
  - Likwid marker in gather kernels should use the X_LIKWID interface.
  - Old kernels should set loop balance in KernelData structure.
  - AVX512 intrinsics for single precision must be implemented.
    - This affects all kernels using intrinsics
    - NOTE: gather kernels need special attention for gather intrincis.
  - Extend documentation by new kernels.
---

diff --git a/doc/main.html b/doc/main.html
index 9f11866..dfd45ec 100644
--- a/doc/main.html
+++ b/doc/main.html
@@ -401,6 +401,10 @@ tr:nth-child(odd) {
 <div class="line">Viktor Haag, 2016</div>
 <div class="line">LSS, University of Erlangen-Nuremberg, Germany</div>
 <div class="line"><br /></div>
+<div class="line">Michael Hussnaetter, 2017-2018</div>
+<div class="line">University of Erlangen-Nuremberg, Germany</div>
+<div class="line">michael.hussnaetter -at- fau.de</div>
+<div class="line"><br /></div>
 </div>
 <div class="line">This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).</div>
 <div class="line"><br /></div>
@@ -581,7 +585,7 @@ make clean-all
 <td>Select GCC or Intel compiler.</td>
 </tr>
 <tr><td>ISA</td>
-<td>avx, sse</td>
+<td>avx512, avx, sse</td>
 <td>avx</td>
 <td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td>
 </tr>
@@ -617,6 +621,35 @@ make clean-all
 </tr>
 </tbody>
 </table>
+<p><strong>Suboptions for ``ISA=avx512``</strong></p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="20%" />
+<col width="5%" />
+<col width="5%" />
+<col width="69%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">name</th>
+<th class="head">values</th>
+<th class="head">default</th>
+<th class="head">description</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>SOFTWARE_PREFETCH_LOOKAHEAD_L1</td>
+<td>int &gt;= 0</td>
+<td>0</td>
+<td>Software prefetch lookahead of elements into L1 cache, value is multiplied by vector size (<tt class="docutils literal">VSIZE</tt>).</td>
+</tr>
+<tr><td>SOFTWARE_PREFETCH_LOOKAHEAD_L2</td>
+<td>int &gt;= 0</td>
+<td>0</td>
+<td>Software prefetch lookahead of elements into L2 cache, value is multiplied by vector size (<tt class="docutils literal">VSIZE</tt>).</td>
+</tr>
+</tbody>
+</table>
+<p>Please note this options require AVX-512 PF support of the target processor.</p>
 </div>
 </div>
 <div class="section" id="invocation">
@@ -637,7 +670,7 @@ LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
 Usage:
 ./lbmbenchk -list
 ./lbmbenchk
-    [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-&lt;block size&gt;]] [-iterations &lt;iterations&gt;] [-lattice-dump-ascii]
+    [-dims XxYxZ] [-geometry box|channel|pipe|blocks[-&lt;block size&gt;]] [-iterations &lt;iterations&gt;] [-lattice-dump-ascii]
     [-rho-in &lt;density&gt;] [-rho-out &lt;density] [-omega &lt;omega&gt;] [-kernel &lt;kernel&gt;]
     [-periodic-x]
     [-t &lt;number of threads&gt;]
@@ -952,13 +985,14 @@ created make sure the binary was compiled with:</p>
 <ul class="simple">
 <li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
 <li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
-<li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
+<li>the correct ISA for macros (i.e. intrinsics) is used, selected via <tt class="docutils literal">ISA</tt> and</li>
 <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
 </ul>
 <div class="section" id="intel-compiler">
 <h2><a class="toc-backref" href="#id18">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></h2>
 <p>For the Intel compiler one can specify depending on the target ISA extension:</p>
 <ul class="simple">
+<li>SSE:          <tt class="docutils literal"><span class="pre">TARCH=-xSSE4.2</span></tt></li>
 <li>AVX:          <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li>
 <li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li>
 <li>AVX512:       <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li>
@@ -974,12 +1008,26 @@ make ISA=avx TARCH=-xCORE-AVX2,-fma
 </pre>
 <p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not
 implemented. This might change in the future.</p>
+<!-- TODO: add isa=avx512 and add docu for knl -->
+<!-- TODO: kein prefetching wenn AVX-512 PF nicht unterstuetz wird -->
 <p>Compiling for an architecture supporting AVX-512 (Skylake):</p>
 <pre class="literal-block">
-make ISA=avx TARCH=-xCORE-AVX512
+make ISA=avx512 TARCH=-xCORE-AVX512
+</pre>
+<p>Please note that for the AVX512 gather kernels software prefetching for the
+gather instructions is disabled per default.
+To enable it set <tt class="docutils literal">SOFTWARE_PREFETCH_LOOKAHEAD_L1</tt> and/or
+<tt class="docutils literal">SOFTWARE_PREFETCH_LOOKAHEAD_L2</tt> to a value greater than <tt class="docutils literal">0</tt> during
+compilation. Note that this requires AVX-512 PF support from the target
+processor.</p>
+<p>Compiling for MIC architecture KNL supporting AVX-512 and AVX-512 PF:</p>
+<pre class="literal-block">
+make ISA=avx512 TARCH=-xMIC-AVX512
+</pre>
+<p>or optionally with software prefetch enabled:</p>
+<pre class="literal-block">
+make ISA=avx512 TARCH=-xMIC-AVX512 SOFTWARE_PREFETCH_LOOKAHEAD_L1=&lt;value&gt; SOFTWARE_PREFETCH_LOOKAHEAD_L2=&lt;value&gt;
 </pre>
-<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the
-AVX512 intrinsics. This might change in the future.</p>
 </div>
 <div class="section" id="pinning">
 <h2><a class="toc-backref" href="#id19">4.2&nbsp;&nbsp;&nbsp;Pinning</a></h2>
@@ -1232,7 +1280,7 @@ Roofline: an insightful visual performance model for multicore architectures.
 Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
 </tbody>
 </table>
-<p>Document was generated at 2018-01-09 11:54.</p>
+<p>Document was generated at 2018-05-10 14:10.</p>
 </div>
 </div>
 </body>
diff --git a/doc/main.rst b/doc/main.rst
index 3d9ca9f..3a5c75c 100644
--- a/doc/main.rst
+++ b/doc/main.rst
@@ -7,6 +7,10 @@
 |   Viktor Haag, 2016
 |   LSS, University of Erlangen-Nuremberg, Germany
 |
+|   Michael Hussnaetter, 2017-2018
+|   University of Erlangen-Nuremberg, Germany
+|   michael.hussnaetter -at- fau.de
+|
 | This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 |
 | LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -155,8 +159,8 @@ name          values                  default      description
 BENCHMARK     on, off                 on           If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.
 BUILD         debug, release          release      debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.
 CONFIG        linux-gcc, linux-intel  linux-intel  Select GCC or Intel compiler. 
-ISA           avx, sse                avx          Determines which ISA extension is used for macro definitions of the intrinsics. This is *not* the architecture the compiler generates code for.
-OPENMP        on, off                 on           OpenMP, i.\,e.\. threading support.
+ISA           avx512, avx, sse        avx          Determines which ISA extension is used for macro definitions of the intrinsics. This is *not* the architecture the compiler generates code for.
+OPENMP        on, off                 on           OpenMP, i.e. threading support.
 PRECISION     dp, sp                  dp           Floating point precision used for data type, arithmetic, and intrincics.
 STATISTICS    on, off                 off          View statistics, like density etc, during simulation. 
 TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.
@@ -164,6 +168,19 @@ VERIFICATION  on, off                 off          Turn verification on/off.
 VTK_OUTPUT    on, off                 off          Enable/Disable VTK file output.
 ============= ======================= ============ ==========================================================
 
+**Suboptions for ``ISA=avx512``**
+
+============================== ======== ======== ======================
+name                           values   default  description
+============================== ======== ======== ======================
+ADJ_LIST_MEM_TYPE              HBM      -        Determines memory location of adjacency list array, DRAM or HBM.
+PDF_MEM_TYPE                   HBM      -        Determines memory location of PDF array, DRAM or HBM.
+SOFTWARE_PREFETCH_LOOKAHEAD_L1 int >= 0 0        Software prefetch lookahead of elements into L1 cache, value is multiplied by vector size (``VSIZE``).
+SOFTWARE_PREFETCH_LOOKAHEAD_L2 int >= 0 0        Software prefetch lookahead of elements into L2 cache, value is multiplied by vector size (``VSIZE``).
+============================== ======== ======== ======================
+
+Please note this options require AVX-512 PF support of the target processor.
+
 Invocation
 ==========
 
@@ -185,7 +202,7 @@ Running the binary with ``-h`` list all available parameters: ::
   Usage:
   ./lbmbenchk -list
   ./lbmbenchk
-      [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-<block size>]] [-iterations <iterations>] [-lattice-dump-ascii]
+      [-dims XxYxZ] [-geometry box|channel|pipe|blocks[-<block size>]] [-iterations <iterations>] [-lattice-dump-ascii]
       [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>]
       [-periodic-x]
       [-t <number of threads>]
@@ -341,7 +358,7 @@ created make sure the binary was compiled with:
 
 - ``BENCHMARK=on`` (default if not overriden) and
 - ``BUILD=release`` (default if not overriden) and 
-- the correct ISA for macros is used, selected via ``ISA`` and
+- the correct ISA for macros (i.e. intrinsics) is used, selected via ``ISA`` and
 - use ``TARCH`` to specify the architecture the compiler generates code for.
   
 Intel Compiler
@@ -349,6 +366,7 @@ Intel Compiler
 
 For the Intel compiler one can specify depending on the target ISA extension:
 
+- SSE:          ``TARCH=-xSSE4.2``
 - AVX:          ``TARCH=-xAVX``
 - AVX2 and FMA: ``TARCH=-xCORE-AVX2,-fma``
 - AVX512:       ``TARCH=-xCORE-AVX512``
@@ -367,12 +385,30 @@ WARNING: ISA is here still set to ``avx`` as currently we have the FMA intrinsic
 implemented. This might change in the future.
 
 
+.. TODO: add isa=avx512 and add docu for knl
+
+.. TODO: kein prefetching wenn AVX-512 PF nicht unterstuetz wird
+ 
 Compiling for an architecture supporting AVX-512 (Skylake): ::
 
-  make ISA=avx TARCH=-xCORE-AVX512
+  make ISA=avx512 TARCH=-xCORE-AVX512
+
+Please note that for the AVX512 gather kernels software prefetching for the
+gather instructions is disabled per default.
+To enable it set ``SOFTWARE_PREFETCH_LOOKAHEAD_L1`` and/or
+``SOFTWARE_PREFETCH_LOOKAHEAD_L2`` to a value greater than ``0`` during
+compilation. Note that this requires AVX-512 PF support from the target
+processor.
+
+Compiling for MIC architecture KNL supporting AVX-512 and AVX-512 PF::
+
+  make ISA=avx512 TARCH=-xMIC-AVX512
+
+or optionally with software prefetch enabled::
+
+  make ISA=avx512 TARCH=-xMIC-AVX512 SOFTWARE_PREFETCH_LOOKAHEAD_L1=<value> SOFTWARE_PREFETCH_LOOKAHEAD_L2=<value>
+
 
-WARNING: ISA is here still set to ``avx`` as currently we have no implementation for the
-AVX512 intrinsics. This might change in the future.
 
 
 Pinning
diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c
index 4852948..62989c8 100644
--- a/src/BenchKernelD3Q19.c
+++ b/src/BenchKernelD3Q19.c
@@ -101,6 +101,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 	}
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	for (int iter = 0; iter < maxIterations; ++iter) {
 
 		X_LIKWID_START("os");
@@ -125,7 +127,7 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 					#pragma vector always
 					#pragma simd
 				#endif
-				for (int z = oZ; z < nZ + oZ; ++z) {
+				for (int z = oZ; z < nZ + oZ; ++z) { // LOOP os
 					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
 #ifdef PROP_MODEL_PUSH
@@ -320,6 +322,7 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 		// Stop counters before bounce back. Else computing loop balance will be incorrect.
 		X_LIKWID_STOP("os");
 
+
 		// Fixup bounce back PDFs.
 		#ifdef _OPENMP
 		#pragma omp parallel for default(none) \
@@ -355,6 +358,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 	#ifdef VTK_OUTPUT
 
 	if (cd->VtkOutput) {
@@ -445,6 +450,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 	nThreads = omp_get_max_threads();
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	for (int iter = 0; iter < maxIterations; ++iter) {
 
 		#ifdef _OPENMP
@@ -729,6 +736,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 	#ifdef VTK_OUTPUT
 
 	if (cd->VtkOutput) {
diff --git a/src/BenchKernelD3Q19Aa.c b/src/BenchKernelD3Q19Aa.c
index a6f73fc..b8ee577 100644
--- a/src/BenchKernelD3Q19Aa.c
+++ b/src/BenchKernelD3Q19Aa.c
@@ -118,6 +118,8 @@ void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 	nThreads = omp_get_max_threads();
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	for (int iter = 0; iter < maxIterations; iter += 2) {
 
 		// --------------------------------------------------------------------
@@ -174,7 +176,7 @@ void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 					#pragma vector always
 					#pragma simd
 				#endif
-				for (int z = bZ; z < eZ; ++z) {
+				for (int z = bZ; z < eZ; ++z) { // LOOP aa-even
 
 
 					if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) {
@@ -382,7 +384,7 @@ void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 					#pragma vector always
 					#pragma simd
 				#endif
-				for (int z = bZ; z < eZ; ++z) {
+				for (int z = bZ; z < eZ; ++z) { // LOOP aa-odd
 
 					#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
@@ -531,6 +533,8 @@ void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData *
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 	#ifdef VTK_OUTPUT
 
 	if (cd->VtkOutput) {
diff --git a/src/BenchKernelD3Q19AaVec.c b/src/BenchKernelD3Q19AaVec.c
index f79e6cd..f67bccb 100644
--- a/src/BenchKernelD3Q19AaVec.c
+++ b/src/BenchKernelD3Q19AaVec.c
@@ -123,6 +123,8 @@ void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
 
 	Assert((maxIterations % 2) == 0);
 
+	X_KERNEL_START(kd);
+
 	for (int iter = 0; iter < maxIterations; iter += 2) {
 
 		// --------------------------------------------------------------------
@@ -215,6 +217,8 @@ void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kd);
+
 	#ifdef VTK_OUTPUT
 
 	if (cd->VtkOutput) {
@@ -331,7 +335,7 @@ static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
 
 		for (int x = bX; x < eX; x += 1) {
 		for (int y = bY; y < eY; y += 1) {
-		for (int z = bZ; z < eZ; z += VSIZE) {
+		for (int z = bZ; z < eZ; z += VSIZE) { // LOOP aa-vec-even
 
 			#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
@@ -523,7 +527,7 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd)  // {{{
 
 		for (int x = bX; x < eX; ++x) {
 		for (int y = bY; y < eY; ++y) {
-		for (int z = bZ; z < eZ; z += VSIZE) {
+		for (int z = bZ; z < eZ; z += VSIZE) { // LOOP aa-vec-odd
 
 			#define I(x, y, z, dir)	P_INDEX_5(gDims, (x), (y), (z), (dir))
 
diff --git a/src/BenchKernelD3Q19AaVecSl.c b/src/BenchKernelD3Q19AaVecSl.c
index 885a065..989e3d0 100644
--- a/src/BenchKernelD3Q19AaVecSl.c
+++ b/src/BenchKernelD3Q19AaVecSl.c
@@ -121,6 +121,8 @@ void FNAME(D3Q19AaVecSlKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
 
 	Assert((maxIterations % 2) == 0);
 
+	X_KERNEL_START(kd);
+
 	#ifdef _OPENMP
 		#pragma omp parallel default(none) shared(kda, kd, ld, cd, src, maxIterations)
 	#endif
@@ -238,6 +240,8 @@ void FNAME(D3Q19AaVecSlKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd)
 		} // for (int iter = 0; ...
 	} // omp parallel
 
+	X_KERNEL_END(kd);
+
 	#ifdef VTK_OUTPUT
 
 	if (cd->VtkOutput) {
@@ -379,7 +383,7 @@ static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{
 	// 		threadId, indexStart, indexEnd, threadStart, threadEnd);
 
 
-	for (int i = threadStart; i < threadEnd; i += VSIZE) {
+	for (int i = threadStart; i < threadEnd; i += VSIZE) { // LOOP aa-vec-sl-even
 
 		// Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
 		// #define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]);
@@ -605,7 +609,7 @@ startX , startY , startZ , startX + _x, startY + _y, startZ + _z);
 #endif // DEBUG_EXTENDED
 
 
-	for (int i = threadStart; i < threadEnd; i += VSIZE) {
+	for (int i = threadStart; i < threadEnd; i += VSIZE) { // LOOP aa-vec-sl-odd
 
 #if DEBUG_EXTENDED
 		#define X(name, idx, idxinv, _x, _y, _z)	Assert((unsigned long)(JOIN(ppdf_,idx)) >= (unsigned long)(JOIN(ppdf_start_,idx))); Assert((unsigned long)(JOIN(ppdf_,idx)) <= (unsigned long)(JOIN(ppdf_end_,idx)));
diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c
index 4adb858..0cb98f8 100644
--- a/src/BenchKernelD3Q19List.c
+++ b/src/BenchKernelD3Q19List.c
@@ -93,16 +93,15 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
-	// TODO: outer openmp parallel
-	for(int iter = 0; iter < maxIterations; ++iter) {
-
+	X_KERNEL_START(kernelData);
 
 	X_LIKWID_START("list-os");
 
+	// TODO: outer openmp parallel
 	#ifdef _OPENMP
-		#pragma omp parallel for default(none) \
+		#pragma omp parallel default(none) \
 				shared(nFluid, nCells, kd, kdl, adjList, src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
-				w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd) \
+				w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, ld, tmp, maxIterations ) \
 				private(ux, uy, uz, ui, dens, dir_indep_trm, adjListIndex, \
 					pdf_C, \
 				  	pdf_N, pdf_E, pdf_S, pdf_W, \
@@ -111,10 +110,16 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
 				  	pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
 				  	evenPart, oddPart, w_1_indep, w_2_indep)
 	#endif
+{
+	for(int iter = 0; iter < maxIterations; ++iter) {
+
+
+
 	#ifdef INTEL_OPT_DIRECTIVES
-		#pragma ivdep
 	#endif
-	for (int index = 0; index < nFluid; ++index) {
+	#pragma omp for
+	#pragma novector
+	for (int index = 0; index < nFluid; ++index) { // LOOP list-os
 
 			#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
 
@@ -311,31 +316,36 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData
 			#undef I
 		} // loop over fluid nodes
 
-		X_LIKWID_STOP("list-os");
+		#pragma omp single
+		{
+			#ifdef VERIFICATION
+				kd->PdfsActive = dst;
+				KernelAddBodyForce(kd, ld, cd);
+			#endif
 
-		#ifdef VERIFICATION
-			kd->PdfsActive = dst;
-			KernelAddBodyForce(kd, ld, cd);
-		#endif
+			#ifdef VTK_OUTPUT
+				if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+					kd->PdfsActive = dst;
+					VtkWrite(ld, kd, cd, iter);
+				}
+			#endif
 
-		#ifdef VTK_OUTPUT
-			if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+			#ifdef STATISTICS
 				kd->PdfsActive = dst;
-				VtkWrite(ld, kd, cd, iter);
-			}
-		#endif
+				KernelStatistics(kd, ld, cd, iter);
+			#endif
 
-		#ifdef STATISTICS
-			kd->PdfsActive = dst;
-			KernelStatistics(kd, ld, cd, iter);
-		#endif
+			// swap grids
+			tmp = src;
+			src = dst;
+			dst = tmp;
+		}
+	}
+	} // for (int iter = 0; ...
 
-		// swap grids
-		tmp = src;
-		src = dst;
-		dst = tmp;
+	X_LIKWID_STOP("list-os");
 
-	} // for (int iter = 0; ...
+	X_KERNEL_END(kernelData);
 
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
diff --git a/src/BenchKernelD3Q19ListAa.c b/src/BenchKernelD3Q19ListAa.c
index 045a396..2021311 100644
--- a/src/BenchKernelD3Q19ListAa.c
+++ b/src/BenchKernelD3Q19ListAa.c
@@ -98,6 +98,8 @@ void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseDat
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	// TODO: outer openmp parallel
 	for(int iter = 0; iter < maxIterations; iter += 2) {
 
@@ -122,7 +124,7 @@ void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseDat
 		#pragma vector always
 		#pragma simd
 #endif
-		for (int index = 0; index < nFluid; ++index) {
+		for (int index = 0; index < nFluid; ++index) { // LOOP list-aa-even
 
 
 			#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
@@ -259,7 +261,7 @@ void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseDat
 #ifdef INTEL_OPT_DIRECTIVES
 		#pragma ivdep
 #endif
-		for (int index = 0; index < nFluid; ++index) {
+		for (int index = 0; index < nFluid; ++index) { // LOOP list-aa-odd
 
 
 			adjListIndex = index * N_D3Q19_IDX;
@@ -397,6 +399,8 @@ void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseDat
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c
index c1e48a5..7aa45ff 100644
--- a/src/BenchKernelD3Q19ListAaCommon.c
+++ b/src/BenchKernelD3Q19ListAaCommon.c
@@ -384,6 +384,8 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete
 	}
 #endif
 
+	// TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
+	// done during lattice setup.
 	kdl->nCells = nCells;
 	kdl->nFluid = nFluid;
 
diff --git a/src/BenchKernelD3Q19ListAaCommon.h b/src/BenchKernelD3Q19ListAaCommon.h
index 11ec5ff..f33c512 100644
--- a/src/BenchKernelD3Q19ListAaCommon.h
+++ b/src/BenchKernelD3Q19ListAaCommon.h
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -89,6 +93,10 @@ static inline int FNAME(PINDEX3)(int nCells, int cellIndex, int d)
 	return d * nCells + cellIndex;
 #elif  DATA_LAYOUT_AOS
 	return cellIndex * N_D3Q19 + d;
+#elif DATA_LAYOUT_AOSOA
+	return (cellIndex - (cellIndex % AOSOA_BLOCK_SIZE)) * N_D3Q19
+			+ (d * AOSOA_BLOCK_SIZE)
+			+ (cellIndex % AOSOA_BLOCK_SIZE); 
 #else
 	#error P_INDEX_3 function not implemented for chosen data layout.
 #endif
diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c
index 8ae0c2c..4590823 100644
--- a/src/BenchKernelD3Q19ListAaPv.c
+++ b/src/BenchKernelD3Q19ListAaPv.c
@@ -76,6 +76,8 @@ void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	// TODO: outer openmp parallel
 
 	for(int iter = 0; iter < maxIterations; iter += 2) {
@@ -139,6 +141,8 @@ void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseD
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
 		kd->PdfsActive = kd->Pdfs[0];
@@ -235,7 +239,7 @@ static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 
 	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
 
-	for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
+	for (int index = indexStartVec; index < indexStopVec; index += VSIZE) { // LOOP list-aa-pv-even-vec
 
 
 		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]);
@@ -373,7 +377,7 @@ static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
 
 	} // loop over fluid nodes
 
-	for (int index = indexStopVec; index < indexStop; ++index) {
+	for (int index = indexStopVec; index < indexStop; ++index) { // LOOP list-aa-pv-even-scalar
 
 		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = src[I(index, idx)];
 		D3Q19_LIST
diff --git a/src/BenchKernelD3Q19ListAaPvGather.c b/src/BenchKernelD3Q19ListAaPvGather.c
new file mode 100644
index 0000000..961b7a0
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGather.c
@@ -0,0 +1,761 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherCommon.h"
+
+#include "Base.h"
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#else
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#endif
+
+//enable software prefetchting for vectorized gather/scatter loop in odd kernel
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L2
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L2 (0) //prefetchting X SIMD widths ahead
+#endif
+
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L1
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L1 (0) //prefetchting X SIMD widths ahead
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices);
+static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices);
+
+void FNAME(D3Q19ListAaPvGatherKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
+	KernelData * kd = (KernelData *)kernelData;
+#endif
+	KernelDataList * kdl = KDL(kernelData);
+
+	int maxIterations = cd->MaxIterations;
+	int nFluid = kdl->nFluid;
+
+	printf("\n");
+#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0) || (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+	printf("# Software prefetching enabled:\n");
+	printf("#   Load/Store prefetch lookahead L2: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L2);
+	printf("#   Load/Store prefetch lookahead L1: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L1);
+#else
+	printf("# Software prefetching disabled.\n");
+#endif
+	printf("\n");
+
+	int nThreads = 1;
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	int * threadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+	for (int i = 0; i < nThreads; ++i) {
+		threadIndices[i] = i * (nFluid / nThreads) + MinI(i, nFluid % nThreads);
+	}
+
+	threadIndices[nThreads] = nFluid;
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, -1);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, 0);
+#endif
+
+	LIKWID_MARKER_INIT;
+
+	// TODO: outer openmp parallel
+
+	X_KERNEL_START(kernelData);
+
+	LIKWID_MARKER_START("OuterLoop");
+	for(int iter = 0; iter < maxIterations; iter += 2) {
+
+		// even time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadIndices)
+#endif
+		{
+			KernelEven(ld, kernelData, cd, threadIndices);
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+		// odd time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadIndices)
+#endif
+		{
+			KernelOdd(ld, kernelData, cd, threadIndices);
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+#ifdef VTK_OUTPUT
+		if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+			kdl->Iteration = iter + 1;
+			kd->PdfsActive = kd->Pdfs[0];
+			VtkWrite(ld, kd, cd, iter);
+		}
+#endif
+
+#ifdef STATISTICS
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelStatistics(kd, ld, cd, iter);
+#endif
+
+	} // for (int iter = 0; ...
+	LIKWID_MARKER_STOP("OuterLoop");
+
+	X_KERNEL_END(kernelData);
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, maxIterations);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+	LIKWID_MARKER_CLOSE;
+	free(threadIndices);
+
+	return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices)
+{
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vui, vux, vuy, vuz, vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name); \
+		PdfT * JOIN(ppdf_,name); \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int threadId = 0;
+#ifdef _OPENMP
+	threadId =  omp_get_thread_num();
+#endif
+
+	int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+	int nFluidVec    = nFluidThread - (nFluidThread % VSIZE);
+
+	int indexStartVec = threadIndices[threadId];
+	int indexStopVec  = threadIndices[threadId] + nFluidVec;
+	int indexStop     = threadIndices[threadId] + nFluidThread;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+
+	#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[I(indexStartVec, idx)]);
+			D3Q19_LIST
+	#undef X
+
+	for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE), _MM_HINT_T1);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE), _MM_HINT_T0);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+		//	     vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+		vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+		//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+		//	     vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+		vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+		//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+		//	     vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+		vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+		//vdens = vpdf_C +
+		//	    vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+		//	    vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+		//	    vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+		//	    vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+		vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+		//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+		vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+		//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+		VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+		//vw_1_indep = vw_1 * vdir_indep_trm;
+		vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+		#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_AA_V(vuy, N, S)
+		COLLIDE_AA_V(vux, E, W)
+		COLLIDE_AA_V(vuz, T, B)
+
+		#undef COLLIDE_AA_V
+
+		//vw_2_indep = vw_2 * vdir_indep_trm;
+		vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+		// collide axis unaligned pdfs vectorized
+		#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+		COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+		COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+		COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+		COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+		COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+		#undef COLLIDE_UA_V
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) +=VSIZE;
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	for (int index = indexStopVec; index < indexStop; ++index) {
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// direction: w_0
+		*ppdf_C  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+		// direction: w_1
+		w_1_indep = w_1*dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// direction: w_2
+		w_2_indep = w_2*dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name)++;
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	#undef I
+
+	return;
+}
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices)
+{
+
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+	KernelDataListRia * kdlr = KDLR(kernelData);
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vux, vuy, vuz, vui;
+	VPDFT vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name) = 0; \
+		PdfT * JOIN(ppdf_,name) = NULL; \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	#define X(name, idx, idxinv, x, y, z) \
+		__m256i JOIN(vgatheridx_,name) = _mm256_set1_epi32(0);
+		D3Q19_LIST_WO_C
+	#undef X
+
+	__m256i vgatherinc = _mm256_set1_epi32(VSIZE);
+
+	uint32_t * consecNodes = kdlr->ConsecNodes;
+	uint32_t consecIndex = 0;
+	uint32_t consecValue = 0;
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int adjListIndex;
+	uint32_t * adjList = kdl->AdjList;
+
+	int threadId = 0;
+
+#ifdef _OPENMP
+	threadId = omp_get_thread_num();
+#endif
+	consecIndex = kdlr->ConsecThreadIndices[threadId];
+	consecValue = 0;
+
+	int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+
+	int indexStart = threadIndices[threadId];
+	int indexStop  = threadIndices[threadId] + nFluidThread;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+	#define ADJ_LIST(dir) adjList[adjListIndex + (dir * VSIZE)]
+
+	int offset_ppdf_C = -1; //dummy init to detect wrong usage.
+
+	for (int index = indexStart; index < indexStop; index += 1) {
+
+		if (consecValue > 0) {
+			--consecValue;
+			// Increment all pdf pointers by an offset. If the previous iteration was
+			// scalar, increment only by one. If the previous iteration was vectorized,
+			// increment by the vector width. These offsets are set in the corresponding
+			// if branches.
+
+			//increment offsets
+
+			#define X(name, idx, idxinv, _x, _y, _z) JOIN(vgatheridx_,name) = VADDI32(JOIN(vgatheridx_,name), vgatherinc);
+				D3Q19_LIST_WO_C
+			#undef X
+
+			ppdf_C += offset_ppdf_C;
+
+		}
+		else {
+			// Load new pointers to PDFs of local cell:
+			Assert(consecIndex < nConsecNodes);
+
+			consecValue = consecNodes[consecIndex] - 1;
+
+			adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vgatheridx_,name) = VLIU(&(ADJ_LIST(idxinv)));
+				D3Q19_LIST_WO_C
+			#undef X
+
+			ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+			++consecIndex;
+		}
+
+		if (consecValue >= (VSIZE - 1)) {
+			// Vectorized part.
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+				int const indexPrefetchL2 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L2;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL2 < indexStop){
+					// update pointers from adjacency list only if necessary
+					if (consecValue >= (SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE + VSIZE - 1)) {
+						#define INCR_PTR(name)		(VADDI32(JOIN(vgatheridx_,name), VMULI32(vgatherinc, VSETI32(SOFTWARE_PREFETCH_LOOKAHEAD_L2))))
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(INCR_PTR(name), (char const *) src, 8, _MM_HINT_T1);
+							D3Q19_LIST_WO_C
+						#undef X
+						#undef INCR_PTR
+					}
+					else {
+						adjListIndex = (indexPrefetchL2 - (indexPrefetchL2 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL2 % VSIZE);
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T1);
+							D3Q19_LIST_WO_C
+						#undef X
+					}
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL2, D3Q19_C)]), _MM_HINT_T1);
+				}
+			#endif
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+				int const indexPrefetchL1 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L1;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL1 < indexStop){
+					// update pointers from adjacency list only if necessary
+					if (consecValue > (SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE + VSIZE - 1)) {
+						#define INCR_PTR(name)		(VADDI32(JOIN(vgatheridx_,name), VMULI32(vgatherinc, VSETI32(SOFTWARE_PREFETCH_LOOKAHEAD_L1))))
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(INCR_PTR(name), (char const *) src, 8, _MM_HINT_T0);
+							D3Q19_LIST_WO_C
+						#undef X
+						#undef INCR_PTR
+					}
+					else {
+						adjListIndex = (indexPrefetchL1 - (indexPrefetchL1 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL1 % VSIZE);
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T0);
+							D3Q19_LIST_WO_C
+						#undef X
+					}
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL1, D3Q19_C)]), _MM_HINT_T0);
+				}
+			#endif
+
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VG32(JOIN(vgatheridx_,name), src, 8);
+				D3Q19_LIST_WO_C
+			#undef X
+
+			vpdf_C = VLDU(ppdf_C);
+
+			//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+			//      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+			//      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+			//      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			//vdens = vpdf_C +
+			//        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+			//        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+			//        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+			//        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+										VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+			VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+			// collide axis aligend pdfs vectorized
+			#define SCAT(offsets, vsrc) VS32(src, offsets, vsrc, 8)
+
+			//vw_1_indep = vw_1 * vdir_indep_trm;
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			// collide axis aligend pdfs vectorized
+			#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_AA_V(vuy, N, S)
+			COLLIDE_AA_V(vux, E, W)
+			COLLIDE_AA_V(vuz, T, B)
+
+			#undef COLLIDE_AA_V
+
+			//vw_2_indep = vw_2 * vdir_indep_trm;
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			// collide axis unaligned pdfs vectorized
+			#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+			COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+			COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+			COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+			COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+			COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+			#undef COLLIDE_UA_V
+			#undef SCAT
+
+			consecValue   -= (VSIZE - 1);
+			index         += (VSIZE - 1);
+			offset_ppdf_C  = VSIZE;
+
+		}
+		else {
+			// Scalar part.
+
+			adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[ADJ_LIST(idxinv)]);
+				D3Q19_LIST_WO_C
+			#undef X
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+				D3Q19_LIST_WO_C
+			#undef X
+
+			pdf_C = *ppdf_C;
+
+			ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+				pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+			uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+				pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+			uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+				pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+			dens = pdf_C +
+				pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+				pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+				pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+				pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+			dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+			// direction: w_0
+			*ppdf_C = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+
+			// direction: w_1
+			w_1_indep = w_1 * dir_indep_trm;
+
+			#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+				ui = tmpUi; \
+				evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+				oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+				*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+				*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+			COLLIDE_AA_S(uy, N, S)
+			COLLIDE_AA_S(ux, E, W)
+			COLLIDE_AA_S(uz, T, B)
+
+			#undef COLLIDE_AA_S
+
+			// direction: w_2
+			w_2_indep = w_2 * dir_indep_trm;
+
+			#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+				ui = tmpUi; \
+				evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+				oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+				*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+				*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+			COLLIDE_UA_S((-ux + uy), NW, SE)
+			COLLIDE_UA_S(( ux + uy), NE, SW)
+			COLLIDE_UA_S((-ux + uz), TW, BE)
+			COLLIDE_UA_S(( ux + uz), TE, BW)
+			COLLIDE_UA_S((-uy + uz), TS, BN)
+			COLLIDE_UA_S(( uy + uz), TN, BS)
+
+			#undef COLLIDE_UA_S
+
+			offset_ppdf_C = 1;
+		}
+
+	} // loop over fluid nodes
+
+#undef ADJ_LIST
+#undef I
+}
diff --git a/src/BenchKernelD3Q19ListAaPvGather.h b/src/BenchKernelD3Q19ListAaPvGather.h
new file mode 100644
index 0000000..e9ae4b0
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGather.h
@@ -0,0 +1,44 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER__
+
+#include "Kernel.h"
+
+void D3Q19ListAaPvGatherInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListAaPvGatherInit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaPvGatherDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19ListAaPvGatherDeinit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER__
diff --git a/src/BenchKernelD3Q19ListAaPvGatherAoSoA.c b/src/BenchKernelD3Q19ListAaPvGatherAoSoA.c
new file mode 100644
index 0000000..478c5d7
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherAoSoA.c
@@ -0,0 +1,834 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherAoSoACommon.h"
+
+#include "Base.h"
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#else
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#endif
+
+//enable software prefetchting for vectorized gather/scatter loop in odd kernel
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L2
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L2 (0) //prefetchting X SIMD widths ahead
+#endif
+
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L1
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L1 (0) //prefetchting X SIMD widths ahead
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices);
+static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices);
+
+void FNAME(D3Q19ListAaPvGatherAoSoAKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
+	KernelData * kd = (KernelData *)kernelData;
+#endif
+	KernelDataList * kdl = KDL(kernelData);
+
+	int maxIterations = cd->MaxIterations;
+	int nFluid = kdl->nFluid;
+
+	printf("\n");
+#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0) || (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+	printf("# Software prefetching enabled:\n");
+	printf("#   Gather/Scatter prefetch lookahead L2: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L2);
+	printf("#   Gather/Scatter prefetch lookahead L1: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L1);
+#else
+	printf("# Software prefetching disabled.\n");
+#endif
+	printf("\n");
+
+	int nThreads = 1;
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	int * threadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+	for (int i = 0; i < nThreads; ++i) {
+		threadIndices[i] = i * (nFluid / nThreads) + MinI(i, nFluid % nThreads);
+	}
+
+	threadIndices[nThreads] = nFluid;
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, -1);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, 0);
+#endif
+
+	LIKWID_MARKER_INIT;
+
+	X_KERNEL_START(kernelData);
+
+	// TODO: outer openmp parallel
+
+	LIKWID_MARKER_START("OuterLoop");
+	for(int iter = 0; iter < maxIterations; iter += 2) {
+
+		// even time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadIndices)
+#endif
+		{
+			KernelEven(ld, kernelData, cd, threadIndices);
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+		// odd time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadIndices)
+#endif
+		{
+			KernelOdd(ld, kernelData, cd, threadIndices);
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+#ifdef VTK_OUTPUT
+		if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+			kdl->Iteration = iter + 1;
+			kd->PdfsActive = kd->Pdfs[0];
+			VtkWrite(ld, kd, cd, iter);
+		}
+#endif
+
+#ifdef STATISTICS
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelStatistics(kd, ld, cd, iter);
+#endif
+
+	} // for (int iter = 0; ...
+	LIKWID_MARKER_STOP("OuterLoop");
+
+	X_KERNEL_END(kernelData);
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, maxIterations);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+	LIKWID_MARKER_CLOSE;
+	free(threadIndices);
+
+	return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices)
+{
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vui, vux, vuy, vuz, vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name); \
+		PdfT * JOIN(ppdf_,name); \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int threadId = 0;
+#ifdef _OPENMP
+	threadId =  omp_get_thread_num();
+#endif
+
+
+	int indexStart    = threadIndices[threadId];
+	int nFluidThread  = threadIndices[threadId + 1] - threadIndices[threadId];
+	int indexStop     = indexStart + nFluidThread;
+
+	int indexStartVec = ((indexStart + VSIZE - 1) / VSIZE) * VSIZE;
+	int nFluidVec     = (indexStop / VSIZE) * VSIZE - indexStartVec;
+	int indexStopVec  = indexStartVec + nFluidVec;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+
+	#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[I(indexStart, idx)]);
+			D3Q19_LIST
+	#undef X
+
+	for (int index = indexStart; index < indexStartVec; ++index) {
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// direction: w_0
+		*ppdf_C  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+		// direction: w_1
+		w_1_indep = w_1*dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// direction: w_2
+		w_2_indep = w_2*dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name)++;
+			D3Q19_LIST
+		#undef X
+	}
+
+	#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[I(indexStartVec, idx)]);
+			D3Q19_LIST
+	#undef X
+
+	for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE * N_D3Q19), _MM_HINT_T1);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE * N_D3Q19), _MM_HINT_T0);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+		//	     vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+		vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+		//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+		//	     vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+		vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+		//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+		//	     vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+		vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+		//vdens = vpdf_C +
+		//	    vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+		//	    vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+		//	    vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+		//	    vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+		vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+		//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+		vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+		//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+		VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+		//vw_1_indep = vw_1 * vdir_indep_trm;
+		vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+		#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_AA_V(vuy, N, S)
+		COLLIDE_AA_V(vux, E, W)
+		COLLIDE_AA_V(vuz, T, B)
+
+		#undef COLLIDE_AA_V
+
+		//vw_2_indep = vw_2 * vdir_indep_trm;
+		vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+		// collide axis unaligned pdfs vectorized
+		#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+		COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+		COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+		COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+		COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+		COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+		#undef COLLIDE_UA_V
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) +=(VSIZE * N_D3Q19);
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	for (int index = indexStopVec; index < indexStop; ++index) {
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// direction: w_0
+		*ppdf_C  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+		// direction: w_1
+		w_1_indep = w_1*dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// direction: w_2
+		w_2_indep = w_2*dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name)++;
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	#undef I
+
+	return;
+}
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices)
+{
+
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+	KernelDataListRia * kdlr = KDLR(kernelData);
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vux, vuy, vuz, vui;
+	VPDFT vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name) = 0; \
+		PdfT * JOIN(ppdf_,name) = NULL; \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	#define X(name, idx, idxinv, x, y, z) \
+		__m256i JOIN(vgatheridx_,name) = _mm256_set1_epi32(0);
+		D3Q19_LIST_WO_C
+	#undef X
+
+	__m256i vgatherinc = VSETI32(VSIZE * N_D3Q19);
+
+	uint32_t * consecNodes = kdlr->ConsecNodes;
+	uint32_t consecIndex = 0;
+	uint32_t consecValue = 0;
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int adjListIndex;
+	uint32_t * adjList = kdl->AdjList;
+
+	int threadId = 0;
+
+#ifdef _OPENMP
+	threadId = omp_get_thread_num();
+#endif
+	consecIndex = kdlr->ConsecThreadIndices[threadId];
+	consecValue = 0;
+
+	int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+
+	int indexStart = threadIndices[threadId];
+	int indexStop  = threadIndices[threadId] + nFluidThread;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+	#define ADJ_LIST(dir) adjList[adjListIndex + (dir * VSIZE)]
+
+	int offset_ppdf_C = -1; //dummy init to detect errors
+
+	for (int index = indexStart; index < indexStop; index += 1) {
+
+		if (consecValue > 0) {
+			--consecValue;
+			// Increment all pdf pointers by an offset. If the previous iteration was
+			// scalar, increment only by one. If the previous iteration was vectorized,
+			// increment by the vector width. These offsets are set in the corresponding
+			// if branches.
+
+			//increment offsets
+
+			#define X(name, idx, idxinv, _x, _y, _z) JOIN(vgatheridx_,name) = VADDI32(JOIN(vgatheridx_,name), vgatherinc);
+				D3Q19_LIST_WO_C
+			#undef X
+
+			//printVector(vgatheridx_N);
+
+			ppdf_C += offset_ppdf_C;
+
+		}
+		else {
+			// Load new pointers to PDFs of local cell:
+			Assert(consecIndex < nConsecNodes);
+
+			consecValue = consecNodes[consecIndex] - 1;
+
+			adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vgatheridx_,name) = VLIU(&(ADJ_LIST(idxinv)));
+				D3Q19_LIST_WO_C
+			#undef X
+
+			ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+			++consecIndex;
+		}
+
+		if (consecValue >= (VSIZE - 1)) {
+			// Vectorized part.
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+				int const indexPrefetchL2 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L2;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL2 < indexStop){
+					// update pointers from adjacency list only if necessary
+					if (consecValue >= (SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE + VSIZE - 1)) {
+						#define INCR_PTR(name)		(VADDI32(JOIN(vgatheridx_,name), VMULI32(vgatherinc, VSETI32(SOFTWARE_PREFETCH_LOOKAHEAD_L2))))
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(INCR_PTR(name), (char const *) src, 8, _MM_HINT_T1);
+							D3Q19_LIST_WO_C
+						#undef X
+						#undef INCR_PTR
+					}
+					else {
+						adjListIndex = (indexPrefetchL2 - (indexPrefetchL2 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL2 % VSIZE);
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T1);
+							D3Q19_LIST_WO_C
+						#undef X
+					}
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL2, D3Q19_C)]), _MM_HINT_T1);
+				}
+			#endif
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+				int const indexPrefetchL1 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L1;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL1 < indexStop){
+					// update pointers from adjacency list only if necessary
+					if (consecValue > (SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE + VSIZE - 1)) {
+						#define INCR_PTR(name)		(VADDI32(JOIN(vgatheridx_,name), VMULI32(vgatherinc, VSETI32(SOFTWARE_PREFETCH_LOOKAHEAD_L1))))
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(INCR_PTR(name), (char const *) src, 8, _MM_HINT_T0);
+							D3Q19_LIST_WO_C
+						#undef X
+						#undef INCR_PTR
+					}
+					else {
+						adjListIndex = (indexPrefetchL1 - (indexPrefetchL1 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL1 % VSIZE);
+						#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T0);
+							D3Q19_LIST_WO_C
+						#undef X
+					}
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL1, D3Q19_C)]), _MM_HINT_T0);
+				}
+			#endif
+
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VG32(JOIN(vgatheridx_,name), src, 8);
+				D3Q19_LIST_WO_C
+			#undef X
+
+			vpdf_C = VLDU(ppdf_C);
+
+			//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+			//      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+			//      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+			//      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			//vdens = vpdf_C +
+			//        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+			//        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+			//        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+			//        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+										VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+			VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+			// collide axis aligend pdfs vectorized
+			#define SCAT(offsets, vsrc) VS32(src, offsets, vsrc, 8)
+
+			//vw_1_indep = vw_1 * vdir_indep_trm;
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			// collide axis aligend pdfs vectorized
+			#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_AA_V(vuy, N, S)
+			COLLIDE_AA_V(vux, E, W)
+			COLLIDE_AA_V(vuz, T, B)
+
+			#undef COLLIDE_AA_V
+
+			//vw_2_indep = vw_2 * vdir_indep_trm;
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			// collide axis unaligned pdfs vectorized
+			#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+			COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+			COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+			COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+			COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+			COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+			#undef COLLIDE_UA_V
+			#undef SCAT
+
+			consecValue   -= (VSIZE - 1);
+			index         += (VSIZE - 1);
+			offset_ppdf_C  = VSIZE * N_D3Q19;
+
+		}
+		else {
+			// Scalar part.
+
+			adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[ADJ_LIST(idxinv)]);
+				D3Q19_LIST_WO_C
+			#undef X
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+				D3Q19_LIST_WO_C
+			#undef X
+
+			pdf_C = *ppdf_C;
+
+			ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+				pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+			uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+				pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+			uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+				pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+			dens = pdf_C +
+				pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+				pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+				pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+				pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+			dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+			// direction: w_0
+			*ppdf_C = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+
+			// direction: w_1
+			w_1_indep = w_1 * dir_indep_trm;
+
+			#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+				ui = tmpUi; \
+				evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+				oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+				*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+				*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+			COLLIDE_AA_S(uy, N, S)
+			COLLIDE_AA_S(ux, E, W)
+			COLLIDE_AA_S(uz, T, B)
+
+			#undef COLLIDE_AA_S
+
+			// direction: w_2
+			w_2_indep = w_2 * dir_indep_trm;
+
+			#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+				ui = tmpUi; \
+				evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+				oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+				*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+				*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+			COLLIDE_UA_S((-ux + uy), NW, SE)
+			COLLIDE_UA_S(( ux + uy), NE, SW)
+			COLLIDE_UA_S((-ux + uz), TW, BE)
+			COLLIDE_UA_S(( ux + uz), TE, BW)
+			COLLIDE_UA_S((-uy + uz), TS, BN)
+			COLLIDE_UA_S(( uy + uz), TN, BS)
+
+			#undef COLLIDE_UA_S
+
+			offset_ppdf_C = 1;
+		}
+
+	} // loop over fluid nodes
+
+#undef ADJ_LIST
+#undef I
+}
diff --git a/src/BenchKernelD3Q19ListAaPvGatherAoSoA.h b/src/BenchKernelD3Q19ListAaPvGatherAoSoA.h
new file mode 100644
index 0000000..370ad4a
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherAoSoA.h
@@ -0,0 +1,42 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA__
+
+#include "Kernel.h"
+
+void D3Q19ListAaPvGatherAoSoAInit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaPvGatherAoSoADeinit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA___
diff --git a/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.c b/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.c
new file mode 100644
index 0000000..07a552d
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.c
@@ -0,0 +1,976 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherAoSoACommon.h"
+
+#include "Memory.h"
+#include "Vector.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+#define PAGE_4K		4096
+
+#if ALLOC_ADJ_LIST_IN_HBM == 1
+	#define ADJ_LIST_ALLOCATOR HbwAllocAligned
+	#define ADJ_LIST_FREE HbwFree
+#else
+	#define ADJ_LIST_ALLOCATOR MemAllocAligned
+	#define ADJ_LIST_FREE MemFree
+#endif
+
+#if ALLOC_PDF_IN_HBM == 1
+	#define PDF_ALLOCATOR HbwAllocAligned
+	#define PDF_FREE HbwFree
+#else
+	#define PDF_ALLOCATOR MemAllocAligned
+	#define PDF_FREE MemFree
+#endif
+
+// Forward definition.
+void FNAME(D3Q19ListAaPvGatherAoSoAKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdf != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			*pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
+			#undef ADJ_LIST
+		}
+		else {
+			*pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
+		}
+
+	}
+	else {
+		*pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
+	}
+
+	return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	if (isnan(pdf)) {
+		printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+		DEBUG_BREAK_POINT();
+		exit(1);
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
+			#undef ADJ_LIST
+		} else {
+			kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
+		}
+
+	} else {
+		kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
+	}
+
+	return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
+			D3Q19_LIST_WO_C
+		#undef X
+		#undef ADJ_LIST
+		pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+			D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
+
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
+			D3Q19_LIST_WO_C
+		#undef X
+		#undef ADJ_LIST
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+			D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+	return;
+}
+
+static void ParameterUsage()
+{
+	printf("Kernel parameters:\n");
+	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+	return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+	Assert(blk != NULL);
+
+	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+	#define NEXT_ARG_PRESENT() \
+		do { \
+			if (i + 1 >= params->nKernelArgs) { \
+				printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+				exit(1); \
+			} \
+		} while (0)
+
+
+	for (int i = 0; i < params->nKernelArgs; ++i) {
+		if (ARG_IS("-blk") || ARG_IS("--blk")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = blk[1] = blk[2] = tmp;
+		}
+		else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = tmp;
+		}
+		else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[1] = tmp;
+		}
+		else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[2] = tmp;
+		}
+		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+			ParameterUsage();
+			exit(1);
+		}
+		else {
+			printf("ERROR: unknown kernel parameter.\n");
+			ParameterUsage();
+			exit(1);
+		}
+	}
+
+	#undef ARG_IS
+	#undef NEXT_ARG_PRESENT
+
+	return;
+}
+
+static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+	Assert(ld != NULL);
+	Assert(kdlr != NULL);
+	Assert(nThreads > 0);
+
+	uint32_t * adjList = kdlr->kdl.AdjList;
+
+	uint32_t nConsecNodes = 0;
+	uint32_t consecIndex = 0;
+
+	int nFluid = kdlr->kdl.nFluid;
+
+	uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
+
+	int nNodesPerThread = nFluid / nThreads;
+
+	for (int i = 0; i < nThreads; ++i) {
+		consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+	}
+	consecThreadIndices[nThreads] = nFluid;
+
+	int indexThread = 1;
+	int similarPatterns = 0;
+	int wasLastChunkThreadBoundary = 0;
+	// We execute following code two times.
+	// - The first time to get the count of how many entries we need for the
+	//   consecNodes array.
+	// - The second time to fill the array.
+
+	// Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+	for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
+
+		int hasSimilarAccessPattern = 1;
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE * N_D3Q19_IDX + VSIZE) {
+					//printf("different @: ADJ_LST(%d,%d)=%d != %d=ADJ_LST(%d, %d) + VSIZE\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-VSIZE,d) + VSIZE * N_D3Q19_IDX + VSIZE, index-VSIZE, d);
+					// Different access pattern.
+					hasSimilarAccessPattern = 0;
+					break;
+				}
+				#undef ADJ_LIST
+			}
+
+			if(!hasSimilarAccessPattern){
+				break; //exit from nested loop
+			}
+		}
+
+		long threadBoundaryIndex = consecThreadIndices[indexThread];
+		if (fluidBaseIndex <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE) {
+			// Current chunk contains thread boundary.
+			// These chunks are treated by scalar peel and reminder loops
+			// in kernel of every thread to ensure VSIZE aligned access to
+			// adjacency list
+
+			// final cells of current thread
+			++consecIndex;
+
+			// first cells of next thread
+			++indexThread;
+			++consecIndex;
+
+			wasLastChunkThreadBoundary = 1;
+		}
+		else {
+			// We are not at a thread boundary
+			if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
+				++similarPatterns;
+			}
+			else {
+				++consecIndex;
+			}
+
+			wasLastChunkThreadBoundary = 0;
+
+			/*
+			if (!hasSimilarAccessPattern) {
+				++consecIndex;
+			}
+			else {
+				++similarPatterns;
+			}
+			*/
+		}
+	}
+
+	if (nFluid > 0) {
+		nConsecNodes = consecIndex + 1;
+	}
+
+	uint32_t * consecNodes;
+	MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
+
+	unsigned long consecNodesByte = (nConsecNodes) * sizeof(uint32_t);
+
+	printf("# Consec. Nodes Array Allocation:\n");
+	printf("#   similar patterns\t\t%d\n", similarPatterns);
+	printf("#   elements:      \t\t%d\n",   nConsecNodes);
+	printf("#   size:          \t\t%e MiB\n", consecNodesByte / 1024.0 / 1024.0);
+	printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
+
+	if (MemAllocAligned((void **)&consecNodesByte, consecNodesByte, PAGE_4K)) {
+			printf("ERROR: allocating consecNodes array with MemAllocAligned failed: %lu bytes.\n", consecNodesByte);
+			exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\tMemAllocAligned()\n");
+	}
+
+	consecIndex = 0;
+
+	if (nFluid > 0) {
+		consecNodes[consecIndex] = VSIZE;
+	}
+
+	indexThread = 1;
+	consecThreadIndices[0] = 0;
+
+	//add first chunk manually to enable backward check for consecutive pattern
+	consecNodes[consecIndex] = VSIZE;
+
+	wasLastChunkThreadBoundary = 0;
+
+	// Loop over adjacency list of all nodes.
+    // Compare if access pattern does not change on chunk level
+	// Since gather instructions are used, access pattern may not be consecutive
+	for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
+
+		int hasSimilarAccessPattern = 1;
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE * N_D3Q19_IDX + VSIZE) {
+					// Different access pattern.
+					hasSimilarAccessPattern = 0;
+					break;
+				}
+				#undef ADJ_LIST
+			}
+
+			if(!hasSimilarAccessPattern){
+				break; //exit from nested loop
+			}
+		}
+
+		long threadBoundaryIndex = consecThreadIndices[indexThread];
+		if (fluidBaseIndex <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE) {
+			// Current chunk contains thread boundary.
+			// These chunks are treated by scalar peel and reminder loops
+			// in kernel of every thread to ensure VSIZE aligned access to
+			// adjacency list
+
+			// final cells of current thread
+			++consecIndex;
+			//consecThreadIndices[indexThread] = consecIndex;
+			consecNodes[consecIndex] = threadBoundaryIndex - fluidBaseIndex;
+
+
+			// first cells of next thread
+			++consecIndex;
+			consecThreadIndices[indexThread] = consecIndex;
+			consecNodes[consecIndex] = (fluidBaseIndex + VSIZE) - threadBoundaryIndex;
+			++indexThread;
+
+			wasLastChunkThreadBoundary = 1;
+
+		}
+		else {
+			// We are not at a thread boundary
+			if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] += VSIZE;
+			}
+			else {
+				++consecIndex;
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] = VSIZE;
+			}
+
+			/*
+			if (!hasSimilarAccessPattern) {
+				++consecIndex;
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] = VSIZE;
+			}
+			else {
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] += VSIZE;
+			}
+			*/
+			wasLastChunkThreadBoundary = 0;
+
+		}
+	}
+
+	/*
+	printf("consecNodes:\n");
+	for(int i = 0; i < nConsecNodes + 5; ++i){
+		printf("%d ", consecNodes[i]);
+	}
+	printf("\n");
+	*/
+	/*
+	printf("consecThreadIndices:\n");
+	for(int i = 0; i < nThreads + 5; ++i){
+		printf("%d ", consecThreadIndices[i]);
+	}
+	printf("\n");
+	*/
+
+	kdlr->ConsecNodes = consecNodes;
+	kdlr->nConsecNodes = nConsecNodes;
+
+	kdlr->ConsecThreadIndices  = consecThreadIndices;
+	kdlr->nConsecThreadIndices = nThreads;
+
+	double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
+	//N_D3Q19 - 1: C lookup not required, +1: transfer of consecValue
+	double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + ((double)nConsecNodes *((N_D3Q19 - 1) * VSIZE + 1)) / nFluid * sizeof(int);
+	double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
+
+	kdlr->kdl.kd.LoopBalance = loopBalance;
+
+	printf("# loop balance:\n");
+	printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
+	printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
+	printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherAoSoAInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	KernelData * kd;
+	KernelDataList * kdl;
+	KernelDataListRia * kdlr;
+	MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+	kd = (KernelData *)kdlr;
+	kdl = KDL(kdlr);
+
+	*kernelData = kd;
+
+#ifdef DEBUG
+	kd->Pdfs[0] = NULL;
+	kd->Pdfs[1] = NULL;
+	kd->PdfsActive = NULL;
+	kd->DstPdfs = NULL;
+	kd->SrcPdfs = NULL;
+	kd->Dims[0] = -1;
+	kd->Dims[1] = -1;
+	kd->Dims[2] = -1;
+	kd->GlobalDims[0] = -1;
+	kd->GlobalDims[1] = -1;
+	kd->GlobalDims[2] = -1;
+	kd->Offsets[0] = -1;
+	kd->Offsets[1] = -1;
+	kd->Offsets[2] = -1;
+
+	kd->ObstIndices = NULL;
+	kd->nObstIndices = -1;
+	kd->BounceBackPdfsSrc = NULL;
+	kd->BounceBackPdfsDst = NULL;
+	kd->nBounceBackPdfs = -1;
+
+	kdl->AdjList = NULL;
+	kdl->Coords = NULL;
+	kdl->Grid = NULL;
+	kdl->nCells = -1;
+	kdl->nFluid = -1;
+
+	kdlr->ConsecNodes = NULL;
+	kdlr->nConsecNodes = 0;
+	kdlr->ConsecThreadIndices = NULL;
+	kdlr->nConsecThreadIndices = 0;
+#endif
+
+	// Ajust the dimensions according to padding, if used.
+	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+	kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+	int * lDims = ld->Dims;
+
+	int lX = lDims[0];
+	int lY = lDims[1];
+	int lZ = lDims[2];
+
+	int nTotalCells = lX * lY * lZ;
+	int nCells = ld->nFluid; // TODO: + padding
+	int nFluid = ld->nFluid;
+
+	// TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
+	// done during lattice setup.
+	kdl->nCells = nCells;
+	kdl->nFluid = nFluid;
+
+	PdfT * pdfs[2];
+
+	int blk[3] = { 0 };
+
+	ParseParameters(params, blk);
+
+	if (blk[0] == 0) blk[0] = lX;
+	if (blk[1] == 0) blk[1] = lY;
+	if (blk[2] == 0) blk[2] = lZ;
+
+	printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+	unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
+	unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
+	unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
+
+	printf("# Lattice Array Allocation:\n");
+	printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
+	printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
+	printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
+
+
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
+		printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
+	}
+
+	kd->Pdfs[0] = pdfs[0];
+
+	// Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+	// Here we touch only the fluid nodes as this loop is OpenMP parallel and
+	// we want the same scheduling as in the kernel.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+	} }
+
+	// Initialize all PDFs to some standard value.
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+	} }
+
+	// ----------------------------------------------------------------------
+	// create grid which will hold the index numbers of the fluid nodes
+
+	uint32_t * grid;
+
+	if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+		printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+		exit(1);
+	}
+	kdl->Grid = grid;
+
+	int latticeIndex;
+
+#ifdef DEBUG
+	for(int z = 0; z < lZ; ++z) {
+		for(int y = 0; y < lY; ++y) {
+			for(int x = 0; x < lX; ++x) {
+
+				latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+				grid[latticeIndex] = ~0;
+			}
+		}
+	}
+#endif
+
+	// ----------------------------------------------------------------------
+	// generate numbering over grid
+
+	uint32_t * coords;
+
+	if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+		printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+		exit(1);
+	}
+
+	kdl->Coords = coords;
+
+	// Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+	// through the ld->Lattice array.
+	int counter = 0;
+
+	// Blocking is implemented via setup of the adjacency list. The kernel later will
+	// walk through the lattice blocked automatically.
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bX = 0; bX < lX; bX += blk[0]) {
+
+		int eZ = MIN(bZ + blk[2], lZ);
+		int eY = MIN(bY + blk[1], lY);
+		int eX = MIN(bX + blk[0], lX);
+
+		for (int z = bZ; z < eZ; ++z) {
+		for (int y = bY; y < eY; ++y) {
+		for (int x = bX; x < eX; ++x) {
+
+			latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+			if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+				grid[latticeIndex] = counter;
+
+				coords[C_INDEX_X(counter)] = x;
+				coords[C_INDEX_Y(counter)] = y;
+				coords[C_INDEX_Z(counter)] = z;
+
+				++counter;
+			}
+		} } }
+	} } }
+
+	Verify(counter == nFluid);
+	uint32_t * adjList;
+
+	// AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
+	unsigned long adjListBytes = nFluid * sizeof(int) * N_D3Q19_IDX;
+
+	printf("# Adjacency List Allocation:\n");
+	printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	// AdjList only requires 18 instead of 19 entries per node, as
+	// the center PDF needs no addressing.
+	if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
+		printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
+	}
+
+	kdl->AdjList = adjList;
+
+	int x, y, z;
+
+	uint32_t neighborIndex;
+	uint32_t dstIndex;
+
+	int nx, ny, nz, px, py, pz;
+
+	// Loop over all fluid nodes and compute the indices to the neighboring
+	// PDFs for configured data layout (AoS/SoA).
+	// Parallelized loop to ensure correct NUMA placement.
+	// #ifdef _OPENMP  --> add line continuation
+	// 	#pragma omp parallel for default(none)
+	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+	// 				stderr,
+	// 				lDims, grid, ld, lX, lY, lZ, adjList)
+	// 		private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+	// #endif
+	for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
+
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				x = coords[C_INDEX_X(index)];
+				y = coords[C_INDEX_Y(index)];
+				z = coords[C_INDEX_Z(index)];
+
+				Assert(x >= 0 && x < lX);
+				Assert(y >= 0 && y < lY);
+				Assert(z >= 0 && z < lZ);
+
+				Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+#ifdef PROP_MODEL_PUSH
+				nx = x + D3Q19_X[d];
+				ny = y + D3Q19_Y[d];
+				nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+				nx = x - D3Q19_X[d];
+				ny = y - D3Q19_Y[d];
+				nz = z - D3Q19_Z[d];
+#else
+				#error No implementation for this PROP_MODEL_NAME.
+#endif
+				// If the neighbor is outside the latice in X direction and we have a
+				// periodic boundary then we need to wrap around.
+				if ( 	((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+						((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+						((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+				   ){
+					// x periodic
+
+					if (nx < 0) {
+						px = lX - 1;
+					}
+					else if (nx >= lX) {
+						px = 0;
+					} else {
+						px = nx;
+					}
+					// y periodic
+					if (ny < 0) {
+						py = lY - 1;
+					}
+					else if (ny >= lY) {
+						py = 0;
+					} else {
+						py = ny;
+					}
+
+					// z periodic
+					if (nz < 0) {
+						pz = lZ - 1;
+					}
+					else if (nz >= lZ) {
+						pz = 0;
+					} else {
+						pz = nz;
+					}
+
+					if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+						dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+					}
+					else {
+						neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+						AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+						dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+					}
+				}
+				else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else {
+					neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+					Assert(neighborIndex != ~0);
+
+					dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+				}
+
+				Assert(dstIndex >= 0);
+				Assert(dstIndex < nCells * N_D3Q19);
+
+				adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
+			}
+		}
+	}
+
+	/*
+	printf("============\n");
+	for(int baseIndex = 0; baseIndex < nFluid; baseIndex+=VSIZE){
+		for(int i = 0; i < VSIZE; ++i){
+			int index = baseIndex + i;
+
+			printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (0 * VSIZE) + (index % VSIZE)]);
+		}
+		printf("\n");
+	}
+	printf("============\n");
+*/
+
+	int nThreads = 1;
+
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	SetupConsecNodes(ld, KDLR(kd), nThreads);
+
+	// Fill remaining KernelData structures
+	kd->GetNode = GetNode;
+	kd->SetNode = SetNode;
+
+	kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+	kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+	kd->Kernel = FNAME(D3Q19ListAaPvGatherAoSoAKernel);
+
+	kd->DstPdfs = NULL;
+	kd->PdfsActive = kd->Pdfs[0];
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherAoSoADeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+	KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+	MemFree((void **)&((*kdlr)->ConsecNodes));
+
+	if ((*kdlr)->ConsecThreadIndices != NULL) {
+		MemFree((void **)&((*kdlr)->ConsecThreadIndices));
+	}
+
+	KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+	ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
+
+	MemFree((void **)&((*kdl)->Coords));
+	MemFree((void **)&((*kdl)->Grid));
+
+	PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
+
+	MemFree((void **)kernelData);
+	return;
+}
+
diff --git a/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.h b/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.h
new file mode 100644
index 0000000..b85d096
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherAoSoACommon.h
@@ -0,0 +1,59 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA_COMMON_H__
+
+#if !defined(DATA_LAYOUT_AOSOA)
+	#error List Gather AoSoA works only with DATA_LAYOUT_AOSOA
+#endif
+
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+typedef struct KernelDataListRia_ {
+	KernelDataList kdl;
+
+	// Array contains information of how many adjacent nodes share the same access pattern.
+	uint32_t * ConsecNodes;
+	uint32_t nConsecNodes;  // Number of entries in ConsecNodes array.
+
+	// Array contains (for each thread) an index into ConsecNodes.
+	uint32_t * ConsecThreadIndices;
+	// Number of entries in ConsecThreadIndices.
+	uint32_t nConsecThreadIndices;
+
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)	((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_AOSOA_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19ListAaPvGatherCommon.c b/src/BenchKernelD3Q19ListAaPvGatherCommon.c
new file mode 100644
index 0000000..a85f542
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherCommon.c
@@ -0,0 +1,977 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherCommon.h"
+
+#include "Memory.h"
+#include "Vector.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+#define PAGE_4K		4096
+
+#if ALLOC_ADJ_LIST_IN_HBM == 1
+	#define ADJ_LIST_ALLOCATOR HbwAllocAligned
+	#define ADJ_LIST_FREE HbwFree
+#else
+	#define ADJ_LIST_ALLOCATOR MemAllocAligned
+	#define ADJ_LIST_FREE MemFree
+#endif
+
+#if ALLOC_PDF_IN_HBM == 1
+	#define PDF_ALLOCATOR HbwAllocAligned
+	#define PDF_FREE HbwFree
+#else
+	#define PDF_ALLOCATOR MemAllocAligned
+	#define PDF_FREE MemFree
+#endif
+
+// Forward definition.
+void FNAME(D3Q19ListAaPvGatherKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdf != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			*pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
+			#undef ADJ_LIST
+		}
+		else {
+			*pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
+		}
+
+	}
+	else {
+		*pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
+	}
+
+	return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	if (isnan(pdf)) {
+		printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+		DEBUG_BREAK_POINT();
+		exit(1);
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
+			#undef ADJ_LIST
+		} else {
+			kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
+		}
+
+	} else {
+		kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
+	}
+
+	return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
+			D3Q19_LIST_WO_C
+		#undef X
+		#undef ADJ_LIST
+		pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+			D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
+
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
+			D3Q19_LIST_WO_C
+		#undef X
+		#undef ADJ_LIST
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+			D3Q19_LIST
+		#undef X
+		#undef I
+
+	}
+
+	return;
+}
+
+static void ParameterUsage()
+{
+	printf("Kernel parameters:\n");
+	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+	return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+	Assert(blk != NULL);
+
+	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+	#define NEXT_ARG_PRESENT() \
+		do { \
+			if (i + 1 >= params->nKernelArgs) { \
+				printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+				exit(1); \
+			} \
+		} while (0)
+
+
+	for (int i = 0; i < params->nKernelArgs; ++i) {
+		if (ARG_IS("-blk") || ARG_IS("--blk")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = blk[1] = blk[2] = tmp;
+		}
+		else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = tmp;
+		}
+		else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[1] = tmp;
+		}
+		else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[2] = tmp;
+		}
+		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+			ParameterUsage();
+			exit(1);
+		}
+		else {
+			printf("ERROR: unknown kernel parameter.\n");
+			ParameterUsage();
+			exit(1);
+		}
+	}
+
+	#undef ARG_IS
+	#undef NEXT_ARG_PRESENT
+
+	return;
+}
+
+static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+	Assert(ld != NULL);
+	Assert(kdlr != NULL);
+	Assert(nThreads > 0);
+
+	uint32_t * adjList = kdlr->kdl.AdjList;
+
+	uint32_t nConsecNodes = 0;
+	uint32_t consecIndex = 0;
+
+	int nFluid = kdlr->kdl.nFluid;
+
+	uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
+
+	int nNodesPerThread = nFluid / nThreads;
+
+	for (int i = 0; i < nThreads; ++i) {
+		consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+	}
+	consecThreadIndices[nThreads] = nFluid;
+
+	int indexThread = 1;
+	int similarPatterns = 0;
+	int wasLastChunkThreadBoundary = 0;
+	// We execute following code two times.
+	// - The first time to get the count of how many entries we need for the
+	//   consecNodes array.
+	// - The second time to fill the array.
+
+	// Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+	for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
+
+		int hasSimilarAccessPattern = 1;
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
+					//printf("different @: ADJ_LST(%d,%d)=%d != %d=ADJ_LST(%d, %d) + VSIZE\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-VSIZE,d) + VSIZE, index-VSIZE, d);
+					// Different access pattern.
+					hasSimilarAccessPattern = 0;
+					break;
+				}
+				#undef ADJ_LIST
+			}
+
+			if(!hasSimilarAccessPattern){
+				break; //exit from nested loop
+			}
+		}
+
+		long threadBoundaryIndex = consecThreadIndices[indexThread];
+		if (fluidBaseIndex <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE) {
+			// Current chunk contains thread boundary.
+			// These chunks are treated by scalar peel and reminder loops
+			// in kernel of every thread to ensure VSIZE aligned access to
+			// adjacency list
+
+			// final cells of current thread
+			++consecIndex;
+
+			// first cells of next thread
+			++indexThread;
+			++consecIndex;
+
+			wasLastChunkThreadBoundary = 1;
+		}
+		else {
+			// We are not at a thread boundary
+			if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
+				++similarPatterns;
+			}
+			else {
+				++consecIndex;
+			}
+
+			wasLastChunkThreadBoundary = 0;
+
+			/*
+			if (!hasSimilarAccessPattern) {
+				++consecIndex;
+			}
+			else {
+				++similarPatterns;
+			}
+			*/
+		}
+	}
+
+	if (nFluid > 0) {
+		nConsecNodes = consecIndex + 1;
+	}
+
+	uint32_t * consecNodes;
+	MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
+
+	unsigned long consecNodesByte = (nConsecNodes) * sizeof(uint32_t);
+
+	printf("# Consec. Nodes Array Allocation:\n");
+	printf("#   similar patterns\t\t%d\n", similarPatterns);
+	printf("#   elements:      \t\t%d\n",   nConsecNodes);
+	printf("#   size:          \t\t%e MiB\n", consecNodesByte / 1024.0 / 1024.0);
+	printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
+
+	if (MemAllocAligned((void **)&consecNodesByte, consecNodesByte, PAGE_4K)) {
+			printf("ERROR: allocating consecNodes array with MemAllocAligned failed: %lu bytes.\n", consecNodesByte);
+			exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\tMemAllocAligned()\n");
+	}
+
+	consecIndex = 0;
+
+	if (nFluid > 0) {
+		consecNodes[consecIndex] = VSIZE;
+	}
+
+	indexThread = 1;
+	consecThreadIndices[0] = 0;
+
+	//add first chunk manually to enable backward check for consecutive pattern
+	consecNodes[consecIndex] = VSIZE;
+
+	wasLastChunkThreadBoundary = 0;
+
+	// Loop over adjacency list of all nodes.
+    // Compare if access pattern does not change on chunk level
+	// Since gather instructions are used, access pattern may not be consecutive
+	for (int fluidBaseIndex = VSIZE; fluidBaseIndex < nFluid; fluidBaseIndex += VSIZE) {
+
+		int hasSimilarAccessPattern = 1;
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-VSIZE, d) + VSIZE) {
+					// Different access pattern.
+					hasSimilarAccessPattern = 0;
+					break;
+				}
+				#undef ADJ_LIST
+			}
+
+			if(!hasSimilarAccessPattern){
+				break; //exit from nested loop
+			}
+		}
+
+		long threadBoundaryIndex = consecThreadIndices[indexThread];
+		if (fluidBaseIndex <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE) {
+			// Current chunk contains thread boundary.
+			// These chunks are treated by scalar peel and reminder loops
+			// in kernel of every thread to ensure VSIZE aligned access to
+			// adjacency list
+
+			// final cells of current thread
+			++consecIndex;
+			//consecThreadIndices[indexThread] = consecIndex;
+			consecNodes[consecIndex] = threadBoundaryIndex - fluidBaseIndex;
+
+
+			// first cells of next thread
+			++consecIndex;
+			consecThreadIndices[indexThread] = consecIndex;
+			consecNodes[consecIndex] = (fluidBaseIndex + VSIZE) - threadBoundaryIndex;
+			++indexThread;
+
+			wasLastChunkThreadBoundary = 1;
+
+		}
+		else {
+			// We are not at a thread boundary
+			if (hasSimilarAccessPattern && !wasLastChunkThreadBoundary){
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] += VSIZE;
+			}
+			else {
+				++consecIndex;
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] = VSIZE;
+			}
+
+			/*
+			if (!hasSimilarAccessPattern) {
+				++consecIndex;
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] = VSIZE;
+			}
+			else {
+				Assert(consecIndex < nConsecNodes);
+				consecNodes[consecIndex] += VSIZE;
+			}
+			*/
+			wasLastChunkThreadBoundary = 0;
+
+		}
+	}
+
+	/*
+	printf("consecNodes:\n");
+	for(int i = 0; i < nConsecNodes + 5; ++i){
+		printf("%d ", consecNodes[i]);
+	}
+	printf("\n");
+	*/
+	/*
+	printf("consecThreadIndices:\n");
+	for(int i = 0; i < nThreads + 5; ++i){
+		printf("%d ", consecThreadIndices[i]);
+	}
+	printf("\n");
+	*/
+
+	kdlr->ConsecNodes = consecNodes;
+	kdlr->nConsecNodes = nConsecNodes;
+
+	kdlr->ConsecThreadIndices  = consecThreadIndices;
+	kdlr->nConsecThreadIndices = nThreads;
+
+	double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
+	//N_D3Q19 - 1: C lookup not required, +1: transfer of consecValue
+	double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + ((double)nConsecNodes *((N_D3Q19 - 1) * VSIZE + 1)) / nFluid * sizeof(int);
+	double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
+
+	kdlr->kdl.kd.LoopBalance = loopBalance;
+
+	printf("# loop balance:\n");
+	printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
+	printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
+	printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	KernelData * kd;
+	KernelDataList * kdl;
+	KernelDataListRia * kdlr;
+	MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+	kd = (KernelData *)kdlr;
+	kdl = KDL(kdlr);
+
+	*kernelData = kd;
+
+#ifdef DEBUG
+	kd->Pdfs[0] = NULL;
+	kd->Pdfs[1] = NULL;
+	kd->PdfsActive = NULL;
+	kd->DstPdfs = NULL;
+	kd->SrcPdfs = NULL;
+	kd->Dims[0] = -1;
+	kd->Dims[1] = -1;
+	kd->Dims[2] = -1;
+	kd->GlobalDims[0] = -1;
+	kd->GlobalDims[1] = -1;
+	kd->GlobalDims[2] = -1;
+	kd->Offsets[0] = -1;
+	kd->Offsets[1] = -1;
+	kd->Offsets[2] = -1;
+
+	kd->ObstIndices = NULL;
+	kd->nObstIndices = -1;
+	kd->BounceBackPdfsSrc = NULL;
+	kd->BounceBackPdfsDst = NULL;
+	kd->nBounceBackPdfs = -1;
+
+	kdl->AdjList = NULL;
+	kdl->Coords = NULL;
+	kdl->Grid = NULL;
+	kdl->nCells = -1;
+	kdl->nFluid = -1;
+
+	kdlr->ConsecNodes = NULL;
+	kdlr->nConsecNodes = 0;
+	kdlr->ConsecThreadIndices = NULL;
+	kdlr->nConsecThreadIndices = 0;
+#endif
+
+	// Ajust the dimensions according to padding, if used.
+	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+	kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+	int * lDims = ld->Dims;
+
+	int lX = lDims[0];
+	int lY = lDims[1];
+	int lZ = lDims[2];
+
+	int nTotalCells = lX * lY * lZ;
+	int nCells = ld->nFluid; // TODO: + padding
+	int nFluid = ld->nFluid;
+
+	// TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
+	// done during lattice setup.
+	kdl->nCells = nCells;
+	kdl->nFluid = nFluid;
+
+	PdfT * pdfs[2];
+
+	int blk[3] = { 0 };
+
+	ParseParameters(params, blk);
+
+	if (blk[0] == 0) blk[0] = lX;
+	if (blk[1] == 0) blk[1] = lY;
+	if (blk[2] == 0) blk[2] = lZ;
+
+	printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+	unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
+	unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
+	unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
+
+	printf("# Lattice Array Allocation:\n");
+	printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
+	printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
+	printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
+
+
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
+		printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
+	}
+
+	kd->Pdfs[0] = pdfs[0];
+
+	// Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+	// Here we touch only the fluid nodes as this loop is OpenMP parallel and
+	// we want the same scheduling as in the kernel.
+	#ifdef _OPENMP
+		#pragma omp parallel for
+	#endif
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+	} }
+
+	// Initialize all PDFs to some standard value.
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+	} }
+
+	// ----------------------------------------------------------------------
+	// create grid which will hold the index numbers of the fluid nodes
+
+	uint32_t * grid;
+
+	if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+		printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+		exit(1);
+	}
+	kdl->Grid = grid;
+
+	int latticeIndex;
+
+#ifdef DEBUG
+	for(int z = 0; z < lZ; ++z) {
+		for(int y = 0; y < lY; ++y) {
+			for(int x = 0; x < lX; ++x) {
+
+				latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+				grid[latticeIndex] = ~0;
+			}
+		}
+	}
+#endif
+
+	// ----------------------------------------------------------------------
+	// generate numbering over grid
+
+	uint32_t * coords;
+
+	if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+		printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+		exit(1);
+	}
+
+	kdl->Coords = coords;
+
+	// Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+	// through the ld->Lattice array.
+	int counter = 0;
+
+	// Blocking is implemented via setup of the adjacency list. The kernel later will
+	// walk through the lattice blocked automatically.
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+	for (int bY = 0; bY < lY; bY += blk[1]) {
+	for (int bX = 0; bX < lX; bX += blk[0]) {
+
+		int eX = MIN(bX + blk[0], lX);
+		int eY = MIN(bY + blk[1], lY);
+		int eZ = MIN(bZ + blk[2], lZ);
+
+
+		for (int z = bZ; z < eZ; ++z) {
+		for (int y = bY; y < eY; ++y) {
+		for (int x = bX; x < eX; ++x) {
+
+			latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+			if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+				grid[latticeIndex] = counter;
+
+				coords[C_INDEX_X(counter)] = x;
+				coords[C_INDEX_Y(counter)] = y;
+				coords[C_INDEX_Z(counter)] = z;
+
+				++counter;
+			}
+		} } }
+	} } }
+
+	Verify(counter == nFluid);
+	uint32_t * adjList;
+
+	// AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
+	unsigned long adjListBytes = nFluid * sizeof(int) * N_D3Q19_IDX;
+
+	printf("# Adjacency List Allocation:\n");
+	printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	// AdjList only requires 18 instead of 19 entries per node, as
+	// the center PDF needs no addressing.
+	if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
+		printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
+	}
+
+	kdl->AdjList = adjList;
+
+	int x, y, z;
+
+	uint32_t neighborIndex;
+	uint32_t dstIndex;
+
+	int nx, ny, nz, px, py, pz;
+
+	// Loop over all fluid nodes and compute the indices to the neighboring
+	// PDFs for configured data layout (AoS/SoA).
+	// Parallelized loop to ensure correct NUMA placement.
+	// #ifdef _OPENMP  --> add line continuation
+	// 	#pragma omp parallel for default(none)
+	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+	// 				stderr,
+	// 				lDims, grid, ld, lX, lY, lZ, adjList)
+	// 		private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+	// #endif
+	for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
+
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				x = coords[C_INDEX_X(index)];
+				y = coords[C_INDEX_Y(index)];
+				z = coords[C_INDEX_Z(index)];
+
+				Assert(x >= 0 && x < lX);
+				Assert(y >= 0 && y < lY);
+				Assert(z >= 0 && z < lZ);
+
+				Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+#ifdef PROP_MODEL_PUSH
+				nx = x + D3Q19_X[d];
+				ny = y + D3Q19_Y[d];
+				nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+				nx = x - D3Q19_X[d];
+				ny = y - D3Q19_Y[d];
+				nz = z - D3Q19_Z[d];
+#else
+				#error No implementation for this PROP_MODEL_NAME.
+#endif
+				// If the neighbor is outside the latice in X direction and we have a
+				// periodic boundary then we need to wrap around.
+				if ( 	((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+						((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+						((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+				   ){
+					// x periodic
+
+					if (nx < 0) {
+						px = lX - 1;
+					}
+					else if (nx >= lX) {
+						px = 0;
+					} else {
+						px = nx;
+					}
+					// y periodic
+					if (ny < 0) {
+						py = lY - 1;
+					}
+					else if (ny >= lY) {
+						py = 0;
+					} else {
+						py = ny;
+					}
+
+					// z periodic
+					if (nz < 0) {
+						pz = lZ - 1;
+					}
+					else if (nz >= lZ) {
+						pz = 0;
+					} else {
+						pz = nz;
+					}
+
+					if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+						dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+					}
+					else {
+						neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+						AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+						dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+					}
+				}
+				else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else {
+					neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+					Assert(neighborIndex != ~0);
+
+					dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+				}
+
+				Assert(dstIndex >= 0);
+				Assert(dstIndex < nCells * N_D3Q19);
+
+				adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
+			}
+		}
+	}
+
+	/*
+	printf("============\n");
+	for(int baseIndex = 0; baseIndex < nFluid; baseIndex+=VSIZE){
+		for(int i = 0; i < VSIZE; ++i){
+			int index = baseIndex + i;
+
+			printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (0 * VSIZE) + (index % VSIZE)]);
+		}
+		printf("\n");
+	}
+	printf("============\n");
+	*/
+
+	int nThreads = 1;
+
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	SetupConsecNodes(ld, KDLR(kd), nThreads);
+
+	// Fill remaining KernelData structures
+	kd->GetNode = GetNode;
+	kd->SetNode = SetNode;
+
+	kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+	kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+	kd->Kernel = FNAME(D3Q19ListAaPvGatherKernel);
+
+	kd->DstPdfs = NULL;
+	kd->PdfsActive = kd->Pdfs[0];
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+	KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+	MemFree((void **)&((*kdlr)->ConsecNodes));
+
+	if ((*kdlr)->ConsecThreadIndices != NULL) {
+		MemFree((void **)&((*kdlr)->ConsecThreadIndices));
+	}
+
+	KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+	ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
+
+	MemFree((void **)&((*kdl)->Coords));
+	MemFree((void **)&((*kdl)->Grid));
+
+	PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
+
+	MemFree((void **)kernelData);
+	return;
+}
+
diff --git a/src/BenchKernelD3Q19ListAaPvGatherCommon.h b/src/BenchKernelD3Q19ListAaPvGatherCommon.h
new file mode 100644
index 0000000..025042d
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherCommon.h
@@ -0,0 +1,59 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_COMMON_H__
+
+#if !defined(DATA_LAYOUT_SOA) && !defined(DATA_LAYOUT_AOSOA)
+	#error List Gather works only with DATA_LAYOUT_SOA or DATA_LAYOUT_AOSOA
+#endif
+
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+typedef struct KernelDataListRia_ {
+	KernelDataList kdl;
+
+	// Array contains information of how many adjacent nodes share the same access pattern.
+	uint32_t * ConsecNodes;
+	uint32_t nConsecNodes;  // Number of entries in ConsecNodes array.
+
+	// Array contains (for each thread) an index into ConsecNodes.
+	uint32_t * ConsecThreadIndices;
+	// Number of entries in ConsecThreadIndices.
+	uint32_t nConsecThreadIndices;
+
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)	((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19ListAaPvGatherHybrid.c b/src/BenchKernelD3Q19ListAaPvGatherHybrid.c
new file mode 100644
index 0000000..0eccb43
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherHybrid.c
@@ -0,0 +1,896 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherHybridCommon.h"
+
+#include "Base.h"
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#else
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#endif
+
+//enable software prefetchting for vectorized gather/scatter loop in odd kernel
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L2
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L2 (0) //prefetchting X loop iterations ahead
+#endif
+
+#ifndef SOFTWARE_PREFETCH_LOOKAHEAD_L1
+#define SOFTWARE_PREFETCH_LOOKAHEAD_L1 (0) //prefetchting X loop iterations ahead
+#endif
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices);
+static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
+
+void FNAME(D3Q19ListAaPvGatherHybridKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
+	KernelData * kd = (KernelData *)kernelData;
+#endif
+	KernelDataList * kdl = KDL(kernelData);
+
+	int maxIterations = cd->MaxIterations;
+	int nFluid = kdl->nFluid;
+
+	printf("\n");
+#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0) || (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+	printf("# Software prefetching enabled:\n");
+	printf("#   Gather/Scatter + Load/Store prefetch lookahead L2: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L2);
+	printf("#   Gather/Scatter + Load/Store prefetch lookahead L1: \t%d\n", SOFTWARE_PREFETCH_LOOKAHEAD_L1);
+#else
+	printf("# Software prefetching disabled.\n");
+#endif
+	printf("\n");
+
+	int nThreads = 1;
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	int * threadStartIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+
+	for (int i = 0; i < nThreads; ++i) {
+		threadStartIndices[i] = i * (nFluid / nThreads) + MinI(i, nFluid % nThreads);
+	}
+
+	threadStartIndices[nThreads] = nFluid;
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, -1);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, 0);
+#endif
+
+	LIKWID_MARKER_INIT;
+
+	// TODO: outer openmp parallel
+
+	X_KERNEL_START(kernelData);
+
+	LIKWID_MARKER_START("OuterLoop");
+
+	for(int iter = 0; iter < maxIterations; iter += 2) {
+
+		// even time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadStartIndices)
+#endif
+		{
+			//LIKWID_MARKER_START("KernelEven");
+			KernelEven(ld, kernelData, cd, threadStartIndices);
+			//LIKWID_MARKER_STOP("KernelEven");
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+		// odd time step
+
+#ifdef _OPENMP
+#pragma omp parallel default(none) shared(ld, kernelData, cd, threadStartIndices)
+#endif
+		{
+			//LIKWID_MARKER_START("KernelOdd");
+			KernelOdd(ld, kernelData, cd);
+			//LIKWID_MARKER_STOP("KernelOdd");
+		}
+
+
+#ifdef VERIFICATION
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelAddBodyForce(kd, ld, cd);
+#endif
+
+#ifdef VTK_OUTPUT
+		if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+			kdl->Iteration = iter + 1;
+			kd->PdfsActive = kd->Pdfs[0];
+			VtkWrite(ld, kd, cd, iter);
+		}
+#endif
+
+#ifdef STATISTICS
+		kdl->Iteration = iter + 1;
+		kd->PdfsActive = kd->Pdfs[0];
+		KernelStatistics(kd, ld, cd, iter);
+#endif
+
+	} // for (int iter = 0; ...
+	LIKWID_MARKER_STOP("OuterLoop");
+
+	X_KERNEL_END(kernelData);
+
+#ifdef VTK_OUTPUT
+	if (cd->VtkOutput) {
+		kd->PdfsActive = kd->Pdfs[0];
+		VtkWrite(ld, kd, cd, maxIterations);
+	}
+#endif
+
+#ifdef STATISTICS
+	kd->PdfsActive = kd->Pdfs[0];
+	KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+	LIKWID_MARKER_CLOSE;
+	free(threadStartIndices);
+
+	return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd, int * threadIndices)
+{
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vui, vux, vuy, vuz, vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name); \
+		PdfT * JOIN(ppdf_,name); \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int threadId = 0;
+#ifdef _OPENMP
+	threadId =  omp_get_thread_num();
+#endif
+
+	int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+	int nFluidVec    = nFluidThread - (nFluidThread % VSIZE);
+
+	int indexStartVec = threadIndices[threadId];
+	int indexStopVec  = threadIndices[threadId] + nFluidVec;
+	int indexStop     = threadIndices[threadId] + nFluidThread;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+
+	#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[I(indexStartVec, idx)]);
+			D3Q19_LIST
+	#undef X
+
+	for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE), _MM_HINT_T1);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+			#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE), _MM_HINT_T0);
+				D3Q19_LIST
+			#undef X
+		#endif
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
+				D3Q19_LIST
+		#undef X
+
+		//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+		//	     vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+		vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+		//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+		//	     vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+		vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+		//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+		//	     vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+		vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+		//vdens = vpdf_C +
+		//	    vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+		//	    vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+		//	    vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+		//	    vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+		vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+		//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+		vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+		//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+		VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+		//vw_1_indep = vw_1 * vdir_indep_trm;
+		vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+		#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_AA_V(vuy, N, S)
+		COLLIDE_AA_V(vux, E, W)
+		COLLIDE_AA_V(vuz, T, B)
+
+		#undef COLLIDE_AA_V
+
+		//vw_2_indep = vw_2 * vdir_indep_trm;
+		vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+		// collide axis unaligned pdfs vectorized
+		#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+			vui = tmpVui; \
+			vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+			voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+			VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+			VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+		COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+		COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+		COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+		COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+		COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+		COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+		#undef COLLIDE_UA_V
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) +=VSIZE;
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	for (int index = indexStopVec; index < indexStop; ++index) {
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST
+		#undef X
+
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// direction: w_0
+		*ppdf_C  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+		// direction: w_1
+		w_1_indep = w_1*dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// direction: w_2
+		w_2_indep = w_2*dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name)++;
+			D3Q19_LIST
+		#undef X
+	} // loop over fluid nodes
+
+	#undef I
+
+	return;
+}
+
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+	Assert(ld != NULL);
+	Assert(kernelData != NULL);
+	Assert(cd != NULL);
+
+	Assert(cd->Omega > 0.0);
+	Assert(cd->Omega < 2.0);
+
+	KernelData * kd = (KernelData *)kernelData;
+	KernelDataList * kdl = KDL(kernelData);
+	KernelDataListRia * kdlr = KDLR(kernelData);
+
+	PdfT omega = cd->Omega;
+	PdfT omegaEven = omega;
+
+	PdfT magicParam = 1.0 / 12.0;
+	PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+	PdfT evenPart = 0.0;
+	PdfT oddPart = 0.0;
+	PdfT dir_indep_trm = 0.0;
+
+	const PdfT w_0 = 1.0 /  3.0;
+	const PdfT w_1 = 1.0 / 18.0;
+	const PdfT w_2 = 1.0 / 36.0;
+
+	const PdfT w_1_x3 = w_1 * 3.0;	const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;	PdfT w_1_indep = 0.0;
+	const PdfT w_2_x3 = w_2 * 3.0;	const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;	PdfT w_2_indep = 0.0;
+
+	PdfT ux, uy, uz, ui;
+	PdfT dens;
+
+	VPDFT VONE_HALF = VSET(0.5);
+	VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+	VPDFT vw_1_indep, vw_2_indep;
+	VPDFT vw_0 = VSET(w_0);
+	VPDFT vw_1 = VSET(w_1);
+	VPDFT vw_2 = VSET(w_2);
+
+	VPDFT vw_1_x3 = VSET(w_1_x3);
+	VPDFT vw_2_x3 = VSET(w_2_x3);
+	VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+	VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+	VPDFT vux, vuy, vuz, vui;
+	VPDFT vdens;
+
+	VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+	VPDFT vomegaEven = VSET(omegaEven);
+	VPDFT vomegaOdd  = VSET(omegaOdd);
+
+
+	// Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+	#define X(name, idx, idxinv, x, y, z) \
+		PdfT JOIN(pdf_,name); \
+		PdfT * JOIN(ppdf_,name) = NULL; \
+		VPDFT JOIN(vpdf_,name);
+		D3Q19_LIST
+	#undef X
+	#define X(name, idx, idxinv, x, y, z)	__m256i JOIN(vgatheridx_,name);
+		D3Q19_LIST_WO_C
+	#undef X
+
+	PdfT * src = kd->Pdfs[0];
+
+	int nCells = kdl->nCells;
+
+	int adjListIndex;
+	uint32_t * adjList = kdl->AdjList;
+
+	int * oddKernelThreadStartIndices = kdlr->oddKernelThreadStartIndices;
+	int * loopStartIndices = kdlr->loopStartIndices;
+
+	int threadId = 0;
+
+#ifdef _OPENMP
+	threadId = omp_get_thread_num();
+#endif
+
+	int threadStartIndex = oddKernelThreadStartIndices[threadId + 0];
+	int threadStopIndex  = oddKernelThreadStartIndices[threadId + 1];
+
+	int indexStart    = loopStartIndices[threadStartIndex];
+	int indexStop     = loopStartIndices[threadStopIndex ];
+
+	int consecStartIndex = threadStartIndex + 1;
+	int consecStopIndex  = threadStopIndex  - 1;
+
+	#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
+	#define ADJ_LIST(dir) adjList[adjListIndex + (dir * VSIZE)]
+
+	// scalar peel loop
+	#pragma novector
+	#pragma loop_count max=7
+	for (int index = indexStart; index < loopStartIndices[consecStartIndex]; ++index) {
+
+		adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[ADJ_LIST(idxinv)]);
+			D3Q19_LIST_WO_C
+		#undef X
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST_WO_C
+		#undef X
+
+		ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+		pdf_C = *ppdf_C;
+
+		// macroscopic values
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// collide direction: w_0
+		*ppdf_C  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+
+		// collide direction: w_1
+		w_1_indep = w_1 * dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// collide direction: w_2
+		w_2_indep = w_2 * dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+	} // scalar peel loop
+
+	// vectorized loop
+	for (int consecIndex = consecStartIndex; consecIndex < consecStopIndex; /* increment in loop body */) {
+
+		int index = loopStartIndices[consecIndex];
+		adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+		#define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[ADJ_LIST(idxinv)]);
+			D3Q19_LIST_WO_C
+		#undef X
+		ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+
+		// loop for consecutive access pattern -> load & store
+		++consecIndex;
+
+		for (; index < loopStartIndices[consecIndex]; index+=VSIZE){
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+				#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L2 * VSIZE), _MM_HINT_T1);
+					D3Q19_LIST
+				#undef X
+			#endif
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+				#define X(name, idx, idxinv, _x, _y, _z) _mm_prefetch((char const *)(JOIN(ppdf_,name) + SOFTWARE_PREFETCH_LOOKAHEAD_L1 * VSIZE), _MM_HINT_T0);
+					D3Q19_LIST
+				#undef X
+			#endif
+
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
+				D3Q19_LIST
+			#undef X
+
+			// macroscopic values
+			//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+			//      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+			//      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+			//      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			//vdens = vpdf_C +
+			//        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+			//        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+			//        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+			//        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+										VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			// collide direction: w_0
+			//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+			VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+			// collide direction: w_1
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+				VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+				VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_AA_V(vuy, N, S)
+			COLLIDE_AA_V(vux, E, W)
+			COLLIDE_AA_V(vuz, T, B)
+
+			#undef COLLIDE_AA_V
+
+			// collide direction: w_2
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+				VSTU(JOIN(ppdf_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+				VSTU(JOIN(ppdf_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+			COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+			COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+			COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+			COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+			COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+			#undef COLLIDE_UA_V
+
+			//increment pointer
+			#define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += VSIZE;
+				D3Q19_LIST
+			#undef X
+		}
+
+		// loop for irregular access pattern -> gather & scatter
+		index = loopStartIndices[consecIndex];
+		++consecIndex;
+		for (; index < loopStartIndices[consecIndex]; index+=VSIZE){
+
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L2 > 0)
+				int const indexPrefetchL2 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L2;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL2 < indexStop){
+					adjListIndex = (indexPrefetchL2 - (indexPrefetchL2 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL2 % VSIZE);
+					#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T1);
+						D3Q19_LIST_WO_C
+					#undef X
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL2, D3Q19_C)]), _MM_HINT_T1);
+				}
+			#endif
+
+			#if (SOFTWARE_PREFETCH_LOOKAHEAD_L1 > 0)
+				int const indexPrefetchL1 = index + VSIZE * SOFTWARE_PREFETCH_LOOKAHEAD_L1;
+				// make sure that adjList access is never out of bounds since it is an actual memory access and no prefetch
+				if (indexPrefetchL1 < indexStop){
+					adjListIndex = (indexPrefetchL1 - (indexPrefetchL1 % VSIZE)) * N_D3Q19_IDX + (indexPrefetchL1 % VSIZE);
+					#define X(name, idx, idxinv, _x, _y, _z) VPG32(VLIU(&ADJ_LIST(idxinv)), (char const *) src, 8, _MM_HINT_T0);
+						D3Q19_LIST_WO_C
+					#undef X
+
+					_mm_prefetch((char const *) &(src[P_INDEX_3(nCells, indexPrefetchL1, D3Q19_C)]), _MM_HINT_T0);
+				}
+			#endif
+
+			adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vgatheridx_,name) = VLIU(&(ADJ_LIST(idxinv)));
+				D3Q19_LIST_WO_C
+			#undef X
+
+			#define X(name, idx, idxinv, _x, _y, _z)	JOIN(vpdf_,name) = VG32(JOIN(vgatheridx_,name), src, 8);
+				D3Q19_LIST_WO_C
+			#undef X
+
+			ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+			vpdf_C = VLDU(ppdf_C);
+
+			// macroscopic values
+			//vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+			//      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+			vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+			//vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+			//      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+			vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+			//vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+			//      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+			vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+			//vdens = vpdf_C +
+			//        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+			//        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+			//        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+			//        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+			vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+										VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+			//vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+			vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+			// collide direction: w_0
+			//src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+			VSTU(ppdf_C,VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+
+			// collide axis aligend pdfs vectorized
+			#define SCAT(offsets, vsrc) VS32(src, offsets, vsrc, 8)
+
+			// collide direction: w_1
+			vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+			#define COLLIDE_AA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_1_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart));\
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_AA_V(vuy, N, S)
+			COLLIDE_AA_V(vux, E, W)
+			COLLIDE_AA_V(vuz, T, B)
+
+			#undef COLLIDE_AA_V
+
+			// collide direction: w_2
+			vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+			#define COLLIDE_UA_V(tmpVui, dir1, dir2) \
+				vui = tmpVui; \
+				vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));\
+				voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(JOIN(vpdf_,dir1),JOIN(vpdf_,dir2))),VMUL(vui,vw_2_x3)));\
+				SCAT(JOIN(vgatheridx_,dir2),VSUB(VSUB(JOIN(vpdf_,dir1),vevenPart),voddPart)); \
+				SCAT(JOIN(vgatheridx_,dir1),VADD(VSUB(JOIN(vpdf_,dir2),vevenPart),voddPart));
+
+			COLLIDE_UA_V(VSUB(vuy,vux), NW, SE)
+			COLLIDE_UA_V(VADD(vux,vuy), NE, SW)
+			COLLIDE_UA_V(VSUB(vuz,vux), TW, BE)
+			COLLIDE_UA_V(VADD(vux,vuz), TE, BW)
+			COLLIDE_UA_V(VSUB(vuz,vuy), TS, BN)
+			COLLIDE_UA_V(VADD(vuy,vuz), TN, BS)
+
+			#undef COLLIDE_UA_V
+			#undef SCAT
+		} // gather & scatter loop
+
+	} // Vectorized Loop
+
+	// Scalar remainder loop
+	#pragma novector
+	#pragma loop_count max=7
+	for (int index = loopStartIndices[consecStopIndex]; index < indexStop; ++index) {
+
+		adjListIndex = (index - (index % VSIZE)) * N_D3Q19_IDX + (index % VSIZE);
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(ppdf_,name) = &(src[ADJ_LIST(idxinv)]);
+			D3Q19_LIST_WO_C
+		#undef X
+		#define X(name, idx, idxinv, _x, _y, _z)	JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+			D3Q19_LIST_WO_C
+		#undef X
+
+		ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+		pdf_C = *ppdf_C;
+
+		// macroscopic values
+		ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+			pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+		uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+			pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+		uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+			pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+		dens = pdf_C +
+			pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+			pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+			pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+			pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+		dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+		// collide direction: w_0
+		*ppdf_C  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+
+		// collide direction: w_1
+		w_1_indep = w_1 * dir_indep_trm;
+
+		#define COLLIDE_AA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_1_x3); \
+			*(JOIN(ppdf_,dir2))  = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1))  = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_AA_S(uy, N, S)
+		COLLIDE_AA_S(ux, E, W)
+		COLLIDE_AA_S(uz, T, B)
+
+		#undef COLLIDE_AA_S
+
+		// collide direction: w_2
+		w_2_indep = w_2 * dir_indep_trm;
+
+		#define COLLIDE_UA_S(tmpUi, dir1, dir2) \
+			ui = tmpUi; \
+			evenPart = omegaEven * (0.5 * (JOIN(pdf_,dir1) + JOIN(pdf_,dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+			oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,dir1) - JOIN(pdf_,dir2)) - ui * w_2_x3); \
+			*(JOIN(ppdf_,dir2)) = JOIN(pdf_,dir1) - evenPart - oddPart; \
+			*(JOIN(ppdf_,dir1)) = JOIN(pdf_,dir2) - evenPart + oddPart;
+
+		COLLIDE_UA_S((-ux + uy), NW, SE)
+		COLLIDE_UA_S(( ux + uy), NE, SW)
+		COLLIDE_UA_S((-ux + uz), TW, BE)
+		COLLIDE_UA_S(( ux + uz), TE, BW)
+		COLLIDE_UA_S((-uy + uz), TS, BN)
+		COLLIDE_UA_S(( uy + uz), TN, BS)
+
+		#undef COLLIDE_UA_S
+
+	} // remainder loop over fluid nodes
+
+#undef SOFTWARE_PREFETCH_LOAD_STORE
+#undef SOFTWARE_PREFETCH_LOAD_STORE_HINT
+#undef SOFTWARE_PREFETCH_LOAD_STORE_LOOKAHEAD
+#undef SOFTWARE_PREFETCH_GATHER_SCATTER
+#undef SOFTWARE_PREFETCH_GATHER_SCATTER_HINT
+#undef SOFTWARE_PREFETCH_GATHER_SCATTER_LOOKAHEAD
+#undef ADJ_LIST
+#undef I
+}
diff --git a/src/BenchKernelD3Q19ListAaPvGatherHybrid.h b/src/BenchKernelD3Q19ListAaPvGatherHybrid.h
new file mode 100644
index 0000000..580f9d7
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherHybrid.h
@@ -0,0 +1,44 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID__
+
+#include "Kernel.h"
+
+void D3Q19ListAaPvGatherHybridInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListAaPvGatherHybridInit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaPvGatherHybridDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19ListAaPvGatherHybridDeinit_PushAoSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID__
diff --git a/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.c b/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.c
new file mode 100644
index 0000000..eefb031
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.c
@@ -0,0 +1,1173 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvGatherHybridCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define PAGE_4K		4096
+
+#if ALLOC_ADJ_LIST_IN_HBM == 1
+#define ADJ_LIST_ALLOCATOR HbwAllocAligned
+#define ADJ_LIST_FREE HbwFree
+#else
+#define ADJ_LIST_ALLOCATOR MemAllocAligned
+#define ADJ_LIST_FREE MemFree
+#endif
+
+#if ALLOC_PDF_IN_HBM == 1
+#define PDF_ALLOCATOR HbwAllocAligned
+#define PDF_FREE HbwFree
+#else
+#define PDF_ALLOCATOR MemAllocAligned
+#define PDF_FREE MemFree
+#endif
+
+// Forward definition.
+void FNAME(D3Q19ListAaPvGatherHybridKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdf != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			*pdf = kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])];
+			#undef ADJ_LIST
+		}
+		else {
+			*pdf = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)];
+		}
+
+	}
+	else {
+		*pdf = kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)];
+	}
+
+	return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+	Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+	if (isnan(pdf)) {
+		printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+		DEBUG_BREAK_POINT();
+		exit(1);
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if (kdl->Iteration % 2 == 0) {
+		// Pdfs are stored inverse, local PDFs are located in remote nodes
+
+		// getting node index
+		uint32_t index = kdl->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+		if (dir != D3Q19_C) {
+			#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+			kd->PdfsActive[ADJ_LIST(D3Q19_INV[dir])] = pdf;
+			#undef ADJ_LIST
+		} else {
+			kd->PdfsActive[P_INDEX_3(kdl->nCells, index, dir)] = pdf;
+		}
+
+	} else {
+		kd->PdfsActive[P_INDEX_5(kdl, x, y, z, dir)] = pdf;
+	}
+
+	return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[ADJ_LIST(idxinv)];
+		D3Q19_LIST_WO_C
+			#undef X
+			#undef ADJ_LIST
+			pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)];
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+		D3Q19_LIST
+			#undef X
+			#undef I
+
+	}
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+	Assert(kd != NULL);
+	Assert(kd->PdfsActive != NULL);
+	Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+	Assert(pdfs != NULL);
+
+	Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+	Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+	for (int d = 0; d < 19; ++d) {
+		if(isnan(pdfs[d])) {
+			printf("%d %d %d %d nan! get node\n", x, y, z, d);
+			for (int d2 = 0; d2 < 19; ++d2) {
+				printf("%d: %e\n", d2, pdfs[d2]);
+			}
+			exit(1);
+		}
+	}
+
+	KernelDataList * kdl = KDL(kd);
+	uint32_t * adjList = kdl->AdjList;
+
+	if(kdl->Iteration % 2 == 0){
+
+		uint32_t index = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+
+		// Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+		kd->PdfsActive[P_INDEX_3(kdl->nCells, index, D3Q19_C)] = pdfs[D3Q19_C];
+
+		#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[ADJ_LIST(idxinv)] = pdfs[idx];
+		D3Q19_LIST_WO_C
+			#undef X
+			#undef ADJ_LIST
+
+	} else {
+
+		#define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+		#define X(name, idx, idxinv, _x, _y, _z)	kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+		D3Q19_LIST
+			#undef X
+			#undef I
+
+	}
+
+	return;
+}
+
+static void ParameterUsage()
+{
+	printf("Kernel parameters:\n");
+	printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+	return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+	Assert(blk != NULL);
+
+	blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+	#define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+	#define NEXT_ARG_PRESENT() \
+	do { \
+		if (i + 1 >= params->nKernelArgs) { \
+			printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+			exit(1); \
+		} \
+	} while (0)
+
+
+	for (int i = 0; i < params->nKernelArgs; ++i) {
+		if (ARG_IS("-blk") || ARG_IS("--blk")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = blk[1] = blk[2] = tmp;
+		}
+		else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[0] = tmp;
+		}
+		else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[1] = tmp;
+		}
+		else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+			NEXT_ARG_PRESENT();
+
+			int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+			if (tmp <= 0) {
+				printf("ERROR: blocking parameter must be > 0.\n");
+				exit(1);
+			}
+
+			blk[2] = tmp;
+		}
+		else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+			ParameterUsage();
+			exit(1);
+		}
+		else {
+			printf("ERROR: unknown kernel parameter.\n");
+			ParameterUsage();
+			exit(1);
+		}
+	}
+
+	#undef ARG_IS
+	#undef NEXT_ARG_PRESENT
+
+	return;
+}
+
+static void SetuploopStartIndices(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+	//#define ADJ_LIST(dir) adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]
+	Assert(ld != NULL);
+	Assert(kdlr != NULL);
+	Assert(nThreads > 0);
+
+	//uint32_t * adjList = kdlr->kdl.AdjList;
+	uint32_t * adjList = kdlr->kdl.AdjList;
+
+	uint32_t nLoopStartIndices = 0;
+	uint32_t loopStartIndex = 2;
+
+	int nFluid = kdlr->kdl.nFluid;
+
+	int * oddKernelThreadStartIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+
+	int nNodesPerThread = nFluid / nThreads;
+	//printf("nodesPerThread: %d\n", nNodesPerThread);
+
+	for (int i = 0; i < nThreads; ++i) {
+		oddKernelThreadStartIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+	}
+
+	oddKernelThreadStartIndices[nThreads] = nFluid;
+
+	/*
+	   for (int i = 0; i <= nThreads; ++i) {
+	   printf("oddKernelThreadStartIndices[%d] = %d\n", i, oddKernelThreadStartIndices[i]);
+	   }
+	   */
+
+	int threadIndex = 1;
+
+	// We execute following code two times.
+	// - The first time to get the count of how many entries we need for the
+	//   loopStartIndices array.
+	// - The second time to fill the array.
+
+	// Loop over adjacency list of all nodes.
+	// Compare if adjacent nodes within one cache line share the same access pattern.
+	// First vectorized access is assumed to be consecutive (-> may be loaded with regular load).
+
+	int lastCacheLineConsecutive = 1;
+
+	for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
+
+		int currentCacheLineConsecutive = 1;
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex) {
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
+					//printf("no match for index: %d\n", d);
+					//printf("ADJ_LlST(%d,%d) = %d  !=  %d = ADJ_LlST(%d,%d) + 1\n", index, d, ADJ_LIST(index,d), ADJ_LIST(index-1,d), index-1, d);
+					// Different access pattern.
+					currentCacheLineConsecutive = 0;
+					break;
+				}
+				#undef ADJ_LIST
+
+			}
+
+			if(!currentCacheLineConsecutive){
+				break; //exit from nested loop
+			}
+
+		}
+
+		int interCacheLineConsecutive = 1;
+
+		if(currentCacheLineConsecutive && lastCacheLineConsecutive){
+			// check if cache line has consecutive memory access pattern to last entry of previous cache line
+			int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
+			if (lastIdxOfPreviousCacheLine > 0) {
+				for(int d = 0; d < N_D3Q19 - 1; ++d) {
+					Assert(d != D3Q19_C);
+					#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+					if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
+						// Different access pattern.
+						//printf("not interCacheConsecutive\n");
+						interCacheLineConsecutive = 0;
+						break;
+					}
+					#undef ADJ_LIST
+
+				}
+			}
+		}
+		int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
+		if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
+			// Current cache line contains thread boundary.
+			// These cache lines are treated by scalar peel and
+			// reminder loops in kernel of every thread.
+			// TODO maybe replace these loops with masked gather / scatter?!
+			if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
+				++loopStartIndex; // reserving gather/scatter index
+			}
+			++loopStartIndex; // reserving space for start remainder loop of thread n
+
+			if (threadIndex < nThreads){
+				++loopStartIndex; // reserving space for starting peel loop of thread n+1
+
+				if (fluidBaseIndex - 1 == threadBoundaryIndex){
+					if(!currentCacheLineConsecutive){
+						++loopStartIndex;
+					}
+				}
+				else {
+					currentCacheLineConsecutive = 1;
+				}
+
+				//++loopStartIndex; // reserving space for ending peel loop / starting load/store of thread n+1
+				++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
+			}
+			++threadIndex;
+		}
+		else {
+			// We are not at a thread boundary.
+			if (currentCacheLineConsecutive) {
+				if(lastCacheLineConsecutive && !interCacheLineConsecutive){
+					loopStartIndex+=2;
+				}
+				else if(!lastCacheLineConsecutive){
+					++loopStartIndex;
+				}
+			}
+			else {
+				if(lastCacheLineConsecutive){
+					++loopStartIndex;
+				}
+			}
+		}
+
+		// treating special case when last thread has no remainder loop
+		if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
+			//printf("--> special case 111. loopStartIndex: %d \n", loopStartIndex);
+			if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
+				++loopStartIndex; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
+			}
+
+			++loopStartIndex; // gather/scatter end and scalar remainder start
+			++loopStartIndex; // scalar remainder end and scalar peel start
+
+		}
+
+		lastCacheLineConsecutive = currentCacheLineConsecutive;
+	}
+
+	if (nFluid > 0) {
+		nLoopStartIndices = loopStartIndex;
+	}
+
+	int * loopStartIndices;
+	unsigned long loopStartIndicesByte = (nLoopStartIndices + 1) * sizeof(int);
+
+	printf("# Loop Start Index Array Allocation:\n");
+	printf("#   elements:      \t\t%d\n",   nLoopStartIndices + 1);
+	printf("#   size:          \t\t%e MiB\n", loopStartIndicesByte / 1024.0 / 1024.0);
+	printf("#   alignment:     \t\t%d b\n",   PAGE_4K);
+
+	if (MemAllocAligned((void **)&loopStartIndices, loopStartIndicesByte, PAGE_4K)) {
+		printf("ERROR: allocating loopStartIndices array with MemAllocAligned failed: %lu bytes.\n", loopStartIndicesByte);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\tMemAllocAligned()\n");
+	}
+
+	oddKernelThreadStartIndices[0] = 0;
+	loopStartIndices[0] = 0; //first scalar loop would start with 0
+	loopStartIndices[1] = 0; //no peel loop expected -> first load/store loop may start at index==0
+	loopStartIndices[2] = 0; //may not be set in case first access is gather/scatter -> therefore its set here
+
+	// resetting values to default
+	threadIndex = 1;
+	lastCacheLineConsecutive = 1;
+	loopStartIndex = 2;
+
+	// Loop over adjacency list of all nodes.
+	// Compare if adjacent nodes share the same access pattern.
+
+	int indexAccumulator = 0;
+
+	// for statistical reasons:
+	int gatherAccumulator = 0;
+	int loadAccumulator = 0;
+	int scalarLookups = 0;
+	int loadLookups = 0;
+
+
+	for (int fluidBaseIndex = 1; fluidBaseIndex < nFluid + 1; fluidBaseIndex += VSIZE) {
+		int currentCacheLineConsecutive = 1;
+		//printf("fluidbaseIndex: %d\n", fluidBaseIndex);
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			// check if cache line itself has consecutive memory access pattern
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE - 1) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+				//if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1)
+				if (ADJ_LIST(index, d) != ADJ_LIST(index-1, d) + 1) {
+					// Different access pattern.
+					currentCacheLineConsecutive = 0;
+					break;
+				}
+				#undef ADJ_LIST
+			}
+
+			if(!currentCacheLineConsecutive){
+				break; //exit from nested loop
+			}
+		}
+
+		int interCacheLineConsecutive = 1;
+
+		if(currentCacheLineConsecutive && lastCacheLineConsecutive){
+			// check if cache line has consecutive memory access pattern to last entry of previous cache line
+			int lastIdxOfPreviousCacheLine = fluidBaseIndex - 2;
+			if (lastIdxOfPreviousCacheLine > 0) {
+				for(int d = 0; d < N_D3Q19 - 1; ++d) {
+					Assert(d != D3Q19_C);
+					#define ADJ_LIST(idx, dir) adjList[((idx) - ((idx) % VSIZE)) * N_D3Q19_IDX + ((dir) * VSIZE) + ((idx) % VSIZE)]
+					if (ADJ_LIST(fluidBaseIndex-1, d) != ADJ_LIST(lastIdxOfPreviousCacheLine, d) + 1) {
+						// Different access pattern.
+						interCacheLineConsecutive = 0;
+						break;
+					}
+					#undef ADJ_LIST
+
+				}
+			}
+		}
+
+		int threadBoundaryIndex = oddKernelThreadStartIndices[threadIndex];
+		//if (fluidBaseIndex > 3500)
+		//	printf("threadBoundaryIndex: %d  fluidBaseIndex-1: %d fluidBaseIndex + VSIZE - 1: %d\n", threadBoundaryIndex, fluidBaseIndex-1, fluidBaseIndex + VSIZE -1);
+
+		if (fluidBaseIndex - 1 <= threadBoundaryIndex &&
+				threadBoundaryIndex < fluidBaseIndex + VSIZE - 1) {
+			// Current cache line contains thread boundary.
+			// These cache lines are treated by scalar peel and
+			// reminder loops in kernel of every thread.
+			// TODO maybe replace these loops with masked gather / scatter?!
+			if (loopStartIndex % 2 == 0) { // current index would be gather/scatter index
+				//loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; //same value as scalar remainder start => no more access to gather/scatter loop
+				loopStartIndices[loopStartIndex] = indexAccumulator; //same value as scalar remainder start => no more access to gather/scatter loop
+				++loopStartIndex;
+			}
+
+			//loopStartIndices[loopStartIndex] = fluidBaseIndex - 1; // gather/scatter end and scalar remainder start
+			loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
+			++loopStartIndex;
+
+			// starting indices of thread n+1
+			loopStartIndices[loopStartIndex] = threadBoundaryIndex; // scalar remainder of thread n end and scalar peel of thread n+1 start
+			oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
+
+			if (threadIndex < nThreads){
+				indexAccumulator = ((threadBoundaryIndex + VSIZE - 1) / VSIZE ) * VSIZE; // rounding towards next multiple of VSIZE
+				++loopStartIndex;
+				loopStartIndices[loopStartIndex] = indexAccumulator; // scalar peel end and 1st load/store start
+
+				// treating special case when there is no peel / remainder loop
+				if (fluidBaseIndex - 1 == threadBoundaryIndex){
+					if(!currentCacheLineConsecutive){
+						++loopStartIndex;
+						loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end and 1st gather/scatter start
+						gatherAccumulator += VSIZE;
+					}
+					else {
+						loadLookups += VSIZE;
+					}
+					indexAccumulator += VSIZE;
+				}
+				else {
+					scalarLookups += VSIZE;
+					currentCacheLineConsecutive = 1;
+				}
+
+				++loopStartIndex; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
+				loopStartIndices[loopStartIndex] = indexAccumulator; // 1st load/store end == 1st gather/scatter start OR 1st gather/scatter end == 2nd load/start start
+			}
+			++threadIndex;
+
+		}
+		else {
+			// We are not at a thread boundary.
+			int print = 0;
+			if (currentCacheLineConsecutive) {
+				loadAccumulator += VSIZE;
+
+				if(lastCacheLineConsecutive && !interCacheLineConsecutive){
+					loadLookups += VSIZE;
+					if (print)
+						printf("#1 loopStartIndex: %d\n", loopStartIndex);
+					// loopStartIndices[loopStartIndex] is not incremented since pointers need to be fetched again.
+					// loopStartIndices[loopStartIndex + 1] (-> start Load/Store and end Gather/Scatter)
+					// gets same value as loopStartIndices[loopStartindex] (-> start Gather/Scatter)
+					// this ensures that no gather/scatter iteration is executed
+					++loopStartIndex;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+
+					// loopStartIndices[loopStartIndex + 2] (-> start Gather/Scatter and end Load/Store)
+					// gets set to have one Load/Store iteration
+					++loopStartIndex;
+					indexAccumulator+=VSIZE;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+
+				}
+				else if(!lastCacheLineConsecutive){
+					loadLookups += VSIZE;
+					if (print)
+						printf("#2 loopStartIndex: %d\n", loopStartIndex);
+					++loopStartIndex;
+					indexAccumulator+=VSIZE;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+				}
+				else { // (lastCacheLineConsecutive && interCacheLineConsecutive)
+					if (print)
+						printf("#3 loopStartIndex: %d\n", loopStartIndex);
+					indexAccumulator+=VSIZE;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+				}
+			}
+			else {
+				gatherAccumulator += VSIZE;
+				if(lastCacheLineConsecutive){
+					if (print)
+						printf("#4 loopStartIndex: %d\n", loopStartIndex);
+					++loopStartIndex;
+					indexAccumulator+=VSIZE;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+				}
+				else { // lastCacheLine without not consecutive memory access pattern
+					if (print)
+						printf("#5 loopStartIndex: %d\n", loopStartIndex);
+					indexAccumulator+=VSIZE;
+					loopStartIndices[loopStartIndex] = indexAccumulator;
+				}
+			}
+		}
+
+		// treating special case when last thread has no remainder loop
+		if (oddKernelThreadStartIndices[nThreads] == fluidBaseIndex + VSIZE - 1) {
+			//printf("--> special case. indexAccumulator: %d\n", indexAccumulator);
+			if (loopStartIndex % 2 != 0) { // current index is gather/scatter end and load/store start index
+				++loopStartIndex;
+				loopStartIndices[loopStartIndex] = indexAccumulator; //set load/store end (gather/scatter start) to same value as scalar remainder start => no more access to gather/scatter loop
+			}
+
+			++loopStartIndex;
+			loopStartIndices[loopStartIndex] = indexAccumulator; // gather/scatter end and scalar remainder start
+			++loopStartIndex;
+			loopStartIndices[loopStartIndex] = indexAccumulator; // scalar remainder end and scalar peel start
+
+			oddKernelThreadStartIndices[threadIndex] = loopStartIndex; // thread start is where scalar peel starts
+		}
+
+		lastCacheLineConsecutive = currentCacheLineConsecutive;
+
+	}
+
+	if (nLoopStartIndices != loopStartIndex){
+		printf("ERROR: nLoopStartIndices unequal loopStartIndex!\n");
+	}
+
+	/*
+	printf("loopStartIndices:\n");
+	for(int i = 0; i <= nLoopStartIndices; ++i){
+		printf("%d ", loopStartIndices[i]);
+	}
+	printf("\n");
+	printf("oddKernelThreadStartIndices:\n");
+	for(int i = 0; i <= nThreads; ++i){
+		printf("%d ", oddKernelThreadStartIndices[i]);
+	}
+	printf("\n");
+	*/
+
+	kdlr->loopStartIndices = loopStartIndices;
+	kdlr->nLoopStartIndices = nLoopStartIndices;
+
+	kdlr->oddKernelThreadStartIndices  = oddKernelThreadStartIndices;
+	kdlr->nOddKernelThreadStartIndices = nThreads;
+
+	printf("#   vload/vstore nodes:  \t% 10d   \t(%3.4f %% of total fluid nodes)\n", loadAccumulator, ((double) loadAccumulator / (double) nFluid) * 100);
+	printf("#   gather/scatter nodes:\t% 10d   \t(%3.4f %% of total fluid nodes)\n", gatherAccumulator, ((double) gatherAccumulator / (double) nFluid) * 100.0);
+	printf("#   vload/vstore lookups:\t% 10d \n", loadLookups * (N_D3Q19 - 1));
+	printf("#   gather/scatter lookups:\t% 10d \n", gatherAccumulator * (N_D3Q19 - 1));
+	printf("#   scalar lookups:      \t% 10d \n", scalarLookups * (N_D3Q19 - 1));
+
+	double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
+	double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) /* actual PDFs */
+		+ (((double)(gatherAccumulator + loadLookups + scalarLookups)) / nFluid) * sizeof(int) * (N_D3Q19 - 1) /* AdjList */
+		+ (nLoopStartIndices / nFluid) * sizeof(int); // one lookup to loopStartIndices
+
+	double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
+
+	kdlr->kdl.kd.LoopBalance = loopBalance;
+
+	printf("# loop balance:\n");
+	printf("#   even timestep:  \t\t%.2f B/FLUP\n", loopBalanceEven);
+	printf("#   odd timestep:   \t\t%.2f B/FLUP\n", loopBalanceOdd);
+	printf("#   average:        \t\t%.2f B/FLUP\n", loopBalance);
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherHybridInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+	KernelData * kd;
+	KernelDataList * kdl;
+	KernelDataListRia * kdlr;
+	MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+	kd = (KernelData *)kdlr;
+	kdl = KDL(kdlr);
+
+	*kernelData = kd;
+
+#ifdef DEBUG
+	kd->Pdfs[0] = NULL;
+	kd->Pdfs[1] = NULL;
+	kd->PdfsActive = NULL;
+	kd->DstPdfs = NULL;
+	kd->SrcPdfs = NULL;
+	kd->Dims[0] = -1;
+	kd->Dims[1] = -1;
+	kd->Dims[2] = -1;
+	kd->GlobalDims[0] = -1;
+	kd->GlobalDims[1] = -1;
+	kd->GlobalDims[2] = -1;
+	kd->Offsets[0] = -1;
+	kd->Offsets[1] = -1;
+	kd->Offsets[2] = -1;
+
+	kd->ObstIndices = NULL;
+	kd->nObstIndices = -1;
+	kd->BounceBackPdfsSrc = NULL;
+	kd->BounceBackPdfsDst = NULL;
+	kd->nBounceBackPdfs = -1;
+
+	kdl->AdjList = NULL;
+	kdl->Coords = NULL;
+	kdl->Grid = NULL;
+	kdl->nCells = -1;
+	kdl->nFluid = -1;
+
+	kdlr->loopStartIndices = NULL;
+	kdlr->nLoopStartIndices = 0;
+	kdlr->oddKernelThreadStartIndices = NULL;
+	kdlr->nOddKernelThreadStartIndices = 0;
+#endif
+
+	kdl->Iteration = -1;
+
+	// Ajust the dimensions according to padding, if used.
+	kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+	kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+	kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+	int * lDims = ld->Dims;
+
+	int lX = lDims[0];
+	int lY = lDims[1];
+	int lZ = lDims[2];
+
+	int nTotalCells = lX * lY * lZ;
+	int nCells = ld->nFluid; // TODO: + padding
+	int nFluid = ld->nFluid;
+
+	// TODO: check nCells/nFluid do not exceed 2^31. This actually has to be
+	// done during lattice setup.
+	kdl->nCells = nCells;
+	kdl->nFluid = nFluid;
+
+	PdfT * pdfs[2];
+
+	int blk[3] = { 0 };
+
+	ParseParameters(params, blk);
+
+	if (blk[0] == 0) blk[0] = lX;
+	if (blk[1] == 0) blk[1] = lY;
+	if (blk[2] == 0) blk[2] = lZ;
+
+	printf("# blocking:            \t\tx: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+	unsigned long latByte      = nCells * sizeof(PdfT) * N_D3Q19;
+	unsigned long latFluidByte = nFluid * sizeof(PdfT) * N_D3Q19;
+	unsigned long latPadByte   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19;
+
+	printf("# Lattice Array Allocation:\n");
+	printf("#   lattice size:      \t\t%e MiB\n", latByte      / 1024.0 / 1024.0);
+	printf("#   fluid lattice size:\t\t%e MiB\n", latFluidByte / 1024.0 / 1024.0);
+	printf("#   lattice padding:   \t\t%e MiB\n", latPadByte   / 1024.0 / 1024.0);
+
+
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	if (PDF_ALLOCATOR((void **)&pdfs[0], latFluidByte, PAGE_4K)) {
+		printf("ERROR: allocating PDF array with %s() failed: %lu bytes.\n", STRINGIFY(PDF_ALLOCATOR), latFluidByte);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(PDF_ALLOCATOR));
+	}
+
+	kd->Pdfs[0] = pdfs[0];
+
+	// Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+	// Here we touch only the fluid nodes as this loop is OpenMP parallel and
+	// we want the same scheduling as in the kernel.
+	#ifdef _OPENMP
+	#pragma omp parallel for
+	#endif
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+	} }
+
+	// Initialize all PDFs to some standard value.
+	for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+		pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+	} }
+
+	// ----------------------------------------------------------------------
+	// create grid which will hold the index numbers of the fluid nodes
+
+	uint32_t * grid;
+
+	if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+		printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+		exit(1);
+	}
+	kdl->Grid = grid;
+
+	int latticeIndex;
+
+#ifdef DEBUG
+	for(int z = 0; z < lZ; ++z) {
+		for(int y = 0; y < lY; ++y) {
+			for(int x = 0; x < lX; ++x) {
+
+				latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+				grid[latticeIndex] = ~0;
+			}
+		}
+	}
+#endif
+
+	// ----------------------------------------------------------------------
+	// generate numbering over grid
+
+	uint32_t * coords;
+
+	if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+		printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+		exit(1);
+	}
+
+	kdl->Coords = coords;
+
+	// Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+	// through the ld->Lattice array.
+	int counter = 0;
+
+	// Blocking is implemented via setup of the adjacency list. The kernel later will
+	// walk through the lattice blocked automatically.
+	for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+		for (int bY = 0; bY < lY; bY += blk[1]) {
+			for (int bX = 0; bX < lX; bX += blk[0]) {
+
+				int eX = MIN(bX + blk[0], lX);
+				int eY = MIN(bY + blk[1], lY);
+				int eZ = MIN(bZ + blk[2], lZ);
+
+
+				for (int z = bZ; z < eZ; ++z) {
+					for (int y = bY; y < eY; ++y) {
+						for (int x = bX; x < eX; ++x) {
+
+							latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+							if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+								grid[latticeIndex] = counter;
+
+								coords[C_INDEX_X(counter)] = x;
+								coords[C_INDEX_Y(counter)] = y;
+								coords[C_INDEX_Z(counter)] = z;
+
+								++counter;
+							}
+						} } }
+			} } }
+
+	Verify(counter == nFluid);
+
+	uint32_t * adjList;
+
+	// AoSoA addressing for adjList needs padding for (nFluid % VSIZE) != 0
+	int nFluid_padded = ((nFluid + VSIZE - 1) / VSIZE) * VSIZE;
+	unsigned long adjListBytes = nFluid_padded * sizeof(int) * N_D3Q19_IDX;
+
+	printf("# Adjacency List Allocation:\n");
+	printf("#   size:              \t\t%e MiB\n", adjListBytes / 1024.0 / 1024.0);
+	printf("#   alignment:         \t\t%d b\n", PAGE_4K);
+
+	// AdjList only requires 18 instead of 19 entries per node, as
+	// the center PDF needs no addressing.
+	if (ADJ_LIST_ALLOCATOR((void **)&adjList, adjListBytes, PAGE_4K)) {
+		printf("ERROR: allocating adjList array with %s() failed: %lu bytes.\n", STRINGIFY(ADJ_LIST_ALLOCATOR), adjListBytes);
+		exit(1);
+	}
+	else {
+		printf("#   allocator: \t\t\t%s()\n", STRINGIFY(ADJ_LIST_ALLOCATOR));
+	}
+
+	for (int i = 0; i < nFluid_padded; ++i){
+		adjList[i] = -1;
+	}
+
+	kdl->AdjList = adjList;
+
+	int x, y, z;
+
+	uint32_t neighborIndex;
+	uint32_t dstIndex;
+
+	int nx, ny, nz, px, py, pz;
+
+	// Loop over all fluid nodes and compute the indices to the neighboring
+	// PDFs for configured data layout (AoS/SoA).
+	// Parallelized loop to ensure correct NUMA placement.
+	// #ifdef _OPENMP  --> add line continuation
+	// 	#pragma omp parallel for default(none)
+	// 		shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+	// 				stderr,
+	// 				lDims, grid, ld, lX, lY, lZ, adjList)
+	// 		private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+	// #endif
+
+	for (int fluidBaseIndex = 0; fluidBaseIndex < nFluid; fluidBaseIndex+=VSIZE) {
+
+
+		// Loop over all directions except the center one.
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			Assert(d != D3Q19_C);
+
+			for(int inChunkIndex = 0; (inChunkIndex < VSIZE) && ((fluidBaseIndex + inChunkIndex) < nFluid); ++inChunkIndex){
+				int index = fluidBaseIndex + inChunkIndex;
+
+				Assert(index < nFluid);
+
+				x = coords[C_INDEX_X(index)];
+				y = coords[C_INDEX_Y(index)];
+				z = coords[C_INDEX_Z(index)];
+
+				Assert(x >= 0 && x < lX);
+				Assert(y >= 0 && y < lY);
+				Assert(z >= 0 && z < lZ);
+
+				Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+#ifdef PROP_MODEL_PUSH
+				nx = x + D3Q19_X[d];
+				ny = y + D3Q19_Y[d];
+				nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+				nx = x - D3Q19_X[d];
+				ny = y - D3Q19_Y[d];
+				nz = z - D3Q19_Z[d];
+#else
+				#error No implementation for this PROP_MODEL_NAME.
+#endif
+				// If the neighbor is outside the latice in X direction and we have a
+				// periodic boundary then we need to wrap around.
+				if ( 	((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+						((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+						((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+				   ){
+					// x periodic
+
+					if (nx < 0) {
+						px = lX - 1;
+					}
+					else if (nx >= lX) {
+						px = 0;
+					} else {
+						px = nx;
+					}
+					// y periodic
+					if (ny < 0) {
+						py = lY - 1;
+					}
+					else if (ny >= lY) {
+						py = 0;
+					} else {
+						py = ny;
+					}
+
+					// z periodic
+					if (nz < 0) {
+						pz = lZ - 1;
+					}
+					else if (nz >= lZ) {
+						pz = 0;
+					} else {
+						pz = nz;
+					}
+
+					if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+						dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+					}
+					else {
+						neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+						AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+						dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+					}
+				}
+				else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+					dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+				}
+				else {
+					neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+					Assert(neighborIndex != ~0);
+
+					dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+				}
+
+				Assert(dstIndex >= 0);
+				Assert(dstIndex < nCells * N_D3Q19);
+
+				adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = dstIndex;
+			}
+		}
+	}
+
+// Sets unused adjList entries to some extreme value which triggers and SIGSEG, whenever these values are accidently accessed.
+	for(int index = nFluid; index < nFluid_padded; ++index){
+		for(int d = 0; d < N_D3Q19 - 1; ++d) {
+			adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (d * VSIZE) + (index % VSIZE)] = -10*1000*1000;
+		}
+	}
+
+/*
+	printf("============\n");
+	for (int i = 0; i < nFluid_padded * (N_D3Q19_IDX + 20);){
+		for (int j = 0; j < VSIZE; ++j){
+			printf("%d ",adjList[i]);
+			++i;
+		}
+		printf("\n");
+	}
+	   for(int dir = 0; dir < N_D3Q19; ++dir){
+	   printf("dir: %d\n",dir);
+	   for(int baseIndex = 0; baseIndex < nFluid + VSIZE; baseIndex+=VSIZE){
+	   for(int i = 0; i < VSIZE; ++i){
+	   int index = baseIndex + i;
+
+	   printf("%d ", adjList[(index - (index % VSIZE)) * N_D3Q19_IDX + (dir * VSIZE) + (index % VSIZE)]);
+	   }
+	   printf("\n");
+	   }
+	   printf("\n");
+	   }
+	printf("============\n");
+*/
+
+
+	int nThreads = 1;
+
+#ifdef _OPENMP
+	nThreads = omp_get_max_threads();
+#endif
+
+	SetuploopStartIndices(ld, KDLR(kd), nThreads);
+
+	// Fill remaining KernelData structures
+	kd->GetNode = GetNode;
+	kd->SetNode = SetNode;
+
+	kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+	kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+	kd->Kernel = FNAME(D3Q19ListAaPvGatherHybridKernel);
+
+	kd->DstPdfs = NULL;
+	kd->PdfsActive = kd->Pdfs[0];
+
+	return;
+}
+
+void FNAME(D3Q19ListAaPvGatherHybridDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+	KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+	MemFree((void **)&((*kdlr)->loopStartIndices));
+
+	if ((*kdlr)->oddKernelThreadStartIndices != NULL) {
+		MemFree((void **)&((*kdlr)->oddKernelThreadStartIndices));
+	}
+
+	KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+	ADJ_LIST_FREE((void **)&((*kdl)->AdjList));
+
+	MemFree((void **)&((*kdl)->Coords));
+	MemFree((void **)&((*kdl)->Grid));
+
+	PDF_FREE((void **)&((*kernelData)->Pdfs[0]));
+
+	MemFree((void **)kernelData);
+	return;
+}
+#undef PAGE_4K
+#undef ADJ_LIST
diff --git a/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.h b/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.h
new file mode 100644
index 0000000..c103dc2
--- /dev/null
+++ b/src/BenchKernelD3Q19ListAaPvGatherHybridCommon.h
@@ -0,0 +1,62 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID_COMMON_H__
+
+#if !defined(DATA_LAYOUT_SOA) && !defined(DATA_LAYOUT_AOSOA)
+	#error List Gather Hybrid works only with DATA_LAYOUT_SOA or DATA_LAYOUT_AOSOA
+#endif
+
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+typedef struct KernelDataListRia_ {
+	KernelDataList kdl;
+
+	// Array contains information for loop start indices with the following scheme for every thread:
+	// scalar peel start | (vectorized load store | vectorized gather scatter) ... | scalar remainder.
+	// Example for 3 threads with ! indicating thread boundaries 
+	// [sp,vls,vgs,...,vls,vgs,sr ! sp, vls, vgs, ..., vls, vgs, sr ! sp, vls, vgs, ..., vls, vgs, sr]
+	int * loopStartIndices;
+	int nLoopStartIndices;  // Number of entries in loopStartIndices array.
+
+	// Array contains (for each thread) an index into loopStartIndices.
+	int * oddKernelThreadStartIndices;
+	// Number of entries in threadStartIndices
+	int nOddKernelThreadStartIndices;
+
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)	((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_GATHER_HYBRID_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19ListAaRia.c b/src/BenchKernelD3Q19ListAaRia.c
index 245c2a5..0cb4636 100644
--- a/src/BenchKernelD3Q19ListAaRia.c
+++ b/src/BenchKernelD3Q19ListAaRia.c
@@ -116,6 +116,8 @@ void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, Case
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+	X_KERNEL_START(kernelData);
+
 	// TODO: outer openmp parallel
 	for(int iter = 0; iter < maxIterations; iter += 2) {
 
@@ -139,7 +141,7 @@ void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, Case
 			#pragma vector always
 			#pragma simd
 		#endif
-		for (int index = 0; index < nFluid; ++index) {
+		for (int index = 0; index < nFluid; ++index) { // LOOP list-aa-ria-even
 
 			#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
 
@@ -296,7 +298,7 @@ void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, Case
 			int indexStop  = threadIndices[threadId] + nFluidThread;
 
 			// Because of runlength coding iterations are not independent.
-			for (int index = indexStart; index < indexStop; ++index) {
+			for (int index = indexStart; index < indexStop; ++index) { // LOOP list-aa-ria-odd
 
 				#define I(index, dir)	P_INDEX_3((nCells), (index), (dir))
 
@@ -469,6 +471,8 @@ void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, Case
 
 	} // for (int iter = 0; ...
 
+	X_KERNEL_END(kernelData);
+
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
 		kd->PdfsActive = src;
diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c
index f617406..05d1d0a 100644
--- a/src/BenchKernelD3Q19ListPullSplitNt.c
+++ b/src/BenchKernelD3Q19ListPullSplitNt.c
@@ -119,8 +119,10 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		KernelStatistics(kd, ld, cd, 0);
 	#endif
 
+	X_KERNEL_START(kernelData);
+
+	X_LIKWID_START("list-pull-split-nt-1s");
 
-			X_LIKWID_START("list-pull-split-nt-1s");
 	#ifdef _OPENMP
 		#pragma omp parallel default(none) \
 			shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
@@ -244,6 +246,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 
 	X_LIKWID_STOP("list-pull-split-nt-1s");
 
+	X_KERNEL_END(kernelData);
+
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
 		kd->PdfsActive = src;
@@ -330,7 +334,9 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 	#endif
 
 
-			X_LIKWID_START("list-pull-split-nt-2s");
+	X_KERNEL_START(kernelData);
+
+	X_LIKWID_START("list-pull-split-nt-2s");
 
 
 	#ifdef _OPENMP
@@ -451,7 +457,9 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD
 		MemFree((void **)&tmpArray);
 	}
 
-			X_LIKWID_STOP("list-pull-split-nt-2s");
+	X_LIKWID_STOP("list-pull-split-nt-2s");
+
+	X_KERNEL_END(kernelData);
 
 #ifdef VTK_OUTPUT
 	if (cd->VtkOutput) {
diff --git a/src/Kernel.h b/src/Kernel.h
index 05ada4e..d3f367d 100644
--- a/src/Kernel.h
+++ b/src/Kernel.h
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -47,6 +51,12 @@
 	#define DATA_LAYOUT_NAME AoS
 #endif
 
+#ifdef DATA_LAYOUT_AOSOA
+	#define DATA_LAYOUT_NAME AoSoA
+	// number of cells consecutively stored (-> array length)
+	#define AOSOA_BLOCK_SIZE 8
+#endif
+
 #ifdef PROP_MODEL_PUSH
 	#define PROP_MODEL_NAME Push
 #endif
@@ -187,6 +197,8 @@ typedef struct KernelData_ {
 	int * BounceBackPdfsSrc;
 	int * BounceBackPdfsDst;
 	int nBounceBackPdfs;
+	double LoopBalance;    	// loopBalance when set in kernel init, Main.c may use it to derive metrics
+	double Duration;	// duration when set in kernel routine, Main.c may use it to derive metrics
 
 	void (* BoundaryConditionsGetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT * pdf);
 	void (* BoundaryConditionsSetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT pdf);
@@ -198,6 +210,10 @@ typedef struct KernelData_ {
 
 } KernelData;
 
+
+#define X_KERNEL_START(_kd) 	do { _kd->Duration = Time(); } while (0)
+#define X_KERNEL_END(_kd)		do { _kd->Duration = Time() - _kd->Duration; } while (0)
+
 typedef struct Parameters_ {
 	int nArgs;
 	char ** Args;
diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h
index 6efadd9..9c6b53e 100644
--- a/src/KernelFunctions.h
+++ b/src/KernelFunctions.h
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -35,6 +39,11 @@
 #include "BenchKernelD3Q19ListAa.h"
 #include "BenchKernelD3Q19ListAaRia.h"
 #include "BenchKernelD3Q19ListAaPv.h"
+#ifdef VECTOR_AVX512
+#include "BenchKernelD3Q19ListAaPvGatherAoSoA.h"
+#include "BenchKernelD3Q19ListAaPvGather.h"
+#include "BenchKernelD3Q19ListAaPvGatherHybrid.h"
+#endif
 #include "BenchKernelD3Q19ListPullSplitNt.h"
 
 typedef struct KernelFunctions_
@@ -66,6 +75,24 @@ KernelFunctions g_kernels[] =
 		.Init   = D3Q19ListAaInit_PushAoS,
 		.Deinit = D3Q19ListAaDeinit_PushAoS
 	},
+#ifdef VECTOR_AVX512
+	{
+		.Name   = "list-aa-pv-gather-aosoa",
+		.Init   = D3Q19ListAaPvGatherAoSoAInit_PushAoSoA,
+		.Deinit = D3Q19ListAaPvGatherAoSoADeinit_PushAoSoA
+	},
+	{
+		.Name   = "list-aa-pv-gather-soa",
+		.Init   = D3Q19ListAaPvGatherInit_PushSoA,
+		.Deinit = D3Q19ListAaPvGatherDeinit_PushSoA
+	},
+
+	{
+		.Name   = "list-aa-pv-gather-hybrid-soa",
+		.Init   = D3Q19ListAaPvGatherHybridInit_PushSoA,
+		.Deinit = D3Q19ListAaPvGatherHybridDeinit_PushSoA
+	},
+#endif
 	{
 		.Name   = "list-pull-split-nt-1s-soa",
 		.Init   = D3Q19ListPullSplitNt1SInit_PullSoA,
diff --git a/src/Main.c b/src/Main.c
index 10dfbbe..c9bb590 100644
--- a/src/Main.c
+++ b/src/Main.c
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -152,7 +156,7 @@ int main(int argc, char * argv[])
 #define LBM_BENCH_KERNELS_VERSION_MAJOR 0
 #define LBM_BENCH_KERNELS_VERSION_MINOR 1
 
-    printf("Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE\n");
+    printf("Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017, 2018 LSS, RRZE\n");
     printf("This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.\n");
 	printf("This is free software, and you are welcome to redistribute it under certain conditions.\n");
 	printf("\n");
@@ -346,7 +350,7 @@ int main(int argc, char * argv[])
 			printf("Usage:\n");
 			printf("./lbmbenchk -list\n");
 			printf("./lbmbenchk \n");
-			printf("      [-dims XxYyZ] [-geometry box|channel|pipe|porosity[-value]] [-iterations <iterations>] [-lattice-dump-ascii]\n");
+			printf("      [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-value]] [-iterations <iterations>] [-lattice-dump-ascii]\n");
 			printf("      [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>]\n");
 			printf("      [-periodic-x]\n");
 #ifdef STATISTICS
@@ -368,7 +372,7 @@ int main(int argc, char * argv[])
 			printf("\n");
 			printf("-dims XxYxZ		Specify geometry dimensions.\n");
 			printf("\n");
-			printf("-geometry porosity-<value>\n");
+			printf("-geometry blocks-<value>\n");
 			printf("                Geometetry with blocks of size <value> regularily layout out.\n");
 			printf("\n");
 			return 1;
@@ -441,7 +445,9 @@ int main(int argc, char * argv[])
 	printf("# - floating point:    UNKNOWN (%lu b)\n", sizeof(PdfT));
 #endif
 
-#ifdef VECTOR_AVX
+#if defined(VECTOR_AVX512)
+	printf("# - intrinsics:        AVX512 (VECTOR_AVX512 defined)\n");
+#elif defined(VECTOR_AVX)
 	printf("# - intrinsics:        AVX (VECTOR_AVX defined)\n");
 #elif defined(VECTOR_SSE)
 	printf("# - intrinsics:        SSE (VECTOR_SSE defined)\n");
@@ -469,11 +475,11 @@ int main(int argc, char * argv[])
 	printf("#   type:              %s\n", ld.Name);
 	printf("#   dimensions:        %d x %d x %d (x, y, z)\n", ld.Dims[0], ld.Dims[1], ld.Dims[2]);
 
-	printf("#   nodes total:       %d\n", ld.nObst + ld.nFluid);
-	printf("#   nodes fluid:       %d (including inlet & outlet)\n", ld.nFluid);
-	printf("#   nodes obstacles:   %d\n", ld.nObst);
-	printf("#   nodes inlet:       %d\n", ld.nInlet);
-	printf("#   nodes outlet:      %d\n", ld.nOutlet);
+	printf("#   nodes total:       % 10d\n", ld.nObst + ld.nFluid);
+	printf("#   nodes fluid:       % 10d (including inlet & outlet)\n", ld.nFluid);
+	printf("#   nodes obstacles:   % 10d\n", ld.nObst);
+	printf("#   nodes inlet:       % 10d\n", ld.nInlet);
+	printf("#   nodes outlet:      % 10d\n", ld.nOutlet);
 	printf("#   periodicity:       x: %d y: %d z: %d\n", ld.PeriodicX, ld.PeriodicY, ld.PeriodicZ);
 
 #ifdef VTK_OUTPUT
@@ -560,13 +566,9 @@ int main(int argc, char * argv[])
 
 	X_LIKWID_INIT();
 
-	double timeStart = Time();
-
 	// Call the LBM kernel
 	kd->Kernel(&ld, kd, &cd);
 
-	double duration = Time() - timeStart;
-
 	X_LIKWID_DEINIT();
 
 	// Print some statistics...
@@ -577,12 +579,32 @@ int main(int argc, char * argv[])
 	KernelVerifiy(&ld, kd, &cd, &errorNorm);
 #endif
 
+	double duration 			= kd->Duration;
+	double loopBalance 			= kd->LoopBalance;
+	double dataVolGByte 		= loopBalance * ld.nFluid * cd.MaxIterations / 1024. / 1024. / 1024.;
+	double bandwidthGBytePerS 	= dataVolGByte / duration;
+
 	// Deinitialize kernel by calling its own deinitialization function
 	kf->Deinit(&ld, &kd);
 
-
 	double perf = (double)ld.nFluid * (double)cd.MaxIterations / duration / 1.e6;
 
+	printf("#\n");
+	printf("# Evaluation Stats\n");
+#ifdef VERIFICATION
+	printf("#   runtype:     \t%s  \n", "verification");
+#else
+	printf("#   runtype:     \t%s  \n", "benchmark");
+#endif
+	printf("#   runtime:           \t%.3f s\n", duration);
+	printf("#   iterations:        \t%d  \n", cd.MaxIterations);
+	printf("#   fluid cells:       \t%d \n", ld.nFluid);
+	printf("# Derived metrics\n");
+	printf("#   MEM data vol.:     \t%.2f GByte\n", dataVolGByte);
+	printf("#   MEM bandwidth:     \t%.2f GByte/s\n", bandwidthGBytePerS);
+	printf("#   performance:       \t%.3f MFLUP/s\n", perf);
+
+
 	printf("P:   %f MFLUP/s  t: %d  d: %f s  iter: %d  fnodes: %f x1e6  geo: %s  kernel: %s  %s  %s\n",
 		perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6,
 		geometryType, kernelToUse,
diff --git a/src/Makefile b/src/Makefile
index 52f0bf8..8eb58d4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,6 +8,10 @@
 #   Viktor Haag, 2016
 #   LSS, University of Erlangen-Nuremberg, Germany
 #
+#   Michael Hussnaetter, 2017-2018
+#   University of Erlangen-Nuremberg, Germany
+#   michael.hussnaetter -at- fau.de
+#
 #  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 #
 #  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -141,6 +145,15 @@ OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \
 		$(OBJECT_DIR)/BenchKernelD3Q19AaVecSl_AaSoA.o \
 		$(OBJECT_DIR)/BenchKernelD3Q19AaVecSlCommon_AaSoA.o
 
+# Kernels which are using AVX-512 intrinsics.
+OBJ_C_ISA_AVX_512 = \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGatherAoSoA_PushAoSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGatherAoSoACommon_PushAoSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGather_PushSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGatherCommon_PushSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGatherHybrid_PushSoA.o \
+		$(OBJECT_DIR)/BenchKernelD3Q19ListAaPvGatherHybridCommon_PushSoA.o
+
 OBJ = $(OBJ_C)
 
 include ./config.$(CONFIG).mk
@@ -200,9 +213,14 @@ ifeq (on,$(LID_DRIVEN_CAVITY))
   PP_FLAGS += $(D)LID_DRIVEN_CAVITY
 endif
 
+ifeq (avx512,$(ISA))
+  PP_FLAGS += $(D)VECTOR_AVX512
+endif
+
 ifeq (avx,$(ISA))
   PP_FLAGS += $(D)VECTOR_AVX
 endif
+
 ifeq (sse,$(ISA))
   PP_FLAGS += $(D)VECTOR_SSE
 endif
@@ -215,6 +233,32 @@ ifeq (on,$(LIKWID))
   LD_LIBS  += $(LIKWID_LIB) -llikwid
 endif
 
+ifeq (HBM,$(ADJ_LIST_MEM_TYPE))
+  #MEMKIND is required for this!
+  MEMKIND      = on
+  C_FLAGS     += -DALLOC_ADJ_LIST_IN_HBM=1
+else
+  C_FLAGS     += -DALLOC_ADJ_LIST_IN_HBM=0
+endif
+
+ifeq (HBM,$(PDF_MEM_TYPE))
+  #MEMKIND is required for this!
+  MEMKIND      = on
+  C_FLAGS     += -DALLOC_PDF_IN_HBM=1
+else
+  C_FLAGS     += -DALLOC_PDF_IN_HBM=0
+endif
+
+ifeq (on, $(MEMKIND))
+  C_FLAGS     += -DHAVE_MEMKIND
+  LD_FLAGS    += -lmemkind
+endif
+
+# Prefetch settings
+SOFTWARE_PREFETCH_LOOKAHEAD_L2 ?= 0
+SOFTWARE_PREFETCH_LOOKAHEAD_L1 ?= 0
+C_FLAGS                        += -DSOFTWARE_PREFETCH_LOOKAHEAD_L2=$(SOFTWARE_PREFETCH_LOOKAHEAD_L2) -DSOFTWARE_PREFETCH_LOOKAHEAD_L1=$(SOFTWARE_PREFETCH_LOOKAHEAD_L1)
+
 
 ifeq (dp,$(PRECISION))
   PP_FLAGS += $(D)PRECISION_DP
@@ -228,6 +272,14 @@ else
 endif
 endif
 
+# Use AVX512 intrinsic kernels only when ISA=avx512 and we have double
+# precision support.
+ifeq (avx512,$(ISA))
+  ifeq (dp,$(PRECISION))
+    OBJ_C += $(OBJ_C_ISA_AVX_512)
+  endif
+endif
+
 # ARCH can only be assigned a string without a space. The space is escaped as
 # a comma which we have to replace here.
 
@@ -298,6 +350,10 @@ $(OBJECT_DIR)/%_PushAoS.o: %.c $(REBUILD_DEPS)
 	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)"
 	$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_PUSH -c $< -o $@
 
+$(OBJECT_DIR)/%_PushAoSoA.o: %.c $(REBUILD_DEPS)
+	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOSOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)"
+	$(Q)$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOSOA $(D)PROP_MODEL_PUSH -c $< -o $@
+
 $(OBJECT_DIR)/%_PullSoA.o: %.c $(REBUILD_DEPS)
 	@$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PULL$(COLOR_NO)"
 	$(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_PULL -c $< -o $@
@@ -324,7 +380,7 @@ $(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS)
 
 
 # ------------------------------------------------------------------------
-# Following targets in princibal make only sure, that the directories
+# Following targets in principal make only sure, that the directories
 # are created if the do not exist already.
 
 $(DEP_DIR)/.target:
diff --git a/src/Memory.c b/src/Memory.c
index e7dda39..ef4ccec 100644
--- a/src/Memory.c
+++ b/src/Memory.c
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -38,6 +42,10 @@
 #include <errno.h>
 
 
+#ifdef HAVE_MEMKIND
+#include <hbwmalloc.h>
+#endif
+
 #ifdef HAVE_HUGE_PAGES
 #include <sys/mman.h> // madvise
 #endif
@@ -110,3 +118,48 @@ int MemZero(void * ptr, size_t bytesToZero)
 
 	return 0;
 }
+
+#ifdef HAVE_MEMKIND
+int HbwAlloc(void ** ptr, size_t bytesToAlloc)
+{
+	void * tmpPtr;
+
+	tmpPtr = hbw_malloc(bytesToAlloc);
+
+	if (tmpPtr == NULL) { //  && bytesToAlloc != 0) {
+		Error("allocation of %lu bytes in HBM failed: %d - %s\n", bytesToAlloc, errno, strerror(errno));
+		exit(1);
+	}
+
+	*ptr = tmpPtr;
+
+	return 0;
+}
+
+int HbwAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes)
+{
+	int ret;
+
+	ret = hbw_posix_memalign(ptr, alignmentBytes, bytesToAlloc);
+
+	if (ret) {
+		Error("allocation of %lu bytes in HBM aligned to %lu bytes failed: %d - %s\n", bytesToAlloc, alignmentBytes, errno, strerror(errno));
+		exit(1);
+	}
+
+	return 0;
+}
+
+
+int HbwFree(void ** ptr)
+{
+	Assert(*ptr != NULL);
+
+	hbw_free(*ptr);
+
+	*ptr = NULL;
+
+	return 0;
+}
+
+#endif	// HAVE_MEMKIND
diff --git a/src/Memory.h b/src/Memory.h
index cfe1dfc..092736d 100644
--- a/src/Memory.h
+++ b/src/Memory.h
@@ -8,6 +8,10 @@
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -35,4 +39,10 @@ int MemFree(void ** ptr);
 
 int MemZero(void * ptr, size_t bytesToZero);
 
+#ifdef HAVE_MEMKIND
+int HbwAlloc(void ** ptr, size_t bytesToAlloc);
+int HbwAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes);
+int HbwFree(void ** ptr);
+#endif
+
 #endif // __MEMORY_H__
diff --git a/src/Vector.h b/src/Vector.h
index af12f77..cc9b7d7 100644
--- a/src/Vector.h
+++ b/src/Vector.h
@@ -1,13 +1,17 @@
 // --------------------------------------------------------------------------
 //
 // Copyright
-//   Markus Wittmann, 2016-2017
+//   Markus Wittmann, 2016-2018
 //   RRZE, University of Erlangen-Nuremberg, Germany
 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
 //
 //   Viktor Haag, 2016
 //   LSS, University of Erlangen-Nuremberg, Germany
 //
+//   Michael Hussnaetter, 2017-2018
+//   University of Erlangen-Nuremberg, Germany
+//   michael.hussnaetter -at- fau.de
+//
 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 //
 //  LbmBenchKernels is free software: you can redistribute it and/or modify
@@ -27,13 +31,15 @@
 #ifndef __VECTOR_H__
 #define __VECTOR_H__
 
-#if !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
+#if !defined(VECTOR_AVX512) && !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
 	#warning Defining VECTOR_AVX as no ISA extension was selected.
 	#define VECTOR_AVX
 #endif
 
-#if defined(VECTOR_AVX) && defined(VECTOR_SSE)
-	#error Only VECTOR_AVX or VECTOR_SSE can be defined at the same time.
+#if (defined(VECTOR_AVX512) && defined(VECTOR_AVX)) || \
+	(defined(VECTOR_AVX512) && defined(VECTOR_SEE)) || \
+	(defined(VECTOR_AVX) && defined(VECTOR_SSE))
+	#error Only VECTOR_AVX512 or VECTOR_AVX or VECTOR_SSE can be defined at the same time.
 #endif
 
 #if !defined(PRECISION_DP) && !defined(PRECISION_SP)
@@ -46,6 +52,43 @@
 
 #ifdef PRECISION_DP
 
+	#ifdef VECTOR_AVX512
+
+		#include <immintrin.h>
+		// Vector size in double-precision floatin-point numbers.
+		#define VSIZE	8
+
+		#define VPDFT				__m512d
+
+		#define VSET(scalar)		_mm512_set1_pd(scalar)
+		#define VSETI32(scalar)     _mm256_set1_epi32(scalar)
+
+		#define VLD(expr)			_mm512_load_pd(expr)
+		#define VLDU(expr)			_mm512_loadu_pd(expr)
+		#define VLIU(expr)			_mm256_loadu_si256((__m256i const *)expr)
+		#define VLI64(expr)			_mm512_load_epi64(expr)
+
+		#define VST(dst, src)		_mm512_store_pd(dst, src)
+		#define VSTU(dst, src)		_mm512_storeu_pd(dst, src)
+		#define VSTNT(dst, src)		_mm512_stream_pd(dst, src)
+
+		#define VG32(offsets, base, scale)	_mm512_i32gather_pd(offsets, base, scale)
+		#define VG64(offsets, base, scale)	_mm512_i64gather_pd(offsets, base, scale)
+
+		#define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
+
+		#define VS32(dst_base, dst_offsets, src, scale)	_mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
+		#define VS64(dst_base, dst_offsets, src, scale)	_mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
+
+		#define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
+
+		#define VMUL(a, b)			_mm512_mul_pd(a, b)
+		#define VADD(a, b)			_mm512_add_pd(a, b)
+		#define VADDI32(a,b)		_mm256_add_epi32(a,b)
+		#define VMULI32(a,b)		_mm256_mul_epi32(a,b)
+		#define VSUB(a, b)			_mm512_sub_pd(a, b)
+	#endif
+
 	#ifdef VECTOR_AVX
 
 		#include <immintrin.h>
@@ -91,6 +134,10 @@
 
 #elif defined(PRECISION_SP)
 
+	#ifdef VECTOR_AVX512
+		#error Single precision intrinsic kernels for AVX512 are currently not implemented.
+	#endif
+
 	#ifdef VECTOR_AVX
 
 		#include <immintrin.h>
diff --git a/src/config.linux-gcc.mk b/src/config.linux-gcc.mk
index 41dea5d..9ecce40 100644
--- a/src/config.linux-gcc.mk
+++ b/src/config.linux-gcc.mk
@@ -25,6 +25,7 @@
 #
 # --------------------------------------------------------------------------
 
+# TODO: add AVX512/HBM/Memkind support for GCC
 
 # ------------------------------------------------------------------------
 # C ompiler/linker to use.
diff --git a/src/config.linux-intel.mk b/src/config.linux-intel.mk
index 00d1ca7..88d1786 100644
--- a/src/config.linux-intel.mk
+++ b/src/config.linux-intel.mk
@@ -8,6 +8,10 @@
 #   Viktor Haag, 2016
 #   LSS, University of Erlangen-Nuremberg, Germany
 #
+#   Michael Hussnaetter, 2017-2018
+#   University of Erlangen-Nuremberg, Germany
+#   michael.hussnaetter -at- fau.de
+#
 #  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
 #
 #  LbmBenchKernels is free software: you can redistribute it and/or modify
diff --git a/src/test-verification.sh b/src/test-verification.sh
index 1dc8906..8f871b5 100755
--- a/src/test-verification.sh
+++ b/src/test-verification.sh
@@ -55,15 +55,16 @@ function run_kernel
   local K="$2"      # Kernel name
   local T="$3"      # Number of threads
   local KernelArgs="${4:-""}"
+  local BinaryArgs="${5:-""}"
 
   ((++TestsTotal))
 
-  echo -n "$Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) ${KernelArgs:+"-- "}$KernelArgs "
+  echo -n "$Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) $BinaryArgs ${KernelArgs:+"-- "}$KernelArgs "
 
   if [ "$KernelArgs" == "" ]; then
-    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) > "$Tmp" 2>&1
+    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) $BinaryArgs > "$Tmp" 2>&1
   else
-    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) -- $KernelArgs > "$Tmp" 2>&1
+    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) $BinaryArgs -- $KernelArgs > "$Tmp" 2>&1
   fi
 
   local ExitCode="$?"
@@ -86,6 +87,7 @@ for K in $("$Binary" -list | tail -n +7); do
   for T in $(seq 1 $NThreads); do
 
     run_kernel "$Binary" "$K" "$T"
+    # run_kernel "$Binary" "$K" "$T" "" "-dims 17x17x17"
 
     # Check in the usage string, if the kernel accepts parameters for blocking.
 
diff --git a/src/test.sh b/src/test.sh
index 121de49..aafefc8 100755
--- a/src/test.sh
+++ b/src/test.sh
@@ -55,14 +55,14 @@ fi
 
 Config="$1"
 
-make clean-all
-
-make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
-make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
-make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
-
-BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
-BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
+# make clean-all
+#
+# make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
+# make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
+# make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
+#
+# BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
+# BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
 
 
 make -j $NProc PRECISION=sp CONFIG=$Config TAG=$XTag-debug

name	values	default	description
SOFTWARE_PREFETCH_LOOKAHEAD_L1	int >= 0	0	Software prefetch lookahead of elements into L1 cache, value is multiplied by vector size (`VSIZE`).
SOFTWARE_PREFETCH_LOOKAHEAD_L2	int >= 0	0	Software prefetch lookahead of elements into L2 cache, value is multiplied by vector size (`VSIZE`).