X-Git-Url: http://git.rrze.uni-erlangen.de/gitweb/?p=LbmBenchmarkKernelsPublic.git;a=blobdiff_plain;f=doc%2Fhtml%2Fmain.html;fp=doc%2Fhtml%2Fmain.html;h=99f4cb847eb50e75fbbfdbbf68c260645dd031ef;hp=511f6b2dcf0bfd7cd2be208fc20e3d3f83c391e3;hb=0095f461c30075a883df0265a7b831061ee7ebee;hpb=e3f82424829ebb623343ce0092238f83b4a1b8c2

diff --git a/doc/html/main.html b/doc/html/main.html
index 511f6b2..99f4cb8 100644
--- a/doc/html/main.html
+++ b/doc/html/main.html
@@ -420,42 +420,66 @@ tr:nth-child(odd) {
 <div class="contents topic" id="contents">
 <p class="topic-title first">Contents</p>
 <ul class="auto-toc simple">
-<li><a class="reference internal" href="#compilation" id="id2">1&nbsp;&nbsp;&nbsp;Compilation</a><ul class="auto-toc">
-<li><a class="reference internal" href="#debug-and-verification" id="id3">1.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></li>
-<li><a class="reference internal" href="#benchmarking" id="id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
-<li><a class="reference internal" href="#release-and-verification" id="id5">1.3&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
-<li><a class="reference internal" href="#compilers" id="id6">1.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
-<li><a class="reference internal" href="#cleaning" id="id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></li>
-<li><a class="reference internal" href="#options-summary" id="id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></li>
+<li><a class="reference internal" href="#introduction" id="id5">1&nbsp;&nbsp;&nbsp;Introduction</a></li>
+<li><a class="reference internal" href="#compilation" id="id6">2&nbsp;&nbsp;&nbsp;Compilation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#debug-and-verification" id="id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></li>
+<li><a class="reference internal" href="#release-and-verification" id="id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
+<li><a class="reference internal" href="#benchmarking" id="id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
+<li><a class="reference internal" href="#compilers" id="id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
+<li><a class="reference internal" href="#cleaning" id="id11">2.5&nbsp;&nbsp;&nbsp;Cleaning</a></li>
+<li><a class="reference internal" href="#options-summary" id="id12">2.6&nbsp;&nbsp;&nbsp;Options Summary</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#invocation" id="id9">2&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
-<li><a class="reference internal" href="#command-line-parameters" id="id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
-<li><a class="reference internal" href="#kernels" id="id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
+<li><a class="reference internal" href="#invocation" id="id13">3&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#command-line-parameters" id="id14">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#kernels" id="id15">3.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#id1" id="id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
-<li><a class="reference internal" href="#padding" id="id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></li>
+<li><a class="reference internal" href="#id2" id="id16">4&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
+<li><a class="reference internal" href="#intel-compiler" id="id17">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></li>
+<li><a class="reference internal" href="#pinning" id="id18">4.2&nbsp;&nbsp;&nbsp;Pinning</a></li>
+<li><a class="reference internal" href="#general-remarks" id="id19">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></li>
+<li><a class="reference internal" href="#padding" id="id20">4.4&nbsp;&nbsp;&nbsp;Padding</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#geometries" id="id14">4&nbsp;&nbsp;&nbsp;Geometries</a></li>
-<li><a class="reference internal" href="#results" id="id15">5&nbsp;&nbsp;&nbsp;Results</a></li>
-<li><a class="reference internal" href="#licence" id="id16">6&nbsp;&nbsp;&nbsp;Licence</a></li>
-<li><a class="reference internal" href="#acknowledgements" id="id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
+<li><a class="reference internal" href="#geometries" id="id21">5&nbsp;&nbsp;&nbsp;Geometries</a></li>
+<li><a class="reference internal" href="#performance-results" id="id22">6&nbsp;&nbsp;&nbsp;Performance Results</a><ul class="auto-toc">
+<li><a class="reference internal" href="#haswell-intel-xeon-e5-2695-v3" id="id23">6.1&nbsp;&nbsp;&nbsp;Haswell, Intel Xeon E5-2695 v3</a></li>
+<li><a class="reference internal" href="#broadwell-intel-xeon-e5-2630-v4" id="id24">6.2&nbsp;&nbsp;&nbsp;Broadwell, Intel Xeon E5-2630 v4</a></li>
+<li><a class="reference internal" href="#skylake-intel-xeon-gold-6148" id="id25">6.3&nbsp;&nbsp;&nbsp;Skylake, Intel Xeon Gold 6148</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#licence" id="id26">7&nbsp;&nbsp;&nbsp;Licence</a></li>
+<li><a class="reference internal" href="#acknowledgements" id="id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
+<li><a class="reference internal" href="#bibliography" id="id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></li>
 </ul>
 </div>
+<div class="section" id="introduction">
+<h1><a class="toc-backref" href="#id5">1&nbsp;&nbsp;&nbsp;Introduction</a></h1>
+<p>The lattice Boltzmann (LBM) benchmark kernels are a collection of LBM kernel
+implementations.</p>
+<p><strong>AS SUCH THE LBM BENCHMARK KERNELS ARE NO FULLY EQUIPPED CFD SOLVER AND SOLELY
+SERVES THE PURPOSE OF STUDYING POSSIBLE PERFORMANCE OPTIMIZATIONS AND/OR
+EXPERIMENTS.</strong></p>
+<p>Currently all kernels utilize a D3Q19 discretization and the
+two-relaxation-time (TRT) collision operator <a class="citation-reference" href="#ginzburg-2008" id="id1">[ginzburg-2008]</a>.
+All operations are carried out in double precision arithmetic.</p>
+</div>
 <div class="section" id="compilation">
-<h1><a class="toc-backref" href="#id2">1&nbsp;&nbsp;&nbsp;Compilation</a></h1>
+<h1><a class="toc-backref" href="#id6">2&nbsp;&nbsp;&nbsp;Compilation</a></h1>
 <p>The benchmark framework currently supports only Linux systems and the GCC and
 Intel compilers. Every other configuration probably requires adjustment inside
-the code and the makefiles. Further some code might be platform or at least
+the code and the makefiles. Furthermore some code might be platform or at least
 POSIX specific.</p>
 <p>The benchmark can be build via <tt class="docutils literal">make</tt> from the <tt class="docutils literal">src</tt> subdirectory. This will
 generate one binary which hosts all implemented benchmark kernels.</p>
 <p>Binaries are located under the <tt class="docutils literal">bin</tt> subdirectory and will have different names
 depending on compiler and build configuration.</p>
+<p>Compilation can target debug or release builds. Combined with both build types
+verification can be enabled, which increases the runtime and hence is not
+suited for benchmarking.</p>
 <div class="section" id="debug-and-verification">
-<h2><a class="toc-backref" href="#id3">1.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
+<h2><a class="toc-backref" href="#id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
 <pre class="literal-block">
 make BUILD=debug BENCHMARK=off
 </pre>
@@ -470,32 +494,34 @@ binary will be found in the <tt class="docutils literal">bin</tt> subdirectory a
 <p>Please note that the generated binary will therefore
 exhibit a poor performance.</p>
 </div>
+<div class="section" id="release-and-verification">
+<h2><a class="toc-backref" href="#id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></h2>
+<p>Verification with the debug builds can be extremely slow. Hence verification
+capabilities can be build with release builds:</p>
+<pre class="literal-block">
+make BENCHMARK=off
+</pre>
+</div>
 <div class="section" id="benchmarking">
-<h2><a class="toc-backref" href="#id4">1.2&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
+<h2><a class="toc-backref" href="#id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
 <p>To generate a binary for benchmarking run make with</p>
 <pre class="literal-block">
 make
 </pre>
 <p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
-BUILD=release turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
+<tt class="docutils literal">BUILD=release</tt> turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
 verfification, statistics, and VTK output.</p>
-</div>
-<div class="section" id="release-and-verification">
-<h2><a class="toc-backref" href="#id5">1.3&nbsp;&nbsp;&nbsp;Release and Verification</a></h2>
-<p>Verification with the debug builds can be extremely slow. Hence verification
-capabilities can be build with release builds:</p>
-<pre class="literal-block">
-make BENCHMARK=off
-</pre>
+<p>See Options Summary below for further description of options which can be
+applied, e.g. TARCH as well as the Benchmarking section.</p>
 </div>
 <div class="section" id="compilers">
-<h2><a class="toc-backref" href="#id6">1.4&nbsp;&nbsp;&nbsp;Compilers</a></h2>
+<h2><a class="toc-backref" href="#id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></h2>
 <p>Currently only the GCC and Intel compiler under Linux are supported. Between
 both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
 <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
 </div>
 <div class="section" id="cleaning">
-<h2><a class="toc-backref" href="#id7">1.5&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
+<h2><a class="toc-backref" href="#id11">2.5&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
 <p>For each configuration and build (debug/release) a subdirectory under the
 <tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
 stored.
@@ -510,21 +536,23 @@ make clean-all
 <p>all object and dependency files are deleted.</p>
 </div>
 <div class="section" id="options-summary">
-<h2><a class="toc-backref" href="#id8">1.6&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
-<p>Options that can be specified when building the framework with make:</p>
+<h2><a class="toc-backref" href="#id12">2.6&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
+<p>Options that can be specified when building the suite with make:</p>
 <table border="1" class="docutils">
 <colgroup>
-<col width="8%" />
-<col width="13%" />
 <col width="7%" />
-<col width="72%" />
+<col width="12%" />
+<col width="6%" />
+<col width="75%" />
 </colgroup>
-<tbody valign="top">
-<tr><td>name</td>
-<td>values</td>
-<td>default</td>
-<td>description</td>
+<thead valign="bottom">
+<tr><th class="head">name</th>
+<th class="head">values</th>
+<th class="head">default</th>
+<th class="head">description</th>
 </tr>
+</thead>
+<tbody valign="top">
 <tr><td>BENCHMARK</td>
 <td>on, off</td>
 <td>on</td>
@@ -533,7 +561,7 @@ make clean-all
 <tr><td>BUILD</td>
 <td>debug, release</td>
 <td>release</td>
-<td>No optimization, debug symbols, DEBUG defined.</td>
+<td>debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.</td>
 </tr>
 <tr><td>CONFIG</td>
 <td>linux-gcc, linux-intel</td>
@@ -543,7 +571,7 @@ make clean-all
 <tr><td>ISA</td>
 <td>avx, sse</td>
 <td>avx</td>
-<td>Determines which ISA extension is used for macro definitions. This is <em>not</em> the architecture the compiler generates code for.</td>
+<td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td>
 </tr>
 <tr><td>OPENMP</td>
 <td>on, off</td>
@@ -575,7 +603,7 @@ make clean-all
 </div>
 </div>
 <div class="section" id="invocation">
-<h1><a class="toc-backref" href="#id9">2&nbsp;&nbsp;&nbsp;Invocation</a></h1>
+<h1><a class="toc-backref" href="#id13">3&nbsp;&nbsp;&nbsp;Invocation</a></h1>
 <p>Running the binary will print among the GPL licence header a line like the following:</p>
 <pre class="literal-block">
 LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
@@ -586,7 +614,7 @@ LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
 </pre>
 <p>if verfication was disabled during compilation.</p>
 <div class="section" id="command-line-parameters">
-<h2><a class="toc-backref" href="#id10">2.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
+<h2><a class="toc-backref" href="#id14">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
 <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
 <pre class="literal-block">
 Usage:
@@ -613,7 +641,7 @@ iterations, etc, which can afterward be override, e.g.:</p>
 <pre class="literal-block">
 $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32
 </pre>
-<p>Kernel specific parameters can be opatained via selecting the specific kernel
+<p>Kernel specific parameters can be obtained via selecting the specific kernel
 and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
 <pre class="literal-block">
 $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
@@ -651,7 +679,7 @@ Available kernels to benchmark:
 </pre>
 </div>
 <div class="section" id="kernels">
-<h2><a class="toc-backref" href="#id11">2.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
+<h2><a class="toc-backref" href="#id15">3.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
 <p>The following list shortly describes available kernels:</p>
 <ul class="simple">
 <li>push-soa/push-aos/pull-soa/pull-aos:
@@ -862,8 +890,8 @@ during each run.</p>
 </table>
 </div>
 </div>
-<div class="section" id="id1">
-<h1><a class="toc-backref" href="#id12">3&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
+<div class="section" id="id2">
+<h1><a class="toc-backref" href="#id16">4&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
 <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
 created make sure the binary was compiled with:</p>
 <ul class="simple">
@@ -872,12 +900,43 @@ created make sure the binary was compiled with:</p>
 <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
 <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
 </ul>
+<div class="section" id="intel-compiler">
+<h2><a class="toc-backref" href="#id17">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></h2>
+<p>For the Intel compiler one can specify depending on the target ISA extension:</p>
+<ul class="simple">
+<li>AVX:          <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li>
+<li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li>
+<li>AVX512:       <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li>
+<li>KNL:          <tt class="docutils literal"><span class="pre">TARCH=-xMIC-AVX512</span></tt></li>
+</ul>
+<p>Compiling for an architecture supporting AVX (Sandy Bridge, Ivy Bridge):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xAVX
+</pre>
+<p>Compiling for an architecture supporting AVX2 (Haswell, Broadwell):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xCORE-AVX2,-fma
+</pre>
+<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not
+implemented. This might change in the future.</p>
+<p>Compiling for an architecture supporting AVX-512 (Skylake):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xCORE-AVX512
+</pre>
+<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the
+AVX512 intrinsics. This might change in the future.</p>
+</div>
+<div class="section" id="pinning">
+<h2><a class="toc-backref" href="#id18">4.2&nbsp;&nbsp;&nbsp;Pinning</a></h2>
 <p>During benchmarking pinning should be used via the <tt class="docutils literal"><span class="pre">-pin</span></tt> parameter. Running
-a benchmark with 10 threads an pin them to the first 10 cores works like</p>
+a benchmark with 10 threads and pin them to the first 10 cores works like</p>
 <pre class="literal-block">
 $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9)
 </pre>
-<p>Things the binary does nor check or controll:</p>
+</div>
+<div class="section" id="general-remarks">
+<h2><a class="toc-backref" href="#id19">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></h2>
+<p>Things the binary does nor check or control:</p>
 <ul class="simple">
 <li>transparent huge pages: when allocating memory small 4 KiB pages might be
 replaced with larger ones. This is in general a good thing, but if this is
@@ -895,7 +954,7 @@ means the memory will be placed at the NUMA domain the touching core is
 associated with. If a different policy is in place or the NUMA domain to be
 used is already full memory might be allocated in a remote domain. Accesses
 to remote domains typically have a higher latency and lower bandwidth.</li>
-<li>System load: interference with other application, espcially on desktop
+<li>System load: interference with other application, especially on desktop
 systems should be avoided.</li>
 <li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
 adjusted so that no cache or TLB thrashing should occur. The parameters are
@@ -905,8 +964,9 @@ padding section.</li>
 function for different ISA extensions. Make sure the code you might think is
 executed is actually the code which is executed.</li>
 </ul>
+</div>
 <div class="section" id="padding">
-<h2><a class="toc-backref" href="#id13">3.1&nbsp;&nbsp;&nbsp;Padding</a></h2>
+<h2><a class="toc-backref" href="#id20">4.4&nbsp;&nbsp;&nbsp;Padding</a></h2>
 <p>With correct padding cache and TLB thrashing can be avoided. Therefore the
 number of (fluid) nodes used in the data layout is artificially increased.</p>
 <p>Currently automatic padding is active for kernels which support it. It can be
@@ -938,22 +998,912 @@ like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would
 </div>
 </div>
 <div class="section" id="geometries">
-<h1><a class="toc-backref" href="#id14">4&nbsp;&nbsp;&nbsp;Geometries</a></h1>
-<p>TODO: supported geometries: channel, pipe, blocks</p>
+<h1><a class="toc-backref" href="#id21">5&nbsp;&nbsp;&nbsp;Geometries</a></h1>
+<p>TODO: supported geometries: channel, pipe, blocks, fluid</p>
+</div>
+<div class="section" id="performance-results">
+<h1><a class="toc-backref" href="#id22">6&nbsp;&nbsp;&nbsp;Performance Results</a></h1>
+<p>The sections lists performance values measured on several machines for
+different kernels and geometries.
+The <strong>RFM</strong> column denotes the expected performance as predicted by the
+Roofline performance model <a class="citation-reference" href="#williams-2008" id="id3">[williams-2008]</a>.
+For performance prediction of each kernel a memory bandwidth benchmark is used
+which mimics the kernels memory access pattern and the kernel's loop balance
+(see <a class="citation-reference" href="#kernels" id="id4">[kernels]</a> for details).</p>
+<div class="section" id="haswell-intel-xeon-e5-2695-v3">
+<h2><a class="toc-backref" href="#id23">6.1&nbsp;&nbsp;&nbsp;Haswell, Intel Xeon E5-2695 v3</a></h2>
+<ul class="simple">
+<li>Haswell architecture, AVX2, FMA</li>
+<li>14 cores, 2,3 GHz</li>
+<li>2 x 7 cores in cluster-on-die (CoD) mode enabled</li>
+<li>SMT enabled</li>
+</ul>
+<p>memory bandwidth:</p>
+<ul class="simple">
+<li>copy-19              47.3 GB/s</li>
+<li>copy-19-nt-sl        47.1 GB/s</li>
+<li>update-19            44.0 GB/s</li>
+</ul>
+<p>geometry dimensions:  500x100x100</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="19%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="4%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel</th>
+<th class="head">pipe</th>
+<th class="head">blocks-2</th>
+<th class="head">blocks-4</th>
+<th class="head">blocks-6</th>
+<th class="head">blocks-8</th>
+<th class="head">blocks-10</th>
+<th class="head">blocks-15</th>
+<th class="head">blocks-16</th>
+<th class="head">blocks-20</th>
+<th class="head">blocks-25</th>
+<th class="head">blocks-32</th>
+<th class="head">RFM</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>blk-push-aos</td>
+<td>58.82</td>
+<td>49.85</td>
+<td>57.34</td>
+<td>59.90</td>
+<td>61.37</td>
+<td>62.17</td>
+<td>65.30</td>
+<td>64.00</td>
+<td>67.54</td>
+<td>64.46</td>
+<td>69.69</td>
+<td>104</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>32.32</td>
+<td>33.46</td>
+<td>34.02</td>
+<td>34.64</td>
+<td>35.06</td>
+<td>35.04</td>
+<td>36.31</td>
+<td>35.44</td>
+<td>37.20</td>
+<td>35.14</td>
+<td>37.95</td>
+<td>104</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>56.97</td>
+<td>51.41</td>
+<td>56.09</td>
+<td>57.92</td>
+<td>59.98</td>
+<td>59.83</td>
+<td>63.37</td>
+<td>61.55</td>
+<td>65.50</td>
+<td>63.11</td>
+<td>67.02</td>
+<td>104</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>49.29</td>
+<td>46.23</td>
+<td>47.50</td>
+<td>51.97</td>
+<td>51.27</td>
+<td>49.52</td>
+<td>55.23</td>
+<td>53.13</td>
+<td>54.50</td>
+<td>49.79</td>
+<td>57.90</td>
+<td>104</td>
+</tr>
+<tr><td>aa-aos</td>
+<td>91.35</td>
+<td>66.14</td>
+<td>76.80</td>
+<td>84.76</td>
+<td>83.63</td>
+<td>91.36</td>
+<td>93.46</td>
+<td>92.62</td>
+<td>93.91</td>
+<td>92.25</td>
+<td>92.93</td>
+<td>145</td>
+</tr>
+<tr><td>aa-soa</td>
+<td>75.51</td>
+<td>65.68</td>
+<td>70.94</td>
+<td>71.36</td>
+<td>73.83</td>
+<td>75.46</td>
+<td>74.84</td>
+<td>79.48</td>
+<td>83.28</td>
+<td>77.70</td>
+<td>82.72</td>
+<td>145</td>
+</tr>
+<tr><td>aa-vec-soa</td>
+<td>93.85</td>
+<td>83.44</td>
+<td>91.58</td>
+<td>93.96</td>
+<td>94.35</td>
+<td>96.62</td>
+<td>101.76</td>
+<td>96.72</td>
+<td>106.37</td>
+<td>102.60</td>
+<td>110.28</td>
+<td>145</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>80.29</td>
+<td>80.97</td>
+<td>80.95</td>
+<td>81.10</td>
+<td>81.37</td>
+<td>82.44</td>
+<td>81.77</td>
+<td>81.49</td>
+<td>80.72</td>
+<td>81.93</td>
+<td>80.93</td>
+<td>83</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>47.52</td>
+<td>42.65</td>
+<td>45.28</td>
+<td>46.64</td>
+<td>43.46</td>
+<td>40.59</td>
+<td>44.94</td>
+<td>46.55</td>
+<td>41.53</td>
+<td>45.98</td>
+<td>44.86</td>
+<td>83</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>85.30</td>
+<td>82.97</td>
+<td>86.43</td>
+<td>83.42</td>
+<td>86.33</td>
+<td>83.70</td>
+<td>86.43</td>
+<td>83.77</td>
+<td>83.10</td>
+<td>85.89</td>
+<td>84.44</td>
+<td>83</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>62.12</td>
+<td>63.61</td>
+<td>63.28</td>
+<td>61.32</td>
+<td>66.72</td>
+<td>62.65</td>
+<td>64.82</td>
+<td>60.49</td>
+<td>58.01</td>
+<td>64.46</td>
+<td>62.52</td>
+<td>83</td>
+</tr>
+<tr><td>list-pull-split-nt-1s-soa</td>
+<td>121.35</td>
+<td>113.77</td>
+<td>115.29</td>
+<td>113.54</td>
+<td>117.00</td>
+<td>116.46</td>
+<td>114.78</td>
+<td>114.54</td>
+<td>110.83</td>
+<td>112.67</td>
+<td>117.85</td>
+<td>125</td>
+</tr>
+<tr><td>list-pull-split-nt-2s-soa</td>
+<td>118.09</td>
+<td>110.48</td>
+<td>112.55</td>
+<td>113.18</td>
+<td>113.44</td>
+<td>111.85</td>
+<td>109.27</td>
+<td>114.41</td>
+<td>110.28</td>
+<td>111.78</td>
+<td>113.74</td>
+<td>125</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>121.28</td>
+<td>118.63</td>
+<td>119.00</td>
+<td>118.50</td>
+<td>121.99</td>
+<td>119.11</td>
+<td>118.83</td>
+<td>121.47</td>
+<td>121.62</td>
+<td>126.18</td>
+<td>120.12</td>
+<td>129</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>126.34</td>
+<td>116.90</td>
+<td>129.45</td>
+<td>127.12</td>
+<td>129.41</td>
+<td>121.42</td>
+<td>126.19</td>
+<td>126.76</td>
+<td>126.70</td>
+<td>124.40</td>
+<td>125.22</td>
+<td>129</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>133.68</td>
+<td>121.82</td>
+<td>126.04</td>
+<td>128.46</td>
+<td>131.15</td>
+<td>132.25</td>
+<td>128.78</td>
+<td>133.50</td>
+<td>126.69</td>
+<td>124.40</td>
+<td>130.37</td>
+<td>145</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>146.22</td>
+<td>124.39</td>
+<td>130.73</td>
+<td>136.29</td>
+<td>137.61</td>
+<td>131.21</td>
+<td>138.65</td>
+<td>138.78</td>
+<td>127.02</td>
+<td>132.40</td>
+<td>138.37</td>
+<td>145</td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="section" id="broadwell-intel-xeon-e5-2630-v4">
+<h2><a class="toc-backref" href="#id24">6.2&nbsp;&nbsp;&nbsp;Broadwell, Intel Xeon E5-2630 v4</a></h2>
+<ul class="simple">
+<li>Broadwell architecture, AVX2, FMA</li>
+<li>10 cores, 2.2 GHz</li>
+<li>SMT disabled</li>
+</ul>
+<p>memory bandwidth:</p>
+<ul class="simple">
+<li>copy-19              48.0 GB/s</li>
+<li>copy-nt-sl-19        48.2 GB/s</li>
+<li>update-19            51.1 GB/s</li>
+</ul>
+<p>geometry dimensions:  500x100x100</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="19%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="5%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel</th>
+<th class="head">pipe</th>
+<th class="head">blocks-2</th>
+<th class="head">blocks-4</th>
+<th class="head">blocks-6</th>
+<th class="head">blocks-8</th>
+<th class="head">blocks-10</th>
+<th class="head">blocks-15</th>
+<th class="head">blocks-16</th>
+<th class="head">blocks-20</th>
+<th class="head">blocks-25</th>
+<th class="head">blocks-32</th>
+<th class="head">RFM</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>blk-push-aos</td>
+<td>55.75</td>
+<td>47.62</td>
+<td>54.57</td>
+<td>57.10</td>
+<td>58.49</td>
+<td>59.00</td>
+<td>61.72</td>
+<td>60.56</td>
+<td>64.05</td>
+<td>61.10</td>
+<td>66.03</td>
+<td>105</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>30.06</td>
+<td>31.09</td>
+<td>32.13</td>
+<td>32.54</td>
+<td>32.74</td>
+<td>32.72</td>
+<td>33.81</td>
+<td>33.19</td>
+<td>34.90</td>
+<td>33.21</td>
+<td>35.75</td>
+<td>105</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>53.80</td>
+<td>48.61</td>
+<td>53.08</td>
+<td>54.99</td>
+<td>56.08</td>
+<td>56.68</td>
+<td>59.20</td>
+<td>58.12</td>
+<td>61.49</td>
+<td>58.71</td>
+<td>63.45</td>
+<td>105</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>46.96</td>
+<td>46.61</td>
+<td>48.84</td>
+<td>49.70</td>
+<td>50.33</td>
+<td>50.46</td>
+<td>52.36</td>
+<td>51.39</td>
+<td>54.20</td>
+<td>51.61</td>
+<td>55.71</td>
+<td>105</td>
+</tr>
+<tr><td>aa-aos</td>
+<td>91.40</td>
+<td>66.99</td>
+<td>78.47</td>
+<td>83.38</td>
+<td>86.62</td>
+<td>88.62</td>
+<td>92.98</td>
+<td>91.54</td>
+<td>97.08</td>
+<td>94.93</td>
+<td>98.90</td>
+<td>168</td>
+</tr>
+<tr><td>aa-soa</td>
+<td>83.01</td>
+<td>69.96</td>
+<td>75.85</td>
+<td>77.72</td>
+<td>79.01</td>
+<td>79.29</td>
+<td>82.38</td>
+<td>80.11</td>
+<td>85.70</td>
+<td>83.91</td>
+<td>87.69</td>
+<td>168</td>
+</tr>
+<tr><td>aa-vec-soa</td>
+<td>112.03</td>
+<td>96.52</td>
+<td>105.32</td>
+<td>109.76</td>
+<td>112.55</td>
+<td>113.82</td>
+<td>120.55</td>
+<td>118.37</td>
+<td>126.30</td>
+<td>121.37</td>
+<td>131.94</td>
+<td>168</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>75.13</td>
+<td>74.18</td>
+<td>75.20</td>
+<td>75.42</td>
+<td>75.24</td>
+<td>75.99</td>
+<td>75.80</td>
+<td>75.80</td>
+<td>75.54</td>
+<td>76.22</td>
+<td>76.21</td>
+<td>97</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>40.99</td>
+<td>38.14</td>
+<td>39.00</td>
+<td>38.89</td>
+<td>38.89</td>
+<td>39.67</td>
+<td>39.87</td>
+<td>39.28</td>
+<td>39.35</td>
+<td>40.08</td>
+<td>40.13</td>
+<td>97</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>82.07</td>
+<td>82.88</td>
+<td>83.29</td>
+<td>83.09</td>
+<td>83.32</td>
+<td>83.49</td>
+<td>82.82</td>
+<td>82.88</td>
+<td>83.32</td>
+<td>82.60</td>
+<td>82.93</td>
+<td>97</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>62.07</td>
+<td>60.40</td>
+<td>61.89</td>
+<td>61.39</td>
+<td>62.43</td>
+<td>60.90</td>
+<td>60.48</td>
+<td>62.80</td>
+<td>62.50</td>
+<td>61.10</td>
+<td>60.38</td>
+<td>97</td>
+</tr>
+<tr><td>list-pull-split-nt-1s-soa</td>
+<td>125.81</td>
+<td>120.60</td>
+<td>121.96</td>
+<td>122.34</td>
+<td>122.86</td>
+<td>123.53</td>
+<td>123.64</td>
+<td>123.67</td>
+<td>125.94</td>
+<td>124.09</td>
+<td>123.69</td>
+<td>128</td>
+</tr>
+<tr><td>list-pull-split-nt-2s-soa</td>
+<td>122.79</td>
+<td>117.16</td>
+<td>118.86</td>
+<td>119.16</td>
+<td>119.56</td>
+<td>119.99</td>
+<td>120.01</td>
+<td>120.03</td>
+<td>122.64</td>
+<td>120.57</td>
+<td>120.39</td>
+<td>128</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>128.13</td>
+<td>127.41</td>
+<td>129.31</td>
+<td>129.07</td>
+<td>129.79</td>
+<td>129.63</td>
+<td>129.67</td>
+<td>129.94</td>
+<td>129.12</td>
+<td>128.41</td>
+<td>129.72</td>
+<td>150</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>141.60</td>
+<td>139.78</td>
+<td>141.58</td>
+<td>142.16</td>
+<td>141.94</td>
+<td>141.31</td>
+<td>142.37</td>
+<td>142.25</td>
+<td>142.43</td>
+<td>141.40</td>
+<td>142.26</td>
+<td>150</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>141.82</td>
+<td>134.88</td>
+<td>140.15</td>
+<td>140.72</td>
+<td>141.67</td>
+<td>140.51</td>
+<td>141.18</td>
+<td>141.29</td>
+<td>142.97</td>
+<td>141.94</td>
+<td>143.25</td>
+<td>168</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>164.79</td>
+<td>140.95</td>
+<td>159.24</td>
+<td>161.78</td>
+<td>162.40</td>
+<td>163.04</td>
+<td>164.69</td>
+<td>164.38</td>
+<td>165.11</td>
+<td>165.75</td>
+<td>166.09</td>
+<td>168</td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="section" id="skylake-intel-xeon-gold-6148">
+<h2><a class="toc-backref" href="#id25">6.3&nbsp;&nbsp;&nbsp;Skylake, Intel Xeon Gold 6148</a></h2>
+<ul class="simple">
+<li>Skylake architecture, AVX2, FMA, AVX512</li>
+<li>20 cores, 2.4 GHz</li>
+<li>SMT enabled</li>
+</ul>
+<p>memory bandwidth:</p>
+<ul class="simple">
+<li>copy-19                  89.7 GB/s</li>
+<li>copy-19-nt-sl            92.4 GB/s</li>
+<li>update-19                93.6 GB/s</li>
+</ul>
+<p>geometry dimensions:  500x100x100</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="20%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="7%" />
+<col width="2%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel</th>
+<th class="head">pipe</th>
+<th class="head">blocks-2</th>
+<th class="head">blocks-4</th>
+<th class="head">blocks-6</th>
+<th class="head">blocks-8</th>
+<th class="head">blocks-10</th>
+<th class="head">blocks-15</th>
+<th class="head">blocks-16</th>
+<th class="head">blocks-20</th>
+<th class="head">blocks-25</th>
+<th class="head">blocks-32</th>
+<th class="head">RFM</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>blk-push-aos</td>
+<td>113.01</td>
+<td>93.99</td>
+<td>108.98</td>
+<td>114.65</td>
+<td>117.87</td>
+<td>119.47</td>
+<td>124.95</td>
+<td>122.46</td>
+<td>129.29</td>
+<td>123.87</td>
+<td>133.01</td>
+<td>197</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>100.21</td>
+<td>98.87</td>
+<td>103.63</td>
+<td>105.56</td>
+<td>107.02</td>
+<td>107.27</td>
+<td>111.61</td>
+<td>109.83</td>
+<td>116.16</td>
+<td>110.51</td>
+<td>110.29</td>
+<td>197</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>118.45</td>
+<td>102.54</td>
+<td>114.12</td>
+<td>117.82</td>
+<td>122.69</td>
+<td>124.31</td>
+<td>130.58</td>
+<td>127.85</td>
+<td>135.72</td>
+<td>129.65</td>
+<td>139.94</td>
+<td>197</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>82.60</td>
+<td>83.36</td>
+<td>87.13</td>
+<td>88.39</td>
+<td>88.84</td>
+<td>88.96</td>
+<td>92.48</td>
+<td>90.93</td>
+<td>95.79</td>
+<td>91.92</td>
+<td>98.64</td>
+<td>197</td>
+</tr>
+<tr><td>aa-aos</td>
+<td>171.32</td>
+<td>125.43</td>
+<td>147.73</td>
+<td>157.70</td>
+<td>163.35</td>
+<td>167.25</td>
+<td>175.39</td>
+<td>174.20</td>
+<td>182.54</td>
+<td>173.67</td>
+<td>187.76</td>
+<td>308</td>
+</tr>
+<tr><td>aa-soa</td>
+<td>180.85</td>
+<td>152.39</td>
+<td>165.84</td>
+<td>152.59</td>
+<td>171.90</td>
+<td>175.76</td>
+<td>184.94</td>
+<td>182.34</td>
+<td>189.43</td>
+<td>180.30</td>
+<td>193.54</td>
+<td>308</td>
+</tr>
+<tr><td>aa-vec-soa</td>
+<td>208.03</td>
+<td>181.51</td>
+<td>195.86</td>
+<td>203.41</td>
+<td>209.08</td>
+<td>212.34</td>
+<td>224.05</td>
+<td>219.49</td>
+<td>234.31</td>
+<td>225.92</td>
+<td>245.22</td>
+<td>308</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>158.81</td>
+<td>164.67</td>
+<td>162.93</td>
+<td>163.05</td>
+<td>165.22</td>
+<td>164.31</td>
+<td>164.66</td>
+<td>160.78</td>
+<td>164.07</td>
+<td>165.19</td>
+<td>164.06</td>
+<td>177</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>134.60</td>
+<td>110.44</td>
+<td>110.17</td>
+<td>132.01</td>
+<td>132.95</td>
+<td>133.46</td>
+<td>134.37</td>
+<td>134.33</td>
+<td>135.12</td>
+<td>134.91</td>
+<td>137.87</td>
+<td>177</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>169.61</td>
+<td>170.03</td>
+<td>170.89</td>
+<td>170.90</td>
+<td>171.20</td>
+<td>171.60</td>
+<td>172.09</td>
+<td>171.95</td>
+<td>169.48</td>
+<td>172.08</td>
+<td>171.02</td>
+<td>177</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>120.50</td>
+<td>116.73</td>
+<td>118.62</td>
+<td>118.00</td>
+<td>120.99</td>
+<td>118.15</td>
+<td>117.17</td>
+<td>121.41</td>
+<td>120.83</td>
+<td>120.00</td>
+<td>118.74</td>
+<td>177</td>
+</tr>
+<tr><td>list-pull-split-nt-1s-soa</td>
+<td>225.59</td>
+<td>224.18</td>
+<td>225.10</td>
+<td>226.34</td>
+<td>226.01</td>
+<td>230.37</td>
+<td>227.50</td>
+<td>228.42</td>
+<td>227.39</td>
+<td>231.65</td>
+<td>227.35</td>
+<td>246</td>
+</tr>
+<tr><td>list-pull-split-nt-2s-soa</td>
+<td>219.20</td>
+<td>214.63</td>
+<td>217.61</td>
+<td>218.13</td>
+<td>219.07</td>
+<td>221.01</td>
+<td>219.88</td>
+<td>220.09</td>
+<td>220.62</td>
+<td>221.68</td>
+<td>220.58</td>
+<td>246</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>241.39</td>
+<td>239.27</td>
+<td>239.53</td>
+<td>242.56</td>
+<td>242.46</td>
+<td>243.00</td>
+<td>242.91</td>
+<td>242.46</td>
+<td>241.24</td>
+<td>242.96</td>
+<td>241.52</td>
+<td>275</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>273.73</td>
+<td>268.49</td>
+<td>268.48</td>
+<td>271.79</td>
+<td>275.29</td>
+<td>274.56</td>
+<td>277.18</td>
+<td>272.67</td>
+<td>274.21</td>
+<td>275.24</td>
+<td>278.21</td>
+<td>275</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>288.42</td>
+<td>261.89</td>
+<td>273.26</td>
+<td>284.84</td>
+<td>283.88</td>
+<td>288.29</td>
+<td>290.72</td>
+<td>289.81</td>
+<td>293.36</td>
+<td>290.75</td>
+<td>292.93</td>
+<td>308</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>303.35</td>
+<td>267.21</td>
+<td>289.18</td>
+<td>294.96</td>
+<td>294.36</td>
+<td>298.16</td>
+<td>300.45</td>
+<td>301.71</td>
+<td>302.37</td>
+<td>302.88</td>
+<td>304.46</td>
+<td>308</td>
+</tr>
+</tbody>
+</table>
 </div>
-<div class="section" id="results">
-<h1><a class="toc-backref" href="#id15">5&nbsp;&nbsp;&nbsp;Results</a></h1>
-<p>TODO</p>
 </div>
 <div class="section" id="licence">
-<h1><a class="toc-backref" href="#id16">6&nbsp;&nbsp;&nbsp;Licence</a></h1>
+<h1><a class="toc-backref" href="#id26">7&nbsp;&nbsp;&nbsp;Licence</a></h1>
 <p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
 </div>
 <div class="section" id="acknowledgements">
-<h1><a class="toc-backref" href="#id17">7&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
+<h1><a class="toc-backref" href="#id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
 <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
 <p>This work was funded by KONWHIR project OMI4PAPS.</p>
-<p>Document was generated at 2017-11-02 15:33.</p>
+</div>
+<div class="section" id="bibliography">
+<h1><a class="toc-backref" href="#id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></h1>
+<table class="docutils citation" frame="void" id="ginzburg-2008" rules="none">
+<colgroup><col class="label" /><col /></colgroup>
+<tbody valign="top">
+<tr><td class="label"><a class="fn-backref" href="#id1">[ginzburg-2008]</a></td><td>I. Ginzburg, F. Verhaeghe, and D. d'HumiÃ¨res.
+Two-relaxation-time lattice Boltzmann scheme: About parametrization, velocity, pressure and mixed boundary conditions.
+Commun. Comput. Phys., 3(2):427-478, 2008.</td></tr>
+</tbody>
+</table>
+<table class="docutils citation" frame="void" id="williams-2008" rules="none">
+<colgroup><col class="label" /><col /></colgroup>
+<tbody valign="top">
+<tr><td class="label"><a class="fn-backref" href="#id3">[williams-2008]</a></td><td>S. Williams, A. Waterman, and D. Patterson.
+Roofline: an insightful visual performance model for multicore architectures.
+Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
+</tbody>
+</table>
+<p>Document was generated at 2017-11-21 15:43.</p>
 </div>
 </div>
 </body>

name	values	default	description
name	values	default	description
BENCHMARK	on, off	on
BUILD	debug, release	release	No optimization, debug symbols, DEBUG defined.	debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.
CONFIG	linux-gcc, linux-intel
ISA	avx, sse	avx	Determines which ISA extension is used for macro definitions. This is not the architecture the compiler generates code for.	Determines which ISA extension is used for macro definitions of the intrinsics. This is not the architecture the compiler generates code for.
OPENMP	on, off
kernel	pipe	blocks-2	blocks-4	blocks-6	blocks-8	blocks-10	blocks-15	blocks-16	blocks-20	blocks-25	blocks-32	RFM
blk-push-aos	58.82	49.85	57.34	59.90	61.37	62.17	65.30	64.00	67.54	64.46	69.69	104
blk-push-soa	32.32	33.46	34.02	34.64	35.06	35.04	36.31	35.44	37.20	35.14	37.95	104
blk-pull-aos	56.97	51.41	56.09	57.92	59.98	59.83	63.37	61.55	65.50	63.11	67.02	104
blk-pull-soa	49.29	46.23	47.50	51.97	51.27	49.52	55.23	53.13	54.50	49.79	57.90	104
aa-aos	91.35	66.14	76.80	84.76	83.63	91.36	93.46	92.62	93.91	92.25	92.93	145
aa-soa	75.51	65.68	70.94	71.36	73.83	75.46	74.84	79.48	83.28	77.70	82.72	145
aa-vec-soa	93.85	83.44	91.58	93.96	94.35	96.62	101.76	96.72	106.37	102.60	110.28	145
list-push-aos	80.29	80.97	80.95	81.10	81.37	82.44	81.77	81.49	80.72	81.93	80.93	83
list-push-soa	47.52	42.65	45.28	46.64	43.46	40.59	44.94	46.55	41.53	45.98	44.86	83
list-pull-aos	85.30	82.97	86.43	83.42	86.33	83.70	86.43	83.77	83.10	85.89	84.44	83
list-pull-soa	62.12	63.61	63.28	61.32	66.72	62.65	64.82	60.49	58.01	64.46	62.52	83
list-pull-split-nt-1s-soa	121.35	113.77	115.29	113.54	117.00	116.46	114.78	114.54	110.83	112.67	117.85	125
list-pull-split-nt-2s-soa	118.09	110.48	112.55	113.18	113.44	111.85	109.27	114.41	110.28	111.78	113.74	125
list-aa-aos	121.28	118.63	119.00	118.50	121.99	119.11	118.83	121.47	121.62	126.18	120.12	129
list-aa-soa	126.34	116.90	129.45	127.12	129.41	121.42	126.19	126.76	126.70	124.40	125.22	129
list-aa-ria-soa	133.68	121.82	126.04	128.46	131.15	132.25	128.78	133.50	126.69	124.40	130.37	145
list-aa-pv-soa	146.22	124.39	130.73	136.29	137.61	131.21	138.65	138.78	127.02	132.40	138.37	145
kernel	pipe	blocks-2	blocks-4	blocks-6	blocks-8	blocks-10	blocks-15	blocks-16	blocks-20	blocks-25	blocks-32	RFM
blk-push-aos	55.75	47.62	54.57	57.10	58.49	59.00	61.72	60.56	64.05	61.10	66.03	105
blk-push-soa	30.06	31.09	32.13	32.54	32.74	32.72	33.81	33.19	34.90	33.21	35.75	105
blk-pull-aos	53.80	48.61	53.08	54.99	56.08	56.68	59.20	58.12	61.49	58.71	63.45	105
blk-pull-soa	46.96	46.61	48.84	49.70	50.33	50.46	52.36	51.39	54.20	51.61	55.71	105
aa-aos	91.40	66.99	78.47	83.38	86.62	88.62	92.98	91.54	97.08	94.93	98.90	168
aa-soa	83.01	69.96	75.85	77.72	79.01	79.29	82.38	80.11	85.70	83.91	87.69	168
aa-vec-soa	112.03	96.52	105.32	109.76	112.55	113.82	120.55	118.37	126.30	121.37	131.94	168
list-push-aos	75.13	74.18	75.20	75.42	75.24	75.99	75.80	75.80	75.54	76.22	76.21	97
list-push-soa	40.99	38.14	39.00	38.89	38.89	39.67	39.87	39.28	39.35	40.08	40.13	97
list-pull-aos	82.07	82.88	83.29	83.09	83.32	83.49	82.82	82.88	83.32	82.60	82.93	97
list-pull-soa	62.07	60.40	61.89	61.39	62.43	60.90	60.48	62.80	62.50	61.10	60.38	97
list-pull-split-nt-1s-soa	125.81	120.60	121.96	122.34	122.86	123.53	123.64	123.67	125.94	124.09	123.69	128
list-pull-split-nt-2s-soa	122.79	117.16	118.86	119.16	119.56	119.99	120.01	120.03	122.64	120.57	120.39	128
list-aa-aos	128.13	127.41	129.31	129.07	129.79	129.63	129.67	129.94	129.12	128.41	129.72	150
list-aa-soa	141.60	139.78	141.58	142.16	141.94	141.31	142.37	142.25	142.43	141.40	142.26	150
list-aa-ria-soa	141.82	134.88	140.15	140.72	141.67	140.51	141.18	141.29	142.97	141.94	143.25	168
list-aa-pv-soa	164.79	140.95	159.24	161.78	162.40	163.04	164.69	164.38	165.11	165.75	166.09	168
kernel	pipe	blocks-2	blocks-4	blocks-6	blocks-8	blocks-10	blocks-15	blocks-16	blocks-20	blocks-25	blocks-32	RFM
blk-push-aos	113.01	93.99	108.98	114.65	117.87	119.47	124.95	122.46	129.29	123.87	133.01	197
blk-push-soa	100.21	98.87	103.63	105.56	107.02	107.27	111.61	109.83	116.16	110.51	110.29	197
blk-pull-aos	118.45	102.54	114.12	117.82	122.69	124.31	130.58	127.85	135.72	129.65	139.94	197
blk-pull-soa	82.60	83.36	87.13	88.39	88.84	88.96	92.48	90.93	95.79	91.92	98.64	197
aa-aos	171.32	125.43	147.73	157.70	163.35	167.25	175.39	174.20	182.54	173.67	187.76	308
aa-soa	180.85	152.39	165.84	152.59	171.90	175.76	184.94	182.34	189.43	180.30	193.54	308
aa-vec-soa	208.03	181.51	195.86	203.41	209.08	212.34	224.05	219.49	234.31	225.92	245.22	308
list-push-aos	158.81	164.67	162.93	163.05	165.22	164.31	164.66	160.78	164.07	165.19	164.06	177
list-push-soa	134.60	110.44	110.17	132.01	132.95	133.46	134.37	134.33	135.12	134.91	137.87	177
list-pull-aos	169.61	170.03	170.89	170.90	171.20	171.60	172.09	171.95	169.48	172.08	171.02	177
list-pull-soa	120.50	116.73	118.62	118.00	120.99	118.15	117.17	121.41	120.83	120.00	118.74	177
list-pull-split-nt-1s-soa	225.59	224.18	225.10	226.34	226.01	230.37	227.50	228.42	227.39	231.65	227.35	246
list-pull-split-nt-2s-soa	219.20	214.63	217.61	218.13	219.07	221.01	219.88	220.09	220.62	221.68	220.58	246
list-aa-aos	241.39	239.27	239.53	242.56	242.46	243.00	242.91	242.46	241.24	242.96	241.52	275
list-aa-soa	273.73	268.49	268.48	271.79	275.29	274.56	277.18	272.67	274.21	275.24	278.21	275
list-aa-ria-soa	288.42	261.89	273.26	284.84	283.88	288.29	290.72	289.81	293.36	290.75	292.93	308
list-aa-pv-soa	303.35	267.21	289.18	294.96	294.36	298.16	300.45	301.71	302.37	302.88	304.46	308