add single precision, add aa-vec-sl-soa kernel, updated doc

[LbmBenchmarkKernelsPublic.git] / doc / main.html
diff --git a/doc/main.html b/doc/main.html

new file mode 100644 (file)

index 0000000..9f11866
--- /dev/null
+++ b/doc/main.html
@@ -0,0 +1,1239 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="Docutils 0.12: http://docutils.sourceforge.net/" />
+<title>LBM Benchmark Kernels Documentation</title>
+<style type="text/css">
+
+/*
+:Author: David Goodger (goodger@python.org)
+:Id: $Id: html4css1.css 7614 2013-02-21 15:55:51Z milde $
+:Copyright: This stylesheet has been placed in the public domain.
+
+Default cascading style sheet for the HTML output of Docutils.
+
+See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
+customize this style sheet.
+*/
+
+/* used to remove borders from tables and images */
+.borderless, table.borderless td, table.borderless th {
+  border: 0 }
+
+table.borderless td, table.borderless th {
+  /* Override padding for "table.docutils td" with "! important".
+     The right padding separates the table cells. */
+  padding: 0 0.5em 0 0 ! important }
+
+.first {
+  /* Override more specific margin styles with "! important". */
+  margin-top: 0 ! important }
+
+.last, .with-subtitle {
+  margin-bottom: 0 ! important }
+
+.hidden {
+  display: none }
+
+a.toc-backref {
+  text-decoration: none ;
+  color: black }
+
+blockquote.epigraph {
+  margin: 2em 5em ; }
+
+dl.docutils dd {
+  margin-bottom: 0.5em }
+
+object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
+  overflow: hidden;
+}
+
+/* Uncomment (and remove this text!) to get bold-faced definition list terms
+dl.docutils dt {
+  font-weight: bold }
+*/
+
+div.abstract {
+  margin: 2em 5em }
+
+div.abstract p.topic-title {
+  font-weight: bold ;
+  text-align: center }
+
+div.admonition, div.attention, div.caution, div.danger, div.error,
+div.hint, div.important, div.note, div.tip, div.warning {
+  margin: 2em ;
+  border: medium outset ;
+  padding: 1em }
+
+div.admonition p.admonition-title, div.hint p.admonition-title,
+div.important p.admonition-title, div.note p.admonition-title,
+div.tip p.admonition-title {
+  font-weight: bold ;
+  font-family: sans-serif }
+
+div.attention p.admonition-title, div.caution p.admonition-title,
+div.danger p.admonition-title, div.error p.admonition-title,
+div.warning p.admonition-title, .code .error {
+  color: red ;
+  font-weight: bold ;
+  font-family: sans-serif }
+
+/* Uncomment (and remove this text!) to get reduced vertical space in
+   compound paragraphs.
+div.compound .compound-first, div.compound .compound-middle {
+  margin-bottom: 0.5em }
+
+div.compound .compound-last, div.compound .compound-middle {
+  margin-top: 0.5em }
+*/
+
+div.dedication {
+  margin: 2em 5em ;
+  text-align: center ;
+  font-style: italic }
+
+div.dedication p.topic-title {
+  font-weight: bold ;
+  font-style: normal }
+
+div.figure {
+  margin-left: 2em ;
+  margin-right: 2em }
+
+div.footer, div.header {
+  clear: both;
+  font-size: smaller }
+
+div.line-block {
+  display: block ;
+  margin-top: 1em ;
+  margin-bottom: 1em }
+
+div.line-block div.line-block {
+  margin-top: 0 ;
+  margin-bottom: 0 ;
+  margin-left: 1.5em }
+
+div.sidebar {
+  margin: 0 0 0.5em 1em ;
+  border: medium outset ;
+  padding: 1em ;
+  background-color: #ffffee ;
+  width: 40% ;
+  float: right ;
+  clear: right }
+
+div.sidebar p.rubric {
+  font-family: sans-serif ;
+  font-size: medium }
+
+div.system-messages {
+  margin: 5em }
+
+div.system-messages h1 {
+  color: red }
+
+div.system-message {
+  border: medium outset ;
+  padding: 1em }
+
+div.system-message p.system-message-title {
+  color: red ;
+  font-weight: bold }
+
+div.topic {
+  margin: 2em }
+
+h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
+h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
+  margin-top: 0.4em }
+
+h1.title {
+  text-align: center }
+
+h2.subtitle {
+  text-align: center }
+
+hr.docutils {
+  width: 75% }
+
+img.align-left, .figure.align-left, object.align-left {
+  clear: left ;
+  float: left ;
+  margin-right: 1em }
+
+img.align-right, .figure.align-right, object.align-right {
+  clear: right ;
+  float: right ;
+  margin-left: 1em }
+
+img.align-center, .figure.align-center, object.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+.align-left {
+  text-align: left }
+
+.align-center {
+  clear: both ;
+  text-align: center }
+
+.align-right {
+  text-align: right }
+
+/* reset inner alignment in figures */
+div.align-right {
+  text-align: inherit }
+
+/* div.align-center * { */
+/*   text-align: left } */
+
+ol.simple, ul.simple {
+  margin-bottom: 1em }
+
+ol.arabic {
+  list-style: decimal }
+
+ol.loweralpha {
+  list-style: lower-alpha }
+
+ol.upperalpha {
+  list-style: upper-alpha }
+
+ol.lowerroman {
+  list-style: lower-roman }
+
+ol.upperroman {
+  list-style: upper-roman }
+
+p.attribution {
+  text-align: right ;
+  margin-left: 50% }
+
+p.caption {
+  font-style: italic }
+
+p.credits {
+  font-style: italic ;
+  font-size: smaller }
+
+p.label {
+  white-space: nowrap }
+
+p.rubric {
+  font-weight: bold ;
+  font-size: larger ;
+  color: maroon ;
+  text-align: center }
+
+p.sidebar-title {
+  font-family: sans-serif ;
+  font-weight: bold ;
+  font-size: larger }
+
+p.sidebar-subtitle {
+  font-family: sans-serif ;
+  font-weight: bold }
+
+p.topic-title {
+  font-weight: bold }
+
+pre.address {
+  margin-bottom: 0 ;
+  margin-top: 0 ;
+  font: inherit }
+
+pre.literal-block, pre.doctest-block, pre.math, pre.code {
+  margin-left: 2em ;
+  margin-right: 2em }
+
+pre.code .ln { color: grey; } /* line numbers */
+pre.code, code { background-color: #eeeeee }
+pre.code .comment, code .comment { color: #5C6576 }
+pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold }
+pre.code .literal.string, code .literal.string { color: #0C5404 }
+pre.code .name.builtin, code .name.builtin { color: #352B84 }
+pre.code .deleted, code .deleted { background-color: #DEB0A1}
+pre.code .inserted, code .inserted { background-color: #A3D289}
+
+span.classifier {
+  font-family: sans-serif ;
+  font-style: oblique }
+
+span.classifier-delimiter {
+  font-family: sans-serif ;
+  font-weight: bold }
+
+span.interpreted {
+  font-family: sans-serif }
+
+span.option {
+  white-space: nowrap }
+
+span.pre {
+  white-space: pre }
+
+span.problematic {
+  color: red }
+
+span.section-subtitle {
+  /* font-size relative to parent (h1..h6 element) */
+  font-size: 80% }
+
+table.citation {
+  border-left: solid 1px gray;
+  margin-left: 1px }
+
+table.docinfo {
+  margin: 2em 4em }
+
+table.docutils {
+  margin-top: 0.5em ;
+  margin-bottom: 0.5em }
+
+table.footnote {
+  border-left: solid 1px black;
+  margin-left: 1px }
+
+table.docutils td, table.docutils th,
+table.docinfo td, table.docinfo th {
+  padding-left: 0.5em ;
+  padding-right: 0.5em ;
+  vertical-align: top }
+
+table.docutils th.field-name, table.docinfo th.docinfo-name {
+  font-weight: bold ;
+  text-align: left ;
+  white-space: nowrap ;
+  padding-left: 0 }
+
+/* "booktabs" style (no vertical lines) */
+table.docutils.booktabs {
+  border: 0px;
+  border-top: 2px solid;
+  border-bottom: 2px solid;
+  border-collapse: collapse;
+}
+table.docutils.booktabs * {
+  border: 0px;
+}
+table.docutils.booktabs th {
+  border-bottom: thin solid;
+  text-align: left;
+}
+
+h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
+h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
+  font-size: 100% }
+
+ul.auto-toc {
+  list-style-type: none }
+
+</style>
+<style type="text/css">
+
+
+h1, h2, h3, h4, h5, h6 {
+  font-family: sans-serif;
+  font-size: 100%;
+  background-color: #dcdcdc;
+}
+
+h1.title {
+  background-color: gray;
+  color: white
+}
+
+table.footnote {
+  padding-left: 0.5ex;
+}
+
+table.citation {
+  padding-left: 0.5ex
+}
+
+td.label {
+  width: 10%;
+}
+
+table, table.docutils, td, th {
+  border: 0;
+}
+
+table.citation, table.footnote {
+  width: 100%;
+}
+
+th {
+    background-color: lavender ;
+}
+
+tr:nth-child(even) {
+    xxbackground-color: aliceblue;
+    background-color: white; 
+}
+tr:nth-child(odd) {
+    xxbackground-color: lavender;
+    background-color: whitesmoke;
+}
+
+
+
+</style>
+</head>
+<body>
+<div class="document">
+
+
+<div class="line-block">
+<div class="line">Copyright</div>
+<div class="line-block">
+<div class="line">Markus Wittmann, 2016-2018</div>
+<div class="line">RRZE, University of Erlangen-Nuremberg, Germany</div>
+<div class="line">markus.wittmann -at- fau.de or hpc -at- rrze.fau.de</div>
+<div class="line"><br /></div>
+<div class="line">Viktor Haag, 2016</div>
+<div class="line">LSS, University of Erlangen-Nuremberg, Germany</div>
+<div class="line"><br /></div>
+</div>
+<div class="line">This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).</div>
+<div class="line"><br /></div>
+<div class="line">LbmBenchKernels is free software: you can redistribute it and/or modify</div>
+<div class="line">it under the terms of the GNU General Public License as published by</div>
+<div class="line">the Free Software Foundation, either version 3 of the License, or</div>
+<div class="line">(at your option) any later version.</div>
+<div class="line"><br /></div>
+<div class="line">LbmBenchKernels is distributed in the hope that it will be useful,</div>
+<div class="line">but WITHOUT ANY WARRANTY; without even the implied warranty of</div>
+<div class="line">MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the</div>
+<div class="line">GNU General Public License for more details.</div>
+<div class="line"><br /></div>
+<div class="line">You should have received a copy of the GNU General Public License</div>
+<div class="line">along with LbmBenchKernels.  If not, see &lt;<a class="reference external" href="http://www.gnu.org/licenses/">http://www.gnu.org/licenses/</a>&gt;.</div>
+</div>
+<p><strong>LBM Benchmark Kernels Documentation</strong></p>
+<div class="contents topic" id="contents">
+<p class="topic-title first">Contents</p>
+<ul class="auto-toc simple">
+<li><a class="reference internal" href="#introduction" id="id5">1&nbsp;&nbsp;&nbsp;Introduction</a></li>
+<li><a class="reference internal" href="#compilation" id="id6">2&nbsp;&nbsp;&nbsp;Compilation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#debug-and-verification" id="id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></li>
+<li><a class="reference internal" href="#release-and-verification" id="id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
+<li><a class="reference internal" href="#benchmarking" id="id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
+<li><a class="reference internal" href="#compilers" id="id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
+<li><a class="reference internal" href="#floating-point-precision" id="id11">2.5&nbsp;&nbsp;&nbsp;Floating Point Precision</a></li>
+<li><a class="reference internal" href="#cleaning" id="id12">2.6&nbsp;&nbsp;&nbsp;Cleaning</a></li>
+<li><a class="reference internal" href="#options-summary" id="id13">2.7&nbsp;&nbsp;&nbsp;Options Summary</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#invocation" id="id14">3&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
+<li><a class="reference internal" href="#command-line-parameters" id="id15">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
+<li><a class="reference internal" href="#kernels" id="id16">3.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#id2" id="id17">4&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
+<li><a class="reference internal" href="#intel-compiler" id="id18">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></li>
+<li><a class="reference internal" href="#pinning" id="id19">4.2&nbsp;&nbsp;&nbsp;Pinning</a></li>
+<li><a class="reference internal" href="#general-remarks" id="id20">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></li>
+<li><a class="reference internal" href="#padding" id="id21">4.4&nbsp;&nbsp;&nbsp;Padding</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#geometries" id="id22">5&nbsp;&nbsp;&nbsp;Geometries</a></li>
+<li><a class="reference internal" href="#performance-results" id="id23">6&nbsp;&nbsp;&nbsp;Performance Results</a><ul class="auto-toc">
+<li><a class="reference internal" href="#machine-specifications" id="id24">6.1&nbsp;&nbsp;&nbsp;Machine Specifications</a></li>
+<li><a class="reference internal" href="#single-socket-results" id="id25">6.2&nbsp;&nbsp;&nbsp;Single Socket Results</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#licence" id="id26">7&nbsp;&nbsp;&nbsp;Licence</a></li>
+<li><a class="reference internal" href="#acknowledgements" id="id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
+<li><a class="reference internal" href="#bibliography" id="id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></li>
+</ul>
+</div>
+<div class="section" id="introduction">
+<h1><a class="toc-backref" href="#id5">1&nbsp;&nbsp;&nbsp;Introduction</a></h1>
+<p>The lattice Boltzmann (LBM) benchmark kernels are a collection of LBM kernel
+implementations.</p>
+<p><strong>AS SUCH THE LBM BENCHMARK KERNELS ARE NO FULLY EQUIPPED CFD SOLVER AND SOLELY
+SERVES THE PURPOSE OF STUDYING POSSIBLE PERFORMANCE OPTIMIZATIONS AND/OR
+EXPERIMENTS.</strong></p>
+<p>Currently all kernels utilize a D3Q19 discretization and the
+two-relaxation-time (TRT) collision operator <a class="citation-reference" href="#ginzburg-2008" id="id1">[ginzburg-2008]</a>.
+All operations are carried out in double or single precision arithmetic.</p>
+</div>
+<div class="section" id="compilation">
+<h1><a class="toc-backref" href="#id6">2&nbsp;&nbsp;&nbsp;Compilation</a></h1>
+<p>The benchmark framework currently supports only Linux systems and the GCC and
+Intel compilers. Every other configuration probably requires adjustment inside
+the code and the makefiles. Furthermore some code might be platform or at least
+POSIX specific.</p>
+<p>The benchmark can be build via <tt class="docutils literal">make</tt> from the <tt class="docutils literal">src</tt> subdirectory. This will
+generate one binary which hosts all implemented benchmark kernels.</p>
+<p>Binaries are located under the <tt class="docutils literal">bin</tt> subdirectory and will have different names
+depending on compiler and build configuration.</p>
+<p>Compilation can target debug or release builds. Combined with both build types
+verification can be enabled, which increases the runtime and hence is not
+suited for benchmarking.</p>
+<div class="section" id="debug-and-verification">
+<h2><a class="toc-backref" href="#id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
+<pre class="literal-block">
+make BUILD=debug BENCHMARK=off
+</pre>
+<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of
+the benchmark kernels, where no optimizations are performed,  line numbers and
+debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined.  The resulting
+binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named
+<tt class="docutils literal"><span class="pre">lbmbenchk-linux-&lt;compiler&gt;-debug</span></tt>.</p>
+<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification
+(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output
+(<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p>
+<p>Please note that the generated binary will therefore
+exhibit a poor performance.</p>
+</div>
+<div class="section" id="release-and-verification">
+<h2><a class="toc-backref" href="#id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></h2>
+<p>Verification with the debug builds can be extremely slow. Hence verification
+capabilities can be build with release builds:</p>
+<pre class="literal-block">
+make BENCHMARK=off
+</pre>
+</div>
+<div class="section" id="benchmarking">
+<h2><a class="toc-backref" href="#id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
+<p>To generate a binary for benchmarking run make with</p>
+<pre class="literal-block">
+make
+</pre>
+<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
+<tt class="docutils literal">BUILD=release</tt> turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
+verfification, statistics, and VTK output.</p>
+<p>See Options Summary below for further description of options which can be
+applied, e.g. TARCH as well as the Benchmarking section.</p>
+</div>
+<div class="section" id="compilers">
+<h2><a class="toc-backref" href="#id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></h2>
+<p>Currently only the GCC and Intel compiler under Linux are supported. Between
+both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
+<tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
+</div>
+<div class="section" id="floating-point-precision">
+<h2><a class="toc-backref" href="#id11">2.5&nbsp;&nbsp;&nbsp;Floating Point Precision</a></h2>
+<p>As default double precision data types are used for storing PDFs and floating
+point constants. Furthermore, this is the default for the intrincis kernels.
+With the <tt class="docutils literal">PRECISION=sp</tt> variable this can be changed to single precision.</p>
+<pre class="literal-block">
+make PRECISION=sp   # build for single precision kernels
+
+make PRECISION=dp   # build for double precision kernels (defalt)
+</pre>
+</div>
+<div class="section" id="cleaning">
+<h2><a class="toc-backref" href="#id12">2.6&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
+<p>For each configuration and build (debug/release) a subdirectory under the
+<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
+stored.
+With</p>
+<pre class="literal-block">
+make CONFIG=... BUILD=... clean
+</pre>
+<p>a specific combination is select and cleaned, whereas with</p>
+<pre class="literal-block">
+make clean-all
+</pre>
+<p>all object and dependency files are deleted.</p>
+</div>
+<div class="section" id="options-summary">
+<h2><a class="toc-backref" href="#id13">2.7&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
+<p>Options that can be specified when building the suite with make:</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="7%" />
+<col width="12%" />
+<col width="6%" />
+<col width="75%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">name</th>
+<th class="head">values</th>
+<th class="head">default</th>
+<th class="head">description</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>BENCHMARK</td>
+<td>on, off</td>
+<td>on</td>
+<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td>
+</tr>
+<tr><td>BUILD</td>
+<td>debug, release</td>
+<td>release</td>
+<td>debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.</td>
+</tr>
+<tr><td>CONFIG</td>
+<td>linux-gcc, linux-intel</td>
+<td>linux-intel</td>
+<td>Select GCC or Intel compiler.</td>
+</tr>
+<tr><td>ISA</td>
+<td>avx, sse</td>
+<td>avx</td>
+<td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td>
+</tr>
+<tr><td>OPENMP</td>
+<td>on, off</td>
+<td>on</td>
+<td>OpenMP, i.,e.. threading support.</td>
+</tr>
+<tr><td>PRECISION</td>
+<td>dp, sp</td>
+<td>dp</td>
+<td>Floating point precision used for data type, arithmetic, and intrincics.</td>
+</tr>
+<tr><td>STATISTICS</td>
+<td>on, off</td>
+<td>off</td>
+<td>View statistics, like density etc, during simulation.</td>
+</tr>
+<tr><td>TARCH</td>
+<td>--</td>
+<td>--</td>
+<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td>
+</tr>
+<tr><td>VERIFICATION</td>
+<td>on, off</td>
+<td>off</td>
+<td>Turn verification on/off.</td>
+</tr>
+<tr><td>VTK_OUTPUT</td>
+<td>on, off</td>
+<td>off</td>
+<td>Enable/Disable VTK file output.</td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
+<div class="section" id="invocation">
+<h1><a class="toc-backref" href="#id14">3&nbsp;&nbsp;&nbsp;Invocation</a></h1>
+<p>Running the binary will print among the GPL licence header a line like the following:</p>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+</pre>
+<p>if verfication was enabled during compilation or</p>
+<pre class="literal-block">
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
+</pre>
+<p>if verfication was disabled during compilation.</p>
+<div class="section" id="command-line-parameters">
+<h2><a class="toc-backref" href="#id15">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
+<p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
+<pre class="literal-block">
+Usage:
+./lbmbenchk -list
+./lbmbenchk
+    [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-&lt;block size&gt;]] [-iterations &lt;iterations&gt;] [-lattice-dump-ascii]
+    [-rho-in &lt;density&gt;] [-rho-out &lt;density] [-omega &lt;omega&gt;] [-kernel &lt;kernel&gt;]
+    [-periodic-x]
+    [-t &lt;number of threads&gt;]
+    [-pin core{,core}*]
+    [-verify]
+    -- &lt;kernel specific parameters&gt;
+
+-list           List available kernels.
+
+-dims XxYxZ     Specify geometry dimensions.
+
+-geometry blocks-&lt;block size&gt;
+                Geometetry with blocks of size &lt;block size&gt; regularily layout out.
+</pre>
+<p>If an option is specified multiple times the last one overrides previous ones.
+This holds also true for <tt class="docutils literal"><span class="pre">-verify</span></tt> which sets geometry dimensions,
+iterations, etc, which can afterward be override, e.g.:</p>
+<pre class="literal-block">
+$ bin/lbmbenchk-linux-intel-release-dp -verfiy -dims 32x32x32
+</pre>
+<p>Kernel specific parameters can be obtained via selecting the specific kernel
+and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
+<pre class="literal-block">
+$ bin/lbmbenchk-linux-intel-release-dp -kernel kernel-name -- -h
+...
+Kernel parameters:
+[-blk &lt;n&gt;] [-blk-[xyz] &lt;n&gt;]
+</pre>
+<p>A list of all available kernels can be obtained via <tt class="docutils literal"><span class="pre">-list</span></tt>:</p>
+<pre class="literal-block">
+$ ../bin/lbmbenchk-linux-gcc-debug-dp -list
+Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE
+This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.
+This is free software, and you are welcome to redistribute it under certain conditions.
+
+LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+Available kernels to benchmark:
+   list-aa-pv-soa
+   list-aa-ria-soa
+   list-aa-soa
+   list-aa-aos
+   list-pull-split-nt-1s-soa
+   list-pull-split-nt-2s-soa
+   list-push-soa
+   list-push-aos
+   list-pull-soa
+   list-pull-aos
+   push-soa
+   push-aos
+   pull-soa
+   pull-aos
+   blk-push-soa
+   blk-push-aos
+   blk-pull-soa
+   blk-pull-aos
+</pre>
+</div>
+<div class="section" id="kernels">
+<h2><a class="toc-backref" href="#id16">3.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
+<p>The following list shortly describes available kernels:</p>
+<ul class="simple">
+<li><strong>push-soa/push-aos/pull-soa/pull-aos</strong>:
+Unoptimized kernels (but stream/collide are already fused) using two grids as
+source and destination. Implement push/pull semantics as well structure of
+arrays (soa) or array of structures (aos) layout.</li>
+<li><strong>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos</strong>:
+The same as the unoptimized kernels without the blk prefix, except that they support
+spatial blocking, i.e. loop blocking of the three loops used to iterate over
+the lattice. Here manual work sharing for OpenMP is used.</li>
+<li><strong>aa-aos/aa-soa</strong>:
+Straight forward implementation of AA pattern on full array with blocking support.
+Manual work sharing for OpenMP is used. Domain is partitioned only along the x dimension.</li>
+<li><strong>aa-vec-soa/aa-vec-sl-soa</strong>:
+Optimized AA kernel with intrinsics on full array. aa-vec-sl-soa uses only
+one loop for iterating over the lattice instead of three nested ones.</li>
+<li><strong>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos</strong>:
+The same as the unoptimized kernels without the list prefix, but for indirect addressing.
+Here only a 1D vector of is used to store the fluid nodes, omitting the
+obstacles. An adjacency list is used to recover the neighborhood associations.</li>
+<li><strong>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa</strong>:
+Optimized variant of list-pull-soa. Chunks of the lattice are processed as
+once. Postcollision values are written back via nontemporal stores in 18 (1s)
+or 9 (2s) loops.</li>
+<li><strong>list-aa-aos/list-aa-soa</strong>:
+Unoptimized implementation of the AA pattern for the 1D vector with adjacency
+list. Supported are array of structures (aos) and structure of arrays (soa)
+data layout is supported.</li>
+<li><strong>list-aa-ria-soa</strong>:
+Implementation of AA pattern with intrinsics for the 1D vector with adjacency
+list. Furthermore it contains a vectorized even time step and run length
+coding to reduce the loop balance of the odd time step.</li>
+<li><strong>list-aa-pv-soa</strong>:
+All optimizations of list-aa-ria-soa. Additional with partial vectorization
+of the odd time step.</li>
+</ul>
+<p>Note that all array of structures (aos) kernels might require blocking
+(depending on the domain size) to reach the performance of their structure of
+arrays (soa) counter parts.</p>
+<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means
+direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D
+vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported.
+The loop balance B_l is computed for D3Q19 model with <strong>double precision</strong> floating
+point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
+As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
+loop balance depends on the geometry. The effective loop balance is printed
+during each run.</p>
+<table border="1" class="docutils">
+<colgroup>
+<col width="29%" />
+<col width="14%" />
+<col width="14%" />
+<col width="6%" />
+<col width="10%" />
+<col width="10%" />
+<col width="16%" />
+</colgroup>
+<thead valign="bottom">
+<tr><th class="head">kernel name</th>
+<th class="head">prop. step</th>
+<th class="head">data layout</th>
+<th class="head">addr.</th>
+<th class="head">parallel</th>
+<th class="head">blocking</th>
+<th class="head">B_l [B/FLUP]</th>
+</tr>
+</thead>
+<tbody valign="top">
+<tr><td>push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>--</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>blk-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>456</td>
+</tr>
+<tr><td>aa-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>304</td>
+</tr>
+<tr><td>aa-aos</td>
+<td>AA</td>
+<td>AoS</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>304</td>
+</tr>
+<tr><td>aa-vec-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>304</td>
+</tr>
+<tr><td>aa-vec-sl-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>D</td>
+<td>x</td>
+<td>x</td>
+<td>304</td>
+</tr>
+<tr><td>list-push-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-push-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-soa</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-aos</td>
+<td>OS</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>528</td>
+</tr>
+<tr><td>list-pull-split-nt-1s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-pull-split-nt-2s</td>
+<td>OS</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>376</td>
+</tr>
+<tr><td>list-aa-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-aos</td>
+<td>AA</td>
+<td>AoS</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>340</td>
+</tr>
+<tr><td>list-aa-ria-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+<tr><td>list-aa-pv-soa</td>
+<td>AA</td>
+<td>SoA</td>
+<td>I</td>
+<td>x</td>
+<td>x</td>
+<td>304-342</td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
+<div class="section" id="id2">
+<h1><a class="toc-backref" href="#id17">4&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
+<p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
+created make sure the binary was compiled with:</p>
+<ul class="simple">
+<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
+<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
+<li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
+<li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
+</ul>
+<div class="section" id="intel-compiler">
+<h2><a class="toc-backref" href="#id18">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></h2>
+<p>For the Intel compiler one can specify depending on the target ISA extension:</p>
+<ul class="simple">
+<li>AVX:          <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li>
+<li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li>
+<li>AVX512:       <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li>
+<li>KNL:          <tt class="docutils literal"><span class="pre">TARCH=-xMIC-AVX512</span></tt></li>
+</ul>
+<p>Compiling for an architecture supporting AVX (Sandy Bridge, Ivy Bridge):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xAVX
+</pre>
+<p>Compiling for an architecture supporting AVX2 (Haswell, Broadwell):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xCORE-AVX2,-fma
+</pre>
+<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not
+implemented. This might change in the future.</p>
+<p>Compiling for an architecture supporting AVX-512 (Skylake):</p>
+<pre class="literal-block">
+make ISA=avx TARCH=-xCORE-AVX512
+</pre>
+<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the
+AVX512 intrinsics. This might change in the future.</p>
+</div>
+<div class="section" id="pinning">
+<h2><a class="toc-backref" href="#id19">4.2&nbsp;&nbsp;&nbsp;Pinning</a></h2>
+<p>During benchmarking pinning should be used via the <tt class="docutils literal"><span class="pre">-pin</span></tt> parameter. Running
+a benchmark with 10 threads and pin them to the first 10 cores works like</p>
+<pre class="literal-block">
+$ bin/lbmbenchk-linux-intel-release-dp ... -t 10 -pin $(seq -s , 0 9)
+</pre>
+</div>
+<div class="section" id="general-remarks">
+<h2><a class="toc-backref" href="#id20">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></h2>
+<p>Things the binary does nor check or control:</p>
+<ul class="simple">
+<li>transparent huge pages: when allocating memory small 4 KiB pages might be
+replaced with larger ones. This is in general a good thing, but if this is
+really the case, depends on the system settings (check e.g. the status of
+<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>).
+Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to
+a 4 KiB page, which should be the case for the lattices.
+This should result in huge pages except THP is disabled on the machine.
+(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently
+hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li>
+<li>CPU/core frequency: For reproducible results the frequency of all cores
+should be fixed.</li>
+<li>NUMA placement policy: The benchmark assumes a first touch policy, which
+means the memory will be placed at the NUMA domain the touching core is
+associated with. If a different policy is in place or the NUMA domain to be
+used is already full memory might be allocated in a remote domain. Accesses
+to remote domains typically have a higher latency and lower bandwidth.</li>
+<li>System load: interference with other application, especially on desktop
+systems should be avoided.</li>
+<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
+adjusted so that no cache or TLB thrashing should occur. The parameters are
+optimized for current Intel based systems. For more details look into the
+padding section.</li>
+<li>CPU dispatcher function: the compiler might add different versions of a
+function for different ISA extensions. Make sure the code you might think is
+executed is actually the code which is executed.</li>
+</ul>
+</div>
+<div class="section" id="padding">
+<h2><a class="toc-backref" href="#id21">4.4&nbsp;&nbsp;&nbsp;Padding</a></h2>
+<p>With correct padding cache and TLB thrashing can be avoided. Therefore the
+number of (fluid) nodes used in the data layout is artificially increased.</p>
+<p>Currently automatic padding is active for kernels which support it. It can be
+controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>)
+<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding),
+or a manual padding.</p>
+<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
+entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
+parameters of current Intel based processors.</p>
+<p>Manual padding is done via a padding string and has the format
+<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes.
+SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
+19 pages with one lattice (36 with two lattices) we are concurrently accessing
+over as much sets in the TLB as possible.
+This is controlled by the distance between the accessed pages, which is the
+number of (fluid) nodes in between them and can be adjusted by adding further
+(fluid) nodes.
+We want the distance d (in bytes) between two accessed pages to be e.g.
+<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>.
+This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong>
+would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>.
+Measurements show that with only a quarter of half of a page size as offset
+higher performance is achieved, which is done by automatic padding.
+On top of this padding more paddings can be added. They are just added to the
+padding string and are separated by commas.</p>
+<p>A zero modulus in the padding string has a special meaning. Here the
+corresponding offset is just added to the number of nodes. A padding string
+like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p>
+</div>
+</div>
+<div class="section" id="geometries">
+<h1><a class="toc-backref" href="#id22">5&nbsp;&nbsp;&nbsp;Geometries</a></h1>
+<p>TODO: supported geometries: channel, pipe, blocks, fluid</p>
+</div>
+<div class="section" id="performance-results">
+<h1><a class="toc-backref" href="#id23">6&nbsp;&nbsp;&nbsp;Performance Results</a></h1>
+<p>The sections lists performance values measured on several machines for
+different kernels and geometries and <strong>double precision</strong> floating point data/arithmetic.
+The <strong>RFM</strong> column denotes the expected performance as predicted by the
+Roofline performance model <a class="citation-reference" href="#williams-2008" id="id3">[williams-2008]</a>.
+For performance prediction of each kernel a memory bandwidth benchmark is used
+which mimics the kernels memory access pattern and the kernel's loop balance
+(see <a class="citation-reference" href="#kernels" id="id4">[kernels]</a> for details).</p>
+<div class="section" id="machine-specifications">
+<h2><a class="toc-backref" href="#id24">6.1&nbsp;&nbsp;&nbsp;Machine Specifications</a></h2>
+<p><strong>Ivy Bridge, Intel Xeon E5-2660 v2</strong></p>
+<ul class="simple">
+<li>Ivy Bridge architecture, AVX</li>
+<li>10 cores, 2.2 GHz</li>
+<li>SMT enabled</li>
+<li>memoy bandwidth:<ul>
+<li>copy-19             32.7 GB/s</li>
+<li>copy-19-nt-sl       35.6 GB/s</li>
+<li>update-19           37.4 GB/s</li>
+</ul>
+</li>
+</ul>
+<p><strong>Haswell, Intel Xeon E5-2695 v3</strong></p>
+<ul class="simple">
+<li>Haswell architecture, AVX2, FMA</li>
+<li>14 cores, 2.3 GHz</li>
+<li>2 x 7 cores in cluster-on-die (CoD) mode enabled</li>
+<li>SMT enabled</li>
+<li>memory bandwidth:<ul>
+<li>copy-19              47.3 GB/s</li>
+<li>copy-19-nt-sl        47.1 GB/s</li>
+<li>update-19            44.0 GB/s</li>
+</ul>
+</li>
+</ul>
+<p><strong>Broadwell, Intel Xeon E5-2630 v4</strong></p>
+<ul class="simple">
+<li>Broadwell architecture, AVX2, FMA</li>
+<li>10 cores, 2.2 GHz</li>
+<li>SMT disabled</li>
+<li>memory bandwidth:<ul>
+<li>copy-19              48.0 GB/s</li>
+<li>copy-nt-sl-19        48.2 GB/s</li>
+<li>update-19            51.1 GB/s</li>
+</ul>
+</li>
+</ul>
+<p><strong>Skylake, Intel Xeon Gold 6148</strong></p>
+<p>NOTE: currently we only use AVX2 intrinsics.</p>
+<ul class="simple">
+<li>Skylake server architecture, AVX2, AVX512, 2 FMA units</li>
+<li>20 cores, 2.4 GHz</li>
+<li>SMT enabled</li>
+<li>memory bandwidth:<ul>
+<li>copy-19              89.7 GB/s</li>
+<li>copy-19-nt-sl        92.4 GB/s</li>
+<li>update-19            93.6 GB/s</li>
+</ul>
+</li>
+</ul>
+<p><strong>Zen, AMD EPYC 7451</strong></p>
+<ul class="simple">
+<li>Zen architecture, AVX2, FMA</li>
+<li>24 cores, 2.3 GHz</li>
+<li>SMT enabled</li>
+<li>memory bandwidth:<ul>
+<li>copy-19              111.9 GB/s</li>
+<li>copy-19-nt-sl        111.7 GB/s</li>
+<li>update-19            109.2 GB/s</li>
+</ul>
+</li>
+</ul>
+<p><strong>Zen, AMD Ryzen 7 1700X</strong></p>
+<ul class="simple">
+<li>Zen architecture, AVX2, FMA</li>
+<li>8 cores, 3.4 GHz</li>
+<li>SMT enabled</li>
+<li>memory bandwidth:<ul>
+<li>copy-19              27.2 GB/s</li>
+<li>copy-19-nt-sl        27.1 GB/s</li>
+<li>update-19            26.1 GB/s</li>
+</ul>
+</li>
+</ul>
+</div>
+<div class="section" id="single-socket-results">
+<h2><a class="toc-backref" href="#id25">6.2&nbsp;&nbsp;&nbsp;Single Socket Results</a></h2>
+<ul class="simple">
+<li>Geometry dimensions are for all measurements 500x100x100 nodes.</li>
+<li>Note the <strong>different scaling on the y axis</strong> of the plots!</li>
+</ul>
+<table border="1" class="docutils">
+<colgroup>
+<col width="100%" />
+</colgroup>
+<tbody valign="top">
+<tr><td>Ivy Bridge, Intel Xeon E5-2660 v2, Double Precision</td>
+</tr>
+<tr><td><img alt="perf_emmy_dp" src="images/benchmark-emmy-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Ivy Bridge, Intel Xeon E5-2660 v2, Single Precision</td>
+</tr>
+<tr><td><img alt="perf_emmy_sp" src="images/benchmark-emmy-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Haswell, Intel Xeon E5-2695 v3, Double Precision</td>
+</tr>
+<tr><td><img alt="perf_hasep1_dp" src="images/benchmark-hasep1-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Haswell, Intel Xeon E5-2695 v3, Single Precision</td>
+</tr>
+<tr><td><img alt="perf_hasep1_sp" src="images/benchmark-hasep1-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Broadwell, Intel Xeon E5-2630 v4, Double Precision</td>
+</tr>
+<tr><td><img alt="perf_meggie_dp" src="images/benchmark-meggie-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Broadwell, Intel Xeon E5-2630 v4, Single Precision</td>
+</tr>
+<tr><td><img alt="perf_meggie_sp" src="images/benchmark-meggie-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Skylake, Intel Xeon Gold 6148, Double Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+</tr>
+<tr><td><img alt="perf_skylakesp2_dp" src="images/benchmark-skylakesp2-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Skylake, Intel Xeon Gold 6148, Single Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+</tr>
+<tr><td><img alt="perf_skylakesp2_sp" src="images/benchmark-skylakesp2-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Zen, AMD Ryzen 7 1700X, Double Precision</td>
+</tr>
+<tr><td><img alt="perf_summitridge1_dp" src="images/benchmark-summitridge1-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Zen, AMD Ryzen 7 1700X, Single Precision</td>
+</tr>
+<tr><td><img alt="perf_summitridge1_sp" src="images/benchmark-summitridge1-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Zen, AMD EPYC 7451, Double Precision</td>
+</tr>
+<tr><td><img alt="perf_naples1_dp" src="images/benchmark-naples1-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+<tr><td>Zen, AMD EPYC 7451, Single Precision</td>
+</tr>
+<tr><td><img alt="perf_naples1_sp" src="images/benchmark-naples1-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
+<div class="section" id="licence">
+<h1><a class="toc-backref" href="#id26">7&nbsp;&nbsp;&nbsp;Licence</a></h1>
+<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
+</div>
+<div class="section" id="acknowledgements">
+<h1><a class="toc-backref" href="#id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
+<p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
+<p>This work was funded by KONWHIR project OMI4PAPS.</p>
+</div>
+<div class="section" id="bibliography">
+<h1><a class="toc-backref" href="#id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></h1>
+<table class="docutils citation" frame="void" id="ginzburg-2008" rules="none">
+<colgroup><col class="label" /><col /></colgroup>
+<tbody valign="top">
+<tr><td class="label"><a class="fn-backref" href="#id1">[ginzburg-2008]</a></td><td>I. Ginzburg, F. Verhaeghe, and D. d'Humières.
+Two-relaxation-time lattice Boltzmann scheme: About parametrization, velocity, pressure and mixed boundary conditions.
+Commun. Comput. Phys., 3(2):427-478, 2008.</td></tr>
+</tbody>
+</table>
+<table class="docutils citation" frame="void" id="williams-2008" rules="none">
+<colgroup><col class="label" /><col /></colgroup>
+<tbody valign="top">
+<tr><td class="label"><a class="fn-backref" href="#id3">[williams-2008]</a></td><td>S. Williams, A. Waterman, and D. Patterson.
+Roofline: an insightful visual performance model for multicore architectures.
+Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
+</tbody>
+</table>
+<p>Document was generated at 2018-01-09 11:54.</p>
+</div>
+</div>
+</body>
+</html>