update README
[LbmBenchmarkKernelsPublic.git] / doc / html / main.html
CommitLineData
ecf590ae
MW
1<?xml version="1.0" encoding="utf-8" ?>
2<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4<head>
5<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6<meta name="generator" content="Docutils 0.12: http://docutils.sourceforge.net/" />
7<title>LBM Benchmark Kernels Documentation</title>
8<style type="text/css">
9
10/*
11:Author: David Goodger (goodger@python.org)
12:Id: $Id: html4css1.css 7614 2013-02-21 15:55:51Z milde $
13:Copyright: This stylesheet has been placed in the public domain.
14
15Default cascading style sheet for the HTML output of Docutils.
16
17See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
18customize this style sheet.
19*/
20
21/* used to remove borders from tables and images */
22.borderless, table.borderless td, table.borderless th {
23 border: 0 }
24
25table.borderless td, table.borderless th {
26 /* Override padding for "table.docutils td" with "! important".
27 The right padding separates the table cells. */
28 padding: 0 0.5em 0 0 ! important }
29
30.first {
31 /* Override more specific margin styles with "! important". */
32 margin-top: 0 ! important }
33
34.last, .with-subtitle {
35 margin-bottom: 0 ! important }
36
37.hidden {
38 display: none }
39
40a.toc-backref {
41 text-decoration: none ;
42 color: black }
43
44blockquote.epigraph {
45 margin: 2em 5em ; }
46
47dl.docutils dd {
48 margin-bottom: 0.5em }
49
50object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
51 overflow: hidden;
52}
53
54/* Uncomment (and remove this text!) to get bold-faced definition list terms
55dl.docutils dt {
56 font-weight: bold }
57*/
58
59div.abstract {
60 margin: 2em 5em }
61
62div.abstract p.topic-title {
63 font-weight: bold ;
64 text-align: center }
65
66div.admonition, div.attention, div.caution, div.danger, div.error,
67div.hint, div.important, div.note, div.tip, div.warning {
68 margin: 2em ;
69 border: medium outset ;
70 padding: 1em }
71
72div.admonition p.admonition-title, div.hint p.admonition-title,
73div.important p.admonition-title, div.note p.admonition-title,
74div.tip p.admonition-title {
75 font-weight: bold ;
76 font-family: sans-serif }
77
78div.attention p.admonition-title, div.caution p.admonition-title,
79div.danger p.admonition-title, div.error p.admonition-title,
80div.warning p.admonition-title, .code .error {
81 color: red ;
82 font-weight: bold ;
83 font-family: sans-serif }
84
85/* Uncomment (and remove this text!) to get reduced vertical space in
86 compound paragraphs.
87div.compound .compound-first, div.compound .compound-middle {
88 margin-bottom: 0.5em }
89
90div.compound .compound-last, div.compound .compound-middle {
91 margin-top: 0.5em }
92*/
93
94div.dedication {
95 margin: 2em 5em ;
96 text-align: center ;
97 font-style: italic }
98
99div.dedication p.topic-title {
100 font-weight: bold ;
101 font-style: normal }
102
103div.figure {
104 margin-left: 2em ;
105 margin-right: 2em }
106
107div.footer, div.header {
108 clear: both;
109 font-size: smaller }
110
111div.line-block {
112 display: block ;
113 margin-top: 1em ;
114 margin-bottom: 1em }
115
116div.line-block div.line-block {
117 margin-top: 0 ;
118 margin-bottom: 0 ;
119 margin-left: 1.5em }
120
121div.sidebar {
122 margin: 0 0 0.5em 1em ;
123 border: medium outset ;
124 padding: 1em ;
125 background-color: #ffffee ;
126 width: 40% ;
127 float: right ;
128 clear: right }
129
130div.sidebar p.rubric {
131 font-family: sans-serif ;
132 font-size: medium }
133
134div.system-messages {
135 margin: 5em }
136
137div.system-messages h1 {
138 color: red }
139
140div.system-message {
141 border: medium outset ;
142 padding: 1em }
143
144div.system-message p.system-message-title {
145 color: red ;
146 font-weight: bold }
147
148div.topic {
149 margin: 2em }
150
151h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
152h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
153 margin-top: 0.4em }
154
155h1.title {
156 text-align: center }
157
158h2.subtitle {
159 text-align: center }
160
161hr.docutils {
162 width: 75% }
163
164img.align-left, .figure.align-left, object.align-left {
165 clear: left ;
166 float: left ;
167 margin-right: 1em }
168
169img.align-right, .figure.align-right, object.align-right {
170 clear: right ;
171 float: right ;
172 margin-left: 1em }
173
174img.align-center, .figure.align-center, object.align-center {
175 display: block;
176 margin-left: auto;
177 margin-right: auto;
178}
179
180.align-left {
181 text-align: left }
182
183.align-center {
184 clear: both ;
185 text-align: center }
186
187.align-right {
188 text-align: right }
189
190/* reset inner alignment in figures */
191div.align-right {
192 text-align: inherit }
193
194/* div.align-center * { */
195/* text-align: left } */
196
197ol.simple, ul.simple {
198 margin-bottom: 1em }
199
200ol.arabic {
201 list-style: decimal }
202
203ol.loweralpha {
204 list-style: lower-alpha }
205
206ol.upperalpha {
207 list-style: upper-alpha }
208
209ol.lowerroman {
210 list-style: lower-roman }
211
212ol.upperroman {
213 list-style: upper-roman }
214
215p.attribution {
216 text-align: right ;
217 margin-left: 50% }
218
219p.caption {
220 font-style: italic }
221
222p.credits {
223 font-style: italic ;
224 font-size: smaller }
225
226p.label {
227 white-space: nowrap }
228
229p.rubric {
230 font-weight: bold ;
231 font-size: larger ;
232 color: maroon ;
233 text-align: center }
234
235p.sidebar-title {
236 font-family: sans-serif ;
237 font-weight: bold ;
238 font-size: larger }
239
240p.sidebar-subtitle {
241 font-family: sans-serif ;
242 font-weight: bold }
243
244p.topic-title {
245 font-weight: bold }
246
247pre.address {
248 margin-bottom: 0 ;
249 margin-top: 0 ;
250 font: inherit }
251
252pre.literal-block, pre.doctest-block, pre.math, pre.code {
253 margin-left: 2em ;
254 margin-right: 2em }
255
256pre.code .ln { color: grey; } /* line numbers */
257pre.code, code { background-color: #eeeeee }
258pre.code .comment, code .comment { color: #5C6576 }
259pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold }
260pre.code .literal.string, code .literal.string { color: #0C5404 }
261pre.code .name.builtin, code .name.builtin { color: #352B84 }
262pre.code .deleted, code .deleted { background-color: #DEB0A1}
263pre.code .inserted, code .inserted { background-color: #A3D289}
264
265span.classifier {
266 font-family: sans-serif ;
267 font-style: oblique }
268
269span.classifier-delimiter {
270 font-family: sans-serif ;
271 font-weight: bold }
272
273span.interpreted {
274 font-family: sans-serif }
275
276span.option {
277 white-space: nowrap }
278
279span.pre {
280 white-space: pre }
281
282span.problematic {
283 color: red }
284
285span.section-subtitle {
286 /* font-size relative to parent (h1..h6 element) */
287 font-size: 80% }
288
289table.citation {
290 border-left: solid 1px gray;
291 margin-left: 1px }
292
293table.docinfo {
294 margin: 2em 4em }
295
296table.docutils {
297 margin-top: 0.5em ;
298 margin-bottom: 0.5em }
299
300table.footnote {
301 border-left: solid 1px black;
302 margin-left: 1px }
303
304table.docutils td, table.docutils th,
305table.docinfo td, table.docinfo th {
306 padding-left: 0.5em ;
307 padding-right: 0.5em ;
308 vertical-align: top }
309
310table.docutils th.field-name, table.docinfo th.docinfo-name {
311 font-weight: bold ;
312 text-align: left ;
313 white-space: nowrap ;
314 padding-left: 0 }
315
316/* "booktabs" style (no vertical lines) */
317table.docutils.booktabs {
318 border: 0px;
319 border-top: 2px solid;
320 border-bottom: 2px solid;
321 border-collapse: collapse;
322}
323table.docutils.booktabs * {
324 border: 0px;
325}
326table.docutils.booktabs th {
327 border-bottom: thin solid;
328 text-align: left;
329}
330
331h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
332h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
333 font-size: 100% }
334
335ul.auto-toc {
336 list-style-type: none }
337
e3f82424
MW
338</style>
339<style type="text/css">
340
341
342h1, h2, h3, h4, h5, h6 {
343 font-family: sans-serif;
344 font-size: 100%;
345 background-color: #dcdcdc;
346}
347
348h1.title {
349 background-color: gray;
350 color: white
351}
352
353table.footnote {
354 padding-left: 0.5ex;
355}
356
357table.citation {
358 padding-left: 0.5ex
359}
360
361td.label {
362 width: 10%;
363}
364
365table, table.docutils, td, th {
366 border: 0;
367}
368
369table.citation, table.footnote {
370 width: 100%;
371}
372
373th {
374 background-color: lavender ;
375}
376
377tr:nth-child(even) {
378 xxbackground-color: aliceblue;
379 background-color: white;
380}
381tr:nth-child(odd) {
382 xxbackground-color: lavender;
383 background-color: whitesmoke;
384}
385
386
387
ecf590ae
MW
388</style>
389</head>
390<body>
391<div class="document" id="lbm-benchmark-kernels-documentation">
392<h1 class="title">LBM Benchmark Kernels Documentation</h1>
393
394<!-- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
395#
396# Copyright
397# Markus Wittmann, 2016-2017
398# RRZE, University of Erlangen-Nuremberg, Germany
399# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
400#
401# Viktor Haag, 2016
402# LSS, University of Erlangen-Nuremberg, Germany
403#
404# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
405#
406# LbmBenchKernels is free software: you can redistribute it and/or modify
407# it under the terms of the GNU General Public License as published by
408# the Free Software Foundation, either version 3 of the License, or
409# (at your option) any later version.
410#
411# LbmBenchKernels is distributed in the hope that it will be useful,
412# but WITHOUT ANY WARRANTY; without even the implied warranty of
413# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
414# GNU General Public License for more details.
415#
416# You should have received a copy of the GNU General Public License
417# along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
418#
419# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
420<div class="contents topic" id="contents">
421<p class="topic-title first">Contents</p>
422<ul class="auto-toc simple">
0095f461
MW
423<li><a class="reference internal" href="#introduction" id="id5">1&nbsp;&nbsp;&nbsp;Introduction</a></li>
424<li><a class="reference internal" href="#compilation" id="id6">2&nbsp;&nbsp;&nbsp;Compilation</a><ul class="auto-toc">
425<li><a class="reference internal" href="#debug-and-verification" id="id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></li>
426<li><a class="reference internal" href="#release-and-verification" id="id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></li>
427<li><a class="reference internal" href="#benchmarking" id="id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></li>
428<li><a class="reference internal" href="#compilers" id="id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></li>
429<li><a class="reference internal" href="#cleaning" id="id11">2.5&nbsp;&nbsp;&nbsp;Cleaning</a></li>
430<li><a class="reference internal" href="#options-summary" id="id12">2.6&nbsp;&nbsp;&nbsp;Options Summary</a></li>
ecf590ae
MW
431</ul>
432</li>
0095f461
MW
433<li><a class="reference internal" href="#invocation" id="id13">3&nbsp;&nbsp;&nbsp;Invocation</a><ul class="auto-toc">
434<li><a class="reference internal" href="#command-line-parameters" id="id14">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></li>
435<li><a class="reference internal" href="#kernels" id="id15">3.2&nbsp;&nbsp;&nbsp;Kernels</a></li>
ecf590ae
MW
436</ul>
437</li>
0095f461
MW
438<li><a class="reference internal" href="#id2" id="id16">4&nbsp;&nbsp;&nbsp;Benchmarking</a><ul class="auto-toc">
439<li><a class="reference internal" href="#intel-compiler" id="id17">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></li>
440<li><a class="reference internal" href="#pinning" id="id18">4.2&nbsp;&nbsp;&nbsp;Pinning</a></li>
441<li><a class="reference internal" href="#general-remarks" id="id19">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></li>
442<li><a class="reference internal" href="#padding" id="id20">4.4&nbsp;&nbsp;&nbsp;Padding</a></li>
e3f82424
MW
443</ul>
444</li>
0095f461
MW
445<li><a class="reference internal" href="#geometries" id="id21">5&nbsp;&nbsp;&nbsp;Geometries</a></li>
446<li><a class="reference internal" href="#performance-results" id="id22">6&nbsp;&nbsp;&nbsp;Performance Results</a><ul class="auto-toc">
447<li><a class="reference internal" href="#haswell-intel-xeon-e5-2695-v3" id="id23">6.1&nbsp;&nbsp;&nbsp;Haswell, Intel Xeon E5-2695 v3</a></li>
448<li><a class="reference internal" href="#broadwell-intel-xeon-e5-2630-v4" id="id24">6.2&nbsp;&nbsp;&nbsp;Broadwell, Intel Xeon E5-2630 v4</a></li>
449<li><a class="reference internal" href="#skylake-intel-xeon-gold-6148" id="id25">6.3&nbsp;&nbsp;&nbsp;Skylake, Intel Xeon Gold 6148</a></li>
450</ul>
451</li>
452<li><a class="reference internal" href="#licence" id="id26">7&nbsp;&nbsp;&nbsp;Licence</a></li>
453<li><a class="reference internal" href="#acknowledgements" id="id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></li>
454<li><a class="reference internal" href="#bibliography" id="id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></li>
ecf590ae
MW
455</ul>
456</div>
0095f461
MW
457<div class="section" id="introduction">
458<h1><a class="toc-backref" href="#id5">1&nbsp;&nbsp;&nbsp;Introduction</a></h1>
459<p>The lattice Boltzmann (LBM) benchmark kernels are a collection of LBM kernel
460implementations.</p>
461<p><strong>AS SUCH THE LBM BENCHMARK KERNELS ARE NO FULLY EQUIPPED CFD SOLVER AND SOLELY
462SERVES THE PURPOSE OF STUDYING POSSIBLE PERFORMANCE OPTIMIZATIONS AND/OR
463EXPERIMENTS.</strong></p>
464<p>Currently all kernels utilize a D3Q19 discretization and the
465two-relaxation-time (TRT) collision operator <a class="citation-reference" href="#ginzburg-2008" id="id1">[ginzburg-2008]</a>.
466All operations are carried out in double precision arithmetic.</p>
467</div>
ecf590ae 468<div class="section" id="compilation">
0095f461 469<h1><a class="toc-backref" href="#id6">2&nbsp;&nbsp;&nbsp;Compilation</a></h1>
ecf590ae
MW
470<p>The benchmark framework currently supports only Linux systems and the GCC and
471Intel compilers. Every other configuration probably requires adjustment inside
0095f461 472the code and the makefiles. Furthermore some code might be platform or at least
ecf590ae
MW
473POSIX specific.</p>
474<p>The benchmark can be build via <tt class="docutils literal">make</tt> from the <tt class="docutils literal">src</tt> subdirectory. This will
475generate one binary which hosts all implemented benchmark kernels.</p>
476<p>Binaries are located under the <tt class="docutils literal">bin</tt> subdirectory and will have different names
477depending on compiler and build configuration.</p>
0095f461
MW
478<p>Compilation can target debug or release builds. Combined with both build types
479verification can be enabled, which increases the runtime and hence is not
480suited for benchmarking.</p>
ecf590ae 481<div class="section" id="debug-and-verification">
0095f461 482<h2><a class="toc-backref" href="#id7">2.1&nbsp;&nbsp;&nbsp;Debug and Verification</a></h2>
ecf590ae 483<pre class="literal-block">
e3f82424 484make BUILD=debug BENCHMARK=off
ecf590ae 485</pre>
e3f82424 486<p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of
ecf590ae
MW
487the benchmark kernels, where no optimizations are performed, line numbers and
488debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined. The resulting
489binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named
490<tt class="docutils literal"><span class="pre">lbmbenchk-linux-&lt;compiler&gt;-debug</span></tt>.</p>
e3f82424
MW
491<p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification
492(<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output
ecf590ae
MW
493(<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p>
494<p>Please note that the generated binary will therefore
495exhibit a poor performance.</p>
496</div>
0095f461
MW
497<div class="section" id="release-and-verification">
498<h2><a class="toc-backref" href="#id8">2.2&nbsp;&nbsp;&nbsp;Release and Verification</a></h2>
499<p>Verification with the debug builds can be extremely slow. Hence verification
500capabilities can be build with release builds:</p>
501<pre class="literal-block">
502make BENCHMARK=off
503</pre>
504</div>
ecf590ae 505<div class="section" id="benchmarking">
0095f461 506<h2><a class="toc-backref" href="#id9">2.3&nbsp;&nbsp;&nbsp;Benchmarking</a></h2>
ecf590ae
MW
507<p>To generate a binary for benchmarking run make with</p>
508<pre class="literal-block">
e3f82424 509make
ecf590ae 510</pre>
e3f82424 511<p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where
0095f461 512<tt class="docutils literal">BUILD=release</tt> turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables
ecf590ae 513verfification, statistics, and VTK output.</p>
0095f461
MW
514<p>See Options Summary below for further description of options which can be
515applied, e.g. TARCH as well as the Benchmarking section.</p>
ecf590ae
MW
516</div>
517<div class="section" id="compilers">
0095f461 518<h2><a class="toc-backref" href="#id10">2.4&nbsp;&nbsp;&nbsp;Compilers</a></h2>
ecf590ae
MW
519<p>Currently only the GCC and Intel compiler under Linux are supported. Between
520both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or
521<tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p>
522</div>
e3f82424 523<div class="section" id="cleaning">
0095f461 524<h2><a class="toc-backref" href="#id11">2.5&nbsp;&nbsp;&nbsp;Cleaning</a></h2>
e3f82424
MW
525<p>For each configuration and build (debug/release) a subdirectory under the
526<tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are
527stored.
528With</p>
529<pre class="literal-block">
530make CONFIG=... BUILD=... clean
531</pre>
532<p>a specific combination is select and cleaned, whereas with</p>
533<pre class="literal-block">
534make clean-all
535</pre>
536<p>all object and dependency files are deleted.</p>
537</div>
ecf590ae 538<div class="section" id="options-summary">
0095f461
MW
539<h2><a class="toc-backref" href="#id12">2.6&nbsp;&nbsp;&nbsp;Options Summary</a></h2>
540<p>Options that can be specified when building the suite with make:</p>
ecf590ae
MW
541<table border="1" class="docutils">
542<colgroup>
ecf590ae 543<col width="7%" />
0095f461
MW
544<col width="12%" />
545<col width="6%" />
546<col width="75%" />
ecf590ae 547</colgroup>
0095f461
MW
548<thead valign="bottom">
549<tr><th class="head">name</th>
550<th class="head">values</th>
551<th class="head">default</th>
552<th class="head">description</th>
ecf590ae 553</tr>
0095f461
MW
554</thead>
555<tbody valign="top">
ecf590ae
MW
556<tr><td>BENCHMARK</td>
557<td>on, off</td>
e3f82424
MW
558<td>on</td>
559<td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td>
ecf590ae
MW
560</tr>
561<tr><td>BUILD</td>
562<td>debug, release</td>
e3f82424 563<td>release</td>
0095f461 564<td>debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.</td>
ecf590ae
MW
565</tr>
566<tr><td>CONFIG</td>
567<td>linux-gcc, linux-intel</td>
568<td>linux-intel</td>
569<td>Select GCC or Intel compiler.</td>
570</tr>
571<tr><td>ISA</td>
572<td>avx, sse</td>
573<td>avx</td>
0095f461 574<td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td>
ecf590ae
MW
575</tr>
576<tr><td>OPENMP</td>
577<td>on, off</td>
578<td>on</td>
579<td>OpenMP, i.,e.. threading support.</td>
580</tr>
581<tr><td>STATISTICS</td>
582<td>on, off</td>
583<td>off</td>
584<td>View statistics, like density etc, during simulation.</td>
585</tr>
e3f82424
MW
586<tr><td>TARCH</td>
587<td>--</td>
588<td>--</td>
589<td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td>
590</tr>
ecf590ae
MW
591<tr><td>VERIFICATION</td>
592<td>on, off</td>
593<td>off</td>
594<td>Turn verification on/off.</td>
595</tr>
596<tr><td>VTK_OUTPUT</td>
597<td>on, off</td>
598<td>off</td>
599<td>Enable/Disable VTK file output.</td>
600</tr>
601</tbody>
602</table>
603</div>
604</div>
605<div class="section" id="invocation">
0095f461 606<h1><a class="toc-backref" href="#id13">3&nbsp;&nbsp;&nbsp;Invocation</a></h1>
ecf590ae 607<p>Running the binary will print among the GPL licence header a line like the following:</p>
e3f82424
MW
608<pre class="literal-block">
609LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification
610</pre>
ecf590ae 611<p>if verfication was enabled during compilation or</p>
e3f82424
MW
612<pre class="literal-block">
613LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark
614</pre>
ecf590ae
MW
615<p>if verfication was disabled during compilation.</p>
616<div class="section" id="command-line-parameters">
0095f461 617<h2><a class="toc-backref" href="#id14">3.1&nbsp;&nbsp;&nbsp;Command Line Parameters</a></h2>
ecf590ae
MW
618<p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p>
619<pre class="literal-block">
620Usage:
621./lbmbenchk -list
622./lbmbenchk
623 [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-&lt;block size&gt;]] [-iterations &lt;iterations&gt;] [-lattice-dump-ascii]
624 [-rho-in &lt;density&gt;] [-rho-out &lt;density] [-omega &lt;omega&gt;] [-kernel &lt;kernel&gt;]
625 [-periodic-x]
626 [-t &lt;number of threads&gt;]
627 [-pin core{,core}*]
628 [-verify]
629 -- &lt;kernel specific parameters&gt;
630
631-list List available kernels.
632
633-dims XxYxZ Specify geometry dimensions.
634
635-geometry blocks-&lt;block size&gt;
636 Geometetry with blocks of size &lt;block size&gt; regularily layout out.
637</pre>
638<p>If an option is specified multiple times the last one overrides previous ones.
639This holds also true for <tt class="docutils literal"><span class="pre">-verify</span></tt> which sets geometry dimensions,
640iterations, etc, which can afterward be override, e.g.:</p>
641<pre class="literal-block">
642$ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32
643</pre>
0095f461 644<p>Kernel specific parameters can be obtained via selecting the specific kernel
ecf590ae
MW
645and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p>
646<pre class="literal-block">
e3f82424 647$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h
ecf590ae
MW
648...
649Kernel parameters:
650[-blk &lt;n&gt;] [-blk-[xyz] &lt;n&gt;]
651</pre>
652<p>A list of all available kernels can be obtained via <tt class="docutils literal"><span class="pre">-list</span></tt>:</p>
653<pre class="literal-block">
654$ ../bin/lbmbenchk-linux-gcc-debug -list
655Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE
656This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.
657This is free software, and you are welcome to redistribute it under certain conditions.
658
659LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification
660Available kernels to benchmark:
661 list-aa-pv-soa
662 list-aa-ria-soa
663 list-aa-soa
664 list-aa-aos
665 list-pull-split-nt-1s-soa
666 list-pull-split-nt-2s-soa
667 list-push-soa
668 list-push-aos
669 list-pull-soa
670 list-pull-aos
671 push-soa
672 push-aos
673 pull-soa
674 pull-aos
675 blk-push-soa
676 blk-push-aos
677 blk-pull-soa
678 blk-pull-aos
679</pre>
680</div>
e3f82424 681<div class="section" id="kernels">
0095f461 682<h2><a class="toc-backref" href="#id15">3.2&nbsp;&nbsp;&nbsp;Kernels</a></h2>
e3f82424
MW
683<p>The following list shortly describes available kernels:</p>
684<ul class="simple">
685<li>push-soa/push-aos/pull-soa/pull-aos:
686Unoptimized kernels (but stream/collide are already fused) using two grids as
687source and destination. Implement push/pull semantics as well structure of
688arrays (soa) or array of structures (aos) layout.</li>
689<li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos:
690The same as the unoptimized kernels without the blk prefix, except that they support
691spatial blocking, i.e. loop blocking of the three loops used to iterate over
692the lattice. Here manual work sharing for OpenMP is used.</li>
693<li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos:
694The same as the unoptimized kernels without the list prefix, but for indirect addressing.
695Here only a 1D vector of is used to store the fluid nodes, omitting the
696obstacles. An adjacency list is used to recover the neighborhood associations.</li>
697<li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa:
698Optimized variant of list-pull-soa. Chunks of the lattice are processed as
699once. Postcollision values are written back via nontemporal stores in 18 (1s)
700or 9 (2s) loops.</li>
701<li>list-aa-aos/list-aa-soa:
702Unoptimized implementation of the AA pattern for the 1D vector with adjacency
703list. Supported are array of structures (aos) and structure of arrays (soa)
704data layout is supported.</li>
705<li>list-aa-ria-soa:
706Implementation of AA pattern with intrinsics for the 1D vector with adjacency
707list. Furthermore it contains a vectorized even time step and run length
708coding to reduce the loop balance of the odd time step.</li>
709<li>list-aa-pv-soa:
710All optimizations of list-aa-ria-soa. Additional with partial vectorization
711of the odd time step.</li>
712</ul>
713<p>Note that all array of structures (aos) kernels might require blocking
714(depending on the domain size) to reach the performance of their structure of
715arrays (soa) counter parts.</p>
716<p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means
717direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D
718vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported.
719The loop balance B_l is computed for D3Q19 model with double precision floating
720point for PDFs (8 byte) and 4 byte integers for the index (adjacency list).
721As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective
722loop balance depends on the geometry. The effective loop balance is printed
723during each run.</p>
724<table border="1" class="docutils">
725<colgroup>
726<col width="29%" />
727<col width="14%" />
728<col width="14%" />
729<col width="6%" />
730<col width="10%" />
731<col width="10%" />
732<col width="16%" />
733</colgroup>
734<thead valign="bottom">
735<tr><th class="head">kernel name</th>
736<th class="head">prop. step</th>
737<th class="head">data layout</th>
738<th class="head">addr.</th>
739<th class="head">parallel</th>
740<th class="head">blocking</th>
741<th class="head">B_l [B/FLUP]</th>
742</tr>
743</thead>
744<tbody valign="top">
745<tr><td>push-soa</td>
746<td>OS</td>
747<td>SoA</td>
748<td>D</td>
749<td>x</td>
750<td>--</td>
751<td>456</td>
752</tr>
753<tr><td>push-aos</td>
754<td>OS</td>
755<td>AoS</td>
756<td>D</td>
757<td>x</td>
758<td>--</td>
759<td>456</td>
760</tr>
761<tr><td>pull-soa</td>
762<td>OS</td>
763<td>SoA</td>
764<td>D</td>
765<td>x</td>
766<td>--</td>
767<td>456</td>
768</tr>
769<tr><td>pull-aos</td>
770<td>OS</td>
771<td>AoS</td>
772<td>D</td>
773<td>x</td>
774<td>--</td>
775<td>456</td>
776</tr>
777<tr><td>blk-push-soa</td>
778<td>OS</td>
779<td>SoA</td>
780<td>D</td>
781<td>x</td>
782<td>x</td>
783<td>456</td>
784</tr>
785<tr><td>blk-push-aos</td>
786<td>OS</td>
787<td>AoS</td>
788<td>D</td>
789<td>x</td>
790<td>x</td>
791<td>456</td>
792</tr>
793<tr><td>blk-pull-soa</td>
794<td>OS</td>
795<td>SoA</td>
796<td>D</td>
797<td>x</td>
798<td>x</td>
799<td>456</td>
800</tr>
801<tr><td>blk-pull-aos</td>
802<td>OS</td>
803<td>AoS</td>
804<td>D</td>
805<td>x</td>
806<td>x</td>
807<td>456</td>
808</tr>
809<tr><td>list-push-soa</td>
810<td>OS</td>
811<td>SoA</td>
812<td>I</td>
813<td>x</td>
814<td>x</td>
815<td>528</td>
816</tr>
817<tr><td>list-push-aos</td>
818<td>OS</td>
819<td>AoS</td>
820<td>I</td>
821<td>x</td>
822<td>x</td>
823<td>528</td>
824</tr>
825<tr><td>list-pull-soa</td>
826<td>OS</td>
827<td>SoA</td>
828<td>I</td>
829<td>x</td>
830<td>x</td>
831<td>528</td>
832</tr>
833<tr><td>list-pull-aos</td>
834<td>OS</td>
835<td>AoS</td>
836<td>I</td>
837<td>x</td>
838<td>x</td>
839<td>528</td>
840</tr>
841<tr><td>list-pull-split-nt-1s</td>
842<td>OS</td>
843<td>SoA</td>
844<td>I</td>
845<td>x</td>
846<td>x</td>
847<td>376</td>
848</tr>
849<tr><td>list-pull-split-nt-2s</td>
850<td>OS</td>
851<td>SoA</td>
852<td>I</td>
853<td>x</td>
854<td>x</td>
855<td>376</td>
856</tr>
857<tr><td>list-aa-soa</td>
858<td>AA</td>
859<td>SoA</td>
860<td>I</td>
861<td>x</td>
862<td>x</td>
863<td>340</td>
864</tr>
865<tr><td>list-aa-aos</td>
866<td>AA</td>
867<td>AoS</td>
868<td>I</td>
869<td>x</td>
870<td>x</td>
871<td>340</td>
872</tr>
873<tr><td>list-aa-ria-soa</td>
874<td>AA</td>
875<td>SoA</td>
876<td>I</td>
877<td>x</td>
878<td>x</td>
879<td>304-342</td>
880</tr>
881<tr><td>list-aa-pv-soa</td>
882<td>AA</td>
883<td>SoA</td>
884<td>I</td>
885<td>x</td>
886<td>x</td>
887<td>304-342</td>
888</tr>
889</tbody>
890</table>
891</div>
ecf590ae 892</div>
0095f461
MW
893<div class="section" id="id2">
894<h1><a class="toc-backref" href="#id16">4&nbsp;&nbsp;&nbsp;Benchmarking</a></h1>
ecf590ae
MW
895<p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be
896created make sure the binary was compiled with:</p>
897<ul class="simple">
e3f82424
MW
898<li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li>
899<li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li>
ecf590ae
MW
900<li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li>
901<li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li>
902</ul>
0095f461
MW
903<div class="section" id="intel-compiler">
904<h2><a class="toc-backref" href="#id17">4.1&nbsp;&nbsp;&nbsp;Intel Compiler</a></h2>
905<p>For the Intel compiler one can specify depending on the target ISA extension:</p>
906<ul class="simple">
907<li>AVX: <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li>
908<li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li>
909<li>AVX512: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li>
910<li>KNL: <tt class="docutils literal"><span class="pre">TARCH=-xMIC-AVX512</span></tt></li>
911</ul>
912<p>Compiling for an architecture supporting AVX (Sandy Bridge, Ivy Bridge):</p>
913<pre class="literal-block">
914make ISA=avx TARCH=-xAVX
915</pre>
916<p>Compiling for an architecture supporting AVX2 (Haswell, Broadwell):</p>
917<pre class="literal-block">
918make ISA=avx TARCH=-xCORE-AVX2,-fma
919</pre>
920<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not
921implemented. This might change in the future.</p>
922<p>Compiling for an architecture supporting AVX-512 (Skylake):</p>
923<pre class="literal-block">
924make ISA=avx TARCH=-xCORE-AVX512
925</pre>
926<p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the
927AVX512 intrinsics. This might change in the future.</p>
928</div>
929<div class="section" id="pinning">
930<h2><a class="toc-backref" href="#id18">4.2&nbsp;&nbsp;&nbsp;Pinning</a></h2>
ecf590ae 931<p>During benchmarking pinning should be used via the <tt class="docutils literal"><span class="pre">-pin</span></tt> parameter. Running
0095f461 932a benchmark with 10 threads and pin them to the first 10 cores works like</p>
ecf590ae
MW
933<pre class="literal-block">
934$ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9)
935</pre>
0095f461
MW
936</div>
937<div class="section" id="general-remarks">
938<h2><a class="toc-backref" href="#id19">4.3&nbsp;&nbsp;&nbsp;General Remarks</a></h2>
939<p>Things the binary does nor check or control:</p>
ecf590ae
MW
940<ul class="simple">
941<li>transparent huge pages: when allocating memory small 4 KiB pages might be
942replaced with larger ones. This is in general a good thing, but if this is
e3f82424
MW
943really the case, depends on the system settings (check e.g. the status of
944<tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>).
945Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to
946a 4 KiB page, which should be the case for the lattices.
947This should result in huge pages except THP is disabled on the machine.
948(NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently
949hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li>
ecf590ae
MW
950<li>CPU/core frequency: For reproducible results the frequency of all cores
951should be fixed.</li>
952<li>NUMA placement policy: The benchmark assumes a first touch policy, which
953means the memory will be placed at the NUMA domain the touching core is
954associated with. If a different policy is in place or the NUMA domain to be
955used is already full memory might be allocated in a remote domain. Accesses
956to remote domains typically have a higher latency and lower bandwidth.</li>
0095f461 957<li>System load: interference with other application, especially on desktop
ecf590ae 958systems should be avoided.</li>
e3f82424
MW
959<li>Padding: For SoA based kernels the number of (fluid) nodes is automatically
960adjusted so that no cache or TLB thrashing should occur. The parameters are
961optimized for current Intel based systems. For more details look into the
962padding section.</li>
ecf590ae
MW
963<li>CPU dispatcher function: the compiler might add different versions of a
964function for different ISA extensions. Make sure the code you might think is
965executed is actually the code which is executed.</li>
966</ul>
0095f461 967</div>
e3f82424 968<div class="section" id="padding">
0095f461 969<h2><a class="toc-backref" href="#id20">4.4&nbsp;&nbsp;&nbsp;Padding</a></h2>
e3f82424
MW
970<p>With correct padding cache and TLB thrashing can be avoided. Therefore the
971number of (fluid) nodes used in the data layout is artificially increased.</p>
972<p>Currently automatic padding is active for kernels which support it. It can be
973controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>)
974<tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding),
975or a manual padding.</p>
976<p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32
977entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the
978parameters of current Intel based processors.</p>
979<p>Manual padding is done via a padding string and has the format
980<tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes.
981SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the
98219 pages with one lattice (36 with two lattices) we are concurrently accessing
983over as much sets in the TLB as possible.
984This is controlled by the distance between the accessed pages, which is the
985number of (fluid) nodes in between them and can be adjusted by adding further
986(fluid) nodes.
987We want the distance d (in bytes) between two accessed pages to be e.g.
988<strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>.
989This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong>
990would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>.
991Measurements show that with only a quarter of half of a page size as offset
992higher performance is achieved, which is done by automatic padding.
993On top of this padding more paddings can be added. They are just added to the
994padding string and are separated by commas.</p>
995<p>A zero modulus in the padding string has a special meaning. Here the
996corresponding offset is just added to the number of nodes. A padding string
997like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p>
998</div>
999</div>
1000<div class="section" id="geometries">
0095f461
MW
1001<h1><a class="toc-backref" href="#id21">5&nbsp;&nbsp;&nbsp;Geometries</a></h1>
1002<p>TODO: supported geometries: channel, pipe, blocks, fluid</p>
1003</div>
1004<div class="section" id="performance-results">
1005<h1><a class="toc-backref" href="#id22">6&nbsp;&nbsp;&nbsp;Performance Results</a></h1>
1006<p>The sections lists performance values measured on several machines for
1007different kernels and geometries.
1008The <strong>RFM</strong> column denotes the expected performance as predicted by the
1009Roofline performance model <a class="citation-reference" href="#williams-2008" id="id3">[williams-2008]</a>.
1010For performance prediction of each kernel a memory bandwidth benchmark is used
1011which mimics the kernels memory access pattern and the kernel's loop balance
1012(see <a class="citation-reference" href="#kernels" id="id4">[kernels]</a> for details).</p>
1013<div class="section" id="haswell-intel-xeon-e5-2695-v3">
1014<h2><a class="toc-backref" href="#id23">6.1&nbsp;&nbsp;&nbsp;Haswell, Intel Xeon E5-2695 v3</a></h2>
1015<ul class="simple">
1016<li>Haswell architecture, AVX2, FMA</li>
1017<li>14 cores, 2,3 GHz</li>
1018<li>2 x 7 cores in cluster-on-die (CoD) mode enabled</li>
1019<li>SMT enabled</li>
1020</ul>
1021<p>memory bandwidth:</p>
1022<ul class="simple">
1023<li>copy-19 47.3 GB/s</li>
1024<li>copy-19-nt-sl 47.1 GB/s</li>
1025<li>update-19 44.0 GB/s</li>
1026</ul>
1027<p>geometry dimensions: 500x100x100</p>
1028<table border="1" class="docutils">
1029<colgroup>
1030<col width="19%" />
1031<col width="7%" />
1032<col width="7%" />
1033<col width="7%" />
1034<col width="7%" />
1035<col width="7%" />
1036<col width="7%" />
1037<col width="7%" />
1038<col width="7%" />
1039<col width="7%" />
1040<col width="7%" />
1041<col width="7%" />
1042<col width="4%" />
1043</colgroup>
1044<thead valign="bottom">
1045<tr><th class="head">kernel</th>
1046<th class="head">pipe</th>
1047<th class="head">blocks-2</th>
1048<th class="head">blocks-4</th>
1049<th class="head">blocks-6</th>
1050<th class="head">blocks-8</th>
1051<th class="head">blocks-10</th>
1052<th class="head">blocks-15</th>
1053<th class="head">blocks-16</th>
1054<th class="head">blocks-20</th>
1055<th class="head">blocks-25</th>
1056<th class="head">blocks-32</th>
1057<th class="head">RFM</th>
1058</tr>
1059</thead>
1060<tbody valign="top">
1061<tr><td>blk-push-aos</td>
1062<td>58.82</td>
1063<td>49.85</td>
1064<td>57.34</td>
1065<td>59.90</td>
1066<td>61.37</td>
1067<td>62.17</td>
1068<td>65.30</td>
1069<td>64.00</td>
1070<td>67.54</td>
1071<td>64.46</td>
1072<td>69.69</td>
1073<td>104</td>
1074</tr>
1075<tr><td>blk-push-soa</td>
1076<td>32.32</td>
1077<td>33.46</td>
1078<td>34.02</td>
1079<td>34.64</td>
1080<td>35.06</td>
1081<td>35.04</td>
1082<td>36.31</td>
1083<td>35.44</td>
1084<td>37.20</td>
1085<td>35.14</td>
1086<td>37.95</td>
1087<td>104</td>
1088</tr>
1089<tr><td>blk-pull-aos</td>
1090<td>56.97</td>
1091<td>51.41</td>
1092<td>56.09</td>
1093<td>57.92</td>
1094<td>59.98</td>
1095<td>59.83</td>
1096<td>63.37</td>
1097<td>61.55</td>
1098<td>65.50</td>
1099<td>63.11</td>
1100<td>67.02</td>
1101<td>104</td>
1102</tr>
1103<tr><td>blk-pull-soa</td>
1104<td>49.29</td>
1105<td>46.23</td>
1106<td>47.50</td>
1107<td>51.97</td>
1108<td>51.27</td>
1109<td>49.52</td>
1110<td>55.23</td>
1111<td>53.13</td>
1112<td>54.50</td>
1113<td>49.79</td>
1114<td>57.90</td>
1115<td>104</td>
1116</tr>
1117<tr><td>aa-aos</td>
1118<td>91.35</td>
1119<td>66.14</td>
1120<td>76.80</td>
1121<td>84.76</td>
1122<td>83.63</td>
1123<td>91.36</td>
1124<td>93.46</td>
1125<td>92.62</td>
1126<td>93.91</td>
1127<td>92.25</td>
1128<td>92.93</td>
1129<td>145</td>
1130</tr>
1131<tr><td>aa-soa</td>
1132<td>75.51</td>
1133<td>65.68</td>
1134<td>70.94</td>
1135<td>71.36</td>
1136<td>73.83</td>
1137<td>75.46</td>
1138<td>74.84</td>
1139<td>79.48</td>
1140<td>83.28</td>
1141<td>77.70</td>
1142<td>82.72</td>
1143<td>145</td>
1144</tr>
1145<tr><td>aa-vec-soa</td>
1146<td>93.85</td>
1147<td>83.44</td>
1148<td>91.58</td>
1149<td>93.96</td>
1150<td>94.35</td>
1151<td>96.62</td>
1152<td>101.76</td>
1153<td>96.72</td>
1154<td>106.37</td>
1155<td>102.60</td>
1156<td>110.28</td>
1157<td>145</td>
1158</tr>
1159<tr><td>list-push-aos</td>
1160<td>80.29</td>
1161<td>80.97</td>
1162<td>80.95</td>
1163<td>81.10</td>
1164<td>81.37</td>
1165<td>82.44</td>
1166<td>81.77</td>
1167<td>81.49</td>
1168<td>80.72</td>
1169<td>81.93</td>
1170<td>80.93</td>
1171<td>83</td>
1172</tr>
1173<tr><td>list-push-soa</td>
1174<td>47.52</td>
1175<td>42.65</td>
1176<td>45.28</td>
1177<td>46.64</td>
1178<td>43.46</td>
1179<td>40.59</td>
1180<td>44.94</td>
1181<td>46.55</td>
1182<td>41.53</td>
1183<td>45.98</td>
1184<td>44.86</td>
1185<td>83</td>
1186</tr>
1187<tr><td>list-pull-aos</td>
1188<td>85.30</td>
1189<td>82.97</td>
1190<td>86.43</td>
1191<td>83.42</td>
1192<td>86.33</td>
1193<td>83.70</td>
1194<td>86.43</td>
1195<td>83.77</td>
1196<td>83.10</td>
1197<td>85.89</td>
1198<td>84.44</td>
1199<td>83</td>
1200</tr>
1201<tr><td>list-pull-soa</td>
1202<td>62.12</td>
1203<td>63.61</td>
1204<td>63.28</td>
1205<td>61.32</td>
1206<td>66.72</td>
1207<td>62.65</td>
1208<td>64.82</td>
1209<td>60.49</td>
1210<td>58.01</td>
1211<td>64.46</td>
1212<td>62.52</td>
1213<td>83</td>
1214</tr>
1215<tr><td>list-pull-split-nt-1s-soa</td>
1216<td>121.35</td>
1217<td>113.77</td>
1218<td>115.29</td>
1219<td>113.54</td>
1220<td>117.00</td>
1221<td>116.46</td>
1222<td>114.78</td>
1223<td>114.54</td>
1224<td>110.83</td>
1225<td>112.67</td>
1226<td>117.85</td>
1227<td>125</td>
1228</tr>
1229<tr><td>list-pull-split-nt-2s-soa</td>
1230<td>118.09</td>
1231<td>110.48</td>
1232<td>112.55</td>
1233<td>113.18</td>
1234<td>113.44</td>
1235<td>111.85</td>
1236<td>109.27</td>
1237<td>114.41</td>
1238<td>110.28</td>
1239<td>111.78</td>
1240<td>113.74</td>
1241<td>125</td>
1242</tr>
1243<tr><td>list-aa-aos</td>
1244<td>121.28</td>
1245<td>118.63</td>
1246<td>119.00</td>
1247<td>118.50</td>
1248<td>121.99</td>
1249<td>119.11</td>
1250<td>118.83</td>
1251<td>121.47</td>
1252<td>121.62</td>
1253<td>126.18</td>
1254<td>120.12</td>
1255<td>129</td>
1256</tr>
1257<tr><td>list-aa-soa</td>
1258<td>126.34</td>
1259<td>116.90</td>
1260<td>129.45</td>
1261<td>127.12</td>
1262<td>129.41</td>
1263<td>121.42</td>
1264<td>126.19</td>
1265<td>126.76</td>
1266<td>126.70</td>
1267<td>124.40</td>
1268<td>125.22</td>
1269<td>129</td>
1270</tr>
1271<tr><td>list-aa-ria-soa</td>
1272<td>133.68</td>
1273<td>121.82</td>
1274<td>126.04</td>
1275<td>128.46</td>
1276<td>131.15</td>
1277<td>132.25</td>
1278<td>128.78</td>
1279<td>133.50</td>
1280<td>126.69</td>
1281<td>124.40</td>
1282<td>130.37</td>
1283<td>145</td>
1284</tr>
1285<tr><td>list-aa-pv-soa</td>
1286<td>146.22</td>
1287<td>124.39</td>
1288<td>130.73</td>
1289<td>136.29</td>
1290<td>137.61</td>
1291<td>131.21</td>
1292<td>138.65</td>
1293<td>138.78</td>
1294<td>127.02</td>
1295<td>132.40</td>
1296<td>138.37</td>
1297<td>145</td>
1298</tr>
1299</tbody>
1300</table>
1301</div>
1302<div class="section" id="broadwell-intel-xeon-e5-2630-v4">
1303<h2><a class="toc-backref" href="#id24">6.2&nbsp;&nbsp;&nbsp;Broadwell, Intel Xeon E5-2630 v4</a></h2>
1304<ul class="simple">
1305<li>Broadwell architecture, AVX2, FMA</li>
1306<li>10 cores, 2.2 GHz</li>
1307<li>SMT disabled</li>
1308</ul>
1309<p>memory bandwidth:</p>
1310<ul class="simple">
1311<li>copy-19 48.0 GB/s</li>
1312<li>copy-nt-sl-19 48.2 GB/s</li>
1313<li>update-19 51.1 GB/s</li>
1314</ul>
1315<p>geometry dimensions: 500x100x100</p>
1316<table border="1" class="docutils">
1317<colgroup>
1318<col width="19%" />
1319<col width="7%" />
1320<col width="7%" />
1321<col width="7%" />
1322<col width="7%" />
1323<col width="7%" />
1324<col width="7%" />
1325<col width="7%" />
1326<col width="7%" />
1327<col width="7%" />
1328<col width="7%" />
1329<col width="7%" />
1330<col width="5%" />
1331</colgroup>
1332<thead valign="bottom">
1333<tr><th class="head">kernel</th>
1334<th class="head">pipe</th>
1335<th class="head">blocks-2</th>
1336<th class="head">blocks-4</th>
1337<th class="head">blocks-6</th>
1338<th class="head">blocks-8</th>
1339<th class="head">blocks-10</th>
1340<th class="head">blocks-15</th>
1341<th class="head">blocks-16</th>
1342<th class="head">blocks-20</th>
1343<th class="head">blocks-25</th>
1344<th class="head">blocks-32</th>
1345<th class="head">RFM</th>
1346</tr>
1347</thead>
1348<tbody valign="top">
1349<tr><td>blk-push-aos</td>
1350<td>55.75</td>
1351<td>47.62</td>
1352<td>54.57</td>
1353<td>57.10</td>
1354<td>58.49</td>
1355<td>59.00</td>
1356<td>61.72</td>
1357<td>60.56</td>
1358<td>64.05</td>
1359<td>61.10</td>
1360<td>66.03</td>
1361<td>105</td>
1362</tr>
1363<tr><td>blk-push-soa</td>
1364<td>30.06</td>
1365<td>31.09</td>
1366<td>32.13</td>
1367<td>32.54</td>
1368<td>32.74</td>
1369<td>32.72</td>
1370<td>33.81</td>
1371<td>33.19</td>
1372<td>34.90</td>
1373<td>33.21</td>
1374<td>35.75</td>
1375<td>105</td>
1376</tr>
1377<tr><td>blk-pull-aos</td>
1378<td>53.80</td>
1379<td>48.61</td>
1380<td>53.08</td>
1381<td>54.99</td>
1382<td>56.08</td>
1383<td>56.68</td>
1384<td>59.20</td>
1385<td>58.12</td>
1386<td>61.49</td>
1387<td>58.71</td>
1388<td>63.45</td>
1389<td>105</td>
1390</tr>
1391<tr><td>blk-pull-soa</td>
1392<td>46.96</td>
1393<td>46.61</td>
1394<td>48.84</td>
1395<td>49.70</td>
1396<td>50.33</td>
1397<td>50.46</td>
1398<td>52.36</td>
1399<td>51.39</td>
1400<td>54.20</td>
1401<td>51.61</td>
1402<td>55.71</td>
1403<td>105</td>
1404</tr>
1405<tr><td>aa-aos</td>
1406<td>91.40</td>
1407<td>66.99</td>
1408<td>78.47</td>
1409<td>83.38</td>
1410<td>86.62</td>
1411<td>88.62</td>
1412<td>92.98</td>
1413<td>91.54</td>
1414<td>97.08</td>
1415<td>94.93</td>
1416<td>98.90</td>
1417<td>168</td>
1418</tr>
1419<tr><td>aa-soa</td>
1420<td>83.01</td>
1421<td>69.96</td>
1422<td>75.85</td>
1423<td>77.72</td>
1424<td>79.01</td>
1425<td>79.29</td>
1426<td>82.38</td>
1427<td>80.11</td>
1428<td>85.70</td>
1429<td>83.91</td>
1430<td>87.69</td>
1431<td>168</td>
1432</tr>
1433<tr><td>aa-vec-soa</td>
1434<td>112.03</td>
1435<td>96.52</td>
1436<td>105.32</td>
1437<td>109.76</td>
1438<td>112.55</td>
1439<td>113.82</td>
1440<td>120.55</td>
1441<td>118.37</td>
1442<td>126.30</td>
1443<td>121.37</td>
1444<td>131.94</td>
1445<td>168</td>
1446</tr>
1447<tr><td>list-push-aos</td>
1448<td>75.13</td>
1449<td>74.18</td>
1450<td>75.20</td>
1451<td>75.42</td>
1452<td>75.24</td>
1453<td>75.99</td>
1454<td>75.80</td>
1455<td>75.80</td>
1456<td>75.54</td>
1457<td>76.22</td>
1458<td>76.21</td>
1459<td>97</td>
1460</tr>
1461<tr><td>list-push-soa</td>
1462<td>40.99</td>
1463<td>38.14</td>
1464<td>39.00</td>
1465<td>38.89</td>
1466<td>38.89</td>
1467<td>39.67</td>
1468<td>39.87</td>
1469<td>39.28</td>
1470<td>39.35</td>
1471<td>40.08</td>
1472<td>40.13</td>
1473<td>97</td>
1474</tr>
1475<tr><td>list-pull-aos</td>
1476<td>82.07</td>
1477<td>82.88</td>
1478<td>83.29</td>
1479<td>83.09</td>
1480<td>83.32</td>
1481<td>83.49</td>
1482<td>82.82</td>
1483<td>82.88</td>
1484<td>83.32</td>
1485<td>82.60</td>
1486<td>82.93</td>
1487<td>97</td>
1488</tr>
1489<tr><td>list-pull-soa</td>
1490<td>62.07</td>
1491<td>60.40</td>
1492<td>61.89</td>
1493<td>61.39</td>
1494<td>62.43</td>
1495<td>60.90</td>
1496<td>60.48</td>
1497<td>62.80</td>
1498<td>62.50</td>
1499<td>61.10</td>
1500<td>60.38</td>
1501<td>97</td>
1502</tr>
1503<tr><td>list-pull-split-nt-1s-soa</td>
1504<td>125.81</td>
1505<td>120.60</td>
1506<td>121.96</td>
1507<td>122.34</td>
1508<td>122.86</td>
1509<td>123.53</td>
1510<td>123.64</td>
1511<td>123.67</td>
1512<td>125.94</td>
1513<td>124.09</td>
1514<td>123.69</td>
1515<td>128</td>
1516</tr>
1517<tr><td>list-pull-split-nt-2s-soa</td>
1518<td>122.79</td>
1519<td>117.16</td>
1520<td>118.86</td>
1521<td>119.16</td>
1522<td>119.56</td>
1523<td>119.99</td>
1524<td>120.01</td>
1525<td>120.03</td>
1526<td>122.64</td>
1527<td>120.57</td>
1528<td>120.39</td>
1529<td>128</td>
1530</tr>
1531<tr><td>list-aa-aos</td>
1532<td>128.13</td>
1533<td>127.41</td>
1534<td>129.31</td>
1535<td>129.07</td>
1536<td>129.79</td>
1537<td>129.63</td>
1538<td>129.67</td>
1539<td>129.94</td>
1540<td>129.12</td>
1541<td>128.41</td>
1542<td>129.72</td>
1543<td>150</td>
1544</tr>
1545<tr><td>list-aa-soa</td>
1546<td>141.60</td>
1547<td>139.78</td>
1548<td>141.58</td>
1549<td>142.16</td>
1550<td>141.94</td>
1551<td>141.31</td>
1552<td>142.37</td>
1553<td>142.25</td>
1554<td>142.43</td>
1555<td>141.40</td>
1556<td>142.26</td>
1557<td>150</td>
1558</tr>
1559<tr><td>list-aa-ria-soa</td>
1560<td>141.82</td>
1561<td>134.88</td>
1562<td>140.15</td>
1563<td>140.72</td>
1564<td>141.67</td>
1565<td>140.51</td>
1566<td>141.18</td>
1567<td>141.29</td>
1568<td>142.97</td>
1569<td>141.94</td>
1570<td>143.25</td>
1571<td>168</td>
1572</tr>
1573<tr><td>list-aa-pv-soa</td>
1574<td>164.79</td>
1575<td>140.95</td>
1576<td>159.24</td>
1577<td>161.78</td>
1578<td>162.40</td>
1579<td>163.04</td>
1580<td>164.69</td>
1581<td>164.38</td>
1582<td>165.11</td>
1583<td>165.75</td>
1584<td>166.09</td>
1585<td>168</td>
1586</tr>
1587</tbody>
1588</table>
1589</div>
1590<div class="section" id="skylake-intel-xeon-gold-6148">
1591<h2><a class="toc-backref" href="#id25">6.3&nbsp;&nbsp;&nbsp;Skylake, Intel Xeon Gold 6148</a></h2>
1592<ul class="simple">
1593<li>Skylake architecture, AVX2, FMA, AVX512</li>
1594<li>20 cores, 2.4 GHz</li>
1595<li>SMT enabled</li>
1596</ul>
1597<p>memory bandwidth:</p>
1598<ul class="simple">
1599<li>copy-19 89.7 GB/s</li>
1600<li>copy-19-nt-sl 92.4 GB/s</li>
1601<li>update-19 93.6 GB/s</li>
1602</ul>
1603<p>geometry dimensions: 500x100x100</p>
1604<table border="1" class="docutils">
1605<colgroup>
1606<col width="20%" />
1607<col width="7%" />
1608<col width="7%" />
1609<col width="7%" />
1610<col width="7%" />
1611<col width="7%" />
1612<col width="7%" />
1613<col width="7%" />
1614<col width="7%" />
1615<col width="7%" />
1616<col width="7%" />
1617<col width="7%" />
1618<col width="2%" />
1619</colgroup>
1620<thead valign="bottom">
1621<tr><th class="head">kernel</th>
1622<th class="head">pipe</th>
1623<th class="head">blocks-2</th>
1624<th class="head">blocks-4</th>
1625<th class="head">blocks-6</th>
1626<th class="head">blocks-8</th>
1627<th class="head">blocks-10</th>
1628<th class="head">blocks-15</th>
1629<th class="head">blocks-16</th>
1630<th class="head">blocks-20</th>
1631<th class="head">blocks-25</th>
1632<th class="head">blocks-32</th>
1633<th class="head">RFM</th>
1634</tr>
1635</thead>
1636<tbody valign="top">
1637<tr><td>blk-push-aos</td>
1638<td>113.01</td>
1639<td>93.99</td>
1640<td>108.98</td>
1641<td>114.65</td>
1642<td>117.87</td>
1643<td>119.47</td>
1644<td>124.95</td>
1645<td>122.46</td>
1646<td>129.29</td>
1647<td>123.87</td>
1648<td>133.01</td>
1649<td>197</td>
1650</tr>
1651<tr><td>blk-push-soa</td>
1652<td>100.21</td>
1653<td>98.87</td>
1654<td>103.63</td>
1655<td>105.56</td>
1656<td>107.02</td>
1657<td>107.27</td>
1658<td>111.61</td>
1659<td>109.83</td>
1660<td>116.16</td>
1661<td>110.51</td>
1662<td>110.29</td>
1663<td>197</td>
1664</tr>
1665<tr><td>blk-pull-aos</td>
1666<td>118.45</td>
1667<td>102.54</td>
1668<td>114.12</td>
1669<td>117.82</td>
1670<td>122.69</td>
1671<td>124.31</td>
1672<td>130.58</td>
1673<td>127.85</td>
1674<td>135.72</td>
1675<td>129.65</td>
1676<td>139.94</td>
1677<td>197</td>
1678</tr>
1679<tr><td>blk-pull-soa</td>
1680<td>82.60</td>
1681<td>83.36</td>
1682<td>87.13</td>
1683<td>88.39</td>
1684<td>88.84</td>
1685<td>88.96</td>
1686<td>92.48</td>
1687<td>90.93</td>
1688<td>95.79</td>
1689<td>91.92</td>
1690<td>98.64</td>
1691<td>197</td>
1692</tr>
1693<tr><td>aa-aos</td>
1694<td>171.32</td>
1695<td>125.43</td>
1696<td>147.73</td>
1697<td>157.70</td>
1698<td>163.35</td>
1699<td>167.25</td>
1700<td>175.39</td>
1701<td>174.20</td>
1702<td>182.54</td>
1703<td>173.67</td>
1704<td>187.76</td>
1705<td>308</td>
1706</tr>
1707<tr><td>aa-soa</td>
1708<td>180.85</td>
1709<td>152.39</td>
1710<td>165.84</td>
1711<td>152.59</td>
1712<td>171.90</td>
1713<td>175.76</td>
1714<td>184.94</td>
1715<td>182.34</td>
1716<td>189.43</td>
1717<td>180.30</td>
1718<td>193.54</td>
1719<td>308</td>
1720</tr>
1721<tr><td>aa-vec-soa</td>
1722<td>208.03</td>
1723<td>181.51</td>
1724<td>195.86</td>
1725<td>203.41</td>
1726<td>209.08</td>
1727<td>212.34</td>
1728<td>224.05</td>
1729<td>219.49</td>
1730<td>234.31</td>
1731<td>225.92</td>
1732<td>245.22</td>
1733<td>308</td>
1734</tr>
1735<tr><td>list-push-aos</td>
1736<td>158.81</td>
1737<td>164.67</td>
1738<td>162.93</td>
1739<td>163.05</td>
1740<td>165.22</td>
1741<td>164.31</td>
1742<td>164.66</td>
1743<td>160.78</td>
1744<td>164.07</td>
1745<td>165.19</td>
1746<td>164.06</td>
1747<td>177</td>
1748</tr>
1749<tr><td>list-push-soa</td>
1750<td>134.60</td>
1751<td>110.44</td>
1752<td>110.17</td>
1753<td>132.01</td>
1754<td>132.95</td>
1755<td>133.46</td>
1756<td>134.37</td>
1757<td>134.33</td>
1758<td>135.12</td>
1759<td>134.91</td>
1760<td>137.87</td>
1761<td>177</td>
1762</tr>
1763<tr><td>list-pull-aos</td>
1764<td>169.61</td>
1765<td>170.03</td>
1766<td>170.89</td>
1767<td>170.90</td>
1768<td>171.20</td>
1769<td>171.60</td>
1770<td>172.09</td>
1771<td>171.95</td>
1772<td>169.48</td>
1773<td>172.08</td>
1774<td>171.02</td>
1775<td>177</td>
1776</tr>
1777<tr><td>list-pull-soa</td>
1778<td>120.50</td>
1779<td>116.73</td>
1780<td>118.62</td>
1781<td>118.00</td>
1782<td>120.99</td>
1783<td>118.15</td>
1784<td>117.17</td>
1785<td>121.41</td>
1786<td>120.83</td>
1787<td>120.00</td>
1788<td>118.74</td>
1789<td>177</td>
1790</tr>
1791<tr><td>list-pull-split-nt-1s-soa</td>
1792<td>225.59</td>
1793<td>224.18</td>
1794<td>225.10</td>
1795<td>226.34</td>
1796<td>226.01</td>
1797<td>230.37</td>
1798<td>227.50</td>
1799<td>228.42</td>
1800<td>227.39</td>
1801<td>231.65</td>
1802<td>227.35</td>
1803<td>246</td>
1804</tr>
1805<tr><td>list-pull-split-nt-2s-soa</td>
1806<td>219.20</td>
1807<td>214.63</td>
1808<td>217.61</td>
1809<td>218.13</td>
1810<td>219.07</td>
1811<td>221.01</td>
1812<td>219.88</td>
1813<td>220.09</td>
1814<td>220.62</td>
1815<td>221.68</td>
1816<td>220.58</td>
1817<td>246</td>
1818</tr>
1819<tr><td>list-aa-aos</td>
1820<td>241.39</td>
1821<td>239.27</td>
1822<td>239.53</td>
1823<td>242.56</td>
1824<td>242.46</td>
1825<td>243.00</td>
1826<td>242.91</td>
1827<td>242.46</td>
1828<td>241.24</td>
1829<td>242.96</td>
1830<td>241.52</td>
1831<td>275</td>
1832</tr>
1833<tr><td>list-aa-soa</td>
1834<td>273.73</td>
1835<td>268.49</td>
1836<td>268.48</td>
1837<td>271.79</td>
1838<td>275.29</td>
1839<td>274.56</td>
1840<td>277.18</td>
1841<td>272.67</td>
1842<td>274.21</td>
1843<td>275.24</td>
1844<td>278.21</td>
1845<td>275</td>
1846</tr>
1847<tr><td>list-aa-ria-soa</td>
1848<td>288.42</td>
1849<td>261.89</td>
1850<td>273.26</td>
1851<td>284.84</td>
1852<td>283.88</td>
1853<td>288.29</td>
1854<td>290.72</td>
1855<td>289.81</td>
1856<td>293.36</td>
1857<td>290.75</td>
1858<td>292.93</td>
1859<td>308</td>
1860</tr>
1861<tr><td>list-aa-pv-soa</td>
1862<td>303.35</td>
1863<td>267.21</td>
1864<td>289.18</td>
1865<td>294.96</td>
1866<td>294.36</td>
1867<td>298.16</td>
1868<td>300.45</td>
1869<td>301.71</td>
1870<td>302.37</td>
1871<td>302.88</td>
1872<td>304.46</td>
1873<td>308</td>
1874</tr>
1875</tbody>
1876</table>
e3f82424 1877</div>
e3f82424
MW
1878</div>
1879<div class="section" id="licence">
0095f461 1880<h1><a class="toc-backref" href="#id26">7&nbsp;&nbsp;&nbsp;Licence</a></h1>
e3f82424 1881<p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p>
ecf590ae
MW
1882</div>
1883<div class="section" id="acknowledgements">
0095f461 1884<h1><a class="toc-backref" href="#id27">8&nbsp;&nbsp;&nbsp;Acknowledgements</a></h1>
ecf590ae
MW
1885<p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p>
1886<p>This work was funded by KONWHIR project OMI4PAPS.</p>
0095f461
MW
1887</div>
1888<div class="section" id="bibliography">
1889<h1><a class="toc-backref" href="#id28">9&nbsp;&nbsp;&nbsp;Bibliography</a></h1>
1890<table class="docutils citation" frame="void" id="ginzburg-2008" rules="none">
1891<colgroup><col class="label" /><col /></colgroup>
1892<tbody valign="top">
1893<tr><td class="label"><a class="fn-backref" href="#id1">[ginzburg-2008]</a></td><td>I. Ginzburg, F. Verhaeghe, and D. d'Humières.
1894Two-relaxation-time lattice Boltzmann scheme: About parametrization, velocity, pressure and mixed boundary conditions.
1895Commun. Comput. Phys., 3(2):427-478, 2008.</td></tr>
1896</tbody>
1897</table>
1898<table class="docutils citation" frame="void" id="williams-2008" rules="none">
1899<colgroup><col class="label" /><col /></colgroup>
1900<tbody valign="top">
1901<tr><td class="label"><a class="fn-backref" href="#id3">[williams-2008]</a></td><td>S. Williams, A. Waterman, and D. Patterson.
1902Roofline: an insightful visual performance model for multicore architectures.
1903Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
1904</tbody>
1905</table>
1906<p>Document was generated at 2017-11-21 15:43.</p>
ecf590ae
MW
1907</div>
1908</div>
1909</body>
1910</html>
This page took 0.446172 seconds and 5 git commands to generate.