Commit | Line | Data |
---|---|---|
ecf590ae MW |
1 | <?xml version="1.0" encoding="utf-8" ?> |
2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | <head> | |
5 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
6 | <meta name="generator" content="Docutils 0.12: http://docutils.sourceforge.net/" /> | |
7 | <title>LBM Benchmark Kernels Documentation</title> | |
8 | <style type="text/css"> | |
9 | ||
10 | /* | |
11 | :Author: David Goodger (goodger@python.org) | |
12 | :Id: $Id: html4css1.css 7614 2013-02-21 15:55:51Z milde $ | |
13 | :Copyright: This stylesheet has been placed in the public domain. | |
14 | ||
15 | Default cascading style sheet for the HTML output of Docutils. | |
16 | ||
17 | See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to | |
18 | customize this style sheet. | |
19 | */ | |
20 | ||
21 | /* used to remove borders from tables and images */ | |
22 | .borderless, table.borderless td, table.borderless th { | |
23 | border: 0 } | |
24 | ||
25 | table.borderless td, table.borderless th { | |
26 | /* Override padding for "table.docutils td" with "! important". | |
27 | The right padding separates the table cells. */ | |
28 | padding: 0 0.5em 0 0 ! important } | |
29 | ||
30 | .first { | |
31 | /* Override more specific margin styles with "! important". */ | |
32 | margin-top: 0 ! important } | |
33 | ||
34 | .last, .with-subtitle { | |
35 | margin-bottom: 0 ! important } | |
36 | ||
37 | .hidden { | |
38 | display: none } | |
39 | ||
40 | a.toc-backref { | |
41 | text-decoration: none ; | |
42 | color: black } | |
43 | ||
44 | blockquote.epigraph { | |
45 | margin: 2em 5em ; } | |
46 | ||
47 | dl.docutils dd { | |
48 | margin-bottom: 0.5em } | |
49 | ||
50 | object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] { | |
51 | overflow: hidden; | |
52 | } | |
53 | ||
54 | /* Uncomment (and remove this text!) to get bold-faced definition list terms | |
55 | dl.docutils dt { | |
56 | font-weight: bold } | |
57 | */ | |
58 | ||
59 | div.abstract { | |
60 | margin: 2em 5em } | |
61 | ||
62 | div.abstract p.topic-title { | |
63 | font-weight: bold ; | |
64 | text-align: center } | |
65 | ||
66 | div.admonition, div.attention, div.caution, div.danger, div.error, | |
67 | div.hint, div.important, div.note, div.tip, div.warning { | |
68 | margin: 2em ; | |
69 | border: medium outset ; | |
70 | padding: 1em } | |
71 | ||
72 | div.admonition p.admonition-title, div.hint p.admonition-title, | |
73 | div.important p.admonition-title, div.note p.admonition-title, | |
74 | div.tip p.admonition-title { | |
75 | font-weight: bold ; | |
76 | font-family: sans-serif } | |
77 | ||
78 | div.attention p.admonition-title, div.caution p.admonition-title, | |
79 | div.danger p.admonition-title, div.error p.admonition-title, | |
80 | div.warning p.admonition-title, .code .error { | |
81 | color: red ; | |
82 | font-weight: bold ; | |
83 | font-family: sans-serif } | |
84 | ||
85 | /* Uncomment (and remove this text!) to get reduced vertical space in | |
86 | compound paragraphs. | |
87 | div.compound .compound-first, div.compound .compound-middle { | |
88 | margin-bottom: 0.5em } | |
89 | ||
90 | div.compound .compound-last, div.compound .compound-middle { | |
91 | margin-top: 0.5em } | |
92 | */ | |
93 | ||
94 | div.dedication { | |
95 | margin: 2em 5em ; | |
96 | text-align: center ; | |
97 | font-style: italic } | |
98 | ||
99 | div.dedication p.topic-title { | |
100 | font-weight: bold ; | |
101 | font-style: normal } | |
102 | ||
103 | div.figure { | |
104 | margin-left: 2em ; | |
105 | margin-right: 2em } | |
106 | ||
107 | div.footer, div.header { | |
108 | clear: both; | |
109 | font-size: smaller } | |
110 | ||
111 | div.line-block { | |
112 | display: block ; | |
113 | margin-top: 1em ; | |
114 | margin-bottom: 1em } | |
115 | ||
116 | div.line-block div.line-block { | |
117 | margin-top: 0 ; | |
118 | margin-bottom: 0 ; | |
119 | margin-left: 1.5em } | |
120 | ||
121 | div.sidebar { | |
122 | margin: 0 0 0.5em 1em ; | |
123 | border: medium outset ; | |
124 | padding: 1em ; | |
125 | background-color: #ffffee ; | |
126 | width: 40% ; | |
127 | float: right ; | |
128 | clear: right } | |
129 | ||
130 | div.sidebar p.rubric { | |
131 | font-family: sans-serif ; | |
132 | font-size: medium } | |
133 | ||
134 | div.system-messages { | |
135 | margin: 5em } | |
136 | ||
137 | div.system-messages h1 { | |
138 | color: red } | |
139 | ||
140 | div.system-message { | |
141 | border: medium outset ; | |
142 | padding: 1em } | |
143 | ||
144 | div.system-message p.system-message-title { | |
145 | color: red ; | |
146 | font-weight: bold } | |
147 | ||
148 | div.topic { | |
149 | margin: 2em } | |
150 | ||
151 | h1.section-subtitle, h2.section-subtitle, h3.section-subtitle, | |
152 | h4.section-subtitle, h5.section-subtitle, h6.section-subtitle { | |
153 | margin-top: 0.4em } | |
154 | ||
155 | h1.title { | |
156 | text-align: center } | |
157 | ||
158 | h2.subtitle { | |
159 | text-align: center } | |
160 | ||
161 | hr.docutils { | |
162 | width: 75% } | |
163 | ||
164 | img.align-left, .figure.align-left, object.align-left { | |
165 | clear: left ; | |
166 | float: left ; | |
167 | margin-right: 1em } | |
168 | ||
169 | img.align-right, .figure.align-right, object.align-right { | |
170 | clear: right ; | |
171 | float: right ; | |
172 | margin-left: 1em } | |
173 | ||
174 | img.align-center, .figure.align-center, object.align-center { | |
175 | display: block; | |
176 | margin-left: auto; | |
177 | margin-right: auto; | |
178 | } | |
179 | ||
180 | .align-left { | |
181 | text-align: left } | |
182 | ||
183 | .align-center { | |
184 | clear: both ; | |
185 | text-align: center } | |
186 | ||
187 | .align-right { | |
188 | text-align: right } | |
189 | ||
190 | /* reset inner alignment in figures */ | |
191 | div.align-right { | |
192 | text-align: inherit } | |
193 | ||
194 | /* div.align-center * { */ | |
195 | /* text-align: left } */ | |
196 | ||
197 | ol.simple, ul.simple { | |
198 | margin-bottom: 1em } | |
199 | ||
200 | ol.arabic { | |
201 | list-style: decimal } | |
202 | ||
203 | ol.loweralpha { | |
204 | list-style: lower-alpha } | |
205 | ||
206 | ol.upperalpha { | |
207 | list-style: upper-alpha } | |
208 | ||
209 | ol.lowerroman { | |
210 | list-style: lower-roman } | |
211 | ||
212 | ol.upperroman { | |
213 | list-style: upper-roman } | |
214 | ||
215 | p.attribution { | |
216 | text-align: right ; | |
217 | margin-left: 50% } | |
218 | ||
219 | p.caption { | |
220 | font-style: italic } | |
221 | ||
222 | p.credits { | |
223 | font-style: italic ; | |
224 | font-size: smaller } | |
225 | ||
226 | p.label { | |
227 | white-space: nowrap } | |
228 | ||
229 | p.rubric { | |
230 | font-weight: bold ; | |
231 | font-size: larger ; | |
232 | color: maroon ; | |
233 | text-align: center } | |
234 | ||
235 | p.sidebar-title { | |
236 | font-family: sans-serif ; | |
237 | font-weight: bold ; | |
238 | font-size: larger } | |
239 | ||
240 | p.sidebar-subtitle { | |
241 | font-family: sans-serif ; | |
242 | font-weight: bold } | |
243 | ||
244 | p.topic-title { | |
245 | font-weight: bold } | |
246 | ||
247 | pre.address { | |
248 | margin-bottom: 0 ; | |
249 | margin-top: 0 ; | |
250 | font: inherit } | |
251 | ||
252 | pre.literal-block, pre.doctest-block, pre.math, pre.code { | |
253 | margin-left: 2em ; | |
254 | margin-right: 2em } | |
255 | ||
256 | pre.code .ln { color: grey; } /* line numbers */ | |
257 | pre.code, code { background-color: #eeeeee } | |
258 | pre.code .comment, code .comment { color: #5C6576 } | |
259 | pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold } | |
260 | pre.code .literal.string, code .literal.string { color: #0C5404 } | |
261 | pre.code .name.builtin, code .name.builtin { color: #352B84 } | |
262 | pre.code .deleted, code .deleted { background-color: #DEB0A1} | |
263 | pre.code .inserted, code .inserted { background-color: #A3D289} | |
264 | ||
265 | span.classifier { | |
266 | font-family: sans-serif ; | |
267 | font-style: oblique } | |
268 | ||
269 | span.classifier-delimiter { | |
270 | font-family: sans-serif ; | |
271 | font-weight: bold } | |
272 | ||
273 | span.interpreted { | |
274 | font-family: sans-serif } | |
275 | ||
276 | span.option { | |
277 | white-space: nowrap } | |
278 | ||
279 | span.pre { | |
280 | white-space: pre } | |
281 | ||
282 | span.problematic { | |
283 | color: red } | |
284 | ||
285 | span.section-subtitle { | |
286 | /* font-size relative to parent (h1..h6 element) */ | |
287 | font-size: 80% } | |
288 | ||
289 | table.citation { | |
290 | border-left: solid 1px gray; | |
291 | margin-left: 1px } | |
292 | ||
293 | table.docinfo { | |
294 | margin: 2em 4em } | |
295 | ||
296 | table.docutils { | |
297 | margin-top: 0.5em ; | |
298 | margin-bottom: 0.5em } | |
299 | ||
300 | table.footnote { | |
301 | border-left: solid 1px black; | |
302 | margin-left: 1px } | |
303 | ||
304 | table.docutils td, table.docutils th, | |
305 | table.docinfo td, table.docinfo th { | |
306 | padding-left: 0.5em ; | |
307 | padding-right: 0.5em ; | |
308 | vertical-align: top } | |
309 | ||
310 | table.docutils th.field-name, table.docinfo th.docinfo-name { | |
311 | font-weight: bold ; | |
312 | text-align: left ; | |
313 | white-space: nowrap ; | |
314 | padding-left: 0 } | |
315 | ||
316 | /* "booktabs" style (no vertical lines) */ | |
317 | table.docutils.booktabs { | |
318 | border: 0px; | |
319 | border-top: 2px solid; | |
320 | border-bottom: 2px solid; | |
321 | border-collapse: collapse; | |
322 | } | |
323 | table.docutils.booktabs * { | |
324 | border: 0px; | |
325 | } | |
326 | table.docutils.booktabs th { | |
327 | border-bottom: thin solid; | |
328 | text-align: left; | |
329 | } | |
330 | ||
331 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils, | |
332 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { | |
333 | font-size: 100% } | |
334 | ||
335 | ul.auto-toc { | |
336 | list-style-type: none } | |
337 | ||
e3f82424 MW |
338 | </style> |
339 | <style type="text/css"> | |
340 | ||
341 | ||
342 | h1, h2, h3, h4, h5, h6 { | |
343 | font-family: sans-serif; | |
344 | font-size: 100%; | |
345 | background-color: #dcdcdc; | |
346 | } | |
347 | ||
348 | h1.title { | |
349 | background-color: gray; | |
350 | color: white | |
351 | } | |
352 | ||
353 | table.footnote { | |
354 | padding-left: 0.5ex; | |
355 | } | |
356 | ||
357 | table.citation { | |
358 | padding-left: 0.5ex | |
359 | } | |
360 | ||
361 | td.label { | |
362 | width: 10%; | |
363 | } | |
364 | ||
365 | table, table.docutils, td, th { | |
366 | border: 0; | |
367 | } | |
368 | ||
369 | table.citation, table.footnote { | |
370 | width: 100%; | |
371 | } | |
372 | ||
373 | th { | |
374 | background-color: lavender ; | |
375 | } | |
376 | ||
377 | tr:nth-child(even) { | |
378 | xxbackground-color: aliceblue; | |
379 | background-color: white; | |
380 | } | |
381 | tr:nth-child(odd) { | |
382 | xxbackground-color: lavender; | |
383 | background-color: whitesmoke; | |
384 | } | |
385 | ||
386 | ||
387 | ||
ecf590ae MW |
388 | </style> |
389 | </head> | |
390 | <body> | |
391 | <div class="document" id="lbm-benchmark-kernels-documentation"> | |
392 | <h1 class="title">LBM Benchmark Kernels Documentation</h1> | |
393 | ||
394 | <!-- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
395 | # | |
396 | # Copyright | |
397 | # Markus Wittmann, 2016-2017 | |
398 | # RRZE, University of Erlangen-Nuremberg, Germany | |
399 | # markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
400 | # | |
401 | # Viktor Haag, 2016 | |
402 | # LSS, University of Erlangen-Nuremberg, Germany | |
403 | # | |
404 | # This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
405 | # | |
406 | # LbmBenchKernels is free software: you can redistribute it and/or modify | |
407 | # it under the terms of the GNU General Public License as published by | |
408 | # the Free Software Foundation, either version 3 of the License, or | |
409 | # (at your option) any later version. | |
410 | # | |
411 | # LbmBenchKernels is distributed in the hope that it will be useful, | |
412 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
413 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
414 | # GNU General Public License for more details. | |
415 | # | |
416 | # You should have received a copy of the GNU General Public License | |
417 | # along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
418 | # | |
419 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --> | |
420 | <div class="contents topic" id="contents"> | |
421 | <p class="topic-title first">Contents</p> | |
422 | <ul class="auto-toc simple"> | |
0095f461 MW |
423 | <li><a class="reference internal" href="#introduction" id="id5">1 Introduction</a></li> |
424 | <li><a class="reference internal" href="#compilation" id="id6">2 Compilation</a><ul class="auto-toc"> | |
425 | <li><a class="reference internal" href="#debug-and-verification" id="id7">2.1 Debug and Verification</a></li> | |
426 | <li><a class="reference internal" href="#release-and-verification" id="id8">2.2 Release and Verification</a></li> | |
427 | <li><a class="reference internal" href="#benchmarking" id="id9">2.3 Benchmarking</a></li> | |
428 | <li><a class="reference internal" href="#compilers" id="id10">2.4 Compilers</a></li> | |
429 | <li><a class="reference internal" href="#cleaning" id="id11">2.5 Cleaning</a></li> | |
430 | <li><a class="reference internal" href="#options-summary" id="id12">2.6 Options Summary</a></li> | |
ecf590ae MW |
431 | </ul> |
432 | </li> | |
0095f461 MW |
433 | <li><a class="reference internal" href="#invocation" id="id13">3 Invocation</a><ul class="auto-toc"> |
434 | <li><a class="reference internal" href="#command-line-parameters" id="id14">3.1 Command Line Parameters</a></li> | |
435 | <li><a class="reference internal" href="#kernels" id="id15">3.2 Kernels</a></li> | |
ecf590ae MW |
436 | </ul> |
437 | </li> | |
0095f461 MW |
438 | <li><a class="reference internal" href="#id2" id="id16">4 Benchmarking</a><ul class="auto-toc"> |
439 | <li><a class="reference internal" href="#intel-compiler" id="id17">4.1 Intel Compiler</a></li> | |
440 | <li><a class="reference internal" href="#pinning" id="id18">4.2 Pinning</a></li> | |
441 | <li><a class="reference internal" href="#general-remarks" id="id19">4.3 General Remarks</a></li> | |
442 | <li><a class="reference internal" href="#padding" id="id20">4.4 Padding</a></li> | |
e3f82424 MW |
443 | </ul> |
444 | </li> | |
0095f461 MW |
445 | <li><a class="reference internal" href="#geometries" id="id21">5 Geometries</a></li> |
446 | <li><a class="reference internal" href="#performance-results" id="id22">6 Performance Results</a><ul class="auto-toc"> | |
447 | <li><a class="reference internal" href="#haswell-intel-xeon-e5-2695-v3" id="id23">6.1 Haswell, Intel Xeon E5-2695 v3</a></li> | |
448 | <li><a class="reference internal" href="#broadwell-intel-xeon-e5-2630-v4" id="id24">6.2 Broadwell, Intel Xeon E5-2630 v4</a></li> | |
449 | <li><a class="reference internal" href="#skylake-intel-xeon-gold-6148" id="id25">6.3 Skylake, Intel Xeon Gold 6148</a></li> | |
450 | </ul> | |
451 | </li> | |
452 | <li><a class="reference internal" href="#licence" id="id26">7 Licence</a></li> | |
453 | <li><a class="reference internal" href="#acknowledgements" id="id27">8 Acknowledgements</a></li> | |
454 | <li><a class="reference internal" href="#bibliography" id="id28">9 Bibliography</a></li> | |
ecf590ae MW |
455 | </ul> |
456 | </div> | |
0095f461 MW |
457 | <div class="section" id="introduction"> |
458 | <h1><a class="toc-backref" href="#id5">1 Introduction</a></h1> | |
459 | <p>The lattice Boltzmann (LBM) benchmark kernels are a collection of LBM kernel | |
460 | implementations.</p> | |
461 | <p><strong>AS SUCH THE LBM BENCHMARK KERNELS ARE NO FULLY EQUIPPED CFD SOLVER AND SOLELY | |
462 | SERVES THE PURPOSE OF STUDYING POSSIBLE PERFORMANCE OPTIMIZATIONS AND/OR | |
463 | EXPERIMENTS.</strong></p> | |
464 | <p>Currently all kernels utilize a D3Q19 discretization and the | |
465 | two-relaxation-time (TRT) collision operator <a class="citation-reference" href="#ginzburg-2008" id="id1">[ginzburg-2008]</a>. | |
466 | All operations are carried out in double precision arithmetic.</p> | |
467 | </div> | |
ecf590ae | 468 | <div class="section" id="compilation"> |
0095f461 | 469 | <h1><a class="toc-backref" href="#id6">2 Compilation</a></h1> |
ecf590ae MW |
470 | <p>The benchmark framework currently supports only Linux systems and the GCC and |
471 | Intel compilers. Every other configuration probably requires adjustment inside | |
0095f461 | 472 | the code and the makefiles. Furthermore some code might be platform or at least |
ecf590ae MW |
473 | POSIX specific.</p> |
474 | <p>The benchmark can be build via <tt class="docutils literal">make</tt> from the <tt class="docutils literal">src</tt> subdirectory. This will | |
475 | generate one binary which hosts all implemented benchmark kernels.</p> | |
476 | <p>Binaries are located under the <tt class="docutils literal">bin</tt> subdirectory and will have different names | |
477 | depending on compiler and build configuration.</p> | |
0095f461 MW |
478 | <p>Compilation can target debug or release builds. Combined with both build types |
479 | verification can be enabled, which increases the runtime and hence is not | |
480 | suited for benchmarking.</p> | |
ecf590ae | 481 | <div class="section" id="debug-and-verification"> |
0095f461 | 482 | <h2><a class="toc-backref" href="#id7">2.1 Debug and Verification</a></h2> |
ecf590ae | 483 | <pre class="literal-block"> |
e3f82424 | 484 | make BUILD=debug BENCHMARK=off |
ecf590ae | 485 | </pre> |
e3f82424 | 486 | <p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of |
ecf590ae MW |
487 | the benchmark kernels, where no optimizations are performed, line numbers and |
488 | debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined. The resulting | |
489 | binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named | |
490 | <tt class="docutils literal"><span class="pre">lbmbenchk-linux-<compiler>-debug</span></tt>.</p> | |
e3f82424 MW |
491 | <p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification |
492 | (<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output | |
ecf590ae MW |
493 | (<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p> |
494 | <p>Please note that the generated binary will therefore | |
495 | exhibit a poor performance.</p> | |
496 | </div> | |
0095f461 MW |
497 | <div class="section" id="release-and-verification"> |
498 | <h2><a class="toc-backref" href="#id8">2.2 Release and Verification</a></h2> | |
499 | <p>Verification with the debug builds can be extremely slow. Hence verification | |
500 | capabilities can be build with release builds:</p> | |
501 | <pre class="literal-block"> | |
502 | make BENCHMARK=off | |
503 | </pre> | |
504 | </div> | |
ecf590ae | 505 | <div class="section" id="benchmarking"> |
0095f461 | 506 | <h2><a class="toc-backref" href="#id9">2.3 Benchmarking</a></h2> |
ecf590ae MW |
507 | <p>To generate a binary for benchmarking run make with</p> |
508 | <pre class="literal-block"> | |
e3f82424 | 509 | make |
ecf590ae | 510 | </pre> |
e3f82424 | 511 | <p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where |
0095f461 | 512 | <tt class="docutils literal">BUILD=release</tt> turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables |
ecf590ae | 513 | verfification, statistics, and VTK output.</p> |
0095f461 MW |
514 | <p>See Options Summary below for further description of options which can be |
515 | applied, e.g. TARCH as well as the Benchmarking section.</p> | |
ecf590ae MW |
516 | </div> |
517 | <div class="section" id="compilers"> | |
0095f461 | 518 | <h2><a class="toc-backref" href="#id10">2.4 Compilers</a></h2> |
ecf590ae MW |
519 | <p>Currently only the GCC and Intel compiler under Linux are supported. Between |
520 | both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or | |
521 | <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p> | |
522 | </div> | |
e3f82424 | 523 | <div class="section" id="cleaning"> |
0095f461 | 524 | <h2><a class="toc-backref" href="#id11">2.5 Cleaning</a></h2> |
e3f82424 MW |
525 | <p>For each configuration and build (debug/release) a subdirectory under the |
526 | <tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are | |
527 | stored. | |
528 | With</p> | |
529 | <pre class="literal-block"> | |
530 | make CONFIG=... BUILD=... clean | |
531 | </pre> | |
532 | <p>a specific combination is select and cleaned, whereas with</p> | |
533 | <pre class="literal-block"> | |
534 | make clean-all | |
535 | </pre> | |
536 | <p>all object and dependency files are deleted.</p> | |
537 | </div> | |
ecf590ae | 538 | <div class="section" id="options-summary"> |
0095f461 MW |
539 | <h2><a class="toc-backref" href="#id12">2.6 Options Summary</a></h2> |
540 | <p>Options that can be specified when building the suite with make:</p> | |
ecf590ae MW |
541 | <table border="1" class="docutils"> |
542 | <colgroup> | |
ecf590ae | 543 | <col width="7%" /> |
0095f461 MW |
544 | <col width="12%" /> |
545 | <col width="6%" /> | |
546 | <col width="75%" /> | |
ecf590ae | 547 | </colgroup> |
0095f461 MW |
548 | <thead valign="bottom"> |
549 | <tr><th class="head">name</th> | |
550 | <th class="head">values</th> | |
551 | <th class="head">default</th> | |
552 | <th class="head">description</th> | |
ecf590ae | 553 | </tr> |
0095f461 MW |
554 | </thead> |
555 | <tbody valign="top"> | |
ecf590ae MW |
556 | <tr><td>BENCHMARK</td> |
557 | <td>on, off</td> | |
e3f82424 MW |
558 | <td>on</td> |
559 | <td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td> | |
ecf590ae MW |
560 | </tr> |
561 | <tr><td>BUILD</td> | |
562 | <td>debug, release</td> | |
e3f82424 | 563 | <td>release</td> |
0095f461 | 564 | <td>debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.</td> |
ecf590ae MW |
565 | </tr> |
566 | <tr><td>CONFIG</td> | |
567 | <td>linux-gcc, linux-intel</td> | |
568 | <td>linux-intel</td> | |
569 | <td>Select GCC or Intel compiler.</td> | |
570 | </tr> | |
571 | <tr><td>ISA</td> | |
572 | <td>avx, sse</td> | |
573 | <td>avx</td> | |
0095f461 | 574 | <td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td> |
ecf590ae MW |
575 | </tr> |
576 | <tr><td>OPENMP</td> | |
577 | <td>on, off</td> | |
578 | <td>on</td> | |
579 | <td>OpenMP, i.,e.. threading support.</td> | |
580 | </tr> | |
581 | <tr><td>STATISTICS</td> | |
582 | <td>on, off</td> | |
583 | <td>off</td> | |
584 | <td>View statistics, like density etc, during simulation.</td> | |
585 | </tr> | |
e3f82424 MW |
586 | <tr><td>TARCH</td> |
587 | <td>--</td> | |
588 | <td>--</td> | |
589 | <td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td> | |
590 | </tr> | |
ecf590ae MW |
591 | <tr><td>VERIFICATION</td> |
592 | <td>on, off</td> | |
593 | <td>off</td> | |
594 | <td>Turn verification on/off.</td> | |
595 | </tr> | |
596 | <tr><td>VTK_OUTPUT</td> | |
597 | <td>on, off</td> | |
598 | <td>off</td> | |
599 | <td>Enable/Disable VTK file output.</td> | |
600 | </tr> | |
601 | </tbody> | |
602 | </table> | |
603 | </div> | |
604 | </div> | |
605 | <div class="section" id="invocation"> | |
0095f461 | 606 | <h1><a class="toc-backref" href="#id13">3 Invocation</a></h1> |
ecf590ae | 607 | <p>Running the binary will print among the GPL licence header a line like the following:</p> |
e3f82424 MW |
608 | <pre class="literal-block"> |
609 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification | |
610 | </pre> | |
ecf590ae | 611 | <p>if verfication was enabled during compilation or</p> |
e3f82424 MW |
612 | <pre class="literal-block"> |
613 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark | |
614 | </pre> | |
ecf590ae MW |
615 | <p>if verfication was disabled during compilation.</p> |
616 | <div class="section" id="command-line-parameters"> | |
0095f461 | 617 | <h2><a class="toc-backref" href="#id14">3.1 Command Line Parameters</a></h2> |
ecf590ae MW |
618 | <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p> |
619 | <pre class="literal-block"> | |
620 | Usage: | |
621 | ./lbmbenchk -list | |
622 | ./lbmbenchk | |
623 | [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-<block size>]] [-iterations <iterations>] [-lattice-dump-ascii] | |
624 | [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>] | |
625 | [-periodic-x] | |
626 | [-t <number of threads>] | |
627 | [-pin core{,core}*] | |
628 | [-verify] | |
629 | -- <kernel specific parameters> | |
630 | ||
631 | -list List available kernels. | |
632 | ||
633 | -dims XxYxZ Specify geometry dimensions. | |
634 | ||
635 | -geometry blocks-<block size> | |
636 | Geometetry with blocks of size <block size> regularily layout out. | |
637 | </pre> | |
638 | <p>If an option is specified multiple times the last one overrides previous ones. | |
639 | This holds also true for <tt class="docutils literal"><span class="pre">-verify</span></tt> which sets geometry dimensions, | |
640 | iterations, etc, which can afterward be override, e.g.:</p> | |
641 | <pre class="literal-block"> | |
642 | $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32 | |
643 | </pre> | |
0095f461 | 644 | <p>Kernel specific parameters can be obtained via selecting the specific kernel |
ecf590ae MW |
645 | and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p> |
646 | <pre class="literal-block"> | |
e3f82424 | 647 | $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h |
ecf590ae MW |
648 | ... |
649 | Kernel parameters: | |
650 | [-blk <n>] [-blk-[xyz] <n>] | |
651 | </pre> | |
652 | <p>A list of all available kernels can be obtained via <tt class="docutils literal"><span class="pre">-list</span></tt>:</p> | |
653 | <pre class="literal-block"> | |
654 | $ ../bin/lbmbenchk-linux-gcc-debug -list | |
655 | Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE | |
656 | This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE. | |
657 | This is free software, and you are welcome to redistribute it under certain conditions. | |
658 | ||
659 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification | |
660 | Available kernels to benchmark: | |
661 | list-aa-pv-soa | |
662 | list-aa-ria-soa | |
663 | list-aa-soa | |
664 | list-aa-aos | |
665 | list-pull-split-nt-1s-soa | |
666 | list-pull-split-nt-2s-soa | |
667 | list-push-soa | |
668 | list-push-aos | |
669 | list-pull-soa | |
670 | list-pull-aos | |
671 | push-soa | |
672 | push-aos | |
673 | pull-soa | |
674 | pull-aos | |
675 | blk-push-soa | |
676 | blk-push-aos | |
677 | blk-pull-soa | |
678 | blk-pull-aos | |
679 | </pre> | |
680 | </div> | |
e3f82424 | 681 | <div class="section" id="kernels"> |
0095f461 | 682 | <h2><a class="toc-backref" href="#id15">3.2 Kernels</a></h2> |
e3f82424 MW |
683 | <p>The following list shortly describes available kernels:</p> |
684 | <ul class="simple"> | |
685 | <li>push-soa/push-aos/pull-soa/pull-aos: | |
686 | Unoptimized kernels (but stream/collide are already fused) using two grids as | |
687 | source and destination. Implement push/pull semantics as well structure of | |
688 | arrays (soa) or array of structures (aos) layout.</li> | |
689 | <li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: | |
690 | The same as the unoptimized kernels without the blk prefix, except that they support | |
691 | spatial blocking, i.e. loop blocking of the three loops used to iterate over | |
692 | the lattice. Here manual work sharing for OpenMP is used.</li> | |
693 | <li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: | |
694 | The same as the unoptimized kernels without the list prefix, but for indirect addressing. | |
695 | Here only a 1D vector of is used to store the fluid nodes, omitting the | |
696 | obstacles. An adjacency list is used to recover the neighborhood associations.</li> | |
697 | <li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: | |
698 | Optimized variant of list-pull-soa. Chunks of the lattice are processed as | |
699 | once. Postcollision values are written back via nontemporal stores in 18 (1s) | |
700 | or 9 (2s) loops.</li> | |
701 | <li>list-aa-aos/list-aa-soa: | |
702 | Unoptimized implementation of the AA pattern for the 1D vector with adjacency | |
703 | list. Supported are array of structures (aos) and structure of arrays (soa) | |
704 | data layout is supported.</li> | |
705 | <li>list-aa-ria-soa: | |
706 | Implementation of AA pattern with intrinsics for the 1D vector with adjacency | |
707 | list. Furthermore it contains a vectorized even time step and run length | |
708 | coding to reduce the loop balance of the odd time step.</li> | |
709 | <li>list-aa-pv-soa: | |
710 | All optimizations of list-aa-ria-soa. Additional with partial vectorization | |
711 | of the odd time step.</li> | |
712 | </ul> | |
713 | <p>Note that all array of structures (aos) kernels might require blocking | |
714 | (depending on the domain size) to reach the performance of their structure of | |
715 | arrays (soa) counter parts.</p> | |
716 | <p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means | |
717 | direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D | |
718 | vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported. | |
719 | The loop balance B_l is computed for D3Q19 model with double precision floating | |
720 | point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). | |
721 | As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective | |
722 | loop balance depends on the geometry. The effective loop balance is printed | |
723 | during each run.</p> | |
724 | <table border="1" class="docutils"> | |
725 | <colgroup> | |
726 | <col width="29%" /> | |
727 | <col width="14%" /> | |
728 | <col width="14%" /> | |
729 | <col width="6%" /> | |
730 | <col width="10%" /> | |
731 | <col width="10%" /> | |
732 | <col width="16%" /> | |
733 | </colgroup> | |
734 | <thead valign="bottom"> | |
735 | <tr><th class="head">kernel name</th> | |
736 | <th class="head">prop. step</th> | |
737 | <th class="head">data layout</th> | |
738 | <th class="head">addr.</th> | |
739 | <th class="head">parallel</th> | |
740 | <th class="head">blocking</th> | |
741 | <th class="head">B_l [B/FLUP]</th> | |
742 | </tr> | |
743 | </thead> | |
744 | <tbody valign="top"> | |
745 | <tr><td>push-soa</td> | |
746 | <td>OS</td> | |
747 | <td>SoA</td> | |
748 | <td>D</td> | |
749 | <td>x</td> | |
750 | <td>--</td> | |
751 | <td>456</td> | |
752 | </tr> | |
753 | <tr><td>push-aos</td> | |
754 | <td>OS</td> | |
755 | <td>AoS</td> | |
756 | <td>D</td> | |
757 | <td>x</td> | |
758 | <td>--</td> | |
759 | <td>456</td> | |
760 | </tr> | |
761 | <tr><td>pull-soa</td> | |
762 | <td>OS</td> | |
763 | <td>SoA</td> | |
764 | <td>D</td> | |
765 | <td>x</td> | |
766 | <td>--</td> | |
767 | <td>456</td> | |
768 | </tr> | |
769 | <tr><td>pull-aos</td> | |
770 | <td>OS</td> | |
771 | <td>AoS</td> | |
772 | <td>D</td> | |
773 | <td>x</td> | |
774 | <td>--</td> | |
775 | <td>456</td> | |
776 | </tr> | |
777 | <tr><td>blk-push-soa</td> | |
778 | <td>OS</td> | |
779 | <td>SoA</td> | |
780 | <td>D</td> | |
781 | <td>x</td> | |
782 | <td>x</td> | |
783 | <td>456</td> | |
784 | </tr> | |
785 | <tr><td>blk-push-aos</td> | |
786 | <td>OS</td> | |
787 | <td>AoS</td> | |
788 | <td>D</td> | |
789 | <td>x</td> | |
790 | <td>x</td> | |
791 | <td>456</td> | |
792 | </tr> | |
793 | <tr><td>blk-pull-soa</td> | |
794 | <td>OS</td> | |
795 | <td>SoA</td> | |
796 | <td>D</td> | |
797 | <td>x</td> | |
798 | <td>x</td> | |
799 | <td>456</td> | |
800 | </tr> | |
801 | <tr><td>blk-pull-aos</td> | |
802 | <td>OS</td> | |
803 | <td>AoS</td> | |
804 | <td>D</td> | |
805 | <td>x</td> | |
806 | <td>x</td> | |
807 | <td>456</td> | |
808 | </tr> | |
809 | <tr><td>list-push-soa</td> | |
810 | <td>OS</td> | |
811 | <td>SoA</td> | |
812 | <td>I</td> | |
813 | <td>x</td> | |
814 | <td>x</td> | |
815 | <td>528</td> | |
816 | </tr> | |
817 | <tr><td>list-push-aos</td> | |
818 | <td>OS</td> | |
819 | <td>AoS</td> | |
820 | <td>I</td> | |
821 | <td>x</td> | |
822 | <td>x</td> | |
823 | <td>528</td> | |
824 | </tr> | |
825 | <tr><td>list-pull-soa</td> | |
826 | <td>OS</td> | |
827 | <td>SoA</td> | |
828 | <td>I</td> | |
829 | <td>x</td> | |
830 | <td>x</td> | |
831 | <td>528</td> | |
832 | </tr> | |
833 | <tr><td>list-pull-aos</td> | |
834 | <td>OS</td> | |
835 | <td>AoS</td> | |
836 | <td>I</td> | |
837 | <td>x</td> | |
838 | <td>x</td> | |
839 | <td>528</td> | |
840 | </tr> | |
841 | <tr><td>list-pull-split-nt-1s</td> | |
842 | <td>OS</td> | |
843 | <td>SoA</td> | |
844 | <td>I</td> | |
845 | <td>x</td> | |
846 | <td>x</td> | |
847 | <td>376</td> | |
848 | </tr> | |
849 | <tr><td>list-pull-split-nt-2s</td> | |
850 | <td>OS</td> | |
851 | <td>SoA</td> | |
852 | <td>I</td> | |
853 | <td>x</td> | |
854 | <td>x</td> | |
855 | <td>376</td> | |
856 | </tr> | |
857 | <tr><td>list-aa-soa</td> | |
858 | <td>AA</td> | |
859 | <td>SoA</td> | |
860 | <td>I</td> | |
861 | <td>x</td> | |
862 | <td>x</td> | |
863 | <td>340</td> | |
864 | </tr> | |
865 | <tr><td>list-aa-aos</td> | |
866 | <td>AA</td> | |
867 | <td>AoS</td> | |
868 | <td>I</td> | |
869 | <td>x</td> | |
870 | <td>x</td> | |
871 | <td>340</td> | |
872 | </tr> | |
873 | <tr><td>list-aa-ria-soa</td> | |
874 | <td>AA</td> | |
875 | <td>SoA</td> | |
876 | <td>I</td> | |
877 | <td>x</td> | |
878 | <td>x</td> | |
879 | <td>304-342</td> | |
880 | </tr> | |
881 | <tr><td>list-aa-pv-soa</td> | |
882 | <td>AA</td> | |
883 | <td>SoA</td> | |
884 | <td>I</td> | |
885 | <td>x</td> | |
886 | <td>x</td> | |
887 | <td>304-342</td> | |
888 | </tr> | |
889 | </tbody> | |
890 | </table> | |
891 | </div> | |
ecf590ae | 892 | </div> |
0095f461 MW |
893 | <div class="section" id="id2"> |
894 | <h1><a class="toc-backref" href="#id16">4 Benchmarking</a></h1> | |
ecf590ae MW |
895 | <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be |
896 | created make sure the binary was compiled with:</p> | |
897 | <ul class="simple"> | |
e3f82424 MW |
898 | <li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li> |
899 | <li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li> | |
ecf590ae MW |
900 | <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li> |
901 | <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li> | |
902 | </ul> | |
0095f461 MW |
903 | <div class="section" id="intel-compiler"> |
904 | <h2><a class="toc-backref" href="#id17">4.1 Intel Compiler</a></h2> | |
905 | <p>For the Intel compiler one can specify depending on the target ISA extension:</p> | |
906 | <ul class="simple"> | |
907 | <li>AVX: <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li> | |
908 | <li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li> | |
909 | <li>AVX512: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li> | |
910 | <li>KNL: <tt class="docutils literal"><span class="pre">TARCH=-xMIC-AVX512</span></tt></li> | |
911 | </ul> | |
912 | <p>Compiling for an architecture supporting AVX (Sandy Bridge, Ivy Bridge):</p> | |
913 | <pre class="literal-block"> | |
914 | make ISA=avx TARCH=-xAVX | |
915 | </pre> | |
916 | <p>Compiling for an architecture supporting AVX2 (Haswell, Broadwell):</p> | |
917 | <pre class="literal-block"> | |
918 | make ISA=avx TARCH=-xCORE-AVX2,-fma | |
919 | </pre> | |
920 | <p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not | |
921 | implemented. This might change in the future.</p> | |
922 | <p>Compiling for an architecture supporting AVX-512 (Skylake):</p> | |
923 | <pre class="literal-block"> | |
924 | make ISA=avx TARCH=-xCORE-AVX512 | |
925 | </pre> | |
926 | <p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the | |
927 | AVX512 intrinsics. This might change in the future.</p> | |
928 | </div> | |
929 | <div class="section" id="pinning"> | |
930 | <h2><a class="toc-backref" href="#id18">4.2 Pinning</a></h2> | |
ecf590ae | 931 | <p>During benchmarking pinning should be used via the <tt class="docutils literal"><span class="pre">-pin</span></tt> parameter. Running |
0095f461 | 932 | a benchmark with 10 threads and pin them to the first 10 cores works like</p> |
ecf590ae MW |
933 | <pre class="literal-block"> |
934 | $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9) | |
935 | </pre> | |
0095f461 MW |
936 | </div> |
937 | <div class="section" id="general-remarks"> | |
938 | <h2><a class="toc-backref" href="#id19">4.3 General Remarks</a></h2> | |
939 | <p>Things the binary does nor check or control:</p> | |
ecf590ae MW |
940 | <ul class="simple"> |
941 | <li>transparent huge pages: when allocating memory small 4 KiB pages might be | |
942 | replaced with larger ones. This is in general a good thing, but if this is | |
e3f82424 MW |
943 | really the case, depends on the system settings (check e.g. the status of |
944 | <tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>). | |
945 | Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to | |
946 | a 4 KiB page, which should be the case for the lattices. | |
947 | This should result in huge pages except THP is disabled on the machine. | |
948 | (NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently | |
949 | hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li> | |
ecf590ae MW |
950 | <li>CPU/core frequency: For reproducible results the frequency of all cores |
951 | should be fixed.</li> | |
952 | <li>NUMA placement policy: The benchmark assumes a first touch policy, which | |
953 | means the memory will be placed at the NUMA domain the touching core is | |
954 | associated with. If a different policy is in place or the NUMA domain to be | |
955 | used is already full memory might be allocated in a remote domain. Accesses | |
956 | to remote domains typically have a higher latency and lower bandwidth.</li> | |
0095f461 | 957 | <li>System load: interference with other application, especially on desktop |
ecf590ae | 958 | systems should be avoided.</li> |
e3f82424 MW |
959 | <li>Padding: For SoA based kernels the number of (fluid) nodes is automatically |
960 | adjusted so that no cache or TLB thrashing should occur. The parameters are | |
961 | optimized for current Intel based systems. For more details look into the | |
962 | padding section.</li> | |
ecf590ae MW |
963 | <li>CPU dispatcher function: the compiler might add different versions of a |
964 | function for different ISA extensions. Make sure the code you might think is | |
965 | executed is actually the code which is executed.</li> | |
966 | </ul> | |
0095f461 | 967 | </div> |
e3f82424 | 968 | <div class="section" id="padding"> |
0095f461 | 969 | <h2><a class="toc-backref" href="#id20">4.4 Padding</a></h2> |
e3f82424 MW |
970 | <p>With correct padding cache and TLB thrashing can be avoided. Therefore the |
971 | number of (fluid) nodes used in the data layout is artificially increased.</p> | |
972 | <p>Currently automatic padding is active for kernels which support it. It can be | |
973 | controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>) | |
974 | <tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding), | |
975 | or a manual padding.</p> | |
976 | <p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 | |
977 | entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the | |
978 | parameters of current Intel based processors.</p> | |
979 | <p>Manual padding is done via a padding string and has the format | |
980 | <tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes. | |
981 | SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the | |
982 | 19 pages with one lattice (36 with two lattices) we are concurrently accessing | |
983 | over as much sets in the TLB as possible. | |
984 | This is controlled by the distance between the accessed pages, which is the | |
985 | number of (fluid) nodes in between them and can be adjusted by adding further | |
986 | (fluid) nodes. | |
987 | We want the distance d (in bytes) between two accessed pages to be e.g. | |
988 | <strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>. | |
989 | This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong> | |
990 | would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>. | |
991 | Measurements show that with only a quarter of half of a page size as offset | |
992 | higher performance is achieved, which is done by automatic padding. | |
993 | On top of this padding more paddings can be added. They are just added to the | |
994 | padding string and are separated by commas.</p> | |
995 | <p>A zero modulus in the padding string has a special meaning. Here the | |
996 | corresponding offset is just added to the number of nodes. A padding string | |
997 | like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p> | |
998 | </div> | |
999 | </div> | |
1000 | <div class="section" id="geometries"> | |
0095f461 MW |
1001 | <h1><a class="toc-backref" href="#id21">5 Geometries</a></h1> |
1002 | <p>TODO: supported geometries: channel, pipe, blocks, fluid</p> | |
1003 | </div> | |
1004 | <div class="section" id="performance-results"> | |
1005 | <h1><a class="toc-backref" href="#id22">6 Performance Results</a></h1> | |
1006 | <p>The sections lists performance values measured on several machines for | |
1007 | different kernels and geometries. | |
1008 | The <strong>RFM</strong> column denotes the expected performance as predicted by the | |
1009 | Roofline performance model <a class="citation-reference" href="#williams-2008" id="id3">[williams-2008]</a>. | |
1010 | For performance prediction of each kernel a memory bandwidth benchmark is used | |
1011 | which mimics the kernels memory access pattern and the kernel's loop balance | |
1012 | (see <a class="citation-reference" href="#kernels" id="id4">[kernels]</a> for details).</p> | |
1013 | <div class="section" id="haswell-intel-xeon-e5-2695-v3"> | |
1014 | <h2><a class="toc-backref" href="#id23">6.1 Haswell, Intel Xeon E5-2695 v3</a></h2> | |
1015 | <ul class="simple"> | |
1016 | <li>Haswell architecture, AVX2, FMA</li> | |
1017 | <li>14 cores, 2,3 GHz</li> | |
1018 | <li>2 x 7 cores in cluster-on-die (CoD) mode enabled</li> | |
1019 | <li>SMT enabled</li> | |
1020 | </ul> | |
1021 | <p>memory bandwidth:</p> | |
1022 | <ul class="simple"> | |
1023 | <li>copy-19 47.3 GB/s</li> | |
1024 | <li>copy-19-nt-sl 47.1 GB/s</li> | |
1025 | <li>update-19 44.0 GB/s</li> | |
1026 | </ul> | |
1027 | <p>geometry dimensions: 500x100x100</p> | |
1028 | <table border="1" class="docutils"> | |
1029 | <colgroup> | |
1030 | <col width="19%" /> | |
1031 | <col width="7%" /> | |
1032 | <col width="7%" /> | |
1033 | <col width="7%" /> | |
1034 | <col width="7%" /> | |
1035 | <col width="7%" /> | |
1036 | <col width="7%" /> | |
1037 | <col width="7%" /> | |
1038 | <col width="7%" /> | |
1039 | <col width="7%" /> | |
1040 | <col width="7%" /> | |
1041 | <col width="7%" /> | |
1042 | <col width="4%" /> | |
1043 | </colgroup> | |
1044 | <thead valign="bottom"> | |
1045 | <tr><th class="head">kernel</th> | |
1046 | <th class="head">pipe</th> | |
1047 | <th class="head">blocks-2</th> | |
1048 | <th class="head">blocks-4</th> | |
1049 | <th class="head">blocks-6</th> | |
1050 | <th class="head">blocks-8</th> | |
1051 | <th class="head">blocks-10</th> | |
1052 | <th class="head">blocks-15</th> | |
1053 | <th class="head">blocks-16</th> | |
1054 | <th class="head">blocks-20</th> | |
1055 | <th class="head">blocks-25</th> | |
1056 | <th class="head">blocks-32</th> | |
1057 | <th class="head">RFM</th> | |
1058 | </tr> | |
1059 | </thead> | |
1060 | <tbody valign="top"> | |
1061 | <tr><td>blk-push-aos</td> | |
1062 | <td>58.82</td> | |
1063 | <td>49.85</td> | |
1064 | <td>57.34</td> | |
1065 | <td>59.90</td> | |
1066 | <td>61.37</td> | |
1067 | <td>62.17</td> | |
1068 | <td>65.30</td> | |
1069 | <td>64.00</td> | |
1070 | <td>67.54</td> | |
1071 | <td>64.46</td> | |
1072 | <td>69.69</td> | |
1073 | <td>104</td> | |
1074 | </tr> | |
1075 | <tr><td>blk-push-soa</td> | |
1076 | <td>32.32</td> | |
1077 | <td>33.46</td> | |
1078 | <td>34.02</td> | |
1079 | <td>34.64</td> | |
1080 | <td>35.06</td> | |
1081 | <td>35.04</td> | |
1082 | <td>36.31</td> | |
1083 | <td>35.44</td> | |
1084 | <td>37.20</td> | |
1085 | <td>35.14</td> | |
1086 | <td>37.95</td> | |
1087 | <td>104</td> | |
1088 | </tr> | |
1089 | <tr><td>blk-pull-aos</td> | |
1090 | <td>56.97</td> | |
1091 | <td>51.41</td> | |
1092 | <td>56.09</td> | |
1093 | <td>57.92</td> | |
1094 | <td>59.98</td> | |
1095 | <td>59.83</td> | |
1096 | <td>63.37</td> | |
1097 | <td>61.55</td> | |
1098 | <td>65.50</td> | |
1099 | <td>63.11</td> | |
1100 | <td>67.02</td> | |
1101 | <td>104</td> | |
1102 | </tr> | |
1103 | <tr><td>blk-pull-soa</td> | |
1104 | <td>49.29</td> | |
1105 | <td>46.23</td> | |
1106 | <td>47.50</td> | |
1107 | <td>51.97</td> | |
1108 | <td>51.27</td> | |
1109 | <td>49.52</td> | |
1110 | <td>55.23</td> | |
1111 | <td>53.13</td> | |
1112 | <td>54.50</td> | |
1113 | <td>49.79</td> | |
1114 | <td>57.90</td> | |
1115 | <td>104</td> | |
1116 | </tr> | |
1117 | <tr><td>aa-aos</td> | |
1118 | <td>91.35</td> | |
1119 | <td>66.14</td> | |
1120 | <td>76.80</td> | |
1121 | <td>84.76</td> | |
1122 | <td>83.63</td> | |
1123 | <td>91.36</td> | |
1124 | <td>93.46</td> | |
1125 | <td>92.62</td> | |
1126 | <td>93.91</td> | |
1127 | <td>92.25</td> | |
1128 | <td>92.93</td> | |
1129 | <td>145</td> | |
1130 | </tr> | |
1131 | <tr><td>aa-soa</td> | |
1132 | <td>75.51</td> | |
1133 | <td>65.68</td> | |
1134 | <td>70.94</td> | |
1135 | <td>71.36</td> | |
1136 | <td>73.83</td> | |
1137 | <td>75.46</td> | |
1138 | <td>74.84</td> | |
1139 | <td>79.48</td> | |
1140 | <td>83.28</td> | |
1141 | <td>77.70</td> | |
1142 | <td>82.72</td> | |
1143 | <td>145</td> | |
1144 | </tr> | |
1145 | <tr><td>aa-vec-soa</td> | |
1146 | <td>93.85</td> | |
1147 | <td>83.44</td> | |
1148 | <td>91.58</td> | |
1149 | <td>93.96</td> | |
1150 | <td>94.35</td> | |
1151 | <td>96.62</td> | |
1152 | <td>101.76</td> | |
1153 | <td>96.72</td> | |
1154 | <td>106.37</td> | |
1155 | <td>102.60</td> | |
1156 | <td>110.28</td> | |
1157 | <td>145</td> | |
1158 | </tr> | |
1159 | <tr><td>list-push-aos</td> | |
1160 | <td>80.29</td> | |
1161 | <td>80.97</td> | |
1162 | <td>80.95</td> | |
1163 | <td>81.10</td> | |
1164 | <td>81.37</td> | |
1165 | <td>82.44</td> | |
1166 | <td>81.77</td> | |
1167 | <td>81.49</td> | |
1168 | <td>80.72</td> | |
1169 | <td>81.93</td> | |
1170 | <td>80.93</td> | |
1171 | <td>83</td> | |
1172 | </tr> | |
1173 | <tr><td>list-push-soa</td> | |
1174 | <td>47.52</td> | |
1175 | <td>42.65</td> | |
1176 | <td>45.28</td> | |
1177 | <td>46.64</td> | |
1178 | <td>43.46</td> | |
1179 | <td>40.59</td> | |
1180 | <td>44.94</td> | |
1181 | <td>46.55</td> | |
1182 | <td>41.53</td> | |
1183 | <td>45.98</td> | |
1184 | <td>44.86</td> | |
1185 | <td>83</td> | |
1186 | </tr> | |
1187 | <tr><td>list-pull-aos</td> | |
1188 | <td>85.30</td> | |
1189 | <td>82.97</td> | |
1190 | <td>86.43</td> | |
1191 | <td>83.42</td> | |
1192 | <td>86.33</td> | |
1193 | <td>83.70</td> | |
1194 | <td>86.43</td> | |
1195 | <td>83.77</td> | |
1196 | <td>83.10</td> | |
1197 | <td>85.89</td> | |
1198 | <td>84.44</td> | |
1199 | <td>83</td> | |
1200 | </tr> | |
1201 | <tr><td>list-pull-soa</td> | |
1202 | <td>62.12</td> | |
1203 | <td>63.61</td> | |
1204 | <td>63.28</td> | |
1205 | <td>61.32</td> | |
1206 | <td>66.72</td> | |
1207 | <td>62.65</td> | |
1208 | <td>64.82</td> | |
1209 | <td>60.49</td> | |
1210 | <td>58.01</td> | |
1211 | <td>64.46</td> | |
1212 | <td>62.52</td> | |
1213 | <td>83</td> | |
1214 | </tr> | |
1215 | <tr><td>list-pull-split-nt-1s-soa</td> | |
1216 | <td>121.35</td> | |
1217 | <td>113.77</td> | |
1218 | <td>115.29</td> | |
1219 | <td>113.54</td> | |
1220 | <td>117.00</td> | |
1221 | <td>116.46</td> | |
1222 | <td>114.78</td> | |
1223 | <td>114.54</td> | |
1224 | <td>110.83</td> | |
1225 | <td>112.67</td> | |
1226 | <td>117.85</td> | |
1227 | <td>125</td> | |
1228 | </tr> | |
1229 | <tr><td>list-pull-split-nt-2s-soa</td> | |
1230 | <td>118.09</td> | |
1231 | <td>110.48</td> | |
1232 | <td>112.55</td> | |
1233 | <td>113.18</td> | |
1234 | <td>113.44</td> | |
1235 | <td>111.85</td> | |
1236 | <td>109.27</td> | |
1237 | <td>114.41</td> | |
1238 | <td>110.28</td> | |
1239 | <td>111.78</td> | |
1240 | <td>113.74</td> | |
1241 | <td>125</td> | |
1242 | </tr> | |
1243 | <tr><td>list-aa-aos</td> | |
1244 | <td>121.28</td> | |
1245 | <td>118.63</td> | |
1246 | <td>119.00</td> | |
1247 | <td>118.50</td> | |
1248 | <td>121.99</td> | |
1249 | <td>119.11</td> | |
1250 | <td>118.83</td> | |
1251 | <td>121.47</td> | |
1252 | <td>121.62</td> | |
1253 | <td>126.18</td> | |
1254 | <td>120.12</td> | |
1255 | <td>129</td> | |
1256 | </tr> | |
1257 | <tr><td>list-aa-soa</td> | |
1258 | <td>126.34</td> | |
1259 | <td>116.90</td> | |
1260 | <td>129.45</td> | |
1261 | <td>127.12</td> | |
1262 | <td>129.41</td> | |
1263 | <td>121.42</td> | |
1264 | <td>126.19</td> | |
1265 | <td>126.76</td> | |
1266 | <td>126.70</td> | |
1267 | <td>124.40</td> | |
1268 | <td>125.22</td> | |
1269 | <td>129</td> | |
1270 | </tr> | |
1271 | <tr><td>list-aa-ria-soa</td> | |
1272 | <td>133.68</td> | |
1273 | <td>121.82</td> | |
1274 | <td>126.04</td> | |
1275 | <td>128.46</td> | |
1276 | <td>131.15</td> | |
1277 | <td>132.25</td> | |
1278 | <td>128.78</td> | |
1279 | <td>133.50</td> | |
1280 | <td>126.69</td> | |
1281 | <td>124.40</td> | |
1282 | <td>130.37</td> | |
1283 | <td>145</td> | |
1284 | </tr> | |
1285 | <tr><td>list-aa-pv-soa</td> | |
1286 | <td>146.22</td> | |
1287 | <td>124.39</td> | |
1288 | <td>130.73</td> | |
1289 | <td>136.29</td> | |
1290 | <td>137.61</td> | |
1291 | <td>131.21</td> | |
1292 | <td>138.65</td> | |
1293 | <td>138.78</td> | |
1294 | <td>127.02</td> | |
1295 | <td>132.40</td> | |
1296 | <td>138.37</td> | |
1297 | <td>145</td> | |
1298 | </tr> | |
1299 | </tbody> | |
1300 | </table> | |
1301 | </div> | |
1302 | <div class="section" id="broadwell-intel-xeon-e5-2630-v4"> | |
1303 | <h2><a class="toc-backref" href="#id24">6.2 Broadwell, Intel Xeon E5-2630 v4</a></h2> | |
1304 | <ul class="simple"> | |
1305 | <li>Broadwell architecture, AVX2, FMA</li> | |
1306 | <li>10 cores, 2.2 GHz</li> | |
1307 | <li>SMT disabled</li> | |
1308 | </ul> | |
1309 | <p>memory bandwidth:</p> | |
1310 | <ul class="simple"> | |
1311 | <li>copy-19 48.0 GB/s</li> | |
1312 | <li>copy-nt-sl-19 48.2 GB/s</li> | |
1313 | <li>update-19 51.1 GB/s</li> | |
1314 | </ul> | |
1315 | <p>geometry dimensions: 500x100x100</p> | |
1316 | <table border="1" class="docutils"> | |
1317 | <colgroup> | |
1318 | <col width="19%" /> | |
1319 | <col width="7%" /> | |
1320 | <col width="7%" /> | |
1321 | <col width="7%" /> | |
1322 | <col width="7%" /> | |
1323 | <col width="7%" /> | |
1324 | <col width="7%" /> | |
1325 | <col width="7%" /> | |
1326 | <col width="7%" /> | |
1327 | <col width="7%" /> | |
1328 | <col width="7%" /> | |
1329 | <col width="7%" /> | |
1330 | <col width="5%" /> | |
1331 | </colgroup> | |
1332 | <thead valign="bottom"> | |
1333 | <tr><th class="head">kernel</th> | |
1334 | <th class="head">pipe</th> | |
1335 | <th class="head">blocks-2</th> | |
1336 | <th class="head">blocks-4</th> | |
1337 | <th class="head">blocks-6</th> | |
1338 | <th class="head">blocks-8</th> | |
1339 | <th class="head">blocks-10</th> | |
1340 | <th class="head">blocks-15</th> | |
1341 | <th class="head">blocks-16</th> | |
1342 | <th class="head">blocks-20</th> | |
1343 | <th class="head">blocks-25</th> | |
1344 | <th class="head">blocks-32</th> | |
1345 | <th class="head">RFM</th> | |
1346 | </tr> | |
1347 | </thead> | |
1348 | <tbody valign="top"> | |
1349 | <tr><td>blk-push-aos</td> | |
1350 | <td>55.75</td> | |
1351 | <td>47.62</td> | |
1352 | <td>54.57</td> | |
1353 | <td>57.10</td> | |
1354 | <td>58.49</td> | |
1355 | <td>59.00</td> | |
1356 | <td>61.72</td> | |
1357 | <td>60.56</td> | |
1358 | <td>64.05</td> | |
1359 | <td>61.10</td> | |
1360 | <td>66.03</td> | |
1361 | <td>105</td> | |
1362 | </tr> | |
1363 | <tr><td>blk-push-soa</td> | |
1364 | <td>30.06</td> | |
1365 | <td>31.09</td> | |
1366 | <td>32.13</td> | |
1367 | <td>32.54</td> | |
1368 | <td>32.74</td> | |
1369 | <td>32.72</td> | |
1370 | <td>33.81</td> | |
1371 | <td>33.19</td> | |
1372 | <td>34.90</td> | |
1373 | <td>33.21</td> | |
1374 | <td>35.75</td> | |
1375 | <td>105</td> | |
1376 | </tr> | |
1377 | <tr><td>blk-pull-aos</td> | |
1378 | <td>53.80</td> | |
1379 | <td>48.61</td> | |
1380 | <td>53.08</td> | |
1381 | <td>54.99</td> | |
1382 | <td>56.08</td> | |
1383 | <td>56.68</td> | |
1384 | <td>59.20</td> | |
1385 | <td>58.12</td> | |
1386 | <td>61.49</td> | |
1387 | <td>58.71</td> | |
1388 | <td>63.45</td> | |
1389 | <td>105</td> | |
1390 | </tr> | |
1391 | <tr><td>blk-pull-soa</td> | |
1392 | <td>46.96</td> | |
1393 | <td>46.61</td> | |
1394 | <td>48.84</td> | |
1395 | <td>49.70</td> | |
1396 | <td>50.33</td> | |
1397 | <td>50.46</td> | |
1398 | <td>52.36</td> | |
1399 | <td>51.39</td> | |
1400 | <td>54.20</td> | |
1401 | <td>51.61</td> | |
1402 | <td>55.71</td> | |
1403 | <td>105</td> | |
1404 | </tr> | |
1405 | <tr><td>aa-aos</td> | |
1406 | <td>91.40</td> | |
1407 | <td>66.99</td> | |
1408 | <td>78.47</td> | |
1409 | <td>83.38</td> | |
1410 | <td>86.62</td> | |
1411 | <td>88.62</td> | |
1412 | <td>92.98</td> | |
1413 | <td>91.54</td> | |
1414 | <td>97.08</td> | |
1415 | <td>94.93</td> | |
1416 | <td>98.90</td> | |
1417 | <td>168</td> | |
1418 | </tr> | |
1419 | <tr><td>aa-soa</td> | |
1420 | <td>83.01</td> | |
1421 | <td>69.96</td> | |
1422 | <td>75.85</td> | |
1423 | <td>77.72</td> | |
1424 | <td>79.01</td> | |
1425 | <td>79.29</td> | |
1426 | <td>82.38</td> | |
1427 | <td>80.11</td> | |
1428 | <td>85.70</td> | |
1429 | <td>83.91</td> | |
1430 | <td>87.69</td> | |
1431 | <td>168</td> | |
1432 | </tr> | |
1433 | <tr><td>aa-vec-soa</td> | |
1434 | <td>112.03</td> | |
1435 | <td>96.52</td> | |
1436 | <td>105.32</td> | |
1437 | <td>109.76</td> | |
1438 | <td>112.55</td> | |
1439 | <td>113.82</td> | |
1440 | <td>120.55</td> | |
1441 | <td>118.37</td> | |
1442 | <td>126.30</td> | |
1443 | <td>121.37</td> | |
1444 | <td>131.94</td> | |
1445 | <td>168</td> | |
1446 | </tr> | |
1447 | <tr><td>list-push-aos</td> | |
1448 | <td>75.13</td> | |
1449 | <td>74.18</td> | |
1450 | <td>75.20</td> | |
1451 | <td>75.42</td> | |
1452 | <td>75.24</td> | |
1453 | <td>75.99</td> | |
1454 | <td>75.80</td> | |
1455 | <td>75.80</td> | |
1456 | <td>75.54</td> | |
1457 | <td>76.22</td> | |
1458 | <td>76.21</td> | |
1459 | <td>97</td> | |
1460 | </tr> | |
1461 | <tr><td>list-push-soa</td> | |
1462 | <td>40.99</td> | |
1463 | <td>38.14</td> | |
1464 | <td>39.00</td> | |
1465 | <td>38.89</td> | |
1466 | <td>38.89</td> | |
1467 | <td>39.67</td> | |
1468 | <td>39.87</td> | |
1469 | <td>39.28</td> | |
1470 | <td>39.35</td> | |
1471 | <td>40.08</td> | |
1472 | <td>40.13</td> | |
1473 | <td>97</td> | |
1474 | </tr> | |
1475 | <tr><td>list-pull-aos</td> | |
1476 | <td>82.07</td> | |
1477 | <td>82.88</td> | |
1478 | <td>83.29</td> | |
1479 | <td>83.09</td> | |
1480 | <td>83.32</td> | |
1481 | <td>83.49</td> | |
1482 | <td>82.82</td> | |
1483 | <td>82.88</td> | |
1484 | <td>83.32</td> | |
1485 | <td>82.60</td> | |
1486 | <td>82.93</td> | |
1487 | <td>97</td> | |
1488 | </tr> | |
1489 | <tr><td>list-pull-soa</td> | |
1490 | <td>62.07</td> | |
1491 | <td>60.40</td> | |
1492 | <td>61.89</td> | |
1493 | <td>61.39</td> | |
1494 | <td>62.43</td> | |
1495 | <td>60.90</td> | |
1496 | <td>60.48</td> | |
1497 | <td>62.80</td> | |
1498 | <td>62.50</td> | |
1499 | <td>61.10</td> | |
1500 | <td>60.38</td> | |
1501 | <td>97</td> | |
1502 | </tr> | |
1503 | <tr><td>list-pull-split-nt-1s-soa</td> | |
1504 | <td>125.81</td> | |
1505 | <td>120.60</td> | |
1506 | <td>121.96</td> | |
1507 | <td>122.34</td> | |
1508 | <td>122.86</td> | |
1509 | <td>123.53</td> | |
1510 | <td>123.64</td> | |
1511 | <td>123.67</td> | |
1512 | <td>125.94</td> | |
1513 | <td>124.09</td> | |
1514 | <td>123.69</td> | |
1515 | <td>128</td> | |
1516 | </tr> | |
1517 | <tr><td>list-pull-split-nt-2s-soa</td> | |
1518 | <td>122.79</td> | |
1519 | <td>117.16</td> | |
1520 | <td>118.86</td> | |
1521 | <td>119.16</td> | |
1522 | <td>119.56</td> | |
1523 | <td>119.99</td> | |
1524 | <td>120.01</td> | |
1525 | <td>120.03</td> | |
1526 | <td>122.64</td> | |
1527 | <td>120.57</td> | |
1528 | <td>120.39</td> | |
1529 | <td>128</td> | |
1530 | </tr> | |
1531 | <tr><td>list-aa-aos</td> | |
1532 | <td>128.13</td> | |
1533 | <td>127.41</td> | |
1534 | <td>129.31</td> | |
1535 | <td>129.07</td> | |
1536 | <td>129.79</td> | |
1537 | <td>129.63</td> | |
1538 | <td>129.67</td> | |
1539 | <td>129.94</td> | |
1540 | <td>129.12</td> | |
1541 | <td>128.41</td> | |
1542 | <td>129.72</td> | |
1543 | <td>150</td> | |
1544 | </tr> | |
1545 | <tr><td>list-aa-soa</td> | |
1546 | <td>141.60</td> | |
1547 | <td>139.78</td> | |
1548 | <td>141.58</td> | |
1549 | <td>142.16</td> | |
1550 | <td>141.94</td> | |
1551 | <td>141.31</td> | |
1552 | <td>142.37</td> | |
1553 | <td>142.25</td> | |
1554 | <td>142.43</td> | |
1555 | <td>141.40</td> | |
1556 | <td>142.26</td> | |
1557 | <td>150</td> | |
1558 | </tr> | |
1559 | <tr><td>list-aa-ria-soa</td> | |
1560 | <td>141.82</td> | |
1561 | <td>134.88</td> | |
1562 | <td>140.15</td> | |
1563 | <td>140.72</td> | |
1564 | <td>141.67</td> | |
1565 | <td>140.51</td> | |
1566 | <td>141.18</td> | |
1567 | <td>141.29</td> | |
1568 | <td>142.97</td> | |
1569 | <td>141.94</td> | |
1570 | <td>143.25</td> | |
1571 | <td>168</td> | |
1572 | </tr> | |
1573 | <tr><td>list-aa-pv-soa</td> | |
1574 | <td>164.79</td> | |
1575 | <td>140.95</td> | |
1576 | <td>159.24</td> | |
1577 | <td>161.78</td> | |
1578 | <td>162.40</td> | |
1579 | <td>163.04</td> | |
1580 | <td>164.69</td> | |
1581 | <td>164.38</td> | |
1582 | <td>165.11</td> | |
1583 | <td>165.75</td> | |
1584 | <td>166.09</td> | |
1585 | <td>168</td> | |
1586 | </tr> | |
1587 | </tbody> | |
1588 | </table> | |
1589 | </div> | |
1590 | <div class="section" id="skylake-intel-xeon-gold-6148"> | |
1591 | <h2><a class="toc-backref" href="#id25">6.3 Skylake, Intel Xeon Gold 6148</a></h2> | |
1592 | <ul class="simple"> | |
1593 | <li>Skylake architecture, AVX2, FMA, AVX512</li> | |
1594 | <li>20 cores, 2.4 GHz</li> | |
1595 | <li>SMT enabled</li> | |
1596 | </ul> | |
1597 | <p>memory bandwidth:</p> | |
1598 | <ul class="simple"> | |
1599 | <li>copy-19 89.7 GB/s</li> | |
1600 | <li>copy-19-nt-sl 92.4 GB/s</li> | |
1601 | <li>update-19 93.6 GB/s</li> | |
1602 | </ul> | |
1603 | <p>geometry dimensions: 500x100x100</p> | |
1604 | <table border="1" class="docutils"> | |
1605 | <colgroup> | |
1606 | <col width="20%" /> | |
1607 | <col width="7%" /> | |
1608 | <col width="7%" /> | |
1609 | <col width="7%" /> | |
1610 | <col width="7%" /> | |
1611 | <col width="7%" /> | |
1612 | <col width="7%" /> | |
1613 | <col width="7%" /> | |
1614 | <col width="7%" /> | |
1615 | <col width="7%" /> | |
1616 | <col width="7%" /> | |
1617 | <col width="7%" /> | |
1618 | <col width="2%" /> | |
1619 | </colgroup> | |
1620 | <thead valign="bottom"> | |
1621 | <tr><th class="head">kernel</th> | |
1622 | <th class="head">pipe</th> | |
1623 | <th class="head">blocks-2</th> | |
1624 | <th class="head">blocks-4</th> | |
1625 | <th class="head">blocks-6</th> | |
1626 | <th class="head">blocks-8</th> | |
1627 | <th class="head">blocks-10</th> | |
1628 | <th class="head">blocks-15</th> | |
1629 | <th class="head">blocks-16</th> | |
1630 | <th class="head">blocks-20</th> | |
1631 | <th class="head">blocks-25</th> | |
1632 | <th class="head">blocks-32</th> | |
1633 | <th class="head">RFM</th> | |
1634 | </tr> | |
1635 | </thead> | |
1636 | <tbody valign="top"> | |
1637 | <tr><td>blk-push-aos</td> | |
1638 | <td>113.01</td> | |
1639 | <td>93.99</td> | |
1640 | <td>108.98</td> | |
1641 | <td>114.65</td> | |
1642 | <td>117.87</td> | |
1643 | <td>119.47</td> | |
1644 | <td>124.95</td> | |
1645 | <td>122.46</td> | |
1646 | <td>129.29</td> | |
1647 | <td>123.87</td> | |
1648 | <td>133.01</td> | |
1649 | <td>197</td> | |
1650 | </tr> | |
1651 | <tr><td>blk-push-soa</td> | |
1652 | <td>100.21</td> | |
1653 | <td>98.87</td> | |
1654 | <td>103.63</td> | |
1655 | <td>105.56</td> | |
1656 | <td>107.02</td> | |
1657 | <td>107.27</td> | |
1658 | <td>111.61</td> | |
1659 | <td>109.83</td> | |
1660 | <td>116.16</td> | |
1661 | <td>110.51</td> | |
1662 | <td>110.29</td> | |
1663 | <td>197</td> | |
1664 | </tr> | |
1665 | <tr><td>blk-pull-aos</td> | |
1666 | <td>118.45</td> | |
1667 | <td>102.54</td> | |
1668 | <td>114.12</td> | |
1669 | <td>117.82</td> | |
1670 | <td>122.69</td> | |
1671 | <td>124.31</td> | |
1672 | <td>130.58</td> | |
1673 | <td>127.85</td> | |
1674 | <td>135.72</td> | |
1675 | <td>129.65</td> | |
1676 | <td>139.94</td> | |
1677 | <td>197</td> | |
1678 | </tr> | |
1679 | <tr><td>blk-pull-soa</td> | |
1680 | <td>82.60</td> | |
1681 | <td>83.36</td> | |
1682 | <td>87.13</td> | |
1683 | <td>88.39</td> | |
1684 | <td>88.84</td> | |
1685 | <td>88.96</td> | |
1686 | <td>92.48</td> | |
1687 | <td>90.93</td> | |
1688 | <td>95.79</td> | |
1689 | <td>91.92</td> | |
1690 | <td>98.64</td> | |
1691 | <td>197</td> | |
1692 | </tr> | |
1693 | <tr><td>aa-aos</td> | |
1694 | <td>171.32</td> | |
1695 | <td>125.43</td> | |
1696 | <td>147.73</td> | |
1697 | <td>157.70</td> | |
1698 | <td>163.35</td> | |
1699 | <td>167.25</td> | |
1700 | <td>175.39</td> | |
1701 | <td>174.20</td> | |
1702 | <td>182.54</td> | |
1703 | <td>173.67</td> | |
1704 | <td>187.76</td> | |
1705 | <td>308</td> | |
1706 | </tr> | |
1707 | <tr><td>aa-soa</td> | |
1708 | <td>180.85</td> | |
1709 | <td>152.39</td> | |
1710 | <td>165.84</td> | |
1711 | <td>152.59</td> | |
1712 | <td>171.90</td> | |
1713 | <td>175.76</td> | |
1714 | <td>184.94</td> | |
1715 | <td>182.34</td> | |
1716 | <td>189.43</td> | |
1717 | <td>180.30</td> | |
1718 | <td>193.54</td> | |
1719 | <td>308</td> | |
1720 | </tr> | |
1721 | <tr><td>aa-vec-soa</td> | |
1722 | <td>208.03</td> | |
1723 | <td>181.51</td> | |
1724 | <td>195.86</td> | |
1725 | <td>203.41</td> | |
1726 | <td>209.08</td> | |
1727 | <td>212.34</td> | |
1728 | <td>224.05</td> | |
1729 | <td>219.49</td> | |
1730 | <td>234.31</td> | |
1731 | <td>225.92</td> | |
1732 | <td>245.22</td> | |
1733 | <td>308</td> | |
1734 | </tr> | |
1735 | <tr><td>list-push-aos</td> | |
1736 | <td>158.81</td> | |
1737 | <td>164.67</td> | |
1738 | <td>162.93</td> | |
1739 | <td>163.05</td> | |
1740 | <td>165.22</td> | |
1741 | <td>164.31</td> | |
1742 | <td>164.66</td> | |
1743 | <td>160.78</td> | |
1744 | <td>164.07</td> | |
1745 | <td>165.19</td> | |
1746 | <td>164.06</td> | |
1747 | <td>177</td> | |
1748 | </tr> | |
1749 | <tr><td>list-push-soa</td> | |
1750 | <td>134.60</td> | |
1751 | <td>110.44</td> | |
1752 | <td>110.17</td> | |
1753 | <td>132.01</td> | |
1754 | <td>132.95</td> | |
1755 | <td>133.46</td> | |
1756 | <td>134.37</td> | |
1757 | <td>134.33</td> | |
1758 | <td>135.12</td> | |
1759 | <td>134.91</td> | |
1760 | <td>137.87</td> | |
1761 | <td>177</td> | |
1762 | </tr> | |
1763 | <tr><td>list-pull-aos</td> | |
1764 | <td>169.61</td> | |
1765 | <td>170.03</td> | |
1766 | <td>170.89</td> | |
1767 | <td>170.90</td> | |
1768 | <td>171.20</td> | |
1769 | <td>171.60</td> | |
1770 | <td>172.09</td> | |
1771 | <td>171.95</td> | |
1772 | <td>169.48</td> | |
1773 | <td>172.08</td> | |
1774 | <td>171.02</td> | |
1775 | <td>177</td> | |
1776 | </tr> | |
1777 | <tr><td>list-pull-soa</td> | |
1778 | <td>120.50</td> | |
1779 | <td>116.73</td> | |
1780 | <td>118.62</td> | |
1781 | <td>118.00</td> | |
1782 | <td>120.99</td> | |
1783 | <td>118.15</td> | |
1784 | <td>117.17</td> | |
1785 | <td>121.41</td> | |
1786 | <td>120.83</td> | |
1787 | <td>120.00</td> | |
1788 | <td>118.74</td> | |
1789 | <td>177</td> | |
1790 | </tr> | |
1791 | <tr><td>list-pull-split-nt-1s-soa</td> | |
1792 | <td>225.59</td> | |
1793 | <td>224.18</td> | |
1794 | <td>225.10</td> | |
1795 | <td>226.34</td> | |
1796 | <td>226.01</td> | |
1797 | <td>230.37</td> | |
1798 | <td>227.50</td> | |
1799 | <td>228.42</td> | |
1800 | <td>227.39</td> | |
1801 | <td>231.65</td> | |
1802 | <td>227.35</td> | |
1803 | <td>246</td> | |
1804 | </tr> | |
1805 | <tr><td>list-pull-split-nt-2s-soa</td> | |
1806 | <td>219.20</td> | |
1807 | <td>214.63</td> | |
1808 | <td>217.61</td> | |
1809 | <td>218.13</td> | |
1810 | <td>219.07</td> | |
1811 | <td>221.01</td> | |
1812 | <td>219.88</td> | |
1813 | <td>220.09</td> | |
1814 | <td>220.62</td> | |
1815 | <td>221.68</td> | |
1816 | <td>220.58</td> | |
1817 | <td>246</td> | |
1818 | </tr> | |
1819 | <tr><td>list-aa-aos</td> | |
1820 | <td>241.39</td> | |
1821 | <td>239.27</td> | |
1822 | <td>239.53</td> | |
1823 | <td>242.56</td> | |
1824 | <td>242.46</td> | |
1825 | <td>243.00</td> | |
1826 | <td>242.91</td> | |
1827 | <td>242.46</td> | |
1828 | <td>241.24</td> | |
1829 | <td>242.96</td> | |
1830 | <td>241.52</td> | |
1831 | <td>275</td> | |
1832 | </tr> | |
1833 | <tr><td>list-aa-soa</td> | |
1834 | <td>273.73</td> | |
1835 | <td>268.49</td> | |
1836 | <td>268.48</td> | |
1837 | <td>271.79</td> | |
1838 | <td>275.29</td> | |
1839 | <td>274.56</td> | |
1840 | <td>277.18</td> | |
1841 | <td>272.67</td> | |
1842 | <td>274.21</td> | |
1843 | <td>275.24</td> | |
1844 | <td>278.21</td> | |
1845 | <td>275</td> | |
1846 | </tr> | |
1847 | <tr><td>list-aa-ria-soa</td> | |
1848 | <td>288.42</td> | |
1849 | <td>261.89</td> | |
1850 | <td>273.26</td> | |
1851 | <td>284.84</td> | |
1852 | <td>283.88</td> | |
1853 | <td>288.29</td> | |
1854 | <td>290.72</td> | |
1855 | <td>289.81</td> | |
1856 | <td>293.36</td> | |
1857 | <td>290.75</td> | |
1858 | <td>292.93</td> | |
1859 | <td>308</td> | |
1860 | </tr> | |
1861 | <tr><td>list-aa-pv-soa</td> | |
1862 | <td>303.35</td> | |
1863 | <td>267.21</td> | |
1864 | <td>289.18</td> | |
1865 | <td>294.96</td> | |
1866 | <td>294.36</td> | |
1867 | <td>298.16</td> | |
1868 | <td>300.45</td> | |
1869 | <td>301.71</td> | |
1870 | <td>302.37</td> | |
1871 | <td>302.88</td> | |
1872 | <td>304.46</td> | |
1873 | <td>308</td> | |
1874 | </tr> | |
1875 | </tbody> | |
1876 | </table> | |
e3f82424 | 1877 | </div> |
e3f82424 MW |
1878 | </div> |
1879 | <div class="section" id="licence"> | |
0095f461 | 1880 | <h1><a class="toc-backref" href="#id26">7 Licence</a></h1> |
e3f82424 | 1881 | <p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p> |
ecf590ae MW |
1882 | </div> |
1883 | <div class="section" id="acknowledgements"> | |
0095f461 | 1884 | <h1><a class="toc-backref" href="#id27">8 Acknowledgements</a></h1> |
ecf590ae MW |
1885 | <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p> |
1886 | <p>This work was funded by KONWHIR project OMI4PAPS.</p> | |
0095f461 MW |
1887 | </div> |
1888 | <div class="section" id="bibliography"> | |
1889 | <h1><a class="toc-backref" href="#id28">9 Bibliography</a></h1> | |
1890 | <table class="docutils citation" frame="void" id="ginzburg-2008" rules="none"> | |
1891 | <colgroup><col class="label" /><col /></colgroup> | |
1892 | <tbody valign="top"> | |
1893 | <tr><td class="label"><a class="fn-backref" href="#id1">[ginzburg-2008]</a></td><td>I. Ginzburg, F. Verhaeghe, and D. d'Humières. | |
1894 | Two-relaxation-time lattice Boltzmann scheme: About parametrization, velocity, pressure and mixed boundary conditions. | |
1895 | Commun. Comput. Phys., 3(2):427-478, 2008.</td></tr> | |
1896 | </tbody> | |
1897 | </table> | |
1898 | <table class="docutils citation" frame="void" id="williams-2008" rules="none"> | |
1899 | <colgroup><col class="label" /><col /></colgroup> | |
1900 | <tbody valign="top"> | |
1901 | <tr><td class="label"><a class="fn-backref" href="#id3">[williams-2008]</a></td><td>S. Williams, A. Waterman, and D. Patterson. | |
1902 | Roofline: an insightful visual performance model for multicore architectures. | |
1903 | Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr> | |
1904 | </tbody> | |
1905 | </table> | |
1906 | <p>Document was generated at 2017-11-21 15:43.</p> | |
ecf590ae MW |
1907 | </div> |
1908 | </div> | |
1909 | </body> | |
1910 | </html> |