| 1 | <?xml version="1.0" encoding="utf-8" ?> |
| 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| 3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
| 4 | <head> |
| 5 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| 6 | <meta name="generator" content="Docutils 0.12: http://docutils.sourceforge.net/" /> |
| 7 | <title>LBM Benchmark Kernels Documentation</title> |
| 8 | <style type="text/css"> |
| 9 | |
| 10 | /* |
| 11 | :Author: David Goodger (goodger@python.org) |
| 12 | :Id: $Id: html4css1.css 7614 2013-02-21 15:55:51Z milde $ |
| 13 | :Copyright: This stylesheet has been placed in the public domain. |
| 14 | |
| 15 | Default cascading style sheet for the HTML output of Docutils. |
| 16 | |
| 17 | See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to |
| 18 | customize this style sheet. |
| 19 | */ |
| 20 | |
| 21 | /* used to remove borders from tables and images */ |
| 22 | .borderless, table.borderless td, table.borderless th { |
| 23 | border: 0 } |
| 24 | |
| 25 | table.borderless td, table.borderless th { |
| 26 | /* Override padding for "table.docutils td" with "! important". |
| 27 | The right padding separates the table cells. */ |
| 28 | padding: 0 0.5em 0 0 ! important } |
| 29 | |
| 30 | .first { |
| 31 | /* Override more specific margin styles with "! important". */ |
| 32 | margin-top: 0 ! important } |
| 33 | |
| 34 | .last, .with-subtitle { |
| 35 | margin-bottom: 0 ! important } |
| 36 | |
| 37 | .hidden { |
| 38 | display: none } |
| 39 | |
| 40 | a.toc-backref { |
| 41 | text-decoration: none ; |
| 42 | color: black } |
| 43 | |
| 44 | blockquote.epigraph { |
| 45 | margin: 2em 5em ; } |
| 46 | |
| 47 | dl.docutils dd { |
| 48 | margin-bottom: 0.5em } |
| 49 | |
| 50 | object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] { |
| 51 | overflow: hidden; |
| 52 | } |
| 53 | |
| 54 | /* Uncomment (and remove this text!) to get bold-faced definition list terms |
| 55 | dl.docutils dt { |
| 56 | font-weight: bold } |
| 57 | */ |
| 58 | |
| 59 | div.abstract { |
| 60 | margin: 2em 5em } |
| 61 | |
| 62 | div.abstract p.topic-title { |
| 63 | font-weight: bold ; |
| 64 | text-align: center } |
| 65 | |
| 66 | div.admonition, div.attention, div.caution, div.danger, div.error, |
| 67 | div.hint, div.important, div.note, div.tip, div.warning { |
| 68 | margin: 2em ; |
| 69 | border: medium outset ; |
| 70 | padding: 1em } |
| 71 | |
| 72 | div.admonition p.admonition-title, div.hint p.admonition-title, |
| 73 | div.important p.admonition-title, div.note p.admonition-title, |
| 74 | div.tip p.admonition-title { |
| 75 | font-weight: bold ; |
| 76 | font-family: sans-serif } |
| 77 | |
| 78 | div.attention p.admonition-title, div.caution p.admonition-title, |
| 79 | div.danger p.admonition-title, div.error p.admonition-title, |
| 80 | div.warning p.admonition-title, .code .error { |
| 81 | color: red ; |
| 82 | font-weight: bold ; |
| 83 | font-family: sans-serif } |
| 84 | |
| 85 | /* Uncomment (and remove this text!) to get reduced vertical space in |
| 86 | compound paragraphs. |
| 87 | div.compound .compound-first, div.compound .compound-middle { |
| 88 | margin-bottom: 0.5em } |
| 89 | |
| 90 | div.compound .compound-last, div.compound .compound-middle { |
| 91 | margin-top: 0.5em } |
| 92 | */ |
| 93 | |
| 94 | div.dedication { |
| 95 | margin: 2em 5em ; |
| 96 | text-align: center ; |
| 97 | font-style: italic } |
| 98 | |
| 99 | div.dedication p.topic-title { |
| 100 | font-weight: bold ; |
| 101 | font-style: normal } |
| 102 | |
| 103 | div.figure { |
| 104 | margin-left: 2em ; |
| 105 | margin-right: 2em } |
| 106 | |
| 107 | div.footer, div.header { |
| 108 | clear: both; |
| 109 | font-size: smaller } |
| 110 | |
| 111 | div.line-block { |
| 112 | display: block ; |
| 113 | margin-top: 1em ; |
| 114 | margin-bottom: 1em } |
| 115 | |
| 116 | div.line-block div.line-block { |
| 117 | margin-top: 0 ; |
| 118 | margin-bottom: 0 ; |
| 119 | margin-left: 1.5em } |
| 120 | |
| 121 | div.sidebar { |
| 122 | margin: 0 0 0.5em 1em ; |
| 123 | border: medium outset ; |
| 124 | padding: 1em ; |
| 125 | background-color: #ffffee ; |
| 126 | width: 40% ; |
| 127 | float: right ; |
| 128 | clear: right } |
| 129 | |
| 130 | div.sidebar p.rubric { |
| 131 | font-family: sans-serif ; |
| 132 | font-size: medium } |
| 133 | |
| 134 | div.system-messages { |
| 135 | margin: 5em } |
| 136 | |
| 137 | div.system-messages h1 { |
| 138 | color: red } |
| 139 | |
| 140 | div.system-message { |
| 141 | border: medium outset ; |
| 142 | padding: 1em } |
| 143 | |
| 144 | div.system-message p.system-message-title { |
| 145 | color: red ; |
| 146 | font-weight: bold } |
| 147 | |
| 148 | div.topic { |
| 149 | margin: 2em } |
| 150 | |
| 151 | h1.section-subtitle, h2.section-subtitle, h3.section-subtitle, |
| 152 | h4.section-subtitle, h5.section-subtitle, h6.section-subtitle { |
| 153 | margin-top: 0.4em } |
| 154 | |
| 155 | h1.title { |
| 156 | text-align: center } |
| 157 | |
| 158 | h2.subtitle { |
| 159 | text-align: center } |
| 160 | |
| 161 | hr.docutils { |
| 162 | width: 75% } |
| 163 | |
| 164 | img.align-left, .figure.align-left, object.align-left { |
| 165 | clear: left ; |
| 166 | float: left ; |
| 167 | margin-right: 1em } |
| 168 | |
| 169 | img.align-right, .figure.align-right, object.align-right { |
| 170 | clear: right ; |
| 171 | float: right ; |
| 172 | margin-left: 1em } |
| 173 | |
| 174 | img.align-center, .figure.align-center, object.align-center { |
| 175 | display: block; |
| 176 | margin-left: auto; |
| 177 | margin-right: auto; |
| 178 | } |
| 179 | |
| 180 | .align-left { |
| 181 | text-align: left } |
| 182 | |
| 183 | .align-center { |
| 184 | clear: both ; |
| 185 | text-align: center } |
| 186 | |
| 187 | .align-right { |
| 188 | text-align: right } |
| 189 | |
| 190 | /* reset inner alignment in figures */ |
| 191 | div.align-right { |
| 192 | text-align: inherit } |
| 193 | |
| 194 | /* div.align-center * { */ |
| 195 | /* text-align: left } */ |
| 196 | |
| 197 | ol.simple, ul.simple { |
| 198 | margin-bottom: 1em } |
| 199 | |
| 200 | ol.arabic { |
| 201 | list-style: decimal } |
| 202 | |
| 203 | ol.loweralpha { |
| 204 | list-style: lower-alpha } |
| 205 | |
| 206 | ol.upperalpha { |
| 207 | list-style: upper-alpha } |
| 208 | |
| 209 | ol.lowerroman { |
| 210 | list-style: lower-roman } |
| 211 | |
| 212 | ol.upperroman { |
| 213 | list-style: upper-roman } |
| 214 | |
| 215 | p.attribution { |
| 216 | text-align: right ; |
| 217 | margin-left: 50% } |
| 218 | |
| 219 | p.caption { |
| 220 | font-style: italic } |
| 221 | |
| 222 | p.credits { |
| 223 | font-style: italic ; |
| 224 | font-size: smaller } |
| 225 | |
| 226 | p.label { |
| 227 | white-space: nowrap } |
| 228 | |
| 229 | p.rubric { |
| 230 | font-weight: bold ; |
| 231 | font-size: larger ; |
| 232 | color: maroon ; |
| 233 | text-align: center } |
| 234 | |
| 235 | p.sidebar-title { |
| 236 | font-family: sans-serif ; |
| 237 | font-weight: bold ; |
| 238 | font-size: larger } |
| 239 | |
| 240 | p.sidebar-subtitle { |
| 241 | font-family: sans-serif ; |
| 242 | font-weight: bold } |
| 243 | |
| 244 | p.topic-title { |
| 245 | font-weight: bold } |
| 246 | |
| 247 | pre.address { |
| 248 | margin-bottom: 0 ; |
| 249 | margin-top: 0 ; |
| 250 | font: inherit } |
| 251 | |
| 252 | pre.literal-block, pre.doctest-block, pre.math, pre.code { |
| 253 | margin-left: 2em ; |
| 254 | margin-right: 2em } |
| 255 | |
| 256 | pre.code .ln { color: grey; } /* line numbers */ |
| 257 | pre.code, code { background-color: #eeeeee } |
| 258 | pre.code .comment, code .comment { color: #5C6576 } |
| 259 | pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold } |
| 260 | pre.code .literal.string, code .literal.string { color: #0C5404 } |
| 261 | pre.code .name.builtin, code .name.builtin { color: #352B84 } |
| 262 | pre.code .deleted, code .deleted { background-color: #DEB0A1} |
| 263 | pre.code .inserted, code .inserted { background-color: #A3D289} |
| 264 | |
| 265 | span.classifier { |
| 266 | font-family: sans-serif ; |
| 267 | font-style: oblique } |
| 268 | |
| 269 | span.classifier-delimiter { |
| 270 | font-family: sans-serif ; |
| 271 | font-weight: bold } |
| 272 | |
| 273 | span.interpreted { |
| 274 | font-family: sans-serif } |
| 275 | |
| 276 | span.option { |
| 277 | white-space: nowrap } |
| 278 | |
| 279 | span.pre { |
| 280 | white-space: pre } |
| 281 | |
| 282 | span.problematic { |
| 283 | color: red } |
| 284 | |
| 285 | span.section-subtitle { |
| 286 | /* font-size relative to parent (h1..h6 element) */ |
| 287 | font-size: 80% } |
| 288 | |
| 289 | table.citation { |
| 290 | border-left: solid 1px gray; |
| 291 | margin-left: 1px } |
| 292 | |
| 293 | table.docinfo { |
| 294 | margin: 2em 4em } |
| 295 | |
| 296 | table.docutils { |
| 297 | margin-top: 0.5em ; |
| 298 | margin-bottom: 0.5em } |
| 299 | |
| 300 | table.footnote { |
| 301 | border-left: solid 1px black; |
| 302 | margin-left: 1px } |
| 303 | |
| 304 | table.docutils td, table.docutils th, |
| 305 | table.docinfo td, table.docinfo th { |
| 306 | padding-left: 0.5em ; |
| 307 | padding-right: 0.5em ; |
| 308 | vertical-align: top } |
| 309 | |
| 310 | table.docutils th.field-name, table.docinfo th.docinfo-name { |
| 311 | font-weight: bold ; |
| 312 | text-align: left ; |
| 313 | white-space: nowrap ; |
| 314 | padding-left: 0 } |
| 315 | |
| 316 | /* "booktabs" style (no vertical lines) */ |
| 317 | table.docutils.booktabs { |
| 318 | border: 0px; |
| 319 | border-top: 2px solid; |
| 320 | border-bottom: 2px solid; |
| 321 | border-collapse: collapse; |
| 322 | } |
| 323 | table.docutils.booktabs * { |
| 324 | border: 0px; |
| 325 | } |
| 326 | table.docutils.booktabs th { |
| 327 | border-bottom: thin solid; |
| 328 | text-align: left; |
| 329 | } |
| 330 | |
| 331 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils, |
| 332 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { |
| 333 | font-size: 100% } |
| 334 | |
| 335 | ul.auto-toc { |
| 336 | list-style-type: none } |
| 337 | |
| 338 | </style> |
| 339 | <style type="text/css"> |
| 340 | |
| 341 | |
| 342 | h1, h2, h3, h4, h5, h6 { |
| 343 | font-family: sans-serif; |
| 344 | font-size: 100%; |
| 345 | background-color: #dcdcdc; |
| 346 | } |
| 347 | |
| 348 | h1.title { |
| 349 | background-color: gray; |
| 350 | color: white |
| 351 | } |
| 352 | |
| 353 | table.footnote { |
| 354 | padding-left: 0.5ex; |
| 355 | } |
| 356 | |
| 357 | table.citation { |
| 358 | padding-left: 0.5ex |
| 359 | } |
| 360 | |
| 361 | td.label { |
| 362 | width: 10%; |
| 363 | } |
| 364 | |
| 365 | table, table.docutils, td, th { |
| 366 | border: 0; |
| 367 | } |
| 368 | |
| 369 | table.citation, table.footnote { |
| 370 | width: 100%; |
| 371 | } |
| 372 | |
| 373 | th { |
| 374 | background-color: lavender ; |
| 375 | } |
| 376 | |
| 377 | tr:nth-child(even) { |
| 378 | xxbackground-color: aliceblue; |
| 379 | background-color: white; |
| 380 | } |
| 381 | tr:nth-child(odd) { |
| 382 | xxbackground-color: lavender; |
| 383 | background-color: whitesmoke; |
| 384 | } |
| 385 | |
| 386 | |
| 387 | |
| 388 | </style> |
| 389 | </head> |
| 390 | <body> |
| 391 | <div class="document" id="lbm-benchmark-kernels-documentation"> |
| 392 | <h1 class="title">LBM Benchmark Kernels Documentation</h1> |
| 393 | |
| 394 | <!-- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| 395 | # |
| 396 | # Copyright |
| 397 | # Markus Wittmann, 2016-2017 |
| 398 | # RRZE, University of Erlangen-Nuremberg, Germany |
| 399 | # markus.wittmann -at- fau.de or hpc -at- rrze.fau.de |
| 400 | # |
| 401 | # Viktor Haag, 2016 |
| 402 | # LSS, University of Erlangen-Nuremberg, Germany |
| 403 | # |
| 404 | # This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). |
| 405 | # |
| 406 | # LbmBenchKernels is free software: you can redistribute it and/or modify |
| 407 | # it under the terms of the GNU General Public License as published by |
| 408 | # the Free Software Foundation, either version 3 of the License, or |
| 409 | # (at your option) any later version. |
| 410 | # |
| 411 | # LbmBenchKernels is distributed in the hope that it will be useful, |
| 412 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 413 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 414 | # GNU General Public License for more details. |
| 415 | # |
| 416 | # You should have received a copy of the GNU General Public License |
| 417 | # along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. |
| 418 | # |
| 419 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --> |
| 420 | <div class="contents topic" id="contents"> |
| 421 | <p class="topic-title first">Contents</p> |
| 422 | <ul class="auto-toc simple"> |
| 423 | <li><a class="reference internal" href="#introduction" id="id5">1 Introduction</a></li> |
| 424 | <li><a class="reference internal" href="#compilation" id="id6">2 Compilation</a><ul class="auto-toc"> |
| 425 | <li><a class="reference internal" href="#debug-and-verification" id="id7">2.1 Debug and Verification</a></li> |
| 426 | <li><a class="reference internal" href="#release-and-verification" id="id8">2.2 Release and Verification</a></li> |
| 427 | <li><a class="reference internal" href="#benchmarking" id="id9">2.3 Benchmarking</a></li> |
| 428 | <li><a class="reference internal" href="#compilers" id="id10">2.4 Compilers</a></li> |
| 429 | <li><a class="reference internal" href="#cleaning" id="id11">2.5 Cleaning</a></li> |
| 430 | <li><a class="reference internal" href="#options-summary" id="id12">2.6 Options Summary</a></li> |
| 431 | </ul> |
| 432 | </li> |
| 433 | <li><a class="reference internal" href="#invocation" id="id13">3 Invocation</a><ul class="auto-toc"> |
| 434 | <li><a class="reference internal" href="#command-line-parameters" id="id14">3.1 Command Line Parameters</a></li> |
| 435 | <li><a class="reference internal" href="#kernels" id="id15">3.2 Kernels</a></li> |
| 436 | </ul> |
| 437 | </li> |
| 438 | <li><a class="reference internal" href="#id2" id="id16">4 Benchmarking</a><ul class="auto-toc"> |
| 439 | <li><a class="reference internal" href="#intel-compiler" id="id17">4.1 Intel Compiler</a></li> |
| 440 | <li><a class="reference internal" href="#pinning" id="id18">4.2 Pinning</a></li> |
| 441 | <li><a class="reference internal" href="#general-remarks" id="id19">4.3 General Remarks</a></li> |
| 442 | <li><a class="reference internal" href="#padding" id="id20">4.4 Padding</a></li> |
| 443 | </ul> |
| 444 | </li> |
| 445 | <li><a class="reference internal" href="#geometries" id="id21">5 Geometries</a></li> |
| 446 | <li><a class="reference internal" href="#performance-results" id="id22">6 Performance Results</a><ul class="auto-toc"> |
| 447 | <li><a class="reference internal" href="#haswell-intel-xeon-e5-2695-v3" id="id23">6.1 Haswell, Intel Xeon E5-2695 v3</a></li> |
| 448 | <li><a class="reference internal" href="#broadwell-intel-xeon-e5-2630-v4" id="id24">6.2 Broadwell, Intel Xeon E5-2630 v4</a></li> |
| 449 | <li><a class="reference internal" href="#skylake-intel-xeon-gold-6148" id="id25">6.3 Skylake, Intel Xeon Gold 6148</a></li> |
| 450 | </ul> |
| 451 | </li> |
| 452 | <li><a class="reference internal" href="#licence" id="id26">7 Licence</a></li> |
| 453 | <li><a class="reference internal" href="#acknowledgements" id="id27">8 Acknowledgements</a></li> |
| 454 | <li><a class="reference internal" href="#bibliography" id="id28">9 Bibliography</a></li> |
| 455 | </ul> |
| 456 | </div> |
| 457 | <div class="section" id="introduction"> |
| 458 | <h1><a class="toc-backref" href="#id5">1 Introduction</a></h1> |
| 459 | <p>The lattice Boltzmann (LBM) benchmark kernels are a collection of LBM kernel |
| 460 | implementations.</p> |
| 461 | <p><strong>AS SUCH THE LBM BENCHMARK KERNELS ARE NO FULLY EQUIPPED CFD SOLVER AND SOLELY |
| 462 | SERVES THE PURPOSE OF STUDYING POSSIBLE PERFORMANCE OPTIMIZATIONS AND/OR |
| 463 | EXPERIMENTS.</strong></p> |
| 464 | <p>Currently all kernels utilize a D3Q19 discretization and the |
| 465 | two-relaxation-time (TRT) collision operator <a class="citation-reference" href="#ginzburg-2008" id="id1">[ginzburg-2008]</a>. |
| 466 | All operations are carried out in double precision arithmetic.</p> |
| 467 | </div> |
| 468 | <div class="section" id="compilation"> |
| 469 | <h1><a class="toc-backref" href="#id6">2 Compilation</a></h1> |
| 470 | <p>The benchmark framework currently supports only Linux systems and the GCC and |
| 471 | Intel compilers. Every other configuration probably requires adjustment inside |
| 472 | the code and the makefiles. Furthermore some code might be platform or at least |
| 473 | POSIX specific.</p> |
| 474 | <p>The benchmark can be build via <tt class="docutils literal">make</tt> from the <tt class="docutils literal">src</tt> subdirectory. This will |
| 475 | generate one binary which hosts all implemented benchmark kernels.</p> |
| 476 | <p>Binaries are located under the <tt class="docutils literal">bin</tt> subdirectory and will have different names |
| 477 | depending on compiler and build configuration.</p> |
| 478 | <p>Compilation can target debug or release builds. Combined with both build types |
| 479 | verification can be enabled, which increases the runtime and hence is not |
| 480 | suited for benchmarking.</p> |
| 481 | <div class="section" id="debug-and-verification"> |
| 482 | <h2><a class="toc-backref" href="#id7">2.1 Debug and Verification</a></h2> |
| 483 | <pre class="literal-block"> |
| 484 | make BUILD=debug BENCHMARK=off |
| 485 | </pre> |
| 486 | <p>Running <tt class="docutils literal">make</tt> with <tt class="docutils literal">BUILD=debug</tt> builds the debug version of |
| 487 | the benchmark kernels, where no optimizations are performed, line numbers and |
| 488 | debug symbols are included as well as <tt class="docutils literal">DEBUG</tt> will be defined. The resulting |
| 489 | binary will be found in the <tt class="docutils literal">bin</tt> subdirectory and named |
| 490 | <tt class="docutils literal"><span class="pre">lbmbenchk-linux-<compiler>-debug</span></tt>.</p> |
| 491 | <p>Specifying <tt class="docutils literal">BENCHMARK=off</tt> turns on verification |
| 492 | (<tt class="docutils literal">VERIFICATION=on</tt>), statistics (<tt class="docutils literal">STATISTICS=on</tt>), and VTK output |
| 493 | (<tt class="docutils literal">VTK_OUTPUT=on</tt>) enabled.</p> |
| 494 | <p>Please note that the generated binary will therefore |
| 495 | exhibit a poor performance.</p> |
| 496 | </div> |
| 497 | <div class="section" id="release-and-verification"> |
| 498 | <h2><a class="toc-backref" href="#id8">2.2 Release and Verification</a></h2> |
| 499 | <p>Verification with the debug builds can be extremely slow. Hence verification |
| 500 | capabilities can be build with release builds:</p> |
| 501 | <pre class="literal-block"> |
| 502 | make BENCHMARK=off |
| 503 | </pre> |
| 504 | </div> |
| 505 | <div class="section" id="benchmarking"> |
| 506 | <h2><a class="toc-backref" href="#id9">2.3 Benchmarking</a></h2> |
| 507 | <p>To generate a binary for benchmarking run make with</p> |
| 508 | <pre class="literal-block"> |
| 509 | make |
| 510 | </pre> |
| 511 | <p>As default <tt class="docutils literal">BENCHMARK=on</tt> and <tt class="docutils literal">BUILD=release</tt> is set, where |
| 512 | <tt class="docutils literal">BUILD=release</tt> turns optimizations on and <tt class="docutils literal">BENCHMARK=on</tt> disables |
| 513 | verfification, statistics, and VTK output.</p> |
| 514 | <p>See Options Summary below for further description of options which can be |
| 515 | applied, e.g. TARCH as well as the Benchmarking section.</p> |
| 516 | </div> |
| 517 | <div class="section" id="compilers"> |
| 518 | <h2><a class="toc-backref" href="#id10">2.4 Compilers</a></h2> |
| 519 | <p>Currently only the GCC and Intel compiler under Linux are supported. Between |
| 520 | both configuration can be chosen via <tt class="docutils literal"><span class="pre">CONFIG=linux-gcc</span></tt> or |
| 521 | <tt class="docutils literal"><span class="pre">CONFIG=linux-intel</span></tt>.</p> |
| 522 | </div> |
| 523 | <div class="section" id="cleaning"> |
| 524 | <h2><a class="toc-backref" href="#id11">2.5 Cleaning</a></h2> |
| 525 | <p>For each configuration and build (debug/release) a subdirectory under the |
| 526 | <tt class="docutils literal">src/obj</tt> directory is created where the dependency and object files are |
| 527 | stored. |
| 528 | With</p> |
| 529 | <pre class="literal-block"> |
| 530 | make CONFIG=... BUILD=... clean |
| 531 | </pre> |
| 532 | <p>a specific combination is select and cleaned, whereas with</p> |
| 533 | <pre class="literal-block"> |
| 534 | make clean-all |
| 535 | </pre> |
| 536 | <p>all object and dependency files are deleted.</p> |
| 537 | </div> |
| 538 | <div class="section" id="options-summary"> |
| 539 | <h2><a class="toc-backref" href="#id12">2.6 Options Summary</a></h2> |
| 540 | <p>Options that can be specified when building the suite with make:</p> |
| 541 | <table border="1" class="docutils"> |
| 542 | <colgroup> |
| 543 | <col width="7%" /> |
| 544 | <col width="12%" /> |
| 545 | <col width="6%" /> |
| 546 | <col width="75%" /> |
| 547 | </colgroup> |
| 548 | <thead valign="bottom"> |
| 549 | <tr><th class="head">name</th> |
| 550 | <th class="head">values</th> |
| 551 | <th class="head">default</th> |
| 552 | <th class="head">description</th> |
| 553 | </tr> |
| 554 | </thead> |
| 555 | <tbody valign="top"> |
| 556 | <tr><td>BENCHMARK</td> |
| 557 | <td>on, off</td> |
| 558 | <td>on</td> |
| 559 | <td>If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options.</td> |
| 560 | </tr> |
| 561 | <tr><td>BUILD</td> |
| 562 | <td>debug, release</td> |
| 563 | <td>release</td> |
| 564 | <td>debug: no optimization, debug symbols, DEBUG defined. release: optimizations enabled.</td> |
| 565 | </tr> |
| 566 | <tr><td>CONFIG</td> |
| 567 | <td>linux-gcc, linux-intel</td> |
| 568 | <td>linux-intel</td> |
| 569 | <td>Select GCC or Intel compiler.</td> |
| 570 | </tr> |
| 571 | <tr><td>ISA</td> |
| 572 | <td>avx, sse</td> |
| 573 | <td>avx</td> |
| 574 | <td>Determines which ISA extension is used for macro definitions of the intrinsics. This is <em>not</em> the architecture the compiler generates code for.</td> |
| 575 | </tr> |
| 576 | <tr><td>OPENMP</td> |
| 577 | <td>on, off</td> |
| 578 | <td>on</td> |
| 579 | <td>OpenMP, i.,e.. threading support.</td> |
| 580 | </tr> |
| 581 | <tr><td>STATISTICS</td> |
| 582 | <td>on, off</td> |
| 583 | <td>off</td> |
| 584 | <td>View statistics, like density etc, during simulation.</td> |
| 585 | </tr> |
| 586 | <tr><td>TARCH</td> |
| 587 | <td>--</td> |
| 588 | <td>--</td> |
| 589 | <td>Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler.</td> |
| 590 | </tr> |
| 591 | <tr><td>VERIFICATION</td> |
| 592 | <td>on, off</td> |
| 593 | <td>off</td> |
| 594 | <td>Turn verification on/off.</td> |
| 595 | </tr> |
| 596 | <tr><td>VTK_OUTPUT</td> |
| 597 | <td>on, off</td> |
| 598 | <td>off</td> |
| 599 | <td>Enable/Disable VTK file output.</td> |
| 600 | </tr> |
| 601 | </tbody> |
| 602 | </table> |
| 603 | </div> |
| 604 | </div> |
| 605 | <div class="section" id="invocation"> |
| 606 | <h1><a class="toc-backref" href="#id13">3 Invocation</a></h1> |
| 607 | <p>Running the binary will print among the GPL licence header a line like the following:</p> |
| 608 | <pre class="literal-block"> |
| 609 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification |
| 610 | </pre> |
| 611 | <p>if verfication was enabled during compilation or</p> |
| 612 | <pre class="literal-block"> |
| 613 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark |
| 614 | </pre> |
| 615 | <p>if verfication was disabled during compilation.</p> |
| 616 | <div class="section" id="command-line-parameters"> |
| 617 | <h2><a class="toc-backref" href="#id14">3.1 Command Line Parameters</a></h2> |
| 618 | <p>Running the binary with <tt class="docutils literal"><span class="pre">-h</span></tt> list all available parameters:</p> |
| 619 | <pre class="literal-block"> |
| 620 | Usage: |
| 621 | ./lbmbenchk -list |
| 622 | ./lbmbenchk |
| 623 | [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-<block size>]] [-iterations <iterations>] [-lattice-dump-ascii] |
| 624 | [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>] |
| 625 | [-periodic-x] |
| 626 | [-t <number of threads>] |
| 627 | [-pin core{,core}*] |
| 628 | [-verify] |
| 629 | -- <kernel specific parameters> |
| 630 | |
| 631 | -list List available kernels. |
| 632 | |
| 633 | -dims XxYxZ Specify geometry dimensions. |
| 634 | |
| 635 | -geometry blocks-<block size> |
| 636 | Geometetry with blocks of size <block size> regularily layout out. |
| 637 | </pre> |
| 638 | <p>If an option is specified multiple times the last one overrides previous ones. |
| 639 | This holds also true for <tt class="docutils literal"><span class="pre">-verify</span></tt> which sets geometry dimensions, |
| 640 | iterations, etc, which can afterward be override, e.g.:</p> |
| 641 | <pre class="literal-block"> |
| 642 | $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32 |
| 643 | </pre> |
| 644 | <p>Kernel specific parameters can be obtained via selecting the specific kernel |
| 645 | and passing <tt class="docutils literal"><span class="pre">-h</span></tt> as parameter:</p> |
| 646 | <pre class="literal-block"> |
| 647 | $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h |
| 648 | ... |
| 649 | Kernel parameters: |
| 650 | [-blk <n>] [-blk-[xyz] <n>] |
| 651 | </pre> |
| 652 | <p>A list of all available kernels can be obtained via <tt class="docutils literal"><span class="pre">-list</span></tt>:</p> |
| 653 | <pre class="literal-block"> |
| 654 | $ ../bin/lbmbenchk-linux-gcc-debug -list |
| 655 | Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE |
| 656 | This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE. |
| 657 | This is free software, and you are welcome to redistribute it under certain conditions. |
| 658 | |
| 659 | LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification |
| 660 | Available kernels to benchmark: |
| 661 | list-aa-pv-soa |
| 662 | list-aa-ria-soa |
| 663 | list-aa-soa |
| 664 | list-aa-aos |
| 665 | list-pull-split-nt-1s-soa |
| 666 | list-pull-split-nt-2s-soa |
| 667 | list-push-soa |
| 668 | list-push-aos |
| 669 | list-pull-soa |
| 670 | list-pull-aos |
| 671 | push-soa |
| 672 | push-aos |
| 673 | pull-soa |
| 674 | pull-aos |
| 675 | blk-push-soa |
| 676 | blk-push-aos |
| 677 | blk-pull-soa |
| 678 | blk-pull-aos |
| 679 | </pre> |
| 680 | </div> |
| 681 | <div class="section" id="kernels"> |
| 682 | <h2><a class="toc-backref" href="#id15">3.2 Kernels</a></h2> |
| 683 | <p>The following list shortly describes available kernels:</p> |
| 684 | <ul class="simple"> |
| 685 | <li>push-soa/push-aos/pull-soa/pull-aos: |
| 686 | Unoptimized kernels (but stream/collide are already fused) using two grids as |
| 687 | source and destination. Implement push/pull semantics as well structure of |
| 688 | arrays (soa) or array of structures (aos) layout.</li> |
| 689 | <li>blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: |
| 690 | The same as the unoptimized kernels without the blk prefix, except that they support |
| 691 | spatial blocking, i.e. loop blocking of the three loops used to iterate over |
| 692 | the lattice. Here manual work sharing for OpenMP is used.</li> |
| 693 | <li>list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: |
| 694 | The same as the unoptimized kernels without the list prefix, but for indirect addressing. |
| 695 | Here only a 1D vector of is used to store the fluid nodes, omitting the |
| 696 | obstacles. An adjacency list is used to recover the neighborhood associations.</li> |
| 697 | <li>list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: |
| 698 | Optimized variant of list-pull-soa. Chunks of the lattice are processed as |
| 699 | once. Postcollision values are written back via nontemporal stores in 18 (1s) |
| 700 | or 9 (2s) loops.</li> |
| 701 | <li>list-aa-aos/list-aa-soa: |
| 702 | Unoptimized implementation of the AA pattern for the 1D vector with adjacency |
| 703 | list. Supported are array of structures (aos) and structure of arrays (soa) |
| 704 | data layout is supported.</li> |
| 705 | <li>list-aa-ria-soa: |
| 706 | Implementation of AA pattern with intrinsics for the 1D vector with adjacency |
| 707 | list. Furthermore it contains a vectorized even time step and run length |
| 708 | coding to reduce the loop balance of the odd time step.</li> |
| 709 | <li>list-aa-pv-soa: |
| 710 | All optimizations of list-aa-ria-soa. Additional with partial vectorization |
| 711 | of the odd time step.</li> |
| 712 | </ul> |
| 713 | <p>Note that all array of structures (aos) kernels might require blocking |
| 714 | (depending on the domain size) to reach the performance of their structure of |
| 715 | arrays (soa) counter parts.</p> |
| 716 | <p>The following table summarizes the properties of the kernels. Here <strong>D</strong> means |
| 717 | direct addressing, i.e. full array, <strong>I</strong> means indirect addressing, i.e. 1D |
| 718 | vector with adjacency list, <strong>x</strong> means supported, whereas <strong>--</strong> means unsupported. |
| 719 | The loop balance B_l is computed for D3Q19 model with double precision floating |
| 720 | point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). |
| 721 | As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective |
| 722 | loop balance depends on the geometry. The effective loop balance is printed |
| 723 | during each run.</p> |
| 724 | <table border="1" class="docutils"> |
| 725 | <colgroup> |
| 726 | <col width="29%" /> |
| 727 | <col width="14%" /> |
| 728 | <col width="14%" /> |
| 729 | <col width="6%" /> |
| 730 | <col width="10%" /> |
| 731 | <col width="10%" /> |
| 732 | <col width="16%" /> |
| 733 | </colgroup> |
| 734 | <thead valign="bottom"> |
| 735 | <tr><th class="head">kernel name</th> |
| 736 | <th class="head">prop. step</th> |
| 737 | <th class="head">data layout</th> |
| 738 | <th class="head">addr.</th> |
| 739 | <th class="head">parallel</th> |
| 740 | <th class="head">blocking</th> |
| 741 | <th class="head">B_l [B/FLUP]</th> |
| 742 | </tr> |
| 743 | </thead> |
| 744 | <tbody valign="top"> |
| 745 | <tr><td>push-soa</td> |
| 746 | <td>OS</td> |
| 747 | <td>SoA</td> |
| 748 | <td>D</td> |
| 749 | <td>x</td> |
| 750 | <td>--</td> |
| 751 | <td>456</td> |
| 752 | </tr> |
| 753 | <tr><td>push-aos</td> |
| 754 | <td>OS</td> |
| 755 | <td>AoS</td> |
| 756 | <td>D</td> |
| 757 | <td>x</td> |
| 758 | <td>--</td> |
| 759 | <td>456</td> |
| 760 | </tr> |
| 761 | <tr><td>pull-soa</td> |
| 762 | <td>OS</td> |
| 763 | <td>SoA</td> |
| 764 | <td>D</td> |
| 765 | <td>x</td> |
| 766 | <td>--</td> |
| 767 | <td>456</td> |
| 768 | </tr> |
| 769 | <tr><td>pull-aos</td> |
| 770 | <td>OS</td> |
| 771 | <td>AoS</td> |
| 772 | <td>D</td> |
| 773 | <td>x</td> |
| 774 | <td>--</td> |
| 775 | <td>456</td> |
| 776 | </tr> |
| 777 | <tr><td>blk-push-soa</td> |
| 778 | <td>OS</td> |
| 779 | <td>SoA</td> |
| 780 | <td>D</td> |
| 781 | <td>x</td> |
| 782 | <td>x</td> |
| 783 | <td>456</td> |
| 784 | </tr> |
| 785 | <tr><td>blk-push-aos</td> |
| 786 | <td>OS</td> |
| 787 | <td>AoS</td> |
| 788 | <td>D</td> |
| 789 | <td>x</td> |
| 790 | <td>x</td> |
| 791 | <td>456</td> |
| 792 | </tr> |
| 793 | <tr><td>blk-pull-soa</td> |
| 794 | <td>OS</td> |
| 795 | <td>SoA</td> |
| 796 | <td>D</td> |
| 797 | <td>x</td> |
| 798 | <td>x</td> |
| 799 | <td>456</td> |
| 800 | </tr> |
| 801 | <tr><td>blk-pull-aos</td> |
| 802 | <td>OS</td> |
| 803 | <td>AoS</td> |
| 804 | <td>D</td> |
| 805 | <td>x</td> |
| 806 | <td>x</td> |
| 807 | <td>456</td> |
| 808 | </tr> |
| 809 | <tr><td>list-push-soa</td> |
| 810 | <td>OS</td> |
| 811 | <td>SoA</td> |
| 812 | <td>I</td> |
| 813 | <td>x</td> |
| 814 | <td>x</td> |
| 815 | <td>528</td> |
| 816 | </tr> |
| 817 | <tr><td>list-push-aos</td> |
| 818 | <td>OS</td> |
| 819 | <td>AoS</td> |
| 820 | <td>I</td> |
| 821 | <td>x</td> |
| 822 | <td>x</td> |
| 823 | <td>528</td> |
| 824 | </tr> |
| 825 | <tr><td>list-pull-soa</td> |
| 826 | <td>OS</td> |
| 827 | <td>SoA</td> |
| 828 | <td>I</td> |
| 829 | <td>x</td> |
| 830 | <td>x</td> |
| 831 | <td>528</td> |
| 832 | </tr> |
| 833 | <tr><td>list-pull-aos</td> |
| 834 | <td>OS</td> |
| 835 | <td>AoS</td> |
| 836 | <td>I</td> |
| 837 | <td>x</td> |
| 838 | <td>x</td> |
| 839 | <td>528</td> |
| 840 | </tr> |
| 841 | <tr><td>list-pull-split-nt-1s</td> |
| 842 | <td>OS</td> |
| 843 | <td>SoA</td> |
| 844 | <td>I</td> |
| 845 | <td>x</td> |
| 846 | <td>x</td> |
| 847 | <td>376</td> |
| 848 | </tr> |
| 849 | <tr><td>list-pull-split-nt-2s</td> |
| 850 | <td>OS</td> |
| 851 | <td>SoA</td> |
| 852 | <td>I</td> |
| 853 | <td>x</td> |
| 854 | <td>x</td> |
| 855 | <td>376</td> |
| 856 | </tr> |
| 857 | <tr><td>list-aa-soa</td> |
| 858 | <td>AA</td> |
| 859 | <td>SoA</td> |
| 860 | <td>I</td> |
| 861 | <td>x</td> |
| 862 | <td>x</td> |
| 863 | <td>340</td> |
| 864 | </tr> |
| 865 | <tr><td>list-aa-aos</td> |
| 866 | <td>AA</td> |
| 867 | <td>AoS</td> |
| 868 | <td>I</td> |
| 869 | <td>x</td> |
| 870 | <td>x</td> |
| 871 | <td>340</td> |
| 872 | </tr> |
| 873 | <tr><td>list-aa-ria-soa</td> |
| 874 | <td>AA</td> |
| 875 | <td>SoA</td> |
| 876 | <td>I</td> |
| 877 | <td>x</td> |
| 878 | <td>x</td> |
| 879 | <td>304-342</td> |
| 880 | </tr> |
| 881 | <tr><td>list-aa-pv-soa</td> |
| 882 | <td>AA</td> |
| 883 | <td>SoA</td> |
| 884 | <td>I</td> |
| 885 | <td>x</td> |
| 886 | <td>x</td> |
| 887 | <td>304-342</td> |
| 888 | </tr> |
| 889 | </tbody> |
| 890 | </table> |
| 891 | </div> |
| 892 | </div> |
| 893 | <div class="section" id="id2"> |
| 894 | <h1><a class="toc-backref" href="#id16">4 Benchmarking</a></h1> |
| 895 | <p>Correct benchmarking is a nontrivial task. Whenever benchmark results should be |
| 896 | created make sure the binary was compiled with:</p> |
| 897 | <ul class="simple"> |
| 898 | <li><tt class="docutils literal">BENCHMARK=on</tt> (default if not overriden) and</li> |
| 899 | <li><tt class="docutils literal">BUILD=release</tt> (default if not overriden) and</li> |
| 900 | <li>the correct ISA for macros is used, selected via <tt class="docutils literal">ISA</tt> and</li> |
| 901 | <li>use <tt class="docutils literal">TARCH</tt> to specify the architecture the compiler generates code for.</li> |
| 902 | </ul> |
| 903 | <div class="section" id="intel-compiler"> |
| 904 | <h2><a class="toc-backref" href="#id17">4.1 Intel Compiler</a></h2> |
| 905 | <p>For the Intel compiler one can specify depending on the target ISA extension:</p> |
| 906 | <ul class="simple"> |
| 907 | <li>AVX: <tt class="docutils literal"><span class="pre">TARCH=-xAVX</span></tt></li> |
| 908 | <li>AVX2 and FMA: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX2,-fma</span></tt></li> |
| 909 | <li>AVX512: <tt class="docutils literal"><span class="pre">TARCH=-xCORE-AVX512</span></tt></li> |
| 910 | <li>KNL: <tt class="docutils literal"><span class="pre">TARCH=-xMIC-AVX512</span></tt></li> |
| 911 | </ul> |
| 912 | <p>Compiling for an architecture supporting AVX (Sandy Bridge, Ivy Bridge):</p> |
| 913 | <pre class="literal-block"> |
| 914 | make ISA=avx TARCH=-xAVX |
| 915 | </pre> |
| 916 | <p>Compiling for an architecture supporting AVX2 (Haswell, Broadwell):</p> |
| 917 | <pre class="literal-block"> |
| 918 | make ISA=avx TARCH=-xCORE-AVX2,-fma |
| 919 | </pre> |
| 920 | <p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have the FMA intrinsics not |
| 921 | implemented. This might change in the future.</p> |
| 922 | <p>Compiling for an architecture supporting AVX-512 (Skylake):</p> |
| 923 | <pre class="literal-block"> |
| 924 | make ISA=avx TARCH=-xCORE-AVX512 |
| 925 | </pre> |
| 926 | <p>WARNING: ISA is here still set to <tt class="docutils literal">avx</tt> as currently we have no implementation for the |
| 927 | AVX512 intrinsics. This might change in the future.</p> |
| 928 | </div> |
| 929 | <div class="section" id="pinning"> |
| 930 | <h2><a class="toc-backref" href="#id18">4.2 Pinning</a></h2> |
| 931 | <p>During benchmarking pinning should be used via the <tt class="docutils literal"><span class="pre">-pin</span></tt> parameter. Running |
| 932 | a benchmark with 10 threads and pin them to the first 10 cores works like</p> |
| 933 | <pre class="literal-block"> |
| 934 | $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9) |
| 935 | </pre> |
| 936 | </div> |
| 937 | <div class="section" id="general-remarks"> |
| 938 | <h2><a class="toc-backref" href="#id19">4.3 General Remarks</a></h2> |
| 939 | <p>Things the binary does nor check or control:</p> |
| 940 | <ul class="simple"> |
| 941 | <li>transparent huge pages: when allocating memory small 4 KiB pages might be |
| 942 | replaced with larger ones. This is in general a good thing, but if this is |
| 943 | really the case, depends on the system settings (check e.g. the status of |
| 944 | <tt class="docutils literal">/sys/kernel/mm/transparent_hugepage/enabled</tt>). |
| 945 | Currently <tt class="docutils literal">madvise(MADV_HUGEPAGE)</tt> is used for allocations which are aligned to |
| 946 | a 4 KiB page, which should be the case for the lattices. |
| 947 | This should result in huge pages except THP is disabled on the machine. |
| 948 | (NOTE: madvise() is used if <tt class="docutils literal">HAVE_HUGE_PAGES</tt> is defined, which is currently |
| 949 | hard coded defined in <tt class="docutils literal">Memory.c</tt>).</li> |
| 950 | <li>CPU/core frequency: For reproducible results the frequency of all cores |
| 951 | should be fixed.</li> |
| 952 | <li>NUMA placement policy: The benchmark assumes a first touch policy, which |
| 953 | means the memory will be placed at the NUMA domain the touching core is |
| 954 | associated with. If a different policy is in place or the NUMA domain to be |
| 955 | used is already full memory might be allocated in a remote domain. Accesses |
| 956 | to remote domains typically have a higher latency and lower bandwidth.</li> |
| 957 | <li>System load: interference with other application, especially on desktop |
| 958 | systems should be avoided.</li> |
| 959 | <li>Padding: For SoA based kernels the number of (fluid) nodes is automatically |
| 960 | adjusted so that no cache or TLB thrashing should occur. The parameters are |
| 961 | optimized for current Intel based systems. For more details look into the |
| 962 | padding section.</li> |
| 963 | <li>CPU dispatcher function: the compiler might add different versions of a |
| 964 | function for different ISA extensions. Make sure the code you might think is |
| 965 | executed is actually the code which is executed.</li> |
| 966 | </ul> |
| 967 | </div> |
| 968 | <div class="section" id="padding"> |
| 969 | <h2><a class="toc-backref" href="#id20">4.4 Padding</a></h2> |
| 970 | <p>With correct padding cache and TLB thrashing can be avoided. Therefore the |
| 971 | number of (fluid) nodes used in the data layout is artificially increased.</p> |
| 972 | <p>Currently automatic padding is active for kernels which support it. It can be |
| 973 | controlled via the kernel parameter (i.e. parameter after the <tt class="docutils literal"><span class="pre">--</span></tt>) |
| 974 | <tt class="docutils literal"><span class="pre">-pad</span></tt>. Supported values are <tt class="docutils literal">auto</tt> (default), <tt class="docutils literal">no</tt> (to disable padding), |
| 975 | or a manual padding.</p> |
| 976 | <p>Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 |
| 977 | entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the |
| 978 | parameters of current Intel based processors.</p> |
| 979 | <p>Manual padding is done via a padding string and has the format |
| 980 | <tt class="docutils literal"><span class="pre">mod_1+offset_1(,mod_n+offset_n)</span></tt>, which specifies numbers of bytes. |
| 981 | SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the |
| 982 | 19 pages with one lattice (36 with two lattices) we are concurrently accessing |
| 983 | over as much sets in the TLB as possible. |
| 984 | This is controlled by the distance between the accessed pages, which is the |
| 985 | number of (fluid) nodes in between them and can be adjusted by adding further |
| 986 | (fluid) nodes. |
| 987 | We want the distance d (in bytes) between two accessed pages to be e.g. |
| 988 | <strong>d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE</strong>. |
| 989 | This would distribute the pages evenly over the sets. Hereby <strong>PAGE_SIZE * TLB_SETS</strong> |
| 990 | would be our <tt class="docutils literal">mod_1</tt> and <strong>PAGE_SIZE</strong> (after the =) our <tt class="docutils literal">offset_1</tt>. |
| 991 | Measurements show that with only a quarter of half of a page size as offset |
| 992 | higher performance is achieved, which is done by automatic padding. |
| 993 | On top of this padding more paddings can be added. They are just added to the |
| 994 | padding string and are separated by commas.</p> |
| 995 | <p>A zero modulus in the padding string has a special meaning. Here the |
| 996 | corresponding offset is just added to the number of nodes. A padding string |
| 997 | like <tt class="docutils literal"><span class="pre">-pad</span> 0+16</tt> would at a static padding of two nodes (one node = 8 b).</p> |
| 998 | </div> |
| 999 | </div> |
| 1000 | <div class="section" id="geometries"> |
| 1001 | <h1><a class="toc-backref" href="#id21">5 Geometries</a></h1> |
| 1002 | <p>TODO: supported geometries: channel, pipe, blocks, fluid</p> |
| 1003 | </div> |
| 1004 | <div class="section" id="performance-results"> |
| 1005 | <h1><a class="toc-backref" href="#id22">6 Performance Results</a></h1> |
| 1006 | <p>The sections lists performance values measured on several machines for |
| 1007 | different kernels and geometries. |
| 1008 | The <strong>RFM</strong> column denotes the expected performance as predicted by the |
| 1009 | Roofline performance model <a class="citation-reference" href="#williams-2008" id="id3">[williams-2008]</a>. |
| 1010 | For performance prediction of each kernel a memory bandwidth benchmark is used |
| 1011 | which mimics the kernels memory access pattern and the kernel's loop balance |
| 1012 | (see <a class="citation-reference" href="#kernels" id="id4">[kernels]</a> for details).</p> |
| 1013 | <div class="section" id="haswell-intel-xeon-e5-2695-v3"> |
| 1014 | <h2><a class="toc-backref" href="#id23">6.1 Haswell, Intel Xeon E5-2695 v3</a></h2> |
| 1015 | <ul class="simple"> |
| 1016 | <li>Haswell architecture, AVX2, FMA</li> |
| 1017 | <li>14 cores, 2,3 GHz</li> |
| 1018 | <li>2 x 7 cores in cluster-on-die (CoD) mode enabled</li> |
| 1019 | <li>SMT enabled</li> |
| 1020 | </ul> |
| 1021 | <p>memory bandwidth:</p> |
| 1022 | <ul class="simple"> |
| 1023 | <li>copy-19 47.3 GB/s</li> |
| 1024 | <li>copy-19-nt-sl 47.1 GB/s</li> |
| 1025 | <li>update-19 44.0 GB/s</li> |
| 1026 | </ul> |
| 1027 | <p>geometry dimensions: 500x100x100</p> |
| 1028 | <table border="1" class="docutils"> |
| 1029 | <colgroup> |
| 1030 | <col width="19%" /> |
| 1031 | <col width="7%" /> |
| 1032 | <col width="7%" /> |
| 1033 | <col width="7%" /> |
| 1034 | <col width="7%" /> |
| 1035 | <col width="7%" /> |
| 1036 | <col width="7%" /> |
| 1037 | <col width="7%" /> |
| 1038 | <col width="7%" /> |
| 1039 | <col width="7%" /> |
| 1040 | <col width="7%" /> |
| 1041 | <col width="7%" /> |
| 1042 | <col width="4%" /> |
| 1043 | </colgroup> |
| 1044 | <thead valign="bottom"> |
| 1045 | <tr><th class="head">kernel</th> |
| 1046 | <th class="head">pipe</th> |
| 1047 | <th class="head">blocks-2</th> |
| 1048 | <th class="head">blocks-4</th> |
| 1049 | <th class="head">blocks-6</th> |
| 1050 | <th class="head">blocks-8</th> |
| 1051 | <th class="head">blocks-10</th> |
| 1052 | <th class="head">blocks-15</th> |
| 1053 | <th class="head">blocks-16</th> |
| 1054 | <th class="head">blocks-20</th> |
| 1055 | <th class="head">blocks-25</th> |
| 1056 | <th class="head">blocks-32</th> |
| 1057 | <th class="head">RFM</th> |
| 1058 | </tr> |
| 1059 | </thead> |
| 1060 | <tbody valign="top"> |
| 1061 | <tr><td>blk-push-aos</td> |
| 1062 | <td>58.82</td> |
| 1063 | <td>49.85</td> |
| 1064 | <td>57.34</td> |
| 1065 | <td>59.90</td> |
| 1066 | <td>61.37</td> |
| 1067 | <td>62.17</td> |
| 1068 | <td>65.30</td> |
| 1069 | <td>64.00</td> |
| 1070 | <td>67.54</td> |
| 1071 | <td>64.46</td> |
| 1072 | <td>69.69</td> |
| 1073 | <td>104</td> |
| 1074 | </tr> |
| 1075 | <tr><td>blk-push-soa</td> |
| 1076 | <td>32.32</td> |
| 1077 | <td>33.46</td> |
| 1078 | <td>34.02</td> |
| 1079 | <td>34.64</td> |
| 1080 | <td>35.06</td> |
| 1081 | <td>35.04</td> |
| 1082 | <td>36.31</td> |
| 1083 | <td>35.44</td> |
| 1084 | <td>37.20</td> |
| 1085 | <td>35.14</td> |
| 1086 | <td>37.95</td> |
| 1087 | <td>104</td> |
| 1088 | </tr> |
| 1089 | <tr><td>blk-pull-aos</td> |
| 1090 | <td>56.97</td> |
| 1091 | <td>51.41</td> |
| 1092 | <td>56.09</td> |
| 1093 | <td>57.92</td> |
| 1094 | <td>59.98</td> |
| 1095 | <td>59.83</td> |
| 1096 | <td>63.37</td> |
| 1097 | <td>61.55</td> |
| 1098 | <td>65.50</td> |
| 1099 | <td>63.11</td> |
| 1100 | <td>67.02</td> |
| 1101 | <td>104</td> |
| 1102 | </tr> |
| 1103 | <tr><td>blk-pull-soa</td> |
| 1104 | <td>49.29</td> |
| 1105 | <td>46.23</td> |
| 1106 | <td>47.50</td> |
| 1107 | <td>51.97</td> |
| 1108 | <td>51.27</td> |
| 1109 | <td>49.52</td> |
| 1110 | <td>55.23</td> |
| 1111 | <td>53.13</td> |
| 1112 | <td>54.50</td> |
| 1113 | <td>49.79</td> |
| 1114 | <td>57.90</td> |
| 1115 | <td>104</td> |
| 1116 | </tr> |
| 1117 | <tr><td>aa-aos</td> |
| 1118 | <td>91.35</td> |
| 1119 | <td>66.14</td> |
| 1120 | <td>76.80</td> |
| 1121 | <td>84.76</td> |
| 1122 | <td>83.63</td> |
| 1123 | <td>91.36</td> |
| 1124 | <td>93.46</td> |
| 1125 | <td>92.62</td> |
| 1126 | <td>93.91</td> |
| 1127 | <td>92.25</td> |
| 1128 | <td>92.93</td> |
| 1129 | <td>145</td> |
| 1130 | </tr> |
| 1131 | <tr><td>aa-soa</td> |
| 1132 | <td>75.51</td> |
| 1133 | <td>65.68</td> |
| 1134 | <td>70.94</td> |
| 1135 | <td>71.36</td> |
| 1136 | <td>73.83</td> |
| 1137 | <td>75.46</td> |
| 1138 | <td>74.84</td> |
| 1139 | <td>79.48</td> |
| 1140 | <td>83.28</td> |
| 1141 | <td>77.70</td> |
| 1142 | <td>82.72</td> |
| 1143 | <td>145</td> |
| 1144 | </tr> |
| 1145 | <tr><td>aa-vec-soa</td> |
| 1146 | <td>93.85</td> |
| 1147 | <td>83.44</td> |
| 1148 | <td>91.58</td> |
| 1149 | <td>93.96</td> |
| 1150 | <td>94.35</td> |
| 1151 | <td>96.62</td> |
| 1152 | <td>101.76</td> |
| 1153 | <td>96.72</td> |
| 1154 | <td>106.37</td> |
| 1155 | <td>102.60</td> |
| 1156 | <td>110.28</td> |
| 1157 | <td>145</td> |
| 1158 | </tr> |
| 1159 | <tr><td>list-push-aos</td> |
| 1160 | <td>80.29</td> |
| 1161 | <td>80.97</td> |
| 1162 | <td>80.95</td> |
| 1163 | <td>81.10</td> |
| 1164 | <td>81.37</td> |
| 1165 | <td>82.44</td> |
| 1166 | <td>81.77</td> |
| 1167 | <td>81.49</td> |
| 1168 | <td>80.72</td> |
| 1169 | <td>81.93</td> |
| 1170 | <td>80.93</td> |
| 1171 | <td>83</td> |
| 1172 | </tr> |
| 1173 | <tr><td>list-push-soa</td> |
| 1174 | <td>47.52</td> |
| 1175 | <td>42.65</td> |
| 1176 | <td>45.28</td> |
| 1177 | <td>46.64</td> |
| 1178 | <td>43.46</td> |
| 1179 | <td>40.59</td> |
| 1180 | <td>44.94</td> |
| 1181 | <td>46.55</td> |
| 1182 | <td>41.53</td> |
| 1183 | <td>45.98</td> |
| 1184 | <td>44.86</td> |
| 1185 | <td>83</td> |
| 1186 | </tr> |
| 1187 | <tr><td>list-pull-aos</td> |
| 1188 | <td>85.30</td> |
| 1189 | <td>82.97</td> |
| 1190 | <td>86.43</td> |
| 1191 | <td>83.42</td> |
| 1192 | <td>86.33</td> |
| 1193 | <td>83.70</td> |
| 1194 | <td>86.43</td> |
| 1195 | <td>83.77</td> |
| 1196 | <td>83.10</td> |
| 1197 | <td>85.89</td> |
| 1198 | <td>84.44</td> |
| 1199 | <td>83</td> |
| 1200 | </tr> |
| 1201 | <tr><td>list-pull-soa</td> |
| 1202 | <td>62.12</td> |
| 1203 | <td>63.61</td> |
| 1204 | <td>63.28</td> |
| 1205 | <td>61.32</td> |
| 1206 | <td>66.72</td> |
| 1207 | <td>62.65</td> |
| 1208 | <td>64.82</td> |
| 1209 | <td>60.49</td> |
| 1210 | <td>58.01</td> |
| 1211 | <td>64.46</td> |
| 1212 | <td>62.52</td> |
| 1213 | <td>83</td> |
| 1214 | </tr> |
| 1215 | <tr><td>list-pull-split-nt-1s-soa</td> |
| 1216 | <td>121.35</td> |
| 1217 | <td>113.77</td> |
| 1218 | <td>115.29</td> |
| 1219 | <td>113.54</td> |
| 1220 | <td>117.00</td> |
| 1221 | <td>116.46</td> |
| 1222 | <td>114.78</td> |
| 1223 | <td>114.54</td> |
| 1224 | <td>110.83</td> |
| 1225 | <td>112.67</td> |
| 1226 | <td>117.85</td> |
| 1227 | <td>125</td> |
| 1228 | </tr> |
| 1229 | <tr><td>list-pull-split-nt-2s-soa</td> |
| 1230 | <td>118.09</td> |
| 1231 | <td>110.48</td> |
| 1232 | <td>112.55</td> |
| 1233 | <td>113.18</td> |
| 1234 | <td>113.44</td> |
| 1235 | <td>111.85</td> |
| 1236 | <td>109.27</td> |
| 1237 | <td>114.41</td> |
| 1238 | <td>110.28</td> |
| 1239 | <td>111.78</td> |
| 1240 | <td>113.74</td> |
| 1241 | <td>125</td> |
| 1242 | </tr> |
| 1243 | <tr><td>list-aa-aos</td> |
| 1244 | <td>121.28</td> |
| 1245 | <td>118.63</td> |
| 1246 | <td>119.00</td> |
| 1247 | <td>118.50</td> |
| 1248 | <td>121.99</td> |
| 1249 | <td>119.11</td> |
| 1250 | <td>118.83</td> |
| 1251 | <td>121.47</td> |
| 1252 | <td>121.62</td> |
| 1253 | <td>126.18</td> |
| 1254 | <td>120.12</td> |
| 1255 | <td>129</td> |
| 1256 | </tr> |
| 1257 | <tr><td>list-aa-soa</td> |
| 1258 | <td>126.34</td> |
| 1259 | <td>116.90</td> |
| 1260 | <td>129.45</td> |
| 1261 | <td>127.12</td> |
| 1262 | <td>129.41</td> |
| 1263 | <td>121.42</td> |
| 1264 | <td>126.19</td> |
| 1265 | <td>126.76</td> |
| 1266 | <td>126.70</td> |
| 1267 | <td>124.40</td> |
| 1268 | <td>125.22</td> |
| 1269 | <td>129</td> |
| 1270 | </tr> |
| 1271 | <tr><td>list-aa-ria-soa</td> |
| 1272 | <td>133.68</td> |
| 1273 | <td>121.82</td> |
| 1274 | <td>126.04</td> |
| 1275 | <td>128.46</td> |
| 1276 | <td>131.15</td> |
| 1277 | <td>132.25</td> |
| 1278 | <td>128.78</td> |
| 1279 | <td>133.50</td> |
| 1280 | <td>126.69</td> |
| 1281 | <td>124.40</td> |
| 1282 | <td>130.37</td> |
| 1283 | <td>145</td> |
| 1284 | </tr> |
| 1285 | <tr><td>list-aa-pv-soa</td> |
| 1286 | <td>146.22</td> |
| 1287 | <td>124.39</td> |
| 1288 | <td>130.73</td> |
| 1289 | <td>136.29</td> |
| 1290 | <td>137.61</td> |
| 1291 | <td>131.21</td> |
| 1292 | <td>138.65</td> |
| 1293 | <td>138.78</td> |
| 1294 | <td>127.02</td> |
| 1295 | <td>132.40</td> |
| 1296 | <td>138.37</td> |
| 1297 | <td>145</td> |
| 1298 | </tr> |
| 1299 | </tbody> |
| 1300 | </table> |
| 1301 | </div> |
| 1302 | <div class="section" id="broadwell-intel-xeon-e5-2630-v4"> |
| 1303 | <h2><a class="toc-backref" href="#id24">6.2 Broadwell, Intel Xeon E5-2630 v4</a></h2> |
| 1304 | <ul class="simple"> |
| 1305 | <li>Broadwell architecture, AVX2, FMA</li> |
| 1306 | <li>10 cores, 2.2 GHz</li> |
| 1307 | <li>SMT disabled</li> |
| 1308 | </ul> |
| 1309 | <p>memory bandwidth:</p> |
| 1310 | <ul class="simple"> |
| 1311 | <li>copy-19 48.0 GB/s</li> |
| 1312 | <li>copy-nt-sl-19 48.2 GB/s</li> |
| 1313 | <li>update-19 51.1 GB/s</li> |
| 1314 | </ul> |
| 1315 | <p>geometry dimensions: 500x100x100</p> |
| 1316 | <table border="1" class="docutils"> |
| 1317 | <colgroup> |
| 1318 | <col width="19%" /> |
| 1319 | <col width="7%" /> |
| 1320 | <col width="7%" /> |
| 1321 | <col width="7%" /> |
| 1322 | <col width="7%" /> |
| 1323 | <col width="7%" /> |
| 1324 | <col width="7%" /> |
| 1325 | <col width="7%" /> |
| 1326 | <col width="7%" /> |
| 1327 | <col width="7%" /> |
| 1328 | <col width="7%" /> |
| 1329 | <col width="7%" /> |
| 1330 | <col width="5%" /> |
| 1331 | </colgroup> |
| 1332 | <thead valign="bottom"> |
| 1333 | <tr><th class="head">kernel</th> |
| 1334 | <th class="head">pipe</th> |
| 1335 | <th class="head">blocks-2</th> |
| 1336 | <th class="head">blocks-4</th> |
| 1337 | <th class="head">blocks-6</th> |
| 1338 | <th class="head">blocks-8</th> |
| 1339 | <th class="head">blocks-10</th> |
| 1340 | <th class="head">blocks-15</th> |
| 1341 | <th class="head">blocks-16</th> |
| 1342 | <th class="head">blocks-20</th> |
| 1343 | <th class="head">blocks-25</th> |
| 1344 | <th class="head">blocks-32</th> |
| 1345 | <th class="head">RFM</th> |
| 1346 | </tr> |
| 1347 | </thead> |
| 1348 | <tbody valign="top"> |
| 1349 | <tr><td>blk-push-aos</td> |
| 1350 | <td>55.75</td> |
| 1351 | <td>47.62</td> |
| 1352 | <td>54.57</td> |
| 1353 | <td>57.10</td> |
| 1354 | <td>58.49</td> |
| 1355 | <td>59.00</td> |
| 1356 | <td>61.72</td> |
| 1357 | <td>60.56</td> |
| 1358 | <td>64.05</td> |
| 1359 | <td>61.10</td> |
| 1360 | <td>66.03</td> |
| 1361 | <td>105</td> |
| 1362 | </tr> |
| 1363 | <tr><td>blk-push-soa</td> |
| 1364 | <td>30.06</td> |
| 1365 | <td>31.09</td> |
| 1366 | <td>32.13</td> |
| 1367 | <td>32.54</td> |
| 1368 | <td>32.74</td> |
| 1369 | <td>32.72</td> |
| 1370 | <td>33.81</td> |
| 1371 | <td>33.19</td> |
| 1372 | <td>34.90</td> |
| 1373 | <td>33.21</td> |
| 1374 | <td>35.75</td> |
| 1375 | <td>105</td> |
| 1376 | </tr> |
| 1377 | <tr><td>blk-pull-aos</td> |
| 1378 | <td>53.80</td> |
| 1379 | <td>48.61</td> |
| 1380 | <td>53.08</td> |
| 1381 | <td>54.99</td> |
| 1382 | <td>56.08</td> |
| 1383 | <td>56.68</td> |
| 1384 | <td>59.20</td> |
| 1385 | <td>58.12</td> |
| 1386 | <td>61.49</td> |
| 1387 | <td>58.71</td> |
| 1388 | <td>63.45</td> |
| 1389 | <td>105</td> |
| 1390 | </tr> |
| 1391 | <tr><td>blk-pull-soa</td> |
| 1392 | <td>46.96</td> |
| 1393 | <td>46.61</td> |
| 1394 | <td>48.84</td> |
| 1395 | <td>49.70</td> |
| 1396 | <td>50.33</td> |
| 1397 | <td>50.46</td> |
| 1398 | <td>52.36</td> |
| 1399 | <td>51.39</td> |
| 1400 | <td>54.20</td> |
| 1401 | <td>51.61</td> |
| 1402 | <td>55.71</td> |
| 1403 | <td>105</td> |
| 1404 | </tr> |
| 1405 | <tr><td>aa-aos</td> |
| 1406 | <td>91.40</td> |
| 1407 | <td>66.99</td> |
| 1408 | <td>78.47</td> |
| 1409 | <td>83.38</td> |
| 1410 | <td>86.62</td> |
| 1411 | <td>88.62</td> |
| 1412 | <td>92.98</td> |
| 1413 | <td>91.54</td> |
| 1414 | <td>97.08</td> |
| 1415 | <td>94.93</td> |
| 1416 | <td>98.90</td> |
| 1417 | <td>168</td> |
| 1418 | </tr> |
| 1419 | <tr><td>aa-soa</td> |
| 1420 | <td>83.01</td> |
| 1421 | <td>69.96</td> |
| 1422 | <td>75.85</td> |
| 1423 | <td>77.72</td> |
| 1424 | <td>79.01</td> |
| 1425 | <td>79.29</td> |
| 1426 | <td>82.38</td> |
| 1427 | <td>80.11</td> |
| 1428 | <td>85.70</td> |
| 1429 | <td>83.91</td> |
| 1430 | <td>87.69</td> |
| 1431 | <td>168</td> |
| 1432 | </tr> |
| 1433 | <tr><td>aa-vec-soa</td> |
| 1434 | <td>112.03</td> |
| 1435 | <td>96.52</td> |
| 1436 | <td>105.32</td> |
| 1437 | <td>109.76</td> |
| 1438 | <td>112.55</td> |
| 1439 | <td>113.82</td> |
| 1440 | <td>120.55</td> |
| 1441 | <td>118.37</td> |
| 1442 | <td>126.30</td> |
| 1443 | <td>121.37</td> |
| 1444 | <td>131.94</td> |
| 1445 | <td>168</td> |
| 1446 | </tr> |
| 1447 | <tr><td>list-push-aos</td> |
| 1448 | <td>75.13</td> |
| 1449 | <td>74.18</td> |
| 1450 | <td>75.20</td> |
| 1451 | <td>75.42</td> |
| 1452 | <td>75.24</td> |
| 1453 | <td>75.99</td> |
| 1454 | <td>75.80</td> |
| 1455 | <td>75.80</td> |
| 1456 | <td>75.54</td> |
| 1457 | <td>76.22</td> |
| 1458 | <td>76.21</td> |
| 1459 | <td>97</td> |
| 1460 | </tr> |
| 1461 | <tr><td>list-push-soa</td> |
| 1462 | <td>40.99</td> |
| 1463 | <td>38.14</td> |
| 1464 | <td>39.00</td> |
| 1465 | <td>38.89</td> |
| 1466 | <td>38.89</td> |
| 1467 | <td>39.67</td> |
| 1468 | <td>39.87</td> |
| 1469 | <td>39.28</td> |
| 1470 | <td>39.35</td> |
| 1471 | <td>40.08</td> |
| 1472 | <td>40.13</td> |
| 1473 | <td>97</td> |
| 1474 | </tr> |
| 1475 | <tr><td>list-pull-aos</td> |
| 1476 | <td>82.07</td> |
| 1477 | <td>82.88</td> |
| 1478 | <td>83.29</td> |
| 1479 | <td>83.09</td> |
| 1480 | <td>83.32</td> |
| 1481 | <td>83.49</td> |
| 1482 | <td>82.82</td> |
| 1483 | <td>82.88</td> |
| 1484 | <td>83.32</td> |
| 1485 | <td>82.60</td> |
| 1486 | <td>82.93</td> |
| 1487 | <td>97</td> |
| 1488 | </tr> |
| 1489 | <tr><td>list-pull-soa</td> |
| 1490 | <td>62.07</td> |
| 1491 | <td>60.40</td> |
| 1492 | <td>61.89</td> |
| 1493 | <td>61.39</td> |
| 1494 | <td>62.43</td> |
| 1495 | <td>60.90</td> |
| 1496 | <td>60.48</td> |
| 1497 | <td>62.80</td> |
| 1498 | <td>62.50</td> |
| 1499 | <td>61.10</td> |
| 1500 | <td>60.38</td> |
| 1501 | <td>97</td> |
| 1502 | </tr> |
| 1503 | <tr><td>list-pull-split-nt-1s-soa</td> |
| 1504 | <td>125.81</td> |
| 1505 | <td>120.60</td> |
| 1506 | <td>121.96</td> |
| 1507 | <td>122.34</td> |
| 1508 | <td>122.86</td> |
| 1509 | <td>123.53</td> |
| 1510 | <td>123.64</td> |
| 1511 | <td>123.67</td> |
| 1512 | <td>125.94</td> |
| 1513 | <td>124.09</td> |
| 1514 | <td>123.69</td> |
| 1515 | <td>128</td> |
| 1516 | </tr> |
| 1517 | <tr><td>list-pull-split-nt-2s-soa</td> |
| 1518 | <td>122.79</td> |
| 1519 | <td>117.16</td> |
| 1520 | <td>118.86</td> |
| 1521 | <td>119.16</td> |
| 1522 | <td>119.56</td> |
| 1523 | <td>119.99</td> |
| 1524 | <td>120.01</td> |
| 1525 | <td>120.03</td> |
| 1526 | <td>122.64</td> |
| 1527 | <td>120.57</td> |
| 1528 | <td>120.39</td> |
| 1529 | <td>128</td> |
| 1530 | </tr> |
| 1531 | <tr><td>list-aa-aos</td> |
| 1532 | <td>128.13</td> |
| 1533 | <td>127.41</td> |
| 1534 | <td>129.31</td> |
| 1535 | <td>129.07</td> |
| 1536 | <td>129.79</td> |
| 1537 | <td>129.63</td> |
| 1538 | <td>129.67</td> |
| 1539 | <td>129.94</td> |
| 1540 | <td>129.12</td> |
| 1541 | <td>128.41</td> |
| 1542 | <td>129.72</td> |
| 1543 | <td>150</td> |
| 1544 | </tr> |
| 1545 | <tr><td>list-aa-soa</td> |
| 1546 | <td>141.60</td> |
| 1547 | <td>139.78</td> |
| 1548 | <td>141.58</td> |
| 1549 | <td>142.16</td> |
| 1550 | <td>141.94</td> |
| 1551 | <td>141.31</td> |
| 1552 | <td>142.37</td> |
| 1553 | <td>142.25</td> |
| 1554 | <td>142.43</td> |
| 1555 | <td>141.40</td> |
| 1556 | <td>142.26</td> |
| 1557 | <td>150</td> |
| 1558 | </tr> |
| 1559 | <tr><td>list-aa-ria-soa</td> |
| 1560 | <td>141.82</td> |
| 1561 | <td>134.88</td> |
| 1562 | <td>140.15</td> |
| 1563 | <td>140.72</td> |
| 1564 | <td>141.67</td> |
| 1565 | <td>140.51</td> |
| 1566 | <td>141.18</td> |
| 1567 | <td>141.29</td> |
| 1568 | <td>142.97</td> |
| 1569 | <td>141.94</td> |
| 1570 | <td>143.25</td> |
| 1571 | <td>168</td> |
| 1572 | </tr> |
| 1573 | <tr><td>list-aa-pv-soa</td> |
| 1574 | <td>164.79</td> |
| 1575 | <td>140.95</td> |
| 1576 | <td>159.24</td> |
| 1577 | <td>161.78</td> |
| 1578 | <td>162.40</td> |
| 1579 | <td>163.04</td> |
| 1580 | <td>164.69</td> |
| 1581 | <td>164.38</td> |
| 1582 | <td>165.11</td> |
| 1583 | <td>165.75</td> |
| 1584 | <td>166.09</td> |
| 1585 | <td>168</td> |
| 1586 | </tr> |
| 1587 | </tbody> |
| 1588 | </table> |
| 1589 | </div> |
| 1590 | <div class="section" id="skylake-intel-xeon-gold-6148"> |
| 1591 | <h2><a class="toc-backref" href="#id25">6.3 Skylake, Intel Xeon Gold 6148</a></h2> |
| 1592 | <ul class="simple"> |
| 1593 | <li>Skylake architecture, AVX2, FMA, AVX512</li> |
| 1594 | <li>20 cores, 2.4 GHz</li> |
| 1595 | <li>SMT enabled</li> |
| 1596 | </ul> |
| 1597 | <p>memory bandwidth:</p> |
| 1598 | <ul class="simple"> |
| 1599 | <li>copy-19 89.7 GB/s</li> |
| 1600 | <li>copy-19-nt-sl 92.4 GB/s</li> |
| 1601 | <li>update-19 93.6 GB/s</li> |
| 1602 | </ul> |
| 1603 | <p>geometry dimensions: 500x100x100</p> |
| 1604 | <table border="1" class="docutils"> |
| 1605 | <colgroup> |
| 1606 | <col width="20%" /> |
| 1607 | <col width="7%" /> |
| 1608 | <col width="7%" /> |
| 1609 | <col width="7%" /> |
| 1610 | <col width="7%" /> |
| 1611 | <col width="7%" /> |
| 1612 | <col width="7%" /> |
| 1613 | <col width="7%" /> |
| 1614 | <col width="7%" /> |
| 1615 | <col width="7%" /> |
| 1616 | <col width="7%" /> |
| 1617 | <col width="7%" /> |
| 1618 | <col width="2%" /> |
| 1619 | </colgroup> |
| 1620 | <thead valign="bottom"> |
| 1621 | <tr><th class="head">kernel</th> |
| 1622 | <th class="head">pipe</th> |
| 1623 | <th class="head">blocks-2</th> |
| 1624 | <th class="head">blocks-4</th> |
| 1625 | <th class="head">blocks-6</th> |
| 1626 | <th class="head">blocks-8</th> |
| 1627 | <th class="head">blocks-10</th> |
| 1628 | <th class="head">blocks-15</th> |
| 1629 | <th class="head">blocks-16</th> |
| 1630 | <th class="head">blocks-20</th> |
| 1631 | <th class="head">blocks-25</th> |
| 1632 | <th class="head">blocks-32</th> |
| 1633 | <th class="head">RFM</th> |
| 1634 | </tr> |
| 1635 | </thead> |
| 1636 | <tbody valign="top"> |
| 1637 | <tr><td>blk-push-aos</td> |
| 1638 | <td>113.01</td> |
| 1639 | <td>93.99</td> |
| 1640 | <td>108.98</td> |
| 1641 | <td>114.65</td> |
| 1642 | <td>117.87</td> |
| 1643 | <td>119.47</td> |
| 1644 | <td>124.95</td> |
| 1645 | <td>122.46</td> |
| 1646 | <td>129.29</td> |
| 1647 | <td>123.87</td> |
| 1648 | <td>133.01</td> |
| 1649 | <td>197</td> |
| 1650 | </tr> |
| 1651 | <tr><td>blk-push-soa</td> |
| 1652 | <td>100.21</td> |
| 1653 | <td>98.87</td> |
| 1654 | <td>103.63</td> |
| 1655 | <td>105.56</td> |
| 1656 | <td>107.02</td> |
| 1657 | <td>107.27</td> |
| 1658 | <td>111.61</td> |
| 1659 | <td>109.83</td> |
| 1660 | <td>116.16</td> |
| 1661 | <td>110.51</td> |
| 1662 | <td>110.29</td> |
| 1663 | <td>197</td> |
| 1664 | </tr> |
| 1665 | <tr><td>blk-pull-aos</td> |
| 1666 | <td>118.45</td> |
| 1667 | <td>102.54</td> |
| 1668 | <td>114.12</td> |
| 1669 | <td>117.82</td> |
| 1670 | <td>122.69</td> |
| 1671 | <td>124.31</td> |
| 1672 | <td>130.58</td> |
| 1673 | <td>127.85</td> |
| 1674 | <td>135.72</td> |
| 1675 | <td>129.65</td> |
| 1676 | <td>139.94</td> |
| 1677 | <td>197</td> |
| 1678 | </tr> |
| 1679 | <tr><td>blk-pull-soa</td> |
| 1680 | <td>82.60</td> |
| 1681 | <td>83.36</td> |
| 1682 | <td>87.13</td> |
| 1683 | <td>88.39</td> |
| 1684 | <td>88.84</td> |
| 1685 | <td>88.96</td> |
| 1686 | <td>92.48</td> |
| 1687 | <td>90.93</td> |
| 1688 | <td>95.79</td> |
| 1689 | <td>91.92</td> |
| 1690 | <td>98.64</td> |
| 1691 | <td>197</td> |
| 1692 | </tr> |
| 1693 | <tr><td>aa-aos</td> |
| 1694 | <td>171.32</td> |
| 1695 | <td>125.43</td> |
| 1696 | <td>147.73</td> |
| 1697 | <td>157.70</td> |
| 1698 | <td>163.35</td> |
| 1699 | <td>167.25</td> |
| 1700 | <td>175.39</td> |
| 1701 | <td>174.20</td> |
| 1702 | <td>182.54</td> |
| 1703 | <td>173.67</td> |
| 1704 | <td>187.76</td> |
| 1705 | <td>308</td> |
| 1706 | </tr> |
| 1707 | <tr><td>aa-soa</td> |
| 1708 | <td>180.85</td> |
| 1709 | <td>152.39</td> |
| 1710 | <td>165.84</td> |
| 1711 | <td>152.59</td> |
| 1712 | <td>171.90</td> |
| 1713 | <td>175.76</td> |
| 1714 | <td>184.94</td> |
| 1715 | <td>182.34</td> |
| 1716 | <td>189.43</td> |
| 1717 | <td>180.30</td> |
| 1718 | <td>193.54</td> |
| 1719 | <td>308</td> |
| 1720 | </tr> |
| 1721 | <tr><td>aa-vec-soa</td> |
| 1722 | <td>208.03</td> |
| 1723 | <td>181.51</td> |
| 1724 | <td>195.86</td> |
| 1725 | <td>203.41</td> |
| 1726 | <td>209.08</td> |
| 1727 | <td>212.34</td> |
| 1728 | <td>224.05</td> |
| 1729 | <td>219.49</td> |
| 1730 | <td>234.31</td> |
| 1731 | <td>225.92</td> |
| 1732 | <td>245.22</td> |
| 1733 | <td>308</td> |
| 1734 | </tr> |
| 1735 | <tr><td>list-push-aos</td> |
| 1736 | <td>158.81</td> |
| 1737 | <td>164.67</td> |
| 1738 | <td>162.93</td> |
| 1739 | <td>163.05</td> |
| 1740 | <td>165.22</td> |
| 1741 | <td>164.31</td> |
| 1742 | <td>164.66</td> |
| 1743 | <td>160.78</td> |
| 1744 | <td>164.07</td> |
| 1745 | <td>165.19</td> |
| 1746 | <td>164.06</td> |
| 1747 | <td>177</td> |
| 1748 | </tr> |
| 1749 | <tr><td>list-push-soa</td> |
| 1750 | <td>134.60</td> |
| 1751 | <td>110.44</td> |
| 1752 | <td>110.17</td> |
| 1753 | <td>132.01</td> |
| 1754 | <td>132.95</td> |
| 1755 | <td>133.46</td> |
| 1756 | <td>134.37</td> |
| 1757 | <td>134.33</td> |
| 1758 | <td>135.12</td> |
| 1759 | <td>134.91</td> |
| 1760 | <td>137.87</td> |
| 1761 | <td>177</td> |
| 1762 | </tr> |
| 1763 | <tr><td>list-pull-aos</td> |
| 1764 | <td>169.61</td> |
| 1765 | <td>170.03</td> |
| 1766 | <td>170.89</td> |
| 1767 | <td>170.90</td> |
| 1768 | <td>171.20</td> |
| 1769 | <td>171.60</td> |
| 1770 | <td>172.09</td> |
| 1771 | <td>171.95</td> |
| 1772 | <td>169.48</td> |
| 1773 | <td>172.08</td> |
| 1774 | <td>171.02</td> |
| 1775 | <td>177</td> |
| 1776 | </tr> |
| 1777 | <tr><td>list-pull-soa</td> |
| 1778 | <td>120.50</td> |
| 1779 | <td>116.73</td> |
| 1780 | <td>118.62</td> |
| 1781 | <td>118.00</td> |
| 1782 | <td>120.99</td> |
| 1783 | <td>118.15</td> |
| 1784 | <td>117.17</td> |
| 1785 | <td>121.41</td> |
| 1786 | <td>120.83</td> |
| 1787 | <td>120.00</td> |
| 1788 | <td>118.74</td> |
| 1789 | <td>177</td> |
| 1790 | </tr> |
| 1791 | <tr><td>list-pull-split-nt-1s-soa</td> |
| 1792 | <td>225.59</td> |
| 1793 | <td>224.18</td> |
| 1794 | <td>225.10</td> |
| 1795 | <td>226.34</td> |
| 1796 | <td>226.01</td> |
| 1797 | <td>230.37</td> |
| 1798 | <td>227.50</td> |
| 1799 | <td>228.42</td> |
| 1800 | <td>227.39</td> |
| 1801 | <td>231.65</td> |
| 1802 | <td>227.35</td> |
| 1803 | <td>246</td> |
| 1804 | </tr> |
| 1805 | <tr><td>list-pull-split-nt-2s-soa</td> |
| 1806 | <td>219.20</td> |
| 1807 | <td>214.63</td> |
| 1808 | <td>217.61</td> |
| 1809 | <td>218.13</td> |
| 1810 | <td>219.07</td> |
| 1811 | <td>221.01</td> |
| 1812 | <td>219.88</td> |
| 1813 | <td>220.09</td> |
| 1814 | <td>220.62</td> |
| 1815 | <td>221.68</td> |
| 1816 | <td>220.58</td> |
| 1817 | <td>246</td> |
| 1818 | </tr> |
| 1819 | <tr><td>list-aa-aos</td> |
| 1820 | <td>241.39</td> |
| 1821 | <td>239.27</td> |
| 1822 | <td>239.53</td> |
| 1823 | <td>242.56</td> |
| 1824 | <td>242.46</td> |
| 1825 | <td>243.00</td> |
| 1826 | <td>242.91</td> |
| 1827 | <td>242.46</td> |
| 1828 | <td>241.24</td> |
| 1829 | <td>242.96</td> |
| 1830 | <td>241.52</td> |
| 1831 | <td>275</td> |
| 1832 | </tr> |
| 1833 | <tr><td>list-aa-soa</td> |
| 1834 | <td>273.73</td> |
| 1835 | <td>268.49</td> |
| 1836 | <td>268.48</td> |
| 1837 | <td>271.79</td> |
| 1838 | <td>275.29</td> |
| 1839 | <td>274.56</td> |
| 1840 | <td>277.18</td> |
| 1841 | <td>272.67</td> |
| 1842 | <td>274.21</td> |
| 1843 | <td>275.24</td> |
| 1844 | <td>278.21</td> |
| 1845 | <td>275</td> |
| 1846 | </tr> |
| 1847 | <tr><td>list-aa-ria-soa</td> |
| 1848 | <td>288.42</td> |
| 1849 | <td>261.89</td> |
| 1850 | <td>273.26</td> |
| 1851 | <td>284.84</td> |
| 1852 | <td>283.88</td> |
| 1853 | <td>288.29</td> |
| 1854 | <td>290.72</td> |
| 1855 | <td>289.81</td> |
| 1856 | <td>293.36</td> |
| 1857 | <td>290.75</td> |
| 1858 | <td>292.93</td> |
| 1859 | <td>308</td> |
| 1860 | </tr> |
| 1861 | <tr><td>list-aa-pv-soa</td> |
| 1862 | <td>303.35</td> |
| 1863 | <td>267.21</td> |
| 1864 | <td>289.18</td> |
| 1865 | <td>294.96</td> |
| 1866 | <td>294.36</td> |
| 1867 | <td>298.16</td> |
| 1868 | <td>300.45</td> |
| 1869 | <td>301.71</td> |
| 1870 | <td>302.37</td> |
| 1871 | <td>302.88</td> |
| 1872 | <td>304.46</td> |
| 1873 | <td>308</td> |
| 1874 | </tr> |
| 1875 | </tbody> |
| 1876 | </table> |
| 1877 | </div> |
| 1878 | </div> |
| 1879 | <div class="section" id="licence"> |
| 1880 | <h1><a class="toc-backref" href="#id26">7 Licence</a></h1> |
| 1881 | <p>The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3.</p> |
| 1882 | </div> |
| 1883 | <div class="section" id="acknowledgements"> |
| 1884 | <h1><a class="toc-backref" href="#id27">8 Acknowledgements</a></h1> |
| 1885 | <p>This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY).</p> |
| 1886 | <p>This work was funded by KONWHIR project OMI4PAPS.</p> |
| 1887 | </div> |
| 1888 | <div class="section" id="bibliography"> |
| 1889 | <h1><a class="toc-backref" href="#id28">9 Bibliography</a></h1> |
| 1890 | <table class="docutils citation" frame="void" id="ginzburg-2008" rules="none"> |
| 1891 | <colgroup><col class="label" /><col /></colgroup> |
| 1892 | <tbody valign="top"> |
| 1893 | <tr><td class="label"><a class="fn-backref" href="#id1">[ginzburg-2008]</a></td><td>I. Ginzburg, F. Verhaeghe, and D. d'Humières. |
| 1894 | Two-relaxation-time lattice Boltzmann scheme: About parametrization, velocity, pressure and mixed boundary conditions. |
| 1895 | Commun. Comput. Phys., 3(2):427-478, 2008.</td></tr> |
| 1896 | </tbody> |
| 1897 | </table> |
| 1898 | <table class="docutils citation" frame="void" id="williams-2008" rules="none"> |
| 1899 | <colgroup><col class="label" /><col /></colgroup> |
| 1900 | <tbody valign="top"> |
| 1901 | <tr><td class="label"><a class="fn-backref" href="#id3">[williams-2008]</a></td><td>S. Williams, A. Waterman, and D. Patterson. |
| 1902 | Roofline: an insightful visual performance model for multicore architectures. |
| 1903 | Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr> |
| 1904 | </tbody> |
| 1905 | </table> |
| 1906 | <p>Document was generated at 2017-11-21 15:43.</p> |
| 1907 | </div> |
| 1908 | </div> |
| 1909 | </body> |
| 1910 | </html> |