From e3f82424829ebb623343ce0092238f83b4a1b8c2 Mon Sep 17 00:00:00 2001 From: Markus Wittmann Date: Thu, 2 Nov 2017 15:54:11 +0100 Subject: [PATCH] bulk commit - add AA pattern full array kernels - add padding for list kernels - transposed loops --- doc/Makefile | 2 +- doc/html/main.html | 411 ++++++++++-- doc/main.css | 46 ++ doc/main.rst | 194 +++++- src/Base.h | 14 +- src/BenchKernelD3Q19.c | 66 +- src/BenchKernelD3Q19Aa.c | 529 ++++++++++++++++ src/BenchKernelD3Q19Aa.h | 41 ++ src/BenchKernelD3Q19AaCommon.c | 651 +++++++++++++++++++ src/BenchKernelD3Q19AaCommon.h | 89 +++ src/BenchKernelD3Q19AaVec.c | 610 ++++++++++++++++++ src/BenchKernelD3Q19AaVec.h | 38 ++ src/BenchKernelD3Q19AaVecCommon.c | 656 ++++++++++++++++++++ src/BenchKernelD3Q19AaVecCommon.h | 93 +++ src/BenchKernelD3Q19Common.c | 167 +++-- src/BenchKernelD3Q19Common.h | 4 +- src/BenchKernelD3Q19List.c | 4 + src/BenchKernelD3Q19ListAaCommon.c | 68 +- src/BenchKernelD3Q19ListAaPv.c | 40 +- src/BenchKernelD3Q19ListAaPvCommon.c | 65 +- src/BenchKernelD3Q19ListAaRiaCommon.c | 64 +- src/BenchKernelD3Q19ListCommon.c | 68 +- src/BenchKernelD3Q19ListPullSplitNt.c | 16 + src/BenchKernelD3Q19ListPullSplitNtCommon.c | 59 +- src/Geometry.c | 49 +- src/Geometry.h | 3 +- src/Kernel.h | 2 +- src/KernelFunctions.h | 18 + src/Lattice.c | 59 ++ src/Lattice.h | 1 + src/Main.c | 6 +- src/Makefile | 45 +- src/Memory.c | 26 + src/Padding.c | 300 +++++++++ src/Padding.h | 19 + src/Pinning.c | 67 +- src/Pinning.h | 9 +- src/test.sh | 2 +- 38 files changed, 4228 insertions(+), 373 deletions(-) create mode 100644 doc/main.css create mode 100644 src/BenchKernelD3Q19Aa.c create mode 100644 src/BenchKernelD3Q19Aa.h create mode 100644 src/BenchKernelD3Q19AaCommon.c create mode 100644 src/BenchKernelD3Q19AaCommon.h create mode 100644 src/BenchKernelD3Q19AaVec.c create mode 100644 src/BenchKernelD3Q19AaVec.h create mode 100644 src/BenchKernelD3Q19AaVecCommon.c create mode 100644 src/BenchKernelD3Q19AaVecCommon.h create mode 100644 src/Lattice.c create mode 100644 src/Padding.c create mode 100644 src/Padding.h diff --git a/doc/Makefile b/doc/Makefile index 690e4eb..6417a6e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -34,5 +34,5 @@ main: main.rst #main.css [ -d html ] || mkdir -p html # rst2html --stylesheet=html4css1.css,main.css $< html/$@.html - rst2html --stylesheet=html4css1.css $< html/$@.html + rst2html --stylesheet=html4css1.css,main.css $< html/$@.html diff --git a/doc/html/main.html b/doc/html/main.html index 36579cc..511f6b2 100644 --- a/doc/html/main.html +++ b/doc/html/main.html @@ -335,6 +335,56 @@ h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { ul.auto-toc { list-style-type: none } + + @@ -375,15 +425,23 @@ ul.auto-toc {

1.5 Options Summary

1.5 Cleaning

1.6 Options Summary

2 Invocation

2.1 Command Line Parameters
2 Invocation
- 2.1 Command Line Parameters
- 2.2 Kernels
3 Benchmarking
4 Acknowledgements
3 Benchmarking
- 3.1 Padding
+
4 Geometries
5 Results
6 Licence
7 Acknowledgements

@@ -399,15 +457,15 @@ depending on compiler and build configuration.

1.1 Debug and Verification

-make
+make BUILD=debug BENCHMARK=off

Running make without any arguments builds the debug version (BUILD=debug) of +

Running make with BUILD=debug builds the debug version of the benchmark kernels, where no optimizations are performed, line numbers and debug symbols are included as well as DEBUG will be defined. The resulting binary will be found in the bin subdirectory and named lbmbenchk-linux-<compiler>-debug.

Without any further specification the binary includes verification -(VERIFICATION=on), statistics (STATISTICS), and VTK output +

Specifying BENCHMARK=off turns on verification +(VERIFICATION=on), statistics (STATISTICS=on), and VTK output (VTK_OUTPUT=on) enabled.

Please note that the generated binary will therefore exhibit a poor performance.

@@ -416,9 +474,10 @@ exhibit a poor performance.

1.2 Benchmarking

To generate a binary for benchmarking run make with

-make BENCHMARK=on BUILD=release
+make

Here BUILD=release turns optimizations on and BENCHMARK=on disables +

As default BENCHMARK=on and BUILD=release is set, where +BUILD=release turns optimizations on and BENCHMARK=on disables verfification, statistics, and VTK output.

@@ -426,7 +485,7 @@ verfification, statistics, and VTK output.

Verification with the debug builds can be extremely slow. Hence verification capabilities can be build with release builds:

-make BUILD=release
+make BENCHMARK=off

@@ -435,8 +494,23 @@ make BUILD=release both configuration can be chosen via CONFIG=linux-gcc or CONFIG=linux-intel.

1.5 Cleaning

For each configuration and build (debug/release) a subdirectory under the +src/obj directory is created where the dependency and object files are +stored. +With

+make CONFIG=... BUILD=... clean
+

a specific combination is select and cleaned, whereas with

+make clean-all
+

all object and dependency files are deleted.

1.5 Options Summary

1.6 Options Summary

Options that can be specified when building the framework with make:

@@ -451,19 +525,14 @@ both configuration can be chosen via -2 Invocation +2 Invocation Running the binary will print among the GPL licence header a line like the following: - -LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification ++LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification + if verfication was enabled during compilation or - -LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark ++LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark + if verfication was disabled during compilation. -2.1 Command Line Parameters +2.1 Command Line Parameters Running the binary with -h list all available parameters: Usage: @@ -540,7 +616,7 @@ $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32 Kernel specific parameters can be opatained via selecting the specific kernel and passing -h as parameter: -$ bin/lbmbenchk-linux-intel-release -kernel -- -h +$ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h ... Kernel parameters: [-blk <n>] [-blk-[xyz] <n>] @@ -574,14 +650,225 @@ Available kernels to benchmark: blk-pull-aos + +2.2 Kernels +The following list shortly describes available kernels: + +push-soa/push-aos/pull-soa/pull-aos: +Unoptimized kernels (but stream/collide are already fused) using two grids as +source and destination. Implement push/pull semantics as well structure of +arrays (soa) or array of structures (aos) layout. +blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: +The same as the unoptimized kernels without the blk prefix, except that they support +spatial blocking, i.e. loop blocking of the three loops used to iterate over +the lattice. Here manual work sharing for OpenMP is used. +list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: +The same as the unoptimized kernels without the list prefix, but for indirect addressing. +Here only a 1D vector of is used to store the fluid nodes, omitting the +obstacles. An adjacency list is used to recover the neighborhood associations. +list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: +Optimized variant of list-pull-soa. Chunks of the lattice are processed as +once. Postcollision values are written back via nontemporal stores in 18 (1s) +or 9 (2s) loops. +list-aa-aos/list-aa-soa: +Unoptimized implementation of the AA pattern for the 1D vector with adjacency +list. Supported are array of structures (aos) and structure of arrays (soa) +data layout is supported. +list-aa-ria-soa: +Implementation of AA pattern with intrinsics for the 1D vector with adjacency +list. Furthermore it contains a vectorized even time step and run length +coding to reduce the loop balance of the odd time step. +list-aa-pv-soa: +All optimizations of list-aa-ria-soa. Additional with partial vectorization +of the odd time step. + +Note that all array of structures (aos) kernels might require blocking +(depending on the domain size) to reach the performance of their structure of +arrays (soa) counter parts. +The following table summarizes the properties of the kernels. Here D means +direct addressing, i.e. full array, I means indirect addressing, i.e. 1D +vector with adjacency list, x means supported, whereas -- means unsupported. +The loop balance B_l is computed for D3Q19 model with double precision floating +point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). +As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective +loop balance depends on the geometry. The effective loop balance is printed +during each run. +

+++++++++ +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

kernel name	prop. step	data layout	addr.	parallel	blocking	B_l [B/FLUP]
push-soa	OS	SoA	D	x	--	456
push-aos	OS	AoS	D	x	--	456
pull-soa	OS	SoA	D	x	--	456
pull-aos	OS	AoS	D	x	--	456
blk-push-soa	OS	SoA	D	x	x	456
blk-push-aos	OS	AoS	D	x	x	456
blk-pull-soa	OS	SoA	D	x	x	456
blk-pull-aos	OS	AoS	D	x	x	456
list-push-soa	OS	SoA	I	x	x	528
list-push-aos	OS	AoS	I	x	x	528
list-pull-soa	OS	SoA	I	x	x	528
list-pull-aos	OS	AoS	I	x	x	528
list-pull-split-nt-1s	OS	SoA	I	x	x	376
list-pull-split-nt-2s	OS	SoA	I	x	x	376
list-aa-soa	AA	SoA	I	x	x	340
list-aa-aos	AA	AoS	I	x	x	340
list-aa-ria-soa	AA	SoA	I	x	x	304-342
list-aa-pv-soa	AA	SoA	I	x	x	304-342

+

-3 Benchmarking +3 Benchmarking Correct benchmarking is a nontrivial task. Whenever benchmark results should be created make sure the binary was compiled with: -BENCHMARK=on and -BUILD=release and +BENCHMARK=on (default if not overriden) and +BUILD=release (default if not overriden) and the correct ISA for macros is used, selected via ISA and use TARCH to specify the architecture the compiler generates code for. @@ -594,7 +881,13 @@ $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9) transparent huge pages: when allocating memory small 4 KiB pages might be replaced with larger ones. This is in general a good thing, but if this is -really the case, depends on the system settings. +really the case, depends on the system settings (check e.g. the status of +/sys/kernel/mm/transparent_hugepage/enabled). +Currently madvise(MADV_HUGEPAGE) is used for allocations which are aligned to +a 4 KiB page, which should be the case for the lattices. +This should result in huge pages except THP is disabled on the machine. +(NOTE: madvise() is used if HAVE_HUGE_PAGES is defined, which is currently +hard coded defined in Memory.c). CPU/core frequency: For reproducible results the frequency of all cores should be fixed. NUMA placement policy: The benchmark assumes a first touch policy, which @@ -604,19 +897,63 @@ used is already full memory might be allocated in a remote domain. Accesses to remote domains typically have a higher latency and lower bandwidth. System load: interference with other application, espcially on desktop systems should be avoided. -Padding: most kernels do not care about padding against cache or TLB -thrashing. Even if the number of (fluid) nodes suggest everything is fine, -through parallelization still problems might occur. +Padding: For SoA based kernels the number of (fluid) nodes is automatically +adjusted so that no cache or TLB thrashing should occur. The parameters are +optimized for current Intel based systems. For more details look into the +padding section. CPU dispatcher function: the compiler might add different versions of a function for different ISA extensions. Make sure the code you might think is executed is actually the code which is executed. + +3.1 Padding +With correct padding cache and TLB thrashing can be avoided. Therefore the +number of (fluid) nodes used in the data layout is artificially increased. +Currently automatic padding is active for kernels which support it. It can be +controlled via the kernel parameter (i.e. parameter after the --) +-pad. Supported values are auto (default), no (to disable padding), +or a manual padding. +Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 +entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the +parameters of current Intel based processors. +Manual padding is done via a padding string and has the format +mod_1+offset_1(,mod_n+offset_n), which specifies numbers of bytes. +SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the +19 pages with one lattice (36 with two lattices) we are concurrently accessing +over as much sets in the TLB as possible. +This is controlled by the distance between the accessed pages, which is the +number of (fluid) nodes in between them and can be adjusted by adding further +(fluid) nodes. +We want the distance d (in bytes) between two accessed pages to be e.g. +d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE. +This would distribute the pages evenly over the sets. Hereby PAGE_SIZE * TLB_SETS +would be our mod_1 and PAGE_SIZE (after the =) our offset_1. +Measurements show that with only a quarter of half of a page size as offset +higher performance is achieved, which is done by automatic padding. +On top of this padding more paddings can be added. They are just added to the +padding string and are separated by commas. +A zero modulus in the padding string has a special meaning. Here the +corresponding offset is just added to the number of nodes. A padding string +like -pad 0+16 would at a static padding of two nodes (one node = 8 b). + + + +4 Geometries +TODO: supported geometries: channel, pipe, blocks + + +5 Results +TODO + + +6 Licence +The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3. -4 Acknowledgements +7 Acknowledgements This work was funded by BMBF, grant no. 01IH15003A (project SKAMPY). This work was funded by KONWHIR project OMI4PAPS. -Document was generated at 2017-10-26 09:43. +Document was generated at 2017-11-02 15:33. diff --git a/doc/main.css b/doc/main.css new file mode 100644 index 0000000..27d3f26 --- /dev/null +++ b/doc/main.css @@ -0,0 +1,46 @@ + +h1, h2, h3, h4, h5, h6 { + font-family: sans-serif; + font-size: 100%; + background-color: #dcdcdc; +} + +h1.title { + background-color: gray; + color: white +} + +table.footnote { + padding-left: 0.5ex; +} + +table.citation { + padding-left: 0.5ex +} + +td.label { + width: 10%; +} + +table, table.docutils, td, th { + border: 0; +} + +table.citation, table.footnote { + width: 100%; +} + +th { + background-color: lavender ; +} + +tr:nth-child(even) { + xxbackground-color: aliceblue; + background-color: white; +} +tr:nth-child(odd) { + xxbackground-color: lavender; + background-color: whitesmoke; +} + + diff --git a/doc/main.rst b/doc/main.rst index 9ad56ba..528b339 100644 --- a/doc/main.rst +++ b/doc/main.rst @@ -54,16 +54,16 @@ Debug and Verification :: - make + make BUILD=debug BENCHMARK=off -Running ``make`` without any arguments builds the debug version (BUILD=debug) of +Running ``make`` with ``BUILD=debug`` builds the debug version of the benchmark kernels, where no optimizations are performed, line numbers and debug symbols are included as well as ``DEBUG`` will be defined. The resulting binary will be found in the ``bin`` subdirectory and named ``lbmbenchk-linux--debug``. -Without any further specification the binary includes verification -(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output +Specifying ``BENCHMARK=off`` turns on verification +(``VERIFICATION=on``), statistics (``STATISTICS=on``), and VTK output (``VTK_OUTPUT=on``) enabled. Please note that the generated binary will therefore @@ -74,9 +74,10 @@ Benchmarking To generate a binary for benchmarking run make with :: - make BENCHMARK=on BUILD=release + make -Here BUILD=release turns optimizations on and BENCHMARK=on disables +As default ``BENCHMARK=on`` and ``BUILD=release`` is set, where +BUILD=release turns optimizations on and ``BENCHMARK=on`` disables verfification, statistics, and VTK output. Release and Verification @@ -85,7 +86,7 @@ Release and Verification Verification with the debug builds can be extremely slow. Hence verification capabilities can be build with release builds: :: - make BUILD=release + make BENCHMARK=off Compilers --------- @@ -94,6 +95,24 @@ Currently only the GCC and Intel compiler under Linux are supported. Between both configuration can be chosen via ``CONFIG=linux-gcc`` or ``CONFIG=linux-intel``. + +Cleaning +-------- + +For each configuration and build (debug/release) a subdirectory under the +``src/obj`` directory is created where the dependency and object files are +stored. +With :: + + make CONFIG=... BUILD=... clean + +a specific combination is select and cleaned, whereas with :: + + make clean-all + +all object and dependency files are deleted. + + Options Summary --------------- @@ -102,13 +121,13 @@ Options that can be specified when building the framework with make: ============= ======================= ============ ========================================================== name values default description ------------- ----------------------- ------------ ---------------------------------------------------------- -TARCH -- -- Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler. -BENCHMARK on, off off If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. -BUILD debug, release debug No optimization, debug symbols, DEBUG defined. +BENCHMARK on, off on If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. If disabled enables the three former options. +BUILD debug, release release No optimization, debug symbols, DEBUG defined. CONFIG linux-gcc, linux-intel linux-intel Select GCC or Intel compiler. ISA avx, sse avx Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for. OPENMP on, off on OpenMP, i.\,e.\. threading support. STATISTICS on, off off View statistics, like density etc, during simulation. +TARCH -- -- Via TARCH the architecture the compiler generates code for can be overridden. The value depends on the chosen compiler. VERIFICATION on, off off Turn verification on/off. VTK_OUTPUT on, off off Enable/Disable VTK file output. ============= ======================= ============ ========================================================== @@ -116,11 +135,11 @@ VTK_OUTPUT on, off off Enable/Disable VTK file outpu Invocation ========== -Running the binary will print among the GPL licence header a line like the following: +Running the binary will print among the GPL licence header a line like the following: :: LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification -if verfication was enabled during compilation or +if verfication was enabled during compilation or :: LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark @@ -159,7 +178,7 @@ iterations, etc, which can afterward be override, e.g.: :: Kernel specific parameters can be opatained via selecting the specific kernel and passing ``-h`` as parameter: :: - $ bin/lbmbenchk-linux-intel-release -kernel -- -h + $ bin/lbmbenchk-linux-intel-release -kernel kernel-name -- -h ... Kernel parameters: [-blk ] [-blk-[xyz] ] @@ -193,6 +212,82 @@ A list of all available kernels can be obtained via ``-list``: :: blk-pull-soa blk-pull-aos +Kernels +------- + +The following list shortly describes available kernels: + +- push-soa/push-aos/pull-soa/pull-aos: + Unoptimized kernels (but stream/collide are already fused) using two grids as + source and destination. Implement push/pull semantics as well structure of + arrays (soa) or array of structures (aos) layout. + +- blk-push-soa/blk-push-aos/blk-pull-soa/blk-pull-aos: + The same as the unoptimized kernels without the blk prefix, except that they support + spatial blocking, i.e. loop blocking of the three loops used to iterate over + the lattice. Here manual work sharing for OpenMP is used. + +- list-push-soa/list-push-aos/list-pull-soa/list-pull-aos: + The same as the unoptimized kernels without the list prefix, but for indirect addressing. + Here only a 1D vector of is used to store the fluid nodes, omitting the + obstacles. An adjacency list is used to recover the neighborhood associations. + +- list-pull-split-nt-1s-soa/list-pull-split-nt-2s-soa: + Optimized variant of list-pull-soa. Chunks of the lattice are processed as + once. Postcollision values are written back via nontemporal stores in 18 (1s) + or 9 (2s) loops. + +- list-aa-aos/list-aa-soa: + Unoptimized implementation of the AA pattern for the 1D vector with adjacency + list. Supported are array of structures (aos) and structure of arrays (soa) + data layout is supported. + +- list-aa-ria-soa: + Implementation of AA pattern with intrinsics for the 1D vector with adjacency + list. Furthermore it contains a vectorized even time step and run length + coding to reduce the loop balance of the odd time step. + +- list-aa-pv-soa: + All optimizations of list-aa-ria-soa. Additional with partial vectorization + of the odd time step. + + +Note that all array of structures (aos) kernels might require blocking +(depending on the domain size) to reach the performance of their structure of +arrays (soa) counter parts. + +The following table summarizes the properties of the kernels. Here **D** means +direct addressing, i.e. full array, **I** means indirect addressing, i.e. 1D +vector with adjacency list, **x** means supported, whereas **--** means unsupported. +The loop balance B_l is computed for D3Q19 model with double precision floating +point for PDFs (8 byte) and 4 byte integers for the index (adjacency list). +As list-aa-ria-soa and list-aa-pv-soa support run length coding their effective +loop balance depends on the geometry. The effective loop balance is printed +during each run. + + +====================== =========== =========== ===== ======== ======== ============ +kernel name prop. step data layout addr. parallel blocking B_l [B/FLUP] +====================== =========== =========== ===== ======== ======== ============ +push-soa OS SoA D x -- 456 +push-aos OS AoS D x -- 456 +pull-soa OS SoA D x -- 456 +pull-aos OS AoS D x -- 456 +blk-push-soa OS SoA D x x 456 +blk-push-aos OS AoS D x x 456 +blk-pull-soa OS SoA D x x 456 +blk-pull-aos OS AoS D x x 456 +list-push-soa OS SoA I x x 528 +list-push-aos OS AoS I x x 528 +list-pull-soa OS SoA I x x 528 +list-pull-aos OS AoS I x x 528 +list-pull-split-nt-1s OS SoA I x x 376 +list-pull-split-nt-2s OS SoA I x x 376 +list-aa-soa AA SoA I x x 340 +list-aa-aos AA AoS I x x 340 +list-aa-ria-soa AA SoA I x x 304-342 +list-aa-pv-soa AA SoA I x x 304-342 +====================== =========== =========== ===== ======== ======== ============ Benchmarking ============ @@ -200,8 +295,8 @@ Benchmarking Correct benchmarking is a nontrivial task. Whenever benchmark results should be created make sure the binary was compiled with: -- ``BENCHMARK=on`` and -- ``BUILD=release`` and +- ``BENCHMARK=on`` (default if not overriden) and +- ``BUILD=release`` (default if not overriden) and - the correct ISA for macros is used, selected via ``ISA`` and - use ``TARCH`` to specify the architecture the compiler generates code for. @@ -214,7 +309,13 @@ Things the binary does nor check or controll: - transparent huge pages: when allocating memory small 4 KiB pages might be replaced with larger ones. This is in general a good thing, but if this is - really the case, depends on the system settings. + really the case, depends on the system settings (check e.g. the status of + ``/sys/kernel/mm/transparent_hugepage/enabled``). + Currently ``madvise(MADV_HUGEPAGE)`` is used for allocations which are aligned to + a 4 KiB page, which should be the case for the lattices. + This should result in huge pages except THP is disabled on the machine. + (NOTE: madvise() is used if ``HAVE_HUGE_PAGES`` is defined, which is currently + hard coded defined in ``Memory.c``). - CPU/core frequency: For reproducible results the frequency of all cores should be fixed. @@ -228,14 +329,69 @@ Things the binary does nor check or controll: - System load: interference with other application, espcially on desktop systems should be avoided. -- Padding: most kernels do not care about padding against cache or TLB - thrashing. Even if the number of (fluid) nodes suggest everything is fine, - through parallelization still problems might occur. +- Padding: For SoA based kernels the number of (fluid) nodes is automatically + adjusted so that no cache or TLB thrashing should occur. The parameters are + optimized for current Intel based systems. For more details look into the + padding section. - CPU dispatcher function: the compiler might add different versions of a function for different ISA extensions. Make sure the code you might think is executed is actually the code which is executed. +Padding +------- + +With correct padding cache and TLB thrashing can be avoided. Therefore the +number of (fluid) nodes used in the data layout is artificially increased. + +Currently automatic padding is active for kernels which support it. It can be +controlled via the kernel parameter (i.e. parameter after the ``--``) +``-pad``. Supported values are ``auto`` (default), ``no`` (to disable padding), +or a manual padding. + +Automatic padding tries to avoid cache and TLB thrashing and pads for a 32 +entry (huge pages) TLB with 8 sets and a 512 set (L2) cache. This reflects the +parameters of current Intel based processors. + +Manual padding is done via a padding string and has the format +``mod_1+offset_1(,mod_n+offset_n)``, which specifies numbers of bytes. +SoA data layouts can exhibit TLB thrashing. Therefore we want to distribute the +19 pages with one lattice (36 with two lattices) we are concurrently accessing +over as much sets in the TLB as possible. +This is controlled by the distance between the accessed pages, which is the +number of (fluid) nodes in between them and can be adjusted by adding further +(fluid) nodes. +We want the distance d (in bytes) between two accessed pages to be e.g. +**d % (PAGE_SIZE * TLB_SETS) = PAGE_SIZE**. +This would distribute the pages evenly over the sets. Hereby **PAGE_SIZE * TLB_SETS** +would be our ``mod_1`` and **PAGE_SIZE** (after the =) our ``offset_1``. +Measurements show that with only a quarter of half of a page size as offset +higher performance is achieved, which is done by automatic padding. +On top of this padding more paddings can be added. They are just added to the +padding string and are separated by commas. + +A zero modulus in the padding string has a special meaning. Here the +corresponding offset is just added to the number of nodes. A padding string +like ``-pad 0+16`` would at a static padding of two nodes (one node = 8 b). + + +Geometries +========== + +TODO: supported geometries: channel, pipe, blocks + + +Results +======= + +TODO + + +Licence +======= + +The Lattice Boltzmann Benchmark Kernels are licensed under GPLv3. + Acknowledgements ================ diff --git a/src/Base.h b/src/Base.h index 27f9116..ae61082 100644 --- a/src/Base.h +++ b/src/Base.h @@ -44,8 +44,6 @@ static inline double Time() } -#define TOOL_NAME "lbmbenchk" - #define STRINGIFYX(x) #x #define STRINGIFY(x) STRINGIFYX(x) @@ -117,17 +115,17 @@ static inline double Time() } while (0) #define Print(formatString, ...) \ - fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__) + fprintf(stdout, SHC_MAGENTA formatString SHC_NC, ##__VA_ARGS__) #define Warning(formatString, ...) \ - fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__) + fprintf(stdout, SHC_BROWN "WARNING: " SHC_NC formatString, ##__VA_ARGS__) #define Error(formatString, ...) \ - fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__) - + fprintf(stderr, SHC_RED "ERROR: " formatString SHC_NC , ##__VA_ARGS__) +/* #define DebugPrint(formatString, ...) \ - fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__) - + fprintf(stderr, "DEBUG: " formatString, ##__VA_ARGS__) +*/ #ifndef NO_SHELL_COLORS // or "\e" diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c index 8203d2a..bd6d43c 100644 --- a/src/BenchKernelD3Q19.c +++ b/src/BenchKernelD3Q19.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "LikwidIf.h" #include #include @@ -98,6 +99,8 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd for (int iter = 0; iter < maxIterations; ++iter) { + X_LIKWID_START("os"); + #ifdef _OPENMP #pragma omp parallel for collapse(3) default(none) \ shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ @@ -112,9 +115,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd evenPart, oddPart, w_1_indep, w_2_indep) #endif - for (int z = oZ; z < nZ + oZ; ++z) { + for (int x = oX; x < nX + oX; ++x) { for (int y = oY; y < nY + oY; ++y) { - for (int x = oX; x < nX + oX; ++x) { + for (int z = oZ; z < nZ + oZ; ++z) { #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) #ifdef PROP_MODEL_PUSH @@ -306,6 +309,9 @@ void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd } } // z, y, x (from inner to outer) + // Stop counters before bounce back. Else computing loop balance will be incorrect. + X_LIKWID_STOP("os"); + // Fixup bounce back PDFs. #ifdef _OPENMP #pragma omp parallel for default(none) \ @@ -429,21 +435,8 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * for (int iter = 0; iter < maxIterations; ++iter) { - // #ifdef _OPENMP --> add line continuation - // #pragma omp parallel for collapse(3) default(none) - // shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, - // w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, - // oX, oY, oZ, nX, nY, nZ, blk) - // private(ux, uy, uz, ui, dens, dir_indep_trm, - // pdf_C, - // pdf_N, pdf_E, pdf_S, pdf_W, - // pdf_NE, pdf_SE, pdf_SW, pdf_NW, - // pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, - // pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, - // evenPart, oddPart, w_1_indep, w_2_indep) - // #endif #ifdef _OPENMP - #pragma omp parallel for default(none) \ + #pragma omp parallel default(none) \ shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ oX, oY, oZ, nX, nY, nZ, blk, nThreads) \ @@ -455,16 +448,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ evenPart, oddPart, w_1_indep, w_2_indep) #endif + { + X_LIKWID_START("blk-os"); - for (int i = 0; i < nThreads; ++i) { + int threadId = omp_get_thread_num(); - int threadStartX = nX / nThreads * i; - int threadEndX = nX / nThreads * (i + 1); + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); if (nX % nThreads > 0) { - if (nX % nThreads > i) { - threadStartX += i; - threadEndX += i + 1; + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; } else { threadStartX += nX % nThreads; @@ -472,24 +467,18 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } } - // for (int z = oZ; z < nZ + oZ; ++z) { - // for (int y = oY; y < nY + oY; ++y) { - // for (int x = oX; x < nX + oX; ++x) { - for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { for (int bY = oY; bY < nY + oY; bY += blk[1]) { - for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { - // for (int bX = oX; bX < nX + oX; bX += blk[0]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { // Must do everything here, else it would break collapse. int eZ = MIN(bZ + blk[2], nZ + oZ); int eY = MIN(bY + blk[1], nY + oY); int eX = MIN(bX + blk[0], threadEndX + oX); -// printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); - - for (int z = bZ; z < eZ; ++z) { + for (int x = bX; x < eX; ++x) { for (int y = bY; y < eY; ++y) { - for (int x = bX; x < eX; ++x) { + for (int z = bZ; z < eZ; ++z) { #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) @@ -497,7 +486,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; - //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} D3Q19_LIST #undef X @@ -505,7 +493,6 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)]; - //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} D3Q19_LIST #undef X @@ -525,11 +512,11 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } else { #endif ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - - pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - - pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - - pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; #ifdef LID_DRIVEN_CAVITY } @@ -685,7 +672,10 @@ void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * } } // z, y, x (from inner to outer) - } // loop over threads + X_LIKWID_STOP("blk-os"); + } // parallel region + + // Stop counters before bounce back. Else computing loop balance will be incorrect. // Fixup bounce back PDFs. #ifdef _OPENMP diff --git a/src/BenchKernelD3Q19Aa.c b/src/BenchKernelD3Q19Aa.c new file mode 100644 index 0000000..87a9e33 --- /dev/null +++ b/src/BenchKernelD3Q19Aa.c @@ -0,0 +1,529 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +void FNAME(D3Q19AaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + KernelDataAa * kda = KDA(kd); + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + PdfT magicParam = 1.0 / 12.0; + // 1/4: best stability; + // 1/12: removes third-order advection error (best advection); + // 1/6: removes fourth-order diffusion error (best diffusion); + // 3/16: exact location of bounce back for poiseuille flow + + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ux, uy, uz, ui; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + + int nThreads = 1; + + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif + + for (int iter = 0; iter < maxIterations; iter += 2) { + + // -------------------------------------------------------------------- + // even time step + + X_LIKWID_START("aa-even"); + + // {{{ + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ, blk, nThreads, ld) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + + if (ld->Lattice[L_INDEX_4(ld->Dims, x - oX, y - oY, z - oZ)] == LAT_CELL_OBSTACLE) { + continue; + } + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; + D3Q19_LIST + #undef X + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { + #endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + #ifdef LID_DRIVEN_CAVITY + } + #endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_S)] = pdf_N - evenPart - oddPart; + src[I(x, y, z, D3Q19_N)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_W)] = pdf_E - evenPart - oddPart; + src[I(x, y, z, D3Q19_E)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(x, y, z, D3Q19_B)] = pdf_T - evenPart - oddPart; + src[I(x, y, z, D3Q19_T)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_SE)] = pdf_NW - evenPart - oddPart; + src[I(x, y, z, D3Q19_NW)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_SW)] = pdf_NE - evenPart - oddPart; + src[I(x, y, z, D3Q19_NE)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BE)] = pdf_TW - evenPart - oddPart; + src[I(x, y, z, D3Q19_TW)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BW)] = pdf_TE - evenPart - oddPart; + src[I(x, y, z, D3Q19_TE)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BN)] = pdf_TS - evenPart - oddPart; + src[I(x, y, z, D3Q19_TS)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(x, y, z, D3Q19_BS)] = pdf_TN - evenPart - oddPart; + src[I(x, y, z, D3Q19_TN)] = pdf_BS - evenPart + oddPart; + + #undef I + } } } // z, y, x (from inner to outer) + } } } // z, y, x (from inner to outer) + + } // loop over threads + + // }}} + + X_LIKWID_STOP("aa-even"); + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; + } + + // save current iteration + kda->Iteration = iter; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // -------------------------------------------------------------------- + // odd time step + + + X_LIKWID_START("aa-odd"); + + // {{{ + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(gDims,src, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ, blk, nThreads) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + // Must do everything here, else it would break collapse. + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idxinv)]; + D3Q19_LIST + #undef X + + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + src[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + src[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart; + src[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + src[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + src[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + src[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + src[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + src[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + src[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + + + #undef I + } } } // z, y, x (from inner to outer) + } } } // z, y, x (from inner to outer) + } // loop over threads + + // }}} + + // Stop counters before bounce back. Else computing loop balance will be incorrect. + + X_LIKWID_STOP("aa-odd"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; + } + + // save current iteration + kda->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter + 1 % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter + 1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter + 1); + #endif // }}} + + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + diff --git a/src/BenchKernelD3Q19Aa.h b/src/BenchKernelD3Q19Aa.h new file mode 100644 index 0000000..d0605d1 --- /dev/null +++ b/src/BenchKernelD3Q19Aa.h @@ -0,0 +1,41 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA__ +#define __BENCH_KERNEL_D3Q19_AA__ + +#include "Kernel.h" + + +void D3Q19AaInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19AaInit_AaAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19AaDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19AaDeinit_AaAoS(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19__ diff --git a/src/BenchKernelD3Q19AaCommon.c b/src/BenchKernelD3Q19AaCommon.c new file mode 100644 index 0000000..fbdcbc1 --- /dev/null +++ b/src/BenchKernelD3Q19AaCommon.c @@ -0,0 +1,651 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +// Forward definition. +void FNAME(D3Q19AaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + +void FNAME(D3Q19AaBlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + +static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])]; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)]; + #undef I + } + + + return; +} + +static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf; + #undef I + } + + return; +} + + +static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 // DETECT NANs + + for (int d = 0; d < 19; ++d) { + if (isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + + exit(1); + } + } + +#endif + + return; +} + + +static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + return; +} + + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + + +void FNAME(D3Q19AaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelDataAa * kda = NULL; + MemAlloc((void **)&kda, sizeof(KernelDataAa)); + + kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0; + kda->Iteration = -1; + + KernelData * kd = &kda->kd; + *kernelData = kd; + + kd->nObstIndices = ld->nObst; + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = ld->Dims[0]; + kd->Dims[1] = ld->Dims[1]; + kd->Dims[2] = ld->Dims[2]; + + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + gDims[0] = lDims[0] + 2; + gDims[1] = lDims[1] + 2; + gDims[2] = lDims[2] + 2; + + kd->Offsets[0] = 1; + kd->Offsets[1] = 1; + kd->Offsets[2] = 1; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int gX = gDims[0]; + int gY = gDims[1]; + int gZ = gDims[2]; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3] = { 0 }; + + int nCells = gX * gY * gZ; + + PdfT * pdfs[2] = { NULL, NULL }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = gX; + if (blk[1] == 0) blk[1] = gY; + if (blk[2] == 0) blk[2] = gZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + + kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2]; + + + printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n", + nCells, + sizeof(PdfT) * nCells * N_D3Q19, + sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + +#define PAGE_4K 4096 + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = NULL; + + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // This depends on the chosen data layout. + // The structure of the loop should resemble the same "execution layout" + // as in the kernel! + + int nThreads; +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + +#ifdef _OPENMP + #pragma omp parallel for \ + shared(gDims, pdfs, \ + oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld) +#endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = lX / nThreads * i; + int threadEndX = lX / nThreads * (i + 1); + + if (lX % nThreads > 0) { + if (lX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += lX % nThreads; + threadEndX += lX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < lY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], lY + oY); + int eZ = MIN(bZ + blk[2], lZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -1.0; + } + + } } } // x, y, z + } } } // bx, by, bz + } + + + // Initialize all PDFs to some standard value. + for (int x = oX; x < lX + oX; ++x) { + for (int y = oY; y < lY + oY; ++y) { + for (int z = oZ; z < lZ + oZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + } + } } } // x, y, z + + + // Count how many *PDFs* need bounce back treatment. + + uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + + // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2. + // Works when declaring nBounceBackPdfs as int64_t or using volatile. + volatile int nBounceBackPdfs = 0; + // int64_t nBounceBackPdfs = 0; + int nx, ny, nz, px, py, pz; + + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; + + // Check if neighbor is inside the lattice. + // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + // continue; + // } + if ((nx < 0 || nx >= lX) && ld->PeriodicX) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((ny < 0 || ny >= lY) && ld->PeriodicY) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + } + } + } + } + } + + printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2)); + + MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100); + MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100); + + kd->nBounceBackPdfs = nBounceBackPdfs; + nBounceBackPdfs = 0; + + int srcIndex; + int dstIndex; + + // TODO: currently this is not NUMA-aware + // - maybe use the same blocking as for lattice initialization? + // - do place the bounce back index vector parallel? + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // For periodicity: + + // We assume we have finished odd time step (accessing neighbor PDFs) and are + // before executing the even time step (accessing local PDFs only). + + // Assuming we are at the most east position of the lattice. Through the odd + // time step propagation has put a PDF in the east slot of the ghost cell east + // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node. + + // In case of transition from even to odd time step , src and dest must be + // exchanged. + + + // x periodic + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + // See description of bounce back handling below. + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + } + else { + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // Put it on the other side back into the domain. + dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + } + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + // Depending on the time step we are in we have to exchange src and dst index. + + // We build the list for the case, when we have finished odd time step + // (accessing neighbor PDFs) and before we start with the even time step + // (accessing local PDFs only). + + // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell. + // Then we have to move the east PDF from the obstacle to our west position, + // i.e. the inverse of east. + + // In case of transition from even to odd time step src and dest just + // have to be exchanged. + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + } + } + } + } + } + } + + + // Fill remaining KernelData structures + kd->GetNode = FNAME(GetNode); + kd->SetNode = FNAME(SetNode); + + kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); + + kd->Kernel = FNAME(D3Q19AaKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19AaDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + MemFree((void **) & ((*kernelData)->Pdfs[0])); + // MemFree((void **) & ((*kernelData)->Pdfs[1])); + + MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc)); + MemFree((void **) & ((*kernelData)->BounceBackPdfsDst)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19AaCommon.h b/src/BenchKernelD3Q19AaCommon.h new file mode 100644 index 0000000..151e5a0 --- /dev/null +++ b/src/BenchKernelD3Q19AaCommon.h @@ -0,0 +1,89 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_AA_COMMON_H__ + +#include "Kernel.h" + +typedef struct KernelDataAa_ +{ + KernelData kd; + int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking. + int Iteration; // Current iteration number. +} KernelDataAa; + +// Macro for casting KernelData * to KernelDataAa *. +#define KDA(_x_) ((KernelDataAa *)(_x_)) + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must be defined +#endif + +#ifndef PROP_MODEL_NAME + #error PROP_MODEL_NAME must be defined +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) +{ + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + Assert(d >= 0); +#ifdef D3Q19 + Assert(d < N_D3Q19); +#else +#error Not implemented for this discretization. +#endif + +#ifdef DATA_LAYOUT_SOA + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; +#elif DATA_LAYOUT_AOS + return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; +#else +#error P_INDEX_5 function no implemented for chosen data layout. +#endif +} + +#endif // __BENCH_KERNEL_D3Q19_AA_COMMON_H__ + diff --git a/src/BenchKernelD3Q19AaVec.c b/src/BenchKernelD3Q19AaVec.c new file mode 100644 index 0000000..2642c1c --- /dev/null +++ b/src/BenchKernelD3Q19AaVec.c @@ -0,0 +1,610 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaVecCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" +#include "Vector.h" +#include "Vector.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd); +static void KernelOdd( LatticeDesc * ld, KernelData * kd, CaseData * cd); + +#if 0 // {{{ +void DumpPdfs(LatticeDesc * ld, KernelData * kd, int zStart, int zStop, int iter, const char * prefix) +{ + return; + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + int nX = gDims[0]; + int nY = gDims[1]; + int nZ = gDims[2]; + + PdfT pdfs[N_D3Q19]; + + int localZStart = zStart; + int localZStop = zStop; + + if (localZStart == -1) localZStart = 0; + if (localZStop == -1) localZStop = gDims[2] - 1; + + printf("D iter: %d\n", iter); + + for (int dir = 0; dir < 19; ++dir) { + for (int z = localZStop; z >= localZStart; --z) { + printf("D [%2d][%2d][%s] plane % 2d\n", iter, dir, prefix, z); + + for(int y = 0; y < nY; ++y) { + printf("D [%2d][%2d][%s] %2d ", iter, dir, prefix, y); + + for(int x = 0; x < nX; ++x) { + + if (1) { // ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + pdfs[dir] = kd->PdfsActive[I(x, y, z, dir)]; + #undef I +// kd->GetNode(kd, x, y, z, pdfs); + } + else { + pdfs[dir] = -1.0; + } + + printf("%.16e ", pdfs[dir]); + } + + printf("\n"); + } + } + } +} +#endif // }}} + +void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, KernelData * kd, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + PdfT * src = kd->PdfsActive; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + Assert((maxIterations % 2) == 0); + + for (int iter = 0; iter < maxIterations; iter += 2) { + + // -------------------------------------------------------------------- + // even time step + // -------------------------------------------------------------------- + + X_LIKWID_START("aa-vec-even"); + + #pragma omp parallel + { + KernelEven(ld, kd, cd); + } + + X_LIKWID_STOP("aa-vec-even"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsSrc[i]] = src[kd->BounceBackPdfsDst[i]]; + } + + // save current iteration + kda->Iteration = iter; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + // -------------------------------------------------------------------- + // odd time step + // -------------------------------------------------------------------- + + X_LIKWID_START("aa-vec-odd"); + + #pragma omp parallel + { + KernelOdd(ld, kd, cd); + } + + // Stop counters before bounce back. Else computing loop balance will + // be incorrect. + + X_LIKWID_STOP("aa-vec-odd"); + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, src) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + src[kd->BounceBackPdfsDst[i]] = src[kd->BounceBackPdfsSrc[i]]; + } + + // save current iteration + kda->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && ((iter + 1) % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter + 1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter + 1); + #endif // }}} + + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + +static void KernelEven(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int nThreads = 1; + int threadId = 0; + + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); + #endif + + // TODO: Currently only a 1-D decomposition is applied. For achritectures + // with a lot of cores we want at least 2-D. + + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE); + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + for (int x = bX; x < eX; x += 1) { + for (int y = bY; y < eY; y += 1) { + for (int z = bZ; z < eZ; z += VSIZE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x, y, z, idx)]); + D3Q19_LIST + #undef X + + + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), + VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)), + VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + vui = vuy; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + vui = vux; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + vui = vuz; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + vui = VSUB(vuy,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + vui = VADD(vux,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + vui = VSUB(vuz,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + vui = VADD(vux,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + vui = VSUB(vuz,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + vui = VADD(vuy,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y, z, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + VSTU(&src[I(x, y, z, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + #undef I + } } } // x, y, z + } } } // blocked x, y, z + + + + return; +} // }}} + + +static void KernelOdd(LatticeDesc * ld, KernelData * kd, CaseData * cd) // {{{ +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelDataAa * kda = KDA(kd); + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3]; + blk[0] = kda->Blk[0]; + blk[1] = kda->Blk[1]; + blk[2] = kda->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int threadId = 0; + int nThreads = 1; + + #ifdef _OPENMP + threadId = omp_get_thread_num(); + nThreads = omp_get_max_threads(); + #endif + + // TODO: Currently only a 1-D decomposition is applied. For achritectures + // with a lot of cores we want at least 2-D. + int threadStartX = nX / nThreads * threadId; + int threadEndX = nX / nThreads * (threadId + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > threadId) { + threadStartX += threadId; + threadEndX += threadId + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + AssertMsg((blk[2] % VSIZE == 0) || blk[2] >= nZ, "Blocking in z direction must be a multiple of VSIZE = %d or larger than z dimension.", VSIZE); + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], nY + oY); + int eZ = MIN(bZ + blk[2], nZ + oZ); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; z += VSIZE) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(x - _x, y - _y, z - _z, idxinv)]); + D3Q19_LIST + #undef X + + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), + VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + VSTU(&src[I(x, y, z, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + vui = vuy; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y + 1, z, D3Q19_N)], VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + VSTU(&src[I(x, y - 1, z, D3Q19_S)], VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + vui = vux; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x + 1, y, z, D3Q19_E)], VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y, z, D3Q19_W)], VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + vui = vuz; + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + VSTU(&src[I(x, y, z + 1, D3Q19_T)], VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + VSTU(&src[I(x, y, z - 1, D3Q19_B)], VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + vui = VSUB(vuy,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x - 1, y + 1, z, D3Q19_NW)], VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + VSTU(&src[I(x + 1, y - 1, z, D3Q19_SE)], VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + vui = VADD(vux,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x + 1, y + 1, z, D3Q19_NE)], VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y - 1, z, D3Q19_SW)], VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + vui = VSUB(vuz,vux); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x - 1, y, z + 1, D3Q19_TW)], VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + VSTU(&src[I(x + 1, y, z - 1, D3Q19_BE)], VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + vui = VADD(vux,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x + 1, y, z + 1, D3Q19_TE)], VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + VSTU(&src[I(x - 1, y, z - 1, D3Q19_BW)], VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + vui = VSUB(vuz,vuy); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y - 1, z + 1, D3Q19_TS)], VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(&src[I(x, y + 1, z - 1, D3Q19_BN)], VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + vui = VADD(vuy,vuz); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + VSTU(&src[I(x, y + 1, z + 1, D3Q19_TN)], VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + VSTU(&src[I(x, y - 1, z - 1, D3Q19_BS)], VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + #undef I + } } } // x, y, z + } } } // blocked x, y, z + + return; + +} // }}} diff --git a/src/BenchKernelD3Q19AaVec.h b/src/BenchKernelD3Q19AaVec.h new file mode 100644 index 0000000..a125b04 --- /dev/null +++ b/src/BenchKernelD3Q19AaVec.h @@ -0,0 +1,38 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_VEC__ +#define __BENCH_KERNEL_D3Q19_AA_VEC__ + +#include "Kernel.h" + + +void D3Q19AaVecInit_AaSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19AaVecDeinit_AaSoA(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_AA_VEC__ diff --git a/src/BenchKernelD3Q19AaVecCommon.c b/src/BenchKernelD3Q19AaVecCommon.c new file mode 100644 index 0000000..57a9fae --- /dev/null +++ b/src/BenchKernelD3Q19AaVecCommon.c @@ -0,0 +1,656 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19AaVecCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +// Forward definition. +void FNAME(D3Q19AaVecKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + +static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])]; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)]; + #undef I + } + + + return; +} + +static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, D3Q19_INV[dir])] = pdf; + #undef I + } + else { + int nx = x; + int ny = y; + int nz = z; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf; + #undef I + } + + return; +} + + +static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 // DETECT NANs + + for (int d = 0; d < 19; ++d) { + if (isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + + exit(1); + } + } + +#endif + + return; +} + + +static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataAa * kda = KDA(kd); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + if (kda->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - _x, y + oY - _y, z + oZ - _z, D3Q19_INV[idx])] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + else { + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + } + return; +} + + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + + +void FNAME(D3Q19AaVecInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelDataAa * kda = NULL; + MemAlloc((void **)&kda, sizeof(KernelDataAa)); + + kda->Blk[0] = 0; kda->Blk[1] = 0; kda->Blk[2] = 0; + kda->Iteration = -1; + + KernelData * kd = &kda->kd; + *kernelData = kd; + + kd->nObstIndices = ld->nObst; + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = ld->Dims[0]; + kd->Dims[1] = ld->Dims[1]; + kd->Dims[2] = ld->Dims[2]; + + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + Assert(VSIZE <= 4); + + // TODO: only add enough ghost cells so we can compute everything vectorized. + gDims[0] = lDims[0] + 2; + gDims[1] = lDims[1] + 2; + // TODO: fix this for aa-vec2-soa + gDims[2] = lDims[2] + 4; // one ghost cell in front, one in the back, plus at most two at the back for VSIZE = 4 + + kd->Offsets[0] = 1; + kd->Offsets[1] = 1; + kd->Offsets[2] = 1; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int gX = gDims[0]; + int gY = gDims[1]; + int gZ = gDims[2]; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3] = { 0 }; + + int nCells = gX * gY * gZ; + + PdfT * pdfs[2] = { NULL, NULL }; + + ParseParameters(params, blk); + + if (blk[2] % VSIZE != 0) { + blk[2] -= blk[2] % VSIZE; + printf("WARNING: blocking factor for z direction must be a multiple of VSIZE = %d, adjusting it to %d.\n", VSIZE, blk[2]); + } + + if (blk[0] == 0) blk[0] = gX; + if (blk[1] == 0) blk[1] = gY; + if (blk[2] == 0) blk[2] = gZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + kda->Blk[0] = blk[0]; kda->Blk[1] = blk[1]; kda->Blk[2] = blk[2]; + + + printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for the single lattice)\n", + nCells, + sizeof(PdfT) * nCells * N_D3Q19, + sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + +#define PAGE_4K 4096 + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = NULL; + + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // This depends on the chosen data layout. + // The structure of the loop should resemble the same "execution layout" + // as in the kernel! + + int nThreads; +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + +#ifdef _OPENMP + #pragma omp parallel for \ + shared(gDims, pdfs, \ + oX, oY, oZ, lX, lY, lZ, blk, nThreads, ld) +#endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = lX / nThreads * i; + int threadEndX = lX / nThreads * (i + 1); + + if (lX % nThreads > 0) { + if (lX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += lX % nThreads; + threadEndX += lX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < lY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < lZ + oZ; bZ += blk[2]) { + + int eX = MIN(bX + blk[0], threadEndX + oX); + int eY = MIN(bY + blk[1], lY + oY); + int eZ = MIN(bZ + blk[2], lZ + oZ); + + // printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = -50.0; + } + + } } } // x, y, z + } } } // bx, by, bz + } + + + // Initialize all PDFs to some standard value. + for (int x = oX; x < lX + oX; ++x) { + for (int y = oY; y < lY + oY; ++y) { + for (int z = oZ; z < lZ + oZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + } + } } } // x, y, z + + + // Count how many *PDFs* need bounce back treatment. + + uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + + // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2. + // Works when declaring nBounceBackPdfs as int64_t or using volatile. + volatile int nBounceBackPdfs = 0; + // int64_t nBounceBackPdfs = 0; + int nx, ny, nz, px, py, pz; + + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; + + // Check if neighbor is inside the lattice. + // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + // continue; + // } + if ((nx < 0 || nx >= lX) && ld->PeriodicX) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((ny < 0 || ny >= lY) && ld->PeriodicY) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + } + } + } + } + } + + printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2)); + + MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100); + MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100); + + kd->nBounceBackPdfs = nBounceBackPdfs; + nBounceBackPdfs = 0; + + int srcIndex; + int dstIndex; + + // TODO: currently this is not NUMA-aware + // - maybe use the same blocking as for lattice initialization? + // - do place the bounce back index vector parallel? + + for (int x = 0; x < lX; ++x) { + for (int y = 0; y < lY; ++y) { + for (int z = 0; z < lZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // For periodicity: + + // We assume we have finished odd time step (accessing neighbor PDFs) and are + // before executing the even time step (accessing local PDFs only). + + // Assuming we are at the most east position of the lattice. Through the odd + // time step propagation has put a PDF in the east slot of the ghost cell east + // of us, i.e. nx, ny, nz. We copy it to the east slot of the most west node. + + // In case of transition from even to odd time step , src and dest must be + // exchanged. + + + // x periodic + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + // See description of bounce back handling below. + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + } + else { + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // Put it on the other side back into the domain. + dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + } + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + // Depending on the time step we are in we have to exchange src and dst index. + + // We build the list for the case, when we have finished odd time step + // (accessing neighbor PDFs) and before we start with the even time step + // (accessing local PDFs only). + + // Assume our neighbor east of us, i.e. nx, ny, nz, is an obstacle cell. + // Then we have to move the east PDF from the obstacle to our west position, + // i.e. the inverse of east. + + // In case of transition from even to odd time step src and dest just + // have to be exchanged. + + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + } + } + } + } + } + } + + + // Fill remaining KernelData structures + kd->GetNode = FNAME(GetNode); + kd->SetNode = FNAME(SetNode); + + kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); + + kd->Kernel = FNAME(D3Q19AaVecKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19AaVecDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + MemFree((void **) & ((*kernelData)->Pdfs[0])); + + MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc)); + MemFree((void **) & ((*kernelData)->BounceBackPdfsDst)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19AaVecCommon.h b/src/BenchKernelD3Q19AaVecCommon.h new file mode 100644 index 0000000..d416d07 --- /dev/null +++ b/src/BenchKernelD3Q19AaVecCommon.h @@ -0,0 +1,93 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ + +// #include "BenchKernelD3Q19AaCommon.h" + +#include "Kernel.h" + +typedef struct KernelDataAa_ +{ + KernelData kd; + int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking. + int Iteration; // Current iteration number. +} KernelDataAa; + +// Macro for casting KernelData * to KernelDataAa *. +#define KDA(_x_) ((KernelDataAa *)(_x_)) + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_SOA + #error Only DATA_LAYOUT_SOA is supported. +#endif + +#ifndef PROP_MODEL_AA + #error Only PROP_MODEL_AA is supported. +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) +{ + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + Assert(d >= 0); +#ifdef D3Q19 + Assert(d < N_D3Q19); +#else + #error Not implemented for this discretization. +#endif + +#ifdef DATA_LAYOUT_SOA + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; +// #elif DATA_LAYOUT_AOS +// return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; +#else + #error P_INDEX_5 function no implemented for chosen data layout. +#endif +} + + + +#endif // __BENCH_KERNEL_D3Q19_AA_VEC_COMMON_H__ + diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c index e697bc8..8f75649 100644 --- a/src/BenchKernelD3Q19Common.c +++ b/src/BenchKernelD3Q19Common.c @@ -32,6 +32,9 @@ #include #include +#ifdef _OPENMP + #include +#endif // Forward definition. void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); @@ -224,8 +227,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -236,8 +239,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -248,8 +251,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -260,8 +263,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -285,7 +288,7 @@ static void ParseParameters(Parameters * params, int * blk) } -void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +static void D3Q19InternalInit(int blocking, LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { KernelDataEx * kdex = NULL; MemAlloc((void **)&kdex, sizeof(KernelDataEx)); @@ -332,7 +335,17 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters PdfT * pdfs[2]; - ParseParameters(params, blk); + if (blocking) { + ParseParameters(params, blk); + } + else { + if (params->nKernelArgs != 0) { + printf("ERROR: unknown kernel parameter.\n"); + printf("This kernels accepts no parameters.\n"); + exit(1); + } + } + if (blk[0] == 0) blk[0] = gX; if (blk[1] == 0) blk[1] = gY; @@ -348,8 +361,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); - MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); - MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + // MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, 2 * 1024 * 1024); + + printf("# pdfs[0] = %p\n", pdfs[0]); + printf("# pdfs[1] = %p\n", pdfs[1]); kd->Pdfs[0] = pdfs[0]; kd->Pdfs[1] = pdfs[1]; @@ -358,39 +376,82 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // This depends on the chosen data layout. // The structure of the loop should resemble the same "execution layout" // as in the kernel! -#ifdef _OPENMP - #pragma omp parallel for collapse(3) -#endif - for (int bZ = 0; bZ < gZ; bZ += blk[2]) { - for (int bY = 0; bY < gY; bY += blk[1]) { - for (int bX = 0; bX < gX; bX += blk[0]) { + if (blocking) { + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; - // Must do everything here, else it would break collapse. - int eZ = MIN(bZ + blk[2], gZ); - int eY = MIN(bY + blk[1], gY); - int eX = MIN(bX + blk[0], gX); + int nThreads = 1; - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { - for (int x = bX; x < eX; ++x) { + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif - for (int d = 0; d < N_D3Q19; ++d) { - pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; - } + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(pdfs, gDims, nX, nY, nZ, oX, oY, oZ, blk, nThreads) + #endif + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + + for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } } } } } } + + } + else { + #ifdef _OPENMP + #pragma omp parallel for collapse(3) + #endif + for (int x = 0; x < gX; ++x) { + for (int y = 0; y < gY; ++y) { + for (int z = 0; z < gZ; ++z) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + } + } + } } // Initialize all PDFs to some standard value. - for (int z = 0; z < gZ; ++z) { + for (int x = 0; x < gX; ++x) { for (int y = 0; y < gY; ++y) { - for (int x = 0; x < gX; ++x) { + for (int z = 0; z < gZ; ++z) { for (int d = 0; d < N_D3Q19; ++d) { pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0; @@ -417,9 +478,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // TODO: apply blocking? - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -471,9 +532,9 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters int srcIndex; int dstIndex; - for (int z = 0; z < lZ; ++z) { + for (int x = 0; x < lX; ++x) { for (int y = 0; y < lY; ++y) { - for (int x = 0; x < lX; ++x) { + for (int z = 0; z < lZ; ++z) { if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { for (int d = 0; d < N_D3Q19; ++d) { @@ -598,7 +659,12 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); - kd->Kernel = FNAME(D3Q19BlkKernel); + if (blocking) { + kd->Kernel = FNAME(D3Q19BlkKernel); + } + else { + kd->Kernel = FNAME(D3Q19Kernel); + } kd->DstPdfs = NULL; kd->PdfsActive = kd->Pdfs[0]; @@ -606,6 +672,13 @@ void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters return; } + +void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + D3Q19InternalInit(1, ld, kernelData, params); +} + + void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) { MemFree((void **) & ((*kernelData)->Pdfs[0])); @@ -624,27 +697,7 @@ void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) { - Parameters p; - - if (params->nKernelArgs != 0) { - printf("ERROR: unknown kernel parameter.\n"); - printf("This kernels accepts no parameters.\n"); - exit(1); - } - - // Setup an empty parameters structure. - p.nArgs = params->nArgs; - p.Args = params->Args; - p.nKernelArgs = 0; - p.KernelArgs = NULL; - - // Call init routine for blocking kernel and override the - // kernel function to be called later on. - FNAME(D3Q19BlkInit)(ld, kernelData, &p); - - (*kernelData)->Kernel = FNAME(D3Q19Kernel); - - return; + D3Q19InternalInit(0, ld, kernelData, params); } diff --git a/src/BenchKernelD3Q19Common.h b/src/BenchKernelD3Q19Common.h index cceddca..268bfed 100644 --- a/src/BenchKernelD3Q19Common.h +++ b/src/BenchKernelD3Q19Common.h @@ -74,9 +74,9 @@ static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) #endif #ifdef DATA_LAYOUT_SOA - return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x; + return d * dims[0] * dims[1] * dims[2] + x * dims[1] * dims[2] + y * dims[2] + z; #elif DATA_LAYOUT_AOS - return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d; + return x * dims[1] * dims[2] * N_D3Q19 + y * dims[2] * N_D3Q19 + z * N_D3Q19 + d; #else #error P_INDEX_5 function no implemented for chosen data layout. #endif diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c index 7922b7a..e01853a 100644 --- a/src/BenchKernelD3Q19List.c +++ b/src/BenchKernelD3Q19List.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "LikwidIf.h" #include #include @@ -97,6 +98,7 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData for(int iter = 0; iter < maxIterations; ++iter) { + X_LIKWID_START("list-os"); #ifdef _OPENMP #pragma omp parallel for default(none) \ @@ -307,6 +309,8 @@ void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData #undef I } // loop over fluid nodes + X_LIKWID_STOP("list-os"); + #ifdef VERIFICATION kd->PdfsActive = dst; KernelAddBodyForce(kd, ld, cd); diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c index d2bcb52..c1e48a5 100644 --- a/src/BenchKernelD3Q19ListAaCommon.c +++ b/src/BenchKernelD3Q19ListAaCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include @@ -222,15 +223,18 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk ] [-blk-[xyz] ]\n"); - +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -248,8 +252,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -260,8 +264,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -272,8 +276,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -284,13 +288,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -346,6 +357,11 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete kdl->nFluid = -1; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -358,18 +374,21 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -443,18 +462,17 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -492,7 +510,15 @@ void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Paramete // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configure data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c index c522252..b1dee16 100644 --- a/src/BenchKernelD3Q19ListAaPv.c +++ b/src/BenchKernelD3Q19ListAaPv.c @@ -728,9 +728,9 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart; - VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + VSTU(ppdf_BN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart; - VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + VSTU(ppdf_TS, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); //vui = vuy + vuz; vui = VADD(vuy,vuz); @@ -783,20 +783,20 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) ui = uy; evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_N)] = pdf_N - evenPart - oddPart; - src[ADJ_LIST(D3Q19_S)] = pdf_S - evenPart + oddPart; + *ppdf_S = pdf_N - evenPart - oddPart; + *ppdf_N = pdf_S - evenPart + oddPart; ui = ux; evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_E)] = pdf_E - evenPart - oddPart; - src[ADJ_LIST(D3Q19_W)] = pdf_W - evenPart + oddPart; + *ppdf_W = pdf_E - evenPart - oddPart; + *ppdf_E = pdf_W - evenPart + oddPart; ui = uz; evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep); oddPart = omegaOdd * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3); - src[ADJ_LIST(D3Q19_T)] = pdf_T - evenPart - oddPart; - src[ADJ_LIST(D3Q19_B)] = pdf_B - evenPart + oddPart; + *ppdf_B = pdf_T - evenPart - oddPart; + *ppdf_T = pdf_B - evenPart + oddPart; // direction: w_2 w_2_indep = w_2 * dir_indep_trm; @@ -804,38 +804,38 @@ static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) ui = -ux + uy; evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart; - src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart; + *ppdf_SE = pdf_NW - evenPart - oddPart; + *ppdf_NW = pdf_SE - evenPart + oddPart; ui = ux + uy; evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart; - src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart; + *ppdf_SW = pdf_NE - evenPart - oddPart; + *ppdf_NE = pdf_SW - evenPart + oddPart; ui = -ux + uz; evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart; + *ppdf_BE = pdf_TW - evenPart - oddPart; + *ppdf_TW = pdf_BE - evenPart + oddPart; ui = ux + uz; evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart; + *ppdf_BW = pdf_TE - evenPart - oddPart; + *ppdf_TE = pdf_BW - evenPart + oddPart; ui = -uy + uz; evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart; + *ppdf_BN = pdf_TS - evenPart - oddPart; + *ppdf_TS = pdf_BN - evenPart + oddPart; ui = uy + uz; evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep); oddPart = omegaOdd * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3); - src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart; - src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart; + *ppdf_BS = pdf_TN - evenPart - oddPart; + *ppdf_TN = pdf_BS - evenPart + oddPart; pointerOffset = 1; } diff --git a/src/BenchKernelD3Q19ListAaPvCommon.c b/src/BenchKernelD3Q19ListAaPvCommon.c index 9cdf0e1..7b479cf 100644 --- a/src/BenchKernelD3Q19ListAaPvCommon.c +++ b/src/BenchKernelD3Q19ListAaPvCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include @@ -225,15 +226,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk ] [-blk-[xyz] ]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -502,6 +514,11 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -514,17 +531,21 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; @@ -607,18 +628,18 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -660,7 +681,15 @@ void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parame // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.c b/src/BenchKernelD3Q19ListAaRiaCommon.c index 7faf37b..c6ba13a 100644 --- a/src/BenchKernelD3Q19ListAaRiaCommon.c +++ b/src/BenchKernelD3Q19ListAaRiaCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include @@ -225,15 +226,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk ] [-blk-[xyz] ]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -251,8 +256,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -263,8 +268,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -275,8 +280,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -287,13 +292,20 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } blk[2] = tmp; } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -485,6 +497,11 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -500,14 +517,18 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param int nCells = ld->nFluid; // TODO: + padding int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; @@ -590,18 +611,17 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -643,7 +663,15 @@ void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Param // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListCommon.c b/src/BenchKernelD3Q19ListCommon.c index 4f97bae..f66e379 100644 --- a/src/BenchKernelD3Q19ListCommon.c +++ b/src/BenchKernelD3Q19ListCommon.c @@ -28,6 +28,7 @@ #include "Memory.h" #include "Vtk.h" +#include "Padding.h" #include @@ -200,15 +201,19 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk ] [-blk-[xyz] ]\n"); +#ifdef DATA_LAYOUT_SOA + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); +#endif return; } -static void ParseParameters(Parameters * params, int * blk) +static void ParseParameters(Parameters * params, int * blk, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -226,8 +231,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -238,8 +243,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -250,8 +255,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -262,8 +267,8 @@ static void ParseParameters(Parameters * params, int * blk) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -273,6 +278,13 @@ static void ParseParameters(Parameters * params, int * blk) ParameterUsage(); exit(1); } +#ifdef DATA_LAYOUT_SOA + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } +#endif else { printf("ERROR: unknown kernel parameter.\n"); ParameterUsage(); @@ -324,6 +336,12 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters kdl->nFluid = -1; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &padInfo); + + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -336,18 +354,21 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters int lZ = lDims[2]; int nTotalCells = lX * lY * lZ; - int nCells = ld->nFluid; // TODO: + padding + int nCells = ld->nFluid; int nFluid = ld->nFluid; +#ifdef DATA_LAYOUT_SOA + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } +#endif + kdl->nCells = nCells; kdl->nFluid = nFluid; PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -425,18 +446,17 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -474,7 +494,15 @@ void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). - // TODO: Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c index dfab54a..0132dc9 100644 --- a/src/BenchKernelD3Q19ListPullSplitNt.c +++ b/src/BenchKernelD3Q19ListPullSplitNt.c @@ -29,6 +29,7 @@ #include "Memory.h" #include "Vtk.h" #include "Vector.h" +#include "LikwidIf.h" #include #include @@ -118,6 +119,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD KernelStatistics(kd, ld, cd, 0); #endif + + X_LIKWID_START("list-pull-split-nt-1s"); #ifdef _OPENMP #pragma omp parallel default(none) \ shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ @@ -184,6 +187,7 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD for(int iter = 0; iter < maxIterations; ++iter) { + #if 1 #define INDEX_START blIndexStart #define INDEX_STOP blIndexVec @@ -201,6 +205,8 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD #define INDEX_STOP blIndexStop #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" #endif + + #pragma omp barrier #pragma omp single @@ -235,6 +241,9 @@ void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseD MemFree((void **)&tmpArray); } + + X_LIKWID_STOP("list-pull-split-nt-1s"); + #ifdef VTK_OUTPUT if (cd->VtkOutput) { kd->PdfsActive = src; @@ -321,6 +330,10 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD KernelStatistics(kd, ld, cd, 0); #endif + + X_LIKWID_START("list-pull-split-nt-2s"); + + #ifdef _OPENMP #pragma omp parallel default(none) \ shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ @@ -406,6 +419,7 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD #endif #pragma omp barrier + #pragma omp single { #ifdef VERIFICATION @@ -438,6 +452,8 @@ void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseD MemFree((void **)&tmpArray); } + X_LIKWID_STOP("list-pull-split-nt-2s"); + #ifdef VTK_OUTPUT if (cd->VtkOutput) { kd->PdfsActive = src; diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.c b/src/BenchKernelD3Q19ListPullSplitNtCommon.c index b5df14b..8c8da88 100644 --- a/src/BenchKernelD3Q19ListPullSplitNtCommon.c +++ b/src/BenchKernelD3Q19ListPullSplitNtCommon.c @@ -29,6 +29,8 @@ #include "Memory.h" #include "Vtk.h" #include "Vector.h" +#include "Padding.h" + #include @@ -183,16 +185,18 @@ static void ParameterUsage() { printf("Kernel parameters:\n"); printf(" [-blk ] [-blk-[xyz] ] [-n-tmp-array ]\n"); + printf(" [-pad auto|modulus_1+offset_1(,modulus_n+offset_n)*]\n"); return; } -static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) +static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, PadInfo ** padInfo) { Assert(blk != NULL); blk[0] = 0; blk[1] = 0; blk[2] = 0; *nTmpArray = 152; + *padInfo = NULL; #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) #define NEXT_ARG_PRESENT() \ @@ -210,8 +214,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -222,8 +226,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -234,8 +238,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -246,8 +250,8 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) int tmp = strtol(params->KernelArgs[++i], NULL, 0); - if (tmp <= 0) { - printf("ERROR: blocking parameter must be > 0.\n"); + if (tmp < 0) { + printf("ERROR: blocking parameter must be >= 0.\n"); exit(1); } @@ -270,6 +274,11 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) *nTmpArray = tmp; } + else if (ARG_IS("-pad") || ARG_IS("--pad")) { + NEXT_ARG_PRESENT(); + + *padInfo = PadInfoFromStr(params->KernelArgs[++i]); + } else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { ParameterUsage(); exit(1); @@ -456,6 +465,11 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * kdlr->nConsecThreadIndices = 0; #endif + int blk[3] = { 0 }; + PadInfo * padInfo = NULL; + + ParseParameters(params, blk, &kdlr->nTmpArray, &padInfo); + // Ajust the dimensions according to padding, if used. kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; @@ -471,8 +485,12 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * int nCells = ld->nFluid; int nFluid = ld->nFluid; + { + nCells = PadCellsAndReport(nCells, sizeof(PdfT), &padInfo); + PadInfoFree(padInfo); padInfo = NULL; + } + // We padd each stream of a PDF array for a complete cache line. - // TODO: padding for L1/L2 and TLB. nCells = nCells + (8 - nCells % 8); Assert(nCells % VSIZE == 0); @@ -482,10 +500,6 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * PdfT * pdfs[2]; - int blk[3] = { 0 }; - - ParseParameters(params, blk, &kdlr->nTmpArray); - if (blk[0] == 0) blk[0] = lX; if (blk[1] == 0) blk[1] = lY; if (blk[2] == 0) blk[2] = lZ; @@ -572,18 +586,18 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * // Blocking is implemented via setup of the adjacency list. The kernel later will // walk through the lattice blocked automatically. - for (int bZ = 0; bZ < lZ; bZ += blk[2]) { - for (int bY = 0; bY < lY; bY += blk[1]) { for (int bX = 0; bX < lX; bX += blk[0]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { int eX = MIN(bX + blk[0], lX); int eY = MIN(bY + blk[1], lY); int eZ = MIN(bZ + blk[2], lZ); - for (int z = bZ; z < eZ; ++z) { - for (int y = bY; y < eY; ++y) { for (int x = bX; x < eX; ++x) { + for (int y = bY; y < eY; ++y) { + for (int z = bZ; z < eZ; ++z) { latticeIndex = L_INDEX_4(lDims, x, y, z); @@ -627,6 +641,15 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * // Loop over all fluid nodes and compute the indices to the neighboring // PDFs for configured data layout (AoS/SoA). // Parallelized loop to ensure correct NUMA placement. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int index = 0; index < nFluid; ++index) { + for (int d = 0; d < N_D3Q19_IDX; ++d) { + adjList[index * N_D3Q19_IDX + d] = -1; + } + } + // #ifdef _OPENMP --> add line continuation // #pragma omp parallel for default(none) // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, diff --git a/src/Geometry.c b/src/Geometry.c index 31c985a..0450c56 100644 --- a/src/Geometry.c +++ b/src/Geometry.c @@ -72,6 +72,9 @@ void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], Lat typeDetails = &tmp; } + else if (strncasecmp("fluid", geometryType, 5) == 0) { + type = GEO_TYPE_FLUID; + } else { printf("ERROR: unknown geometry specified.\n"); Verify(0); @@ -99,7 +102,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period Assert(type >= GEO_TYPE_MIN); Assert(type <= GEO_TYPE_MAX); - const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" }; + const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks", "fluid" }; printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]); @@ -127,6 +130,10 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) { periodic[0] = 1; } + else if (type == GEO_TYPE_FLUID) { + // Actually this geometry is non-periodic, but we need fluid at the boundary. + periodic[0] = 1; periodic[1] = 1; periodic[2] = 1; + } // Walls or periodic on first and last x plane. for (int z = 0; z < dims[2]; ++z) { @@ -168,7 +175,7 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period } if (type == GEO_TYPE_CHANNEL) { - periodic[0] = 1; + // Nothing todo here. } else if (type == GEO_TYPE_PIPE) { #define SQR(a) ((a)*(a)) @@ -192,34 +199,38 @@ void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int period int blockSize = *((int *)typeDetails); +#if 0 if (blockSize == 0) { blockSize = 8; } +#endif + if (blockSize > 0) { - int dimMin = dims[0]; + int dimMin = dims[0]; - if (dims[1] < dimMin) dimMin = dims[1]; - if (dims[2] < dimMin) dimMin = dims[2]; + if (dims[1] < dimMin) dimMin = dims[1]; + if (dims[2] < dimMin) dimMin = dims[2]; - if (blockSize < 0 || blockSize > dimMin / 2) { - printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); - // TODO: find a better solution for handling errors in here. - Verify(0); - } + if (blockSize < 0 || blockSize > dimMin / 2) { + printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); + // TODO: find a better solution for handling errors in here. + Verify(0); + } - // Number of blocks in x, y, and z direction. - int nbx = blockSize, nby = blockSize, nbz = blockSize; + // Number of blocks in x, y, and z direction. + int nbx = blockSize, nby = blockSize, nbz = blockSize; - for (int z = 0; z < dims[2]; ++z) { - if ((z % (2 * nbz)) < nbz) continue; + for (int z = 0; z < dims[2]; ++z) { + if ((z % (2 * nbz)) < nbz) continue; - for (int y = 0; y < dims[1]; ++y) { - if ((y % (2 * nby)) < nby) continue; + for (int y = 0; y < dims[1]; ++y) { + if ((y % (2 * nby)) < nby) continue; - for (int x = 0; x < dims[0]; ++x) { + for (int x = 0; x < dims[0]; ++x) { - if ((x % (2 * nbx)) >= nbx) { - lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + if ((x % (2 * nbx)) >= nbx) { + lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + } } } } diff --git a/src/Geometry.h b/src/Geometry.h index 11d72ff..a9bf260 100644 --- a/src/Geometry.h +++ b/src/Geometry.h @@ -37,7 +37,8 @@ typedef enum GEO_TYPES_ { GEO_TYPE_PIPE = 2, GEO_TYPE_BLOCKS = 3, // Expects a pointer to an integer, holding the // value of the block size as type detail. - GEO_TYPE_MAX = 3 + GEO_TYPE_FLUID = 4, + GEO_TYPE_MAX = 4 } GEO_TYPES; diff --git a/src/Kernel.h b/src/Kernel.h index eddae64..99e126b 100644 --- a/src/Kernel.h +++ b/src/Kernel.h @@ -56,7 +56,7 @@ #endif #ifdef PROP_MODEL_AA - #define PROP_MODEL_NAME AA + #define PROP_MODEL_NAME Aa #endif diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h index 2ee063a..557c653 100644 --- a/src/KernelFunctions.h +++ b/src/KernelFunctions.h @@ -28,6 +28,8 @@ #define __KERNEL_FUNCTIONS_H__ #include "BenchKernelD3Q19.h" +#include "BenchKernelD3Q19Aa.h" +#include "BenchKernelD3Q19AaVec.h" #include "BenchKernelD3Q19List.h" #include "BenchKernelD3Q19ListAa.h" #include "BenchKernelD3Q19ListAaRia.h" @@ -133,6 +135,22 @@ KernelFunctions g_kernels[] = .Init = D3Q19BlkInit_PullAoS, .Deinit = D3Q19BlkDeinit_PullAoS }, + { + .Name = "aa-aos", + .Init = D3Q19AaInit_AaAoS, + .Deinit = D3Q19AaDeinit_AaAoS + }, + { + .Name = "aa-soa", + .Init = D3Q19AaInit_AaSoA, + .Deinit = D3Q19AaDeinit_AaSoA + }, + { + .Name = "aa-vec-soa", + .Init = D3Q19AaVecInit_AaSoA, + .Deinit = D3Q19AaVecDeinit_AaSoA + } + }; #endif // __KERNEL_FUNCTIONS_H__ diff --git a/src/Lattice.c b/src/Lattice.c new file mode 100644 index 0000000..bc218eb --- /dev/null +++ b/src/Lattice.c @@ -0,0 +1,59 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "Lattice.h" + +// Dumps the layers [zStart, zStop] of lattice as ASCII. +// Specify zStart = -1 and zStop = -1 as begin/end of lattice. + +void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop) +{ + Assert(ld != NULL); + + const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet + + int localZStart = zStart; + int localZStop = zStop; + + int * dims = ld->Dims; + LatticeT * lattice = ld->Lattice; + + if (localZStart == -1) localZStart = 0; + if (localZStop == -1) localZStop = dims[2] - 1; + + for (int z = localZStop; z >= localZStart; --z) { + printf("plane % 2d\n", z); + + for (int y = dims[1] - 1; y >= 0; --y) { + printf(" %2d ", y); + for (int x = 0; x < dims[0]; ++x) { + printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]); + } + printf("\n"); + } + } +} + diff --git a/src/Lattice.h b/src/Lattice.h index 99b3fc3..c23b70f 100644 --- a/src/Lattice.h +++ b/src/Lattice.h @@ -74,5 +74,6 @@ static inline int L_INDEX_4(int dims[3], int x, int y, int z) return z * dims[0] * dims[1] + y * dims[0] + x; } +void LatDumpAscii(LatticeDesc * ld, int zStart, int zStop); #endif // __LATTICE_H__ diff --git a/src/Main.c b/src/Main.c index 1525ad5..65a4035 100644 --- a/src/Main.c +++ b/src/Main.c @@ -468,7 +468,7 @@ int main(int argc, char * argv[]) int threadId = omp_get_thread_num(); int err; - err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId); + err = PinCurrentThreadByCpuList(pinString, threadId); if (err) { printf("ERROR [thread %d]: pinning failed.\n", threadId); @@ -557,9 +557,9 @@ int main(int argc, char * argv[]) perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6, geometryType, kernelToUse, #ifdef VERIFICATION - "# VERIFICATION" + "VERIFICATION" #else - "# benchmark" + "B" #endif ); diff --git a/src/Makefile b/src/Makefile index a9889a5..99ca902 100644 --- a/src/Makefile +++ b/src/Makefile @@ -29,7 +29,9 @@ # CONFIG ?= linux-gcc CONFIG ?= linux-intel -BUILD ?= debug +BUILD ?= release + +BENCHMARK ?= on # If we compile for BENCHMARK all verifcation and statistics are disabled, # if not specified otherwise. @@ -37,13 +39,12 @@ ifeq (on,$(BENCHMARK)) VERIFICATION ?= off VTK_OUTPUT ?= off STATISTICS ?= off +else + VERIFICATION ?= on + STATISTICS ?= on + VTK_OUTPUT ?= on endif -VERIFICATION ?= on - -STATISTICS ?= on - -VTK_OUTPUT ?= on COLOR ?= on @@ -95,7 +96,7 @@ DEP_DIR = obj/$(CONFIG)-$(BUILD)$(TAG)-dep # Sources to consider. SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \ - Vtk.c Pinning.c + Vtk.c Pinning.c Padding.c Lattice.c # ------------------------------------------------------------------------ # NO CHANGE BELOW SHOULD BE NEEDED @@ -127,7 +128,13 @@ OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \ $(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \ $(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \ $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \ - $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o + $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Aa_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaCommon_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaVec_AaSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19AaVecCommon_AaSoA.o OBJ = $(OBJ_C) @@ -203,9 +210,22 @@ ifeq (on,$(LIKWID)) LD_LIBS += $(LIKWID_LIB) -llikwid endif +# ARCH can only be assigned a string without a space. The space is escaped as +# a comma which we have to replace here. + +ifdef TARCH + ARCH_W_COMMA := $(TARCH) + + COMMA_ := , + override TARCH := $(subst $(COMMA_), ,$(TARCH)) + + $(info $(shell echo -e "\n$(COLOR_GREEN)INFO: Automatically expanding comma in TARCH variable: TARCH=$(TARCH).$(COLOR_NO)\n")) + +endif + .phony: all clean clean-all -$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) building $(.DEFAULT_GOAL)...")) +$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) TARCH=$(COLOR_CYAN)$(TARCH)$(COLOR_NO) building $(.DEFAULT_GOAL)...")) $(info # Object dir: $(OBJECT_DIR)) @@ -265,6 +285,13 @@ $(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS) @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)" $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@ +$(OBJECT_DIR)/%_AaAoS.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_AA -c $< -o $@ + +$(OBJECT_DIR)/%_AaSoA.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_AA$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_AA -c $< -o $@ $(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS) @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)" diff --git a/src/Memory.c b/src/Memory.c index d9d1f94..e7dda39 100644 --- a/src/Memory.c +++ b/src/Memory.c @@ -24,11 +24,24 @@ // along with LbmBenchKernels. If not, see . // // -------------------------------------------------------------------------- +// TODO: make configurable +#define HAVE_HUGE_PAGES + + +#ifdef HAVE_HUGE_PAGES +#define _BSD_SOURCE +#endif + #include #include #include // strerror #include + +#ifdef HAVE_HUGE_PAGES +#include // madvise +#endif + #include "Base.h" #include "Memory.h" @@ -60,6 +73,19 @@ int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes) exit(1); } +#ifdef HAVE_HUGE_PAGES + + if (alignmentBytes % 4096 == 0) { + ret = madvise(*ptr, bytesToAlloc, MADV_HUGEPAGE); + + if (ret != 0) { + DEBUG_BREAK_POINT(); + Error("madvise(%p, %lu, MADV_HUGEPAGE) failed: %d - %s.\n", *ptr, bytesToAlloc, errno, strerror(errno)); + exit(1); + } + } +#endif + return 0; } diff --git a/src/Padding.c b/src/Padding.c new file mode 100644 index 0000000..97a98ea --- /dev/null +++ b/src/Padding.c @@ -0,0 +1,300 @@ +#include "Base.h" +#include "Kernel.h" +#include "Padding.h" + +#include +#include + +// Generates a PadInfo struct from padStr. If padStr is NULL then a NULL +// pointer is returned. +// +// Padding string is either "auto" or in the format of +// +(,+)*. +// +// If modulus is 0 then the remainder will just be added and no modulo will be +// applied. +// +// Returned PadInfo structs must be freed with PadInfoFree. + +PadInfo * PadInfoFromStr(const char * padStr) +{ + if (padStr == NULL) { + return NULL; + } + else if (!strncmp("auto", padStr, 4)) { + + PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo) + 4 * sizeof(int)); + Assert(padInfo != NULL); + + padInfo->nEntries = 2; + padInfo->Modulus = (int *)(padInfo + 1); + padInfo->Offset = ((int *)(padInfo + 1)) + 2; + + // Intel TLB 2 MiB pages + padInfo->Modulus[0] = 2 * 1024 * 1024 * 8; + padInfo->Offset[0] = 2 * 1024 * 1024 + 512; + + // Intel L2, 256 sets + padInfo->Modulus[1] = 256 * 64; + padInfo->Offset[1] = 128; + + return padInfo; + } + else if (!strncmp("no", padStr, 2) || !strncmp("off", padStr, 3)) { + PadInfo * padInfo = (PadInfo *)malloc(sizeof(PadInfo)); + Assert(padInfo != NULL); + + padInfo->nEntries = 0; + padInfo->Modulus = NULL; + padInfo->Offset = NULL; + + return padInfo; + } + + // Count number of commas. Number of commas + 1 gives us the number of + // padding entries. + const char * tmp = padStr; + int nCommas = 0; + + while (*tmp != 0x00) { + if (*tmp == ',') ++nCommas; + ++tmp; + } + + // Number of padding entries = nCommas + 1. + size_t padInfoSize = sizeof(PadInfo) + (nCommas + 1) * 2 * sizeof(int); + PadInfo * padInfo = (PadInfo *)malloc(padInfoSize); + Assert(padInfo != NULL); + + memset(padInfo, 0x00, padInfoSize); + + padInfo->Modulus = (int *)(padInfo + 1); + padInfo->Offset = ((int *)(padInfo + 1)) + nCommas + 1; + + tmp = padStr; + + int padEntryIndex = 0; + int entriesSeen = 0; + + int modulusSeen = 0; + int offsetSeen = 0; + + // TODO: parsing is currently a mess. We assume a fixed format, which + // should be easier to parse now. + while (*tmp != 0x00) { + + if (*tmp == ',') { + // Check if modulus was seen in this section, then move to the next. + + if (!modulusSeen) { + fprintf(stderr, "ERROR: modulus missing before next section in padding string.\n"); + exit(1); + } + + modulusSeen = 0; + offsetSeen = 0; + ++padEntryIndex; + ++tmp; + } + else if (*tmp == '+') { + // Check if modulus was seen in this section and no offset yet. + + if (!modulusSeen) { + fprintf(stderr, "ERROR: modulus missing before offset in padding string.\n"); + exit(1); + } + + if (offsetSeen) { + fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n"); + exit(1); + } + + ++tmp; + } + else if (*tmp >= '0' && *tmp <= '9') { + // Parse number and check for all the errors. + + char * endPtr; + + errno = 0; + + int value = strtol(tmp, &endPtr, 10); + + if ((value == LONG_MIN || value == LONG_MAX) && errno != 0) { + fprintf(stderr, "ERROR: over- or underflow in modulus/offset of padding string.\n"); + exit(1); + } + else if (value == 0 && errno != 0) { + fprintf(stderr, "ERROR: error parsing modulus/offset of padding string: %d - %s\n", errno, strerror(errno)); + exit(1); + } + else if (tmp == endPtr) { + // No digits found: empty string or invalid pad string. + fprintf(stderr, "ERROR: modulus or offset missing in padding string.\n"); + exit(1); + } + + if (value < 0) { + fprintf(stderr, "ERROR: modulus and offset must be >= 0 in padding string.\n"); + exit(1); + } + + if (!modulusSeen) { + if (offsetSeen) { + fprintf(stderr, "ERROR: offset already seen, but not modulus in padding string.\n"); + exit(1); + } + + Verify(padEntryIndex < (nCommas + 1)); + + padInfo->Modulus[padEntryIndex] = value; + + modulusSeen = 1; + ++entriesSeen; + } + else { + if (offsetSeen) { + fprintf(stderr, "ERROR: offset is only allowed to be specified once per section in padding string.\n"); + exit(1); + } + + Verify(padEntryIndex < (nCommas + 1)); + + padInfo->Offset[padEntryIndex] = value; + + offsetSeen = 1; + } + // No increment of tmp needed, as endPtr points already to the first + // character which is not part of the number. + tmp = endPtr; + } + else { + fprintf(stderr, "ERROR: padding string contains invalid characters.\n"); + exit(1); + } + + } + + if (entriesSeen != nCommas + 1) { + fprintf(stderr, "ERROR: did not find all padding entries.\n"); + exit(1); + } + + for (int i = 0; i < nCommas + 1; ++i) { + if (padInfo->Offset[i] >= padInfo->Modulus[i]) { + fprintf(stderr, "ERROR: offset in padding entry %d is equal or larger than modulus.\n", i); + exit(1); + } + } + + padInfo->nEntries = entriesSeen; + + return padInfo; +} + +int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr) +{ + Assert(padInfoPtr != NULL); + + // If the padInfo is NULL determine if we use the "auto" configuration. + const int defaultAutoPadding = 1; + + + if (*padInfoPtr == NULL) { + if (!defaultAutoPadding) { + return nCells; + } + + *padInfoPtr = PadInfoFromStr("auto"); + } + + PadInfo * padInfo = *padInfoPtr; + + Assert(padInfo->nEntries >= 0); + + for (int i = 0; i < padInfo->nEntries; ++i) { + Assert(padInfo->Modulus[i] >= 0); + Assert(padInfo->Offset[i] >= 0); + + int nPadCells = 0; + + if (padInfo->Modulus[i] == 0) { + // When the modulus is just zero then we only add the offset. + nPadCells = padInfo->Offset[i] * cellSizeBytes; + } + else { + int nModCells = padInfo->Modulus[i] / cellSizeBytes; + + if (nModCells == 0) { + fprintf(stderr, "ERROR: modulus of %d byte in padding entry %d becomes zero for PDF size %d byte.\n", + padInfo->Modulus[i], i, cellSizeBytes); + exit(1); + } + + int nOffsetCells = padInfo->Offset[i] / cellSizeBytes; + int nRemainder = nCells % nModCells; + + nPadCells = (nOffsetCells + nModCells - nRemainder) % nModCells; + } + + nCells += nPadCells; + } + + return nCells; +} + +void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix) +{ + Assert(padInfo != NULL); + Assert(padInfo->nEntries >= 0); + Assert(f != NULL); + Assert(prefix != NULL); + + for (int i = 0; i < padInfo->nEntries; ++i) { + fprintf(f, "%sm: %10d b o: %10d b m: %f KiB o: %f KiB m: %f MiB o: %f MiB\n", + prefix, + padInfo->Modulus[i], padInfo->Offset[i], + padInfo->Modulus[i] / 1024.0, padInfo->Offset[i] / 1024.0, + padInfo->Modulus[i] / 1024.0 / 1024.0, padInfo->Offset[i] / 1024.0 / 1024.0); + } + + return; +} + +void PadInfoFree(PadInfo * padInfo) +{ + if (padInfo != NULL) { + free(padInfo); + } + + return; +} + +// Returns the new padded cell size and reports if padding was performed and how. + +int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr) +{ + // Apply padding. + int nNewCells = PadCells(nCells, sizeof(PdfT), padInfoPtr); + + if (nCells != nNewCells) { + printf("# padding info:\n"); + PadInfoPrint(*padInfoPtr, stdout, "# "); + // int nPaddedCells = nNewCells - nCells; + // printf("# padding per dir.: %10d nodes (%f MiB)\n", nPaddedCells, nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT)); + // printf("# padding total: %10d nodes (%f MiB)\n", 19 * nPaddedCells, 19 * nPaddedCells / 1024.0 / 1024.0 * sizeof(PdfT)); + + int nPadCells = nNewCells - nCells; + + printf("#\n# padding %d nodes with %d nodes, %f MiB per direction, %f MiB in total\n", + nCells, + nPadCells, + nPadCells * sizeof(PdfT) / 1024.0 / 1024.0, + nPadCells * sizeof(PdfT) / 1024.0 / 1024.0 * 19); + } + else { + printf("# padding info: no padding used\n"); + } + + return nNewCells; +} diff --git a/src/Padding.h b/src/Padding.h new file mode 100644 index 0000000..55fa33e --- /dev/null +++ b/src/Padding.h @@ -0,0 +1,19 @@ +#ifndef __PADDING_H__ +#define __PADDING_H__ + +typedef struct PadInfo_ +{ + int nEntries; + int * Modulus; + int * Offset; +} PadInfo; + +PadInfo * PadInfoFromStr(const char * padStr); + +int PadCells(int nCells, int cellSizeBytes, PadInfo ** padInfo); +int PadCellsAndReport(int nCells, int cellSizeBytes, PadInfo ** padInfoPtr); + +void PadInfoPrint(PadInfo * padInfo, FILE * f, const char * prefix); +void PadInfoFree(PadInfo * padInfo); + +#endif // __PADDING_H__ diff --git a/src/Pinning.c b/src/Pinning.c index 0cf70ea..de9be3f 100644 --- a/src/Pinning.c +++ b/src/Pinning.c @@ -34,9 +34,6 @@ #include "Base.h" #include "Pinning.h" - - - // ----------------------------------------------------------------------- // // Binds the calling thread to specified core. @@ -73,8 +70,7 @@ int PinCurrentThreadToCore(int coreNumber) // // ----------------------------------------------------------------------- -int PinCurrentThreadByEnvVar(const char * envVarName, - int mpiRank, int nodeRank, int threadNumber) +int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber) { const char * envVarValue; int core; @@ -82,14 +78,12 @@ int PinCurrentThreadByEnvVar(const char * envVarName, envVarValue = getenv(envVarName); if (envVarValue == NULL) { - if (mpiRank == 0) { - Print("skip pinning: env var %s not set\n", envVarName); - } + printf("WARNING: skip pinning: env var %s not set\n", envVarName); return 0; } - core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber); + core = PinParseCpuList(envVarValue, threadNumber); if (core < 0) { return core; @@ -107,20 +101,17 @@ int PinCurrentThreadByEnvVar(const char * envVarName, // // ----------------------------------------------------------------------- -int PinCurrentThreadByCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber) +int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber) { int core; if (cpuList == NULL) { - if (mpiRank == 0) { - printf("ERROR: cpu list is NULL.\n"); - } + printf("ERROR: cpu list is NULL.\n"); exit(1); } - core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber); + core = PinParseCpuList(cpuList, threadNumber); if (core < 0) { return core; @@ -133,31 +124,13 @@ int PinCurrentThreadByCpuList(const char * cpuList, // ----------------------------------------------------------------------- // // Parses the provided cpu list and returns the core number for the -// specified MPI rank, local rank, and thread. -// -// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5 -// -// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a -// node rank. The first block maps to node rank 1, the second to node -// rank 2, etc. -// -// Inside a block the core numbers specify where the threads should -// be pinned to. They are separated by "," and the first number maps -// to the first core, the second number to the second core, etc. -// -// For example: 0,2,4_6,8,10 +// specified thread. // -// Node rank 0 thread 0 pinned to core 0 -// 0 1 2 -// 0 2 4 -// 1 0 6 -// 1 1 8 -// 1 2 10 +// The cpu list has for example a format of: 0,1,2 // // ----------------------------------------------------------------------- -int PinParseCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber) +int PinParseCpuList(const char * cpuList, int threadNumber) { int cpu = -1; @@ -168,8 +141,8 @@ int PinParseCpuList(const char * cpuList, const char * c = cpuList; // Ensure only valid characters are in the cpu list. - // Cpu list is in the format of "0,1,2_3,4,5". - while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) { + // Cpu list is in the format of "0,1,2,3,4,5". + while (((*c >= '0' && *c <= '9') || *c == ',')) { ++c; } @@ -180,19 +153,6 @@ int PinParseCpuList(const char * cpuList, c = cpuList; - int i = 0; - - // Move variable c after the "nodeRank"th "_" in the cpu list. - while (i < nodeRank && *c != 0x00) { - if (*c == '_') ++i; - ++c; - } - - if (i != nodeRank || *c < '0' || *c > '9') { - // Cpu list for this node rank not found. - return -3; - } - // Now find the core for the specified thread. int t = 0; @@ -201,11 +161,6 @@ int PinParseCpuList(const char * cpuList, if (*c == ',') { ++t; } - else if (*c == '_') { - // Unexpected character at this position. - break; - } - ++c; } diff --git a/src/Pinning.h b/src/Pinning.h index 40225c1..877048d 100644 --- a/src/Pinning.h +++ b/src/Pinning.h @@ -30,14 +30,11 @@ int PinCurrentThreadToCore(int coreNumber); -int PinParseCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber); +int PinParseCpuList(const char * cpuList, int threadNumber); -int PinCurrentThreadByEnvVar(const char * envVarName, - int mpiRank, int nodeRank, int threadNumber); +int PinCurrentThreadByEnvVar(const char * envVarName, int threadNumber); -int PinCurrentThreadByCpuList(const char * cpuList, - int mpiRank, int nodeRank, int threadNumber); +int PinCurrentThreadByCpuList(const char * cpuList, int threadNumber); int PinCurrentCore(); diff --git a/src/test.sh b/src/test.sh index 7a76f51..a0e8ad6 100755 --- a/src/test.sh +++ b/src/test.sh @@ -54,7 +54,7 @@ Config="$1" make clean-all make -j CONFIG=$Config TAG=$XTag-debug -make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v" -- 2.25.1