version 0.1
authorMarkus Wittmann <markus.wittmann@fau.de>
Mon, 10 Jul 2017 09:38:45 +0000 (11:38 +0200)
committerMarkus Wittmann <markus.wittmann@fau.de>
Mon, 10 Jul 2017 09:38:45 +0000 (11:38 +0200)
54 files changed:
COPYING [new file with mode: 0644]
LICENSE [new file with mode: 0644]
doc/Makefile [new file with mode: 0644]
doc/main.rst [new file with mode: 0644]
src/Base.h [new file with mode: 0644]
src/BenchKernelD3Q19.c [new file with mode: 0644]
src/BenchKernelD3Q19.h [new file with mode: 0644]
src/BenchKernelD3Q19Common.c [new file with mode: 0644]
src/BenchKernelD3Q19Common.h [new file with mode: 0644]
src/BenchKernelD3Q19List.c [new file with mode: 0644]
src/BenchKernelD3Q19List.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAa.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAa.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAaCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAaCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAaPv.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAaPv.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAaPvCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAaPvCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAaRia.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAaRia.h [new file with mode: 0644]
src/BenchKernelD3Q19ListAaRiaCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19ListAaRiaCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19ListCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19ListCommon.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt.c [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt1SScalar.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNt2SScalar.h [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNtCommon.c [new file with mode: 0644]
src/BenchKernelD3Q19ListPullSplitNtCommon.h [new file with mode: 0644]
src/BoostJoin.h [new file with mode: 0644]
src/Geometry.c [new file with mode: 0644]
src/Geometry.h [new file with mode: 0644]
src/Kernel.c [new file with mode: 0644]
src/Kernel.h [new file with mode: 0644]
src/KernelFunctions.h [new file with mode: 0644]
src/Lattice.h [new file with mode: 0644]
src/LikwidIf.h [new file with mode: 0644]
src/Main.c [new file with mode: 0644]
src/Makefile [new file with mode: 0644]
src/Memory.c [new file with mode: 0644]
src/Memory.h [new file with mode: 0644]
src/Pinning.c [new file with mode: 0644]
src/Pinning.h [new file with mode: 0644]
src/Vector.h [new file with mode: 0644]
src/Vtk.c [new file with mode: 0644]
src/Vtk.h [new file with mode: 0644]
src/config.linux-gcc.mk [new file with mode: 0644]
src/config.linux-intel.mk [new file with mode: 0644]
src/test-verification.sh [new file with mode: 0755]
src/test.sh [new file with mode: 0755]

diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..10926e8
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,675 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..10926e8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,675 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644 (file)
index 0000000..690e4eb
--- /dev/null
@@ -0,0 +1,38 @@
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016-2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+
+
+.PHONY: all main
+
+all: main
+
+main: main.rst
+#main.css
+       [ -d html ] || mkdir -p html
+#    rst2html --stylesheet=html4css1.css,main.css $< html/$@.html
+       rst2html --stylesheet=html4css1.css $< html/$@.html
+
diff --git a/doc/main.rst b/doc/main.rst
new file mode 100644 (file)
index 0000000..921ea84
--- /dev/null
@@ -0,0 +1,242 @@
+.. # --------------------------------------------------------------------------
+   #
+   # Copyright
+   #   Markus Wittmann, 2016-2017
+   #   RRZE, University of Erlangen-Nuremberg, Germany
+   #   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+   #
+   #   Viktor Haag, 2016
+   #   LSS, University of Erlangen-Nuremberg, Germany
+   #
+   #  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+   #
+   #  LbmBenchKernels is free software: you can redistribute it and/or modify
+   #  it under the terms of the GNU General Public License as published by
+   #  the Free Software Foundation, either version 3 of the License, or
+   #  (at your option) any later version.
+   #
+   #  LbmBenchKernels is distributed in the hope that it will be useful,
+   #  but WITHOUT ANY WARRANTY; without even the implied warranty of
+   #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   #  GNU General Public License for more details.
+   #
+   #  You should have received a copy of the GNU General Public License
+   #  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+   #
+   # --------------------------------------------------------------------------
+
+.. title:: LBM Benchmark Kernels Documentation 
+
+
+===================================
+LBM Benchmark Kernels Documentation
+===================================
+
+.. sectnum::
+.. contents::
+
+Compilation
+===========
+
+The benchmark framework currently supports only Linux systems and the GCC and 
+Intel compilers. Every other configuration probably requires adjustment inside
+the code and the makefiles. Further some code might be platform or at least
+POSIX specific.
+
+The benchmark can be build via ``make`` from the ``src`` subdirectory. This will
+generate one binary which hosts all implemented benchmark kernels. 
+
+Binaries are located under the ``bin`` subdirectory and will have different names
+depending on compiler and build configuration.
+
+Debug and Verification
+----------------------
+
+:: 
+
+  make
+
+Running ``make`` without any arguments builds the debug version (BUILD=debug) of
+the benchmark kernels, where no optimizations are performed,  line numbers and
+debug symbols are included as well as ``DEBUG`` will be defined.  The resulting
+binary will be found in the ``bin`` subdirectory and named
+``lbmbenchk-linux-<compiler>-debug``.
+Without any further specification the binary includes verification
+(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output
+(``VTK_OUTPUT=on``) enabled. 
+
+Please note that the generated binary will therefore
+exhibit a poor performance.
+
+Benchmarking
+------------
+
+To generate a binary for benchmarking run make with ::
+
+  make BENCHMARK=on BUILD=release
+
+Here BUILD=release turns optimizations on and BENCHMARK=on disables
+verfification, statistics, and VTK output.
+
+Release and Verification
+------------------------
+
+Verification with the debug builds can be extremely slow. Hence verification
+capabilities can be build with release builds: ::
+
+  make BUILD=release
+
+Compilers
+---------
+
+Currently only the GCC and Intel compiler under Linux are supported. Between
+both configuration can be chosen via ``CONFIG=linux-gcc`` or
+``CONFIG=linux-intel``.
+
+Options Summary
+---------------
+
+Options that can be specified when building the framework with make:
+
+============= ======================= ============ ==========================================================
+name          values                  default      description
+------------- ----------------------- ------------ ----------------------------------------------------------
+TARCH         --                      --           Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler.
+BENCHMARK     on, off                 off          If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT.
+BUILD         debug, release          debug        No optimization, debug symbols, DEBUG defined.
+CONFIG        linux-gcc, linux-intel  linux-intel  Select GCC or Intel compiler. 
+ISA           avx, sse                avx          Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for.
+OPENMP        on, off                 on           OpenMP, i.\,e.\. threading support.
+STATISTICS    on, off                 off          View statistics, like density etc, during simulation. 
+VERIFICATION  on, off                 off          Turn verification on/off.
+VTK_OUTPUT    on, off                 off          Enable/Disable VTK file output.
+============= ======================= ============ ==========================================================
+
+Invocation
+==========
+
+Running the binary will print among the GPL licence header a line like the following:
+  LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+
+if verfication was enabled during compilation or
+
+  LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: benchmark
+
+if verfication was disabled during compilation.
+
+Command Line Parameters
+-----------------------
+
+Running the binary with ``-h`` list all available parameters: ::
+
+  Usage:
+  ./lbmbenchk -list
+  ./lbmbenchk
+      [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-<block size>]] [-iterations <iterations>] [-lattice-dump-ascii]
+      [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>]
+      [-periodic-x]
+      [-t <number of threads>]
+      [-pin core{,core}*]
+      [-verify]
+      -- <kernel specific parameters>
+
+  -list           List available kernels.
+
+  -dims XxYxZ     Specify geometry dimensions.
+
+  -geometry blocks-<block size>
+                  Geometetry with blocks of size <block size> regularily layout out.
+
+
+If an option is specified multiple times the last one overrides previous ones.
+This holds also true for ``-verify`` which sets geometry dimensions,
+iterations, etc, which can afterward be override, e.g.: ::
+
+  $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32
+
+Kernel specific parameters can be opatained via selecting the specific kernel
+and passing ``-h`` as parameter: ::
+
+  $ bin/lbmbenchk-linux-intel-release -kernel -- -h
+  ...
+  Kernel parameters:
+  [-blk <n>] [-blk-[xyz] <n>]
+
+  
+A list of all available kernels can be obtained via ``-list``: ::
+
+  $ ../bin/lbmbenchk-linux-gcc-debug -list
+  Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE
+  This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.
+  This is free software, and you are welcome to redistribute it under certain conditions.
+
+  LBM Benchmark Kernels 0.1, compiled Jul  5 2017 21:59:22, type: verification
+  Available kernels to benchmark:
+     list-aa-pv-soa
+     list-aa-ria-soa
+     list-aa-soa
+     list-aa-aos
+     list-pull-split-nt-1s-soa
+     list-pull-split-nt-2s-soa
+     list-push-soa
+     list-push-aos
+     list-pull-soa
+     list-pull-aos
+     push-soa
+     push-aos
+     pull-soa
+     pull-aos
+     blk-push-soa
+     blk-push-aos
+     blk-pull-soa
+     blk-pull-aos
+
+
+Benchmarking
+============
+
+Correct benchmarking is a nontrivial task. Whenever benchmark results should be
+created make sure the binary was compiled with:  
+
+- ``BENCHMARK=on`` and
+- ``BUILD=release`` and 
+- the correct ISA for macros is used, selected via ``ISA`` and
+- use ``TARCH`` to specify the architecture the compiler generates code for.
+
+During benchmarking pinning should be used via the ``-pin`` parameter. Running
+a benchmark with 10 threads an pin them to the first 10 cores works like ::
+
+  $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9)
+
+Things the binary does nor check or controll:
+
+- transparent huge pages: when allocating memory small 4 KiB pages might be
+  replaced with larger ones. This is in general a good thing, but if this is
+  really the case, depends on the system settings.
+
+- CPU/core frequency: For reproducible results the frequency of all cores
+  should be fixed.
+
+- NUMA placement policy: The benchmark assumes a first touch policy, which
+  means the memory will be placed at the NUMA domain the touching core is
+  associated with. If a different policy is in place or the NUMA domain to be
+  used is already full memory might be allocated in a remote domain. Accesses
+  to remote domains typically have a higher latency and lower bandwidth.
+
+- System load: interference with other application, espcially on desktop
+  systems should be avoided.
+
+- Padding: most kernels do not care about padding against cache or TLB
+  thrashing. Even if the number of (fluid) nodes suggest everything is fine,
+  through parallelization still problems might occur.
+
+- CPU dispatcher function: the compiler might add different versions of a
+  function for different ISA extensions. Make sure the code you might think is
+  executed is actually the code which is executed.
+
+.. |datetime| date:: %Y-%m-%d %H:%M
+
+Document was generated at |datetime|.
+
diff --git a/src/Base.h b/src/Base.h
new file mode 100644 (file)
index 0000000..27f9116
--- /dev/null
@@ -0,0 +1,204 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BASE_H__
+#define __BASE_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/time.h>
+
+
+static inline double Time()
+{
+       struct timeval time;
+
+       gettimeofday(&time, NULL);
+
+       return (time.tv_sec + 1e-6 * time.tv_usec);
+}
+
+
+#define TOOL_NAME "lbmbenchk"
+
+#define STRINGIFYX(x)   #x
+#define STRINGIFY(x)    STRINGIFYX(x)
+
+// See top of BoostJoin.h for Boost Licence.
+#include "BoostJoin.h"
+#define JOIN(X, Y)      BOOST_DO_JOIN(X, Y)
+
+
+// Some macro fu to remove the first comma.
+// "x" is an empty macro agrument in EXPAND2
+// before the first comma which is skipped
+#ifndef EXPAND
+       #define EXPAND2(x, ...)       __VA_ARGS__
+       #define EXPAND(x, ...)        EXPAND2(x, ## __VA_ARGS__)
+#endif
+
+#ifdef DEBUG
+
+    #define Assert(expression) \
+        do { \
+            if (!(expression)) { \
+                Error("%s:%d assertion \"%s\" failed with code %d\n", \
+                        __FILE__, __LINE__, \
+                        #expression, expression); \
+                                __asm__ ("int $3\n"); \
+                exit(-1); \
+            } \
+        } while (0)
+
+    #define AssertMsg(expression, formatString, ...) \
+        do { \
+            if (!(expression)) { \
+                Error("%s:%d assertion \"%s\" failed with code %d\n", \
+                        __FILE__, __LINE__, \
+                        #expression, expression); \
+                Error(formatString, ##__VA_ARGS__); \
+                                __asm__ ("int $3\n"); \
+                exit(-1); \
+            } \
+        } while (0)
+#else
+
+       #define Assert(expression)
+       #define AssertMsg(expression, formatString, ...)
+
+#endif
+
+    #define Verify(expression) \
+        do { \
+            if (!(expression)) { \
+                Error("%s:%d verify \"%s\" failed with code %d\n", \
+                        __FILE__, __LINE__, \
+                        #expression, expression); \
+                                __asm__ ("int $3\n"); \
+                exit(-1); \
+            } \
+        } while (0)
+
+    #define VerifyMsg(expression, formatString, ...) \
+        do { \
+            if (!(expression)) { \
+                Error("%s:%d verify \"%s\" failed with code %d\n", \
+                        __FILE__, __LINE__, \
+                        #expression, expression); \
+                Error(formatString, ##__VA_ARGS__); \
+                                __asm__ ("int $3\n"); \
+                exit(-1); \
+            } \
+        } while (0)
+
+    #define Print(formatString, ...) \
+        fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__)
+
+    #define Warning(formatString, ...) \
+        fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__)
+
+    #define Error(formatString, ...) \
+        fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__)
+
+    #define DebugPrint(formatString, ...) \
+        fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__)
+
+    #ifndef NO_SHELL_COLORS
+
+        // or "\e"
+        #define ESC             "\x1b"
+
+        // No Color
+        #define SHC_NC          ESC "[0m"
+
+        #define SHC_BLACK       ESC "[0;30m"
+        #define SHC_MAGENTA     ESC "[0;35m"
+        #define SHC_RED         ESC "[0;31m"
+        #define SHC_DARK_RED    ESC "[1;31m"
+        #define SHC_CYAN        ESC "[0;36m"
+        #define SHC_BROWN       ESC "[0;33m"
+        #define SHC_DARK_GREEN  ESC "[1;32m"
+
+    #else  // NO_SHELL_COLORS
+
+        // No Color
+        #define SHC_NC          ""
+
+        #define SHC_BLACK       ""
+        #define SHC_MAGENTA     ""
+        #define SHC_RED         ""
+        #define SHC_DARK_RED    ""
+        #define SHC_CYAN        ""
+        #define SHC_BROWN       ""
+        #define SHC_DARK_GREEN  ""
+
+    #endif  // NO_SHELL_COLORS
+
+
+    #define N_ELEMS(x)  (sizeof(x) / sizeof((x)[0]))
+
+
+       #define MIN(a, b)               ((a) <= (b) ? (a) : (b))
+
+static inline int MinI(int a, int b) { return a <= b ? a : b; }
+
+// Raises a breakpoint if a debugger is attached, else SIG_TRAP is raised.
+#define DEBUG_BREAK_POINT()            __asm__ ("int $3\n")
+
+#define UNUSED(variable)                    (void)(variable)
+
+
+static inline char * ByteToHuman(size_t bytes)
+{
+       static char buffer[256] = { 0 };
+
+       if (bytes < 1024) {
+               snprintf(buffer, sizeof(buffer), "%lu b", bytes);
+               return buffer;
+       }
+
+       double KiB = bytes / 1024.0;
+
+       if (KiB < 1024.0) {
+               snprintf(buffer, sizeof(buffer), "%9.2e KiB", KiB);
+               return buffer;
+       }
+
+       double MiB = KiB / 1024.0;
+       if (MiB < 1024.0) {
+               snprintf(buffer, sizeof(buffer), "%9.2e MiB", MiB);
+               return buffer;
+       }
+
+       double GiB = MiB / 1024.0;
+       snprintf(buffer, sizeof(buffer), "%9.2e GiB", GiB);
+       return buffer;
+}
+
+
+#endif // __BASE_H__
diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c
new file mode 100644 (file)
index 0000000..8203d2a
--- /dev/null
@@ -0,0 +1,736 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19Common.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int * gDims = kd->GlobalDims;
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+//     PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter
+       PdfT magicParam = 1.0/12.0;     // 1/4: best stability; 1/12: removes third-order advection error (best advection);     1/6: removes fourth-order diffusion error (best diffusion);     3/16: exact location of bounce back for poiseuille flow
+       PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT w_1_x3 = w_1 * 3.0;        PdfT w_1_nine_half = w_1 * 9.0/2.0;     PdfT w_1_indep = 0.0;
+       PdfT w_2_x3 = w_2 * 3.0;        PdfT w_2_nine_half = w_2 * 9.0/2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ux, uy, uz, ui;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+       PdfT * dst = kd->Pdfs[1];
+       PdfT * tmp;
+
+       int maxIterations = cd->MaxIterations;
+
+       #ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, 0);
+       }
+       #endif
+
+       for (int iter = 0; iter < maxIterations; ++iter) {
+
+               #ifdef _OPENMP
+               #pragma omp parallel for collapse(3) default(none) \
+                               shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
+                               w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+                               oX, oY, oZ, nX, nY, nZ) \
+                               private(ux, uy, uz, ui, dens, dir_indep_trm, \
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                       evenPart, oddPart, w_1_indep, w_2_indep)
+               #endif
+
+               for (int z = oZ; z < nZ + oZ; ++z) {
+                       for (int y = oY; y < nY + oY; ++y) {
+                               for (int x = oX; x < nX + oX; ++x) {
+                                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+#ifdef PROP_MODEL_PUSH
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x, y, z, idx)];
+                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
+                                       D3Q19_LIST
+                                       #undef X
+
+#elif PROP_MODEL_PULL
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
+                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
+                                       D3Q19_LIST
+                                       #undef X
+
+#else
+       #error No implementation for PROP_MODEL_NAME.
+#endif
+
+                                       // #define LID_DRIVEN_CAVITY
+
+                                       #ifdef LID_DRIVEN_CAVITY
+
+                                       if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+                                               ux = 0.1 * 0.577;
+                                               uy = 0.0;
+                                               uz = 0.0;
+
+                                       } else {
+                                       #endif
+                                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                       pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                       pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                       pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+                                       #ifdef LID_DRIVEN_CAVITY
+                                       }
+
+                                       #endif
+
+                                       dens = pdf_C +
+                                                       pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                                       pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                                       pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                                       pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+#ifdef PROP_MODEL_PUSH
+
+                                       // direction: w_0
+                                       dst[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       dst[I(x, y + 1,   z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                                       dst[I(x, y - 1,   z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       dst[I(x + 1,   y,   z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                                       dst[I(x - 1,   y,   z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       dst[I(x,   y, z + 1, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                                       dst[I(x,   y, z - 1, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       dst[I(x - 1, y + 1,   z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                                       dst[I(x + 1, y - 1,   z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       dst[I(x + 1, y + 1,   z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                                       dst[I(x - 1, y - 1,   z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       dst[I(x - 1,   y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                                       dst[I(x + 1,   y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       dst[I(x + 1,   y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                                       dst[I(x - 1,   y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                                       dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                                       dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+#elif PROP_MODEL_PULL
+
+                                       // direction: w_0
+                                       dst[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+#else
+       #error No implementation for PROP_MODEL_NAME.
+#endif
+
+                                       #undef I
+                               }
+                       }
+               } // z, y, x (from inner to outer)
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(kd, dst)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]];
+               }
+
+               #ifdef VERIFICATION
+               kd->PdfsActive = dst;
+               KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+
+               if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                       kd->PdfsActive = dst;
+                       VtkWrite(ld, kd, cd, iter);
+               }
+
+               #endif
+
+               #ifdef STATISTICS
+               kd->PdfsActive = dst;
+               KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // swap grids
+               tmp = src;
+               src = dst;
+               dst = tmp;
+
+       } // for (int iter = 0; ...
+
+       #ifdef VTK_OUTPUT
+
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+
+       #endif
+
+       return;
+}
+
+
+void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int * gDims = kd->GlobalDims;
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       KernelDataEx * kdex = (KernelDataEx *)kd;
+
+       int blk[3];
+       blk[0] = kdex->Blk[0];
+       blk[1] = kdex->Blk[1];
+       blk[2] = kdex->Blk[2];
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+//     PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter
+       PdfT magicParam = 1.0/12.0;     // 1/4: best stability; 1/12: removes third-order advection error (best advection);     1/6: removes fourth-order diffusion error (best diffusion);     3/16: exact location of bounce back for poiseuille flow
+       PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT w_1_x3 = w_1 * 3.0;        PdfT w_1_nine_half = w_1 * 9.0/2.0;     PdfT w_1_indep = 0.0;
+       PdfT w_2_x3 = w_2 * 3.0;        PdfT w_2_nine_half = w_2 * 9.0/2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ux, uy, uz, ui;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+       PdfT * dst = kd->Pdfs[1];
+       PdfT * tmp;
+
+       int maxIterations = cd->MaxIterations;
+
+       #ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, 0);
+       }
+       #endif
+
+       int nThreads = 1;
+
+       #ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+       #endif
+
+       for (int iter = 0; iter < maxIterations; ++iter) {
+
+               // #ifdef _OPENMP --> add line continuation
+               // #pragma omp parallel for collapse(3) default(none)
+               //              shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd,
+               //              w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd,
+               //              oX, oY, oZ, nX, nY, nZ, blk)
+               //              private(ux, uy, uz, ui, dens, dir_indep_trm,
+               //                      pdf_C,
+               //                      pdf_N, pdf_E, pdf_S, pdf_W,
+               //                      pdf_NE, pdf_SE, pdf_SW, pdf_NW,
+               //                      pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW,
+               //                      pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW,
+               //                      evenPart, oddPart, w_1_indep, w_2_indep)
+               // #endif
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
+                               w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \
+                               oX, oY, oZ, nX, nY, nZ, blk, nThreads) \
+                               private(ux, uy, uz, ui, dens, dir_indep_trm, \
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                       evenPart, oddPart, w_1_indep, w_2_indep)
+               #endif
+
+               for (int i = 0; i < nThreads; ++i) {
+
+                       int threadStartX = nX / nThreads * i;
+                       int threadEndX   = nX / nThreads * (i + 1);
+
+                       if (nX % nThreads > 0) {
+                               if (nX % nThreads > i) {
+                                       threadStartX += i;
+                                       threadEndX   += i + 1;
+                               }
+                               else {
+                                       threadStartX += nX % nThreads;
+                                       threadEndX   += nX % nThreads;
+                               }
+                       }
+
+               // for (int z = oZ; z < nZ + oZ; ++z) {
+               //      for (int y = oY; y < nY + oY; ++y) {
+               //              for (int x = oX; x < nX + oX; ++x) {
+               for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) {
+                       for (int bY = oY; bY < nY + oY; bY += blk[1]) {
+                               for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) {
+                               // for (int bX = oX; bX < nX + oX; bX += blk[0]) {
+
+                                       // Must do everything here, else it would break collapse.
+                                       int eZ = MIN(bZ + blk[2], nZ + oZ);
+                                       int eY = MIN(bY + blk[1], nY + oY);
+                                       int eX = MIN(bX + blk[0], threadEndX + oX);
+
+//                                     printf("%d: %d-%d  %d-%d  %d-%d  %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX);
+
+                                       for (int z = bZ; z < eZ; ++z) {
+                                               for (int y = bY; y < eY; ++y) {
+                                                       for (int x = bX; x < eX; ++x) {
+
+                                       #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir))
+
+#ifdef PROP_MODEL_PUSH
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x, y, z, idx)];
+                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
+                                       D3Q19_LIST
+                                       #undef X
+
+#elif PROP_MODEL_PULL
+
+                                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)];
+                                       //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);}
+                                       D3Q19_LIST
+                                       #undef X
+
+#else
+       #error No implementation for PROP_MODEL_NAME.
+#endif
+
+                                       // #define LID_DRIVEN_CAVITY
+
+                                       #ifdef LID_DRIVEN_CAVITY
+
+                                       if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY  && y < (nY - 4 + oY)) {
+                                               ux = 0.1 * 0.577;
+                                               uy = 0.0;
+                                               uz = 0.0;
+
+                                       } else {
+                                       #endif
+                                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                       pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                       pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                       pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+                                       #ifdef LID_DRIVEN_CAVITY
+                                       }
+
+                                       #endif
+
+                                       dens = pdf_C +
+                                                       pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                                       pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                                       pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                                       pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+#ifdef PROP_MODEL_PUSH
+
+                                       // direction: w_0
+                                       dst[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       dst[I(x, y + 1,   z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                                       dst[I(x, y - 1,   z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       dst[I(x + 1,   y,   z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                                       dst[I(x - 1,   y,   z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       dst[I(x,   y, z + 1, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                                       dst[I(x,   y, z - 1, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       dst[I(x - 1, y + 1,   z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                                       dst[I(x + 1, y - 1,   z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       dst[I(x + 1, y + 1,   z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                                       dst[I(x - 1, y - 1,   z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       dst[I(x - 1,   y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                                       dst[I(x + 1,   y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       dst[I(x + 1,   y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                                       dst[I(x - 1,   y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                                       dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                                       dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+#elif PROP_MODEL_PULL
+
+                                       // direction: w_0
+                                       dst[I(x,   y,   z, D3Q19_C)]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                                       // direction: w_1
+                                       w_1_indep = w_1*dir_indep_trm;
+
+                                       ui = uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                                       ui = ux;
+                                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                                       ui = uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                                       dst[I(x, y, z, D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                                       // direction: w_2
+                                       w_2_indep = w_2*dir_indep_trm;
+
+                                       ui = -ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                                       ui = ux + uy;
+                                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                                       ui = -ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                                       ui = ux + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                                       ui = -uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                                       ui = uy + uz;
+                                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                                       dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                                       dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+#else
+       #error No implementation for PROP_MODEL_NAME.
+#endif
+
+                                       #undef I
+                               }
+                       }
+               } // z, y, x (from inner to outer)
+                               }
+                       }
+               } // z, y, x (from inner to outer)
+
+               } // loop over threads
+
+               // Fixup bounce back PDFs.
+               #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(kd, dst)
+               #endif
+               for (int i = 0; i < kd->nBounceBackPdfs; ++i) {
+                       dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]];
+               }
+
+               #ifdef VERIFICATION
+               kd->PdfsActive = dst;
+               KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+
+               if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                       kd->PdfsActive = dst;
+                       VtkWrite(ld, kd, cd, iter);
+               }
+
+               #endif
+
+               #ifdef STATISTICS
+               kd->PdfsActive = dst;
+               KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // swap grids
+               tmp = src;
+               src = dst;
+               dst = tmp;
+
+       } // for (int iter = 0; ...
+
+       #ifdef VTK_OUTPUT
+
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+
+       #endif
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19.h b/src/BenchKernelD3Q19.h
new file mode 100644 (file)
index 0000000..8d7c127
--- /dev/null
@@ -0,0 +1,57 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19__
+#define __BENCH_KERNEL_D3Q19__
+
+#include "Kernel.h"
+
+
+void D3Q19Init_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19Init_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19Deinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19Deinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+void D3Q19Init_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19Init_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19Deinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19Deinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+void D3Q19BlkInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19BlkInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19BlkDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19BlkDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+void D3Q19BlkInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19BlkInit_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19BlkDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19BlkDeinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+#endif // __BENCH_KERNEL_D3Q19__
diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c
new file mode 100644 (file)
index 0000000..e697bc8
--- /dev/null
@@ -0,0 +1,655 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19Common.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+
+// Forward definition.
+void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0);
+       Assert(y >= 0);
+       Assert(z >= 0);
+       Assert(x < kd->Dims[0]);
+       Assert(y < kd->Dims[1]);
+       Assert(z < kd->Dims[2]);
+       Assert(dir >= 0);
+       Assert(dir < N_D3Q19);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+#ifdef PROP_MODEL_PUSH
+       int nx = x;
+       int ny = y;
+       int nz = z;
+#elif PROP_MODEL_PULL
+       int nx = x - D3Q19_X[dir];
+       int ny = y - D3Q19_Y[dir];
+       int nz = z - D3Q19_Z[dir];
+#endif
+
+       #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+       *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)];
+       #undef I
+
+       return;
+}
+
+static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0);
+       Assert(y >= 0);
+       Assert(z >= 0);
+       Assert(x < kd->Dims[0]);
+       Assert(y < kd->Dims[1]);
+       Assert(z < kd->Dims[2]);
+       Assert(dir >= 0);
+       Assert(dir < N_D3Q19);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+#ifdef PROP_MODEL_PUSH
+       int nx = x;
+       int ny = y;
+       int nz = z;
+#elif PROP_MODEL_PULL
+       int nx = x - D3Q19_X[dir];
+       int ny = y - D3Q19_Y[dir];
+       int nz = z - D3Q19_Z[dir];
+#endif
+
+       #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+       kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf;
+       #undef I
+
+
+       return;
+}
+
+
+static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0);
+       Assert(y >= 0);
+       Assert(z >= 0);
+       Assert(x < kd->Dims[0]);
+       Assert(y < kd->Dims[1]);
+       Assert(z < kd->Dims[2]);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+
+       #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+#ifdef PROP_MODEL_PUSH
+       #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)];
+#elif PROP_MODEL_PULL
+       #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)];
+#endif
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+#if 0          // DETECT NANs
+
+       for (int d = 0; d < 19; ++d) {
+               if (isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+
+                       for (int d2 = 0; d2 < 19; ++d2) {
+                               printf("%d: %e\n", d2, pdfs[d2]);
+                       }
+
+                       exit(1);
+               }
+       }
+
+#endif
+
+       return;
+}
+
+
+static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0);
+       Assert(y >= 0);
+       Assert(z >= 0);
+       Assert(x < kd->Dims[0]);
+       Assert(y < kd->Dims[1]);
+       Assert(z < kd->Dims[2]);
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir))
+#ifdef PROP_MODEL_PUSH
+       #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx];
+#elif PROP_MODEL_PULL
+       #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)] = pdfs[idx];
+#endif
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+       return;
+}
+
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+
+void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelDataEx * kdex = NULL;
+       MemAlloc((void **)&kdex, sizeof(KernelDataEx));
+
+       kdex->Blk[0] = 0; kdex->Blk[1] = 0; kdex->Blk[2] = 0;
+
+       KernelData * kd = &kdex->kd;
+       *kernelData = kd;
+
+       kd->nObstIndices = ld->nObst;
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = ld->Dims[0];
+       kd->Dims[1] = ld->Dims[1];
+       kd->Dims[2] = ld->Dims[2];
+
+
+       int * lDims = ld->Dims;
+       int * gDims = kd->GlobalDims;
+
+       gDims[0] = lDims[0] + 2;
+       gDims[1] = lDims[1] + 2;
+       gDims[2] = lDims[2] + 2;
+
+       kd->Offsets[0] = 1;
+       kd->Offsets[1] = 1;
+       kd->Offsets[2] = 1;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int gX = gDims[0];
+       int gY = gDims[1];
+       int gZ = gDims[2];
+
+       int oX = kd->Offsets[0];
+       int oY = kd->Offsets[1];
+       int oZ = kd->Offsets[2];
+
+       int blk[3] = { 0 };
+
+       int nCells = gX * gY * gZ;
+
+       PdfT * pdfs[2];
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = gX;
+       if (blk[1] == 0) blk[1] = gY;
+       if (blk[2] == 0) blk[2] = gZ;
+
+       printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+
+       kdex->Blk[0] = blk[0]; kdex->Blk[1] = blk[1]; kdex->Blk[2] = blk[2];
+
+
+       printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for both lattices)\n",
+              nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
+              2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+       MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+       MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+
+       kd->Pdfs[0] = pdfs[0];
+       kd->Pdfs[1] = pdfs[1];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // This depends on the chosen data layout.
+       // The structure of the loop should resemble the same "execution layout"
+       // as in the kernel!
+#ifdef _OPENMP
+       #pragma omp parallel for collapse(3)
+#endif
+
+       for (int bZ = 0; bZ < gZ; bZ += blk[2]) {
+               for (int bY = 0; bY < gY; bY += blk[1]) {
+                       for (int bX = 0; bX < gX; bX += blk[0]) {
+
+                               // Must do everything here, else it would break collapse.
+                               int eZ = MIN(bZ + blk[2], gZ);
+                               int eY = MIN(bY + blk[1], gY);
+                               int eX = MIN(bX + blk[0], gX);
+
+                               for (int z = bZ; z < eZ; ++z) {
+                                       for (int y = bY; y < eY; ++y) {
+                                               for (int x = bX; x < eX; ++x) {
+
+                                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                                               pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                                               pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0;
+                                                       }
+
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       // Initialize all PDFs to some standard value.
+       for (int z = 0; z < gZ; ++z) {
+               for (int y = 0; y < gY; ++y) {
+                       for (int x = 0; x < gX; ++x) {
+                               for (int d = 0; d < N_D3Q19; ++d) {
+                                       pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+                                       pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0;
+                               }
+                       }
+               }
+       }
+
+
+       // Count how many *PDFs* need bounce back treatment.
+
+       uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ;
+
+       if (nPdfs > ((2LU << 31) - 1)) {
+               printf("ERROR: number of PDFs exceed 2^31.\n");
+               exit(1);
+       }
+
+       // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2.
+       // Works when declaring nBounceBackPdfs as int64_t or using volatile.
+       volatile int nBounceBackPdfs = 0;
+       // int64_t nBounceBackPdfs = 0;
+       int nx, ny, nz, px, py, pz;
+
+       // TODO: apply blocking?
+
+       for (int z = 0; z < lZ; ++z) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int x = 0; x < lX; ++x) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+#ifdef PROP_MODEL_PUSH
+                                               nx = x + D3Q19_X[d];
+                                               ny = y + D3Q19_Y[d];
+                                               nz = z + D3Q19_Z[d];
+#elif PROP_MODEL_PULL
+                                               nx = x - D3Q19_X[d];
+                                               ny = y - D3Q19_Y[d];
+                                               nz = z - D3Q19_Z[d];
+#else
+       #error PROP_MODEL_NAME unknown.
+#endif
+                                               // Check if neighbor is inside the lattice.
+                                               // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                               //      continue;
+                                               // }
+                                               if ((nx < 0 || nx >= lX) && ld->PeriodicX) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((ny < 0 || ny >= lY) && ld->PeriodicY) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                                                       ++nBounceBackPdfs; // Compiler bug --> see above
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+
+       printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2));
+
+       MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100);
+       MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100);
+
+       kd->nBounceBackPdfs = nBounceBackPdfs;
+       nBounceBackPdfs = 0;
+
+       int srcIndex;
+       int dstIndex;
+
+       for (int z = 0; z < lZ; ++z) {
+               for (int y = 0; y < lY; ++y) {
+                       for (int x = 0; x < lX; ++x) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+#ifdef PROP_MODEL_PUSH
+                                               nx = x + D3Q19_X[d];
+                                               ny = y + D3Q19_Y[d];
+                                               nz = z + D3Q19_Z[d];
+#elif PROP_MODEL_PULL
+                                               nx = x - D3Q19_X[d];
+                                               ny = y - D3Q19_Y[d];
+                                               nz = z - D3Q19_Z[d];
+#else
+       #error PROP_MODEL_NAME unknown.
+#endif
+
+                                               if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                                               ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                                               ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                               ){
+                                                       // Implement periodic boundary in X direction.
+
+                                                       // If the target node reached through propagation is outside the lattice
+                                                       // the kernel stores it in some buffer around the domain.
+                                                       // From this position the PDF must be transported to the other side of the
+                                                       // geometry.
+
+                                                       // Take PDF from outside the domain.
+
+                                                       // x periodic
+                                                       if (nx < 0) {
+                                                               px = lX - 1;
+                                                       }
+                                                       else if (nx >= lX) {
+                                                               px = 0;
+                                                       } else {
+                                                               px = nx;
+                                                       }
+
+                                                       // y periodic
+                                                       if (ny < 0) {
+                                                               py = lY - 1;
+                                                       }
+                                                       else if (ny >= lY) {
+                                                               py = 0;
+                                                       } else {
+                                                               py = ny;
+                                                       }
+
+                                                       // z periodic
+                                                       if (nz < 0) {
+                                                               pz = lZ - 1;
+                                                       }
+                                                       else if (nz >= lZ) {
+                                                               pz = 0;
+                                                       } else {
+                                                               pz = nz;
+                                                       }
+
+                                                       if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+#ifdef PROP_MODEL_PUSH
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+#elif PROP_MODEL_PULL
+                                                               srcIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+                                                               dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+#endif
+                                                       }
+                                                       else {
+
+#ifdef PROP_MODEL_PUSH
+                                                               srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                               // Put it on the other side back into the domain.
+                                                               dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+#elif PROP_MODEL_PULL
+                                                               srcIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d);
+                                                               // Put it on the other side back into the ghost layer.
+                                                               dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+#endif
+
+                                                               VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       }
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+
+                                               }
+                                               else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                                                       continue;
+                                               }
+                                               else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+#ifdef PROP_MODEL_PUSH
+                                                       srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                       dstIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+#elif PROP_MODEL_PULL
+                                                       srcIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, D3Q19_INV[d]);
+                                                       dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d);
+                                                       // srcIndex = P_INDEX_5(gDims,  x + oX,  y + oY,  z + oZ, d);
+                                                       // dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, D3Q19_INV[d]);
+#endif
+
+                                                       VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d  xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d);
+
+                                                       kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex;
+                                                       kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex;
+
+                                                       ++nBounceBackPdfs;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = FNAME(GetNode);
+       kd->SetNode = FNAME(SetNode);
+
+       kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf);
+
+       kd->Kernel = FNAME(D3Q19BlkKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       MemFree((void **) & ((*kernelData)->Pdfs[0]));
+       MemFree((void **) & ((*kernelData)->Pdfs[1]));
+
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc));
+       MemFree((void **) & ((*kernelData)->BounceBackPdfsDst));
+
+       MemFree((void **)kernelData);
+
+       return;
+}
+
+// Kernels without blocking perform the same initialization/deinitialization as with
+// blocking, except that a different kernel is called. Hence, no arguments are allowed.
+
+void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       Parameters p;
+
+       if (params->nKernelArgs != 0) {
+               printf("ERROR: unknown kernel parameter.\n");
+               printf("This kernels accepts no parameters.\n");
+               exit(1);
+       }
+
+       // Setup an empty parameters structure.
+       p.nArgs        = params->nArgs;
+       p.Args         = params->Args;
+       p.nKernelArgs  = 0;
+       p.KernelArgs   = NULL;
+
+       // Call init routine for blocking kernel and override the
+       // kernel function to be called later on.
+       FNAME(D3Q19BlkInit)(ld, kernelData, &p);
+
+       (*kernelData)->Kernel = FNAME(D3Q19Kernel);
+
+       return;
+
+}
+
+void FNAME(D3Q19Deinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       FNAME(D3Q19BlkDeinit)(ld, kernelData);
+       return;
+}
diff --git a/src/BenchKernelD3Q19Common.h b/src/BenchKernelD3Q19Common.h
new file mode 100644 (file)
index 0000000..cceddca
--- /dev/null
@@ -0,0 +1,86 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_COMMON_H__
+
+#include "Kernel.h"
+
+typedef struct KernelDataEx_
+{
+       KernelData kd;
+       int Blk[3];                     // Blocking in X, Y, and Z direction, value of 0 disables blocking.
+} KernelDataEx;
+
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)    JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+       #error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+       #error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_5              FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d)
+{
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+
+       Assert(x >= 0);
+       Assert(x < dims[0]);
+       Assert(y >= 0);
+       Assert(y < dims[1]);
+       Assert(z >= 0);
+       Assert(z < dims[2]);
+       Assert(d >= 0);
+#ifdef D3Q19
+       Assert(d < N_D3Q19);
+#else
+#error Not implemented for this discretization.
+#endif
+
+#ifdef DATA_LAYOUT_SOA
+       return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x;
+#elif DATA_LAYOUT_AOS
+       return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d;
+#else
+#error P_INDEX_5 function no implemented for chosen data layout.
+#endif
+}
+
+#endif // __BENCH_KERNEL_D3Q19_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c
new file mode 100644 (file)
index 0000000..7922b7a
--- /dev/null
@@ -0,0 +1,347 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+
+void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+       KernelDataList * kdl = (KernelDataList *)kernelData;
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+//     PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter
+       PdfT magicParam = 1.0/12.0;     // 1/4: best stability; 1/12: removes third-order advection error (best advection);     1/6: removes fourth-order diffusion error (best diffusion);     3/16: exact location of bounce back for poiseuille flow
+       PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT w_1_x3 = w_1 * 3.0;        PdfT w_1_nine_half = w_1 * 9.0/2.0;     PdfT w_1_indep = 0.0;
+       PdfT w_2_x3 = w_2 * 3.0;        PdfT w_2_nine_half = w_2 * 9.0/2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ux, uy, uz, ui;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+       PdfT * dst = kd->Pdfs[1];
+       PdfT * tmp;
+
+       int maxIterations = cd->MaxIterations;
+       int nFluid = kdl->nFluid;
+       int nCells = kdl->nCells;
+
+       uint32_t adjListIndex;
+       uint32_t * adjList = kdl->AdjList;
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, 0);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       // TODO: outer openmp parallel
+       for(int iter = 0; iter < maxIterations; ++iter) {
+
+
+
+       #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(nFluid, nCells, kd, kdl, adjList, src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \
+                               w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd) \
+                               private(ux, uy, uz, ui, dens, dir_indep_trm, adjListIndex, \
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                       evenPart, oddPart, w_1_indep, w_2_indep)
+       #endif
+       for (int index = 0; index < nFluid; ++index) {
+
+                       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+#ifdef PROP_MODEL_PUSH
+
+                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(index, idx)];
+                       D3Q19_LIST
+                       #undef X
+
+#elif PROP_MODEL_PULL
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)];
+
+                       // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_N]]; ...
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idx]];
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+#else
+       #error No implementation for PROP_MODEL_NAME.
+#endif
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+                       int nX = kd->Dims[0];
+                       int nY = kd->Dims[1];
+                       int nZ = kd->Dims[2];
+
+                       int x = kdl->Coords[C_INDEX_X(index)];
+                       int y = kdl->Coords[C_INDEX_Y(index)];
+                       int z = kdl->Coords[C_INDEX_Z(index)];
+
+                       if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) {
+                               ux = 0.1 * 0.577;
+                               uy = 0.0;
+                               uz = 0.0;
+                       } else {
+#endif
+                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                       }
+#endif
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+#ifdef PROP_MODEL_PUSH
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       // direction: w_0
+                       dst[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1*dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                       dst[adjList[adjListIndex + D3Q19_N]]  = pdf_N - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_S]]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                       dst[adjList[adjListIndex + D3Q19_E]]  = pdf_E - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_W]]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                       dst[adjList[adjListIndex + D3Q19_T]]  = pdf_T - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_B]]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2*dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_NW]] = pdf_NW - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_SE]] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_NE]] = pdf_NE - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_SW]] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_TW]] = pdf_TW - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_BE]] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_TE]] = pdf_TE - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_BW]] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_TS]] = pdf_TS - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_BN]] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                       dst[adjList[adjListIndex + D3Q19_TN]] = pdf_TN - evenPart - oddPart;
+                       dst[adjList[adjListIndex + D3Q19_BS]] = pdf_BS - evenPart + oddPart;
+
+#elif PROP_MODEL_PULL
+
+                       // direction: w_0
+                       dst[I(index, D3Q19_C )]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1*dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                       dst[I(index, D3Q19_N )]  = pdf_N - evenPart - oddPart;
+                       dst[I(index, D3Q19_S )]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                       dst[I(index, D3Q19_E )]  = pdf_E - evenPart - oddPart;
+                       dst[I(index, D3Q19_W )]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                       dst[I(index, D3Q19_T )]  = pdf_T - evenPart - oddPart;
+                       dst[I(index, D3Q19_B )]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2*dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                       dst[I(index, D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                       dst[I(index, D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                       dst[I(index, D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                       dst[I(index, D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                       dst[I(index, D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                       dst[I(index, D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                       dst[I(index, D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+#endif
+                       #undef I
+               } // loop over fluid nodes
+
+               #ifdef VERIFICATION
+                       kd->PdfsActive = dst;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                               kd->PdfsActive = dst;
+                               VtkWrite(ld, kd, cd, iter);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kd->PdfsActive = dst;
+                       KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+               // swap grids
+               tmp = src;
+               src = dst;
+               dst = tmp;
+
+       } // for (int iter = 0; ...
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = src;
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
diff --git a/src/BenchKernelD3Q19List.h b/src/BenchKernelD3Q19List.h
new file mode 100644 (file)
index 0000000..a30e709
--- /dev/null
@@ -0,0 +1,44 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST__
+#define __BENCH_KERNEL_D3Q19_LIST__
+
+#include "Kernel.h"
+
+void D3Q19ListInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19ListDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+void D3Q19ListInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListInit_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19ListDeinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+#endif // __BENCH_KERNEL_D3Q19_LIST__
diff --git a/src/BenchKernelD3Q19ListAa.c b/src/BenchKernelD3Q19ListAa.c
new file mode 100644 (file)
index 0000000..2c3572c
--- /dev/null
@@ -0,0 +1,406 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+       KernelDataList * kdl = (KernelDataList *)kernelData;
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+       //  1/4: best stability;
+       // 1/12: removes third-order advection error (best advection);
+       //  1/6: removes fourth-order diffusion error (best diffusion);
+       // 3/16: exact location of bounce back for poiseuille flow
+       PdfT magicParam = 1.0/12.0;
+       PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) );
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0/2.0;       PdfT w_1_indep = 0.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0/2.0;       PdfT w_2_indep = 0.0;
+
+       PdfT ui;
+
+       PdfT ux, uy, uz;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+
+       int maxIterations = cd->MaxIterations;
+       int nFluid = kdl->nFluid;
+       int nCells = kdl->nCells;
+
+       uint32_t adjListIndex;
+       uint32_t * adjList = kdl->AdjList;
+
+
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       // TODO: outer openmp parallel
+       for(int iter = 0; iter < maxIterations; iter += 2) {
+
+
+               // --------------------------------------------------------------------
+               // even time step
+
+               X_LIKWID_START("list-aa-even");
+
+       #ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \
+                               private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW)
+       #endif
+               for (int index = 0; index < nFluid; ++index) {
+
+
+                       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+                       // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ...
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(index, idx)];
+                       D3Q19_LIST
+                       #undef X
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+                       int nX = kd->Dims[0];
+                       int nY = kd->Dims[1];
+                       int nZ = kd->Dims[2];
+
+                       int x = kdl->Coords[C_INDEX_X(index)];
+                       int y = kdl->Coords[C_INDEX_Y(index)];
+                       int z = kdl->Coords[C_INDEX_Z(index)];
+
+                       if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) {
+                               ux = 0.1 * 0.577;
+                               uy = 0.0;
+                               uz = 0.0;
+                       } else {
+#endif
+                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                       }
+#endif
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                       // direction: w_0
+                       src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1*dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                       src[I(index, D3Q19_S)]  = pdf_N - evenPart - oddPart;
+                       src[I(index, D3Q19_N)]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                       src[I(index, D3Q19_W)]  = pdf_E - evenPart - oddPart;
+                       src[I(index, D3Q19_E)]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                       src[I(index, D3Q19_B)]  = pdf_T - evenPart - oddPart;
+                       src[I(index, D3Q19_T)]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2*dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                       src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+                       src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                       src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+                       src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+                       src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+                       src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+                       src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+                       src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+               } // loop over fluid nodes
+
+               X_LIKWID_STOP("list-aa-even");
+
+               // save current iteration
+               kdl->Iteration = iter;
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               // --------------------------------------------------------------------
+               // odd time step
+
+               X_LIKWID_START("list-aa-odd");
+
+#ifdef _OPENMP
+               #pragma omp parallel for default(none) \
+                               shared(nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \
+                               private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\
+                                       pdf_C, \
+                                       pdf_N, pdf_E, pdf_S, pdf_W, \
+                                       pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                       pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                       pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW)
+#endif
+               for (int index = 0; index < nFluid; ++index) {
+
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+                       pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)];
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idxinv]];
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+#ifdef LID_DRIVEN_CAVITY
+                       int nX = kd->Dims[0];
+                       int nY = kd->Dims[1];
+                       int nZ = kd->Dims[2];
+
+                       int x = kdl->Coords[C_INDEX_X(index)];
+                       int y = kdl->Coords[C_INDEX_Y(index)];
+                       int z = kdl->Coords[C_INDEX_Z(index)];
+
+                       if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) {
+                               ux = 0.1 * 0.577;
+                               uy = 0.0;
+                               uz = 0.0;
+                       } else {
+#endif
+                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                       }
+#endif
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       // direction: w_0
+                       src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1*dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                       src[adjList[adjListIndex + D3Q19_N]]  = pdf_N - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_S]]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                       src[adjList[adjListIndex + D3Q19_E]]  = pdf_E - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_W]]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                       src[adjList[adjListIndex + D3Q19_T]]  = pdf_T - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_B]]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2*dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_NW]] = pdf_NW - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_SE]] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_NE]] = pdf_NE - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_SW]] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_TW]] = pdf_TW - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_BE]] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_TE]] = pdf_TE - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_BW]] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_TS]] = pdf_TS - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_BN]] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                       src[adjList[adjListIndex + D3Q19_TN]] = pdf_TN - evenPart - oddPart;
+                       src[adjList[adjListIndex + D3Q19_BS]] = pdf_BS - evenPart + oddPart;
+
+                       #undef I
+               } // loop over fluid nodes
+
+               X_LIKWID_STOP("list-aa-odd");
+
+               // save current iteration
+               kdl->Iteration = iter + 1;
+
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                               kd->PdfsActive = src;
+                               VtkWrite(ld, kd, cd, iter);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kd->PdfsActive = src;
+                       KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+
+       } // for (int iter = 0; ...
+
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = src;
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
diff --git a/src/BenchKernelD3Q19ListAa.h b/src/BenchKernelD3Q19ListAa.h
new file mode 100644 (file)
index 0000000..d0ac199
--- /dev/null
@@ -0,0 +1,40 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA__
+#define __BENCH_KERNEL_D3Q19_LIST_AA__
+
+#include "Kernel.h"
+
+void D3Q19ListAaInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListAaInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+void D3Q19ListAaDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA__
diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c
new file mode 100644 (file)
index 0000000..d2bcb52
--- /dev/null
@@ -0,0 +1,628 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+
+// Forward definition.
+void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
+               }
+               else {
+                       *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
+               }
+
+       }
+       else {
+               *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
+       }
+
+
+       return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       if (isnan(pdf)) {
+               printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+               DEBUG_BREAK_POINT();
+               exit(1);
+       }
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
+               }
+               else {
+                       kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
+               }
+
+       }
+       else {
+               kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
+       }
+
+       return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+#if 0
+       // Detect NaNs
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+#if 0
+       // Detect NaNs
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+       return;
+}
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelData * kd;
+       KernelDataList * kdl;
+       MemAlloc((void **)&kdl, sizeof(KernelDataList));
+
+       kd = (KernelData *)kdl;
+       *kernelData = kd;
+
+#ifdef DEBUG
+       kd->Pdfs[0] = NULL;
+       kd->Pdfs[1] = NULL;
+       kd->PdfsActive = NULL;
+       kd->DstPdfs = NULL;
+       kd->SrcPdfs = NULL;
+       kd->Dims[0] = -1;
+       kd->Dims[1] = -1;
+       kd->Dims[2] = -1;
+       kd->GlobalDims[0] = -1;
+       kd->GlobalDims[1] = -1;
+       kd->GlobalDims[2] = -1;
+       kd->Offsets[0] = -1;
+       kd->Offsets[1] = -1;
+       kd->Offsets[2] = -1;
+
+       kd->ObstIndices = NULL;
+       kd->nObstIndices = -1;
+       kd->BounceBackPdfsSrc = NULL;
+       kd->BounceBackPdfsDst = NULL;
+       kd->nBounceBackPdfs = -1;
+
+       kdl->AdjList = NULL;
+       kdl->Coords = NULL;
+       kdl->Grid = NULL;
+       kdl->nCells = -1;
+       kdl->nFluid = -1;
+#endif
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+       kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+       kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+       int * lDims = ld->Dims;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int nTotalCells = lX * lY * lZ;
+       int nCells = ld->nFluid; // TODO: + padding
+       int nFluid = ld->nFluid;
+
+       kdl->nCells = nCells;
+       kdl->nFluid = nFluid;
+
+       PdfT * pdfs[2];
+
+       int blk[3] = { 0 };
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = lX;
+       if (blk[1] == 0) blk[1] = lY;
+       if (blk[2] == 0) blk[2] = lZ;
+
+       printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+       printf("# allocating data for %d fluid LB nodes with padding (%lu bytes = %f MiB for both lattices)\n",
+               nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
+               2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+       MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+
+       kd->Pdfs[0] = pdfs[0];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // Here we touch only the fluid nodes as this loop is OpenMP parallel and
+       // we want the same scheduling as in the kernel.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+       } }
+
+       // Initialize all PDFs to some standard value.
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+       } }
+
+       // ----------------------------------------------------------------------
+       // create grid which will hold the index numbers of the fluid nodes
+
+       uint32_t * grid;
+
+       if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+               printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+               exit(1);
+       }
+       kdl->Grid = grid;
+
+       int latticeIndex;
+
+#ifdef DEBUG
+       for(int z = 0; z < lZ; ++z) {
+               for(int y = 0; y < lY; ++y) {
+                       for(int x = 0; x < lX; ++x) {
+
+                               latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+                               grid[latticeIndex] = ~0;
+                       }
+               }
+       }
+#endif
+
+       // ----------------------------------------------------------------------
+       // generate numbering over grid
+
+       uint32_t * coords;
+
+       if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+               printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+               exit(1);
+       }
+
+       kdl->Coords = coords;
+
+       // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+       // through the ld->Lattice array.
+       int counter = 0;
+
+       // Blocking is implemented via setup of the adjacency list. The kernel later will
+       // walk through the lattice blocked automatically.
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bX = 0; bX < lX; bX += blk[0]) {
+
+               int eX = MIN(bX + blk[0], lX);
+               int eY = MIN(bY + blk[1], lY);
+               int eZ = MIN(bZ + blk[2], lZ);
+
+
+               for (int z = bZ; z < eZ; ++z) {
+               for (int y = bY; y < eY; ++y) {
+               for (int x = bX; x < eX; ++x) {
+
+                       latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+                       if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+                               grid[latticeIndex] = counter;
+
+                               coords[C_INDEX_X(counter)] = x;
+                               coords[C_INDEX_Y(counter)] = y;
+                               coords[C_INDEX_Z(counter)] = z;
+
+                               ++counter;
+                       }
+               } } }
+       } } }
+
+       Verify(counter == nFluid);
+
+       uint32_t * adjList;
+
+       // AdjList only requires 18 instead of 19 entries per node, as
+       // the center PDF needs no addressing.
+       if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
+               printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
+               exit(1);
+       }
+
+       kdl->AdjList = adjList;
+
+       int x, y, z;
+
+       uint32_t neighborIndex;
+       uint32_t dstIndex;
+
+       int nx, ny, nz, px, py, pz;
+
+       // Loop over all fluid nodes and compute the indices to the neighboring
+       // PDFs for configure data layout (AoS/SoA).
+       // TODO: Parallelized loop to ensure correct NUMA placement.
+       // #ifdef _OPENMP --> add line continuation
+       //      #pragma omp parallel for default(none)
+       //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+       //                              stderr,
+       //                              lDims, grid, ld, lX, lY, lZ, adjList)
+       //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+       // #endif
+       for (int index = 0; index < nFluid; ++index) {
+               x = coords[C_INDEX_X(index)];
+               y = coords[C_INDEX_Y(index)];
+               z = coords[C_INDEX_Z(index)];
+
+               Assert(x >= 0 && x < lX);
+               Assert(y >= 0 && y < lY);
+               Assert(z >= 0 && z < lZ);
+
+               Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+#ifdef PROP_MODEL_PUSH
+                       nx = x + D3Q19_X[d];
+                       ny = y + D3Q19_Y[d];
+                       nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+                       nx = x - D3Q19_X[d];
+                       ny = y - D3Q19_Y[d];
+                       nz = z - D3Q19_Z[d];
+#else
+                       #error No implementation for this PROP_MODEL_NAME.
+#endif
+                       // If the neighbor is outside the latcie in X direction and we have a
+                       // periodic boundary then we need to wrap around.
+                       if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                       ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                       ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                                                                                                               ){
+                               // x periodic
+
+                               if (nx < 0) {
+                                       px = lX - 1;
+                               }
+                               else if (nx >= lX) {
+                                       px = 0;
+                               } else {
+                                       px = nx;
+                               }
+                               // y periodic
+                               if (ny < 0) {
+                                       py = lY - 1;
+                               }
+                               else if (ny >= lY) {
+                                       py = 0;
+                               } else {
+                                       py = ny;
+                               }
+
+                               // z periodic
+                               if (nz < 0) {
+                                       pz = lZ - 1;
+                               }
+                               else if (nz >= lZ) {
+                                       pz = 0;
+                               } else {
+                                       pz = nz;
+                               }
+
+                               if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                       dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                               }
+                               else {
+                                       neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+                                       AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+                                       dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                               }
+                       }
+                       else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else {
+                               neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+                               Assert(neighborIndex != ~0);
+
+                               dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                       }
+
+                       Assert(dstIndex >= 0);
+                       Assert(dstIndex < nCells * N_D3Q19);
+
+                       adjList[index * N_D3Q19_IDX + d] = dstIndex;
+               }
+       }
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = GetNode;
+       kd->SetNode = SetNode;
+
+       kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+       kd->Kernel = FNAME(D3Q19ListAaKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19ListAaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+       MemFree((void **)&((*kernelData)->Pdfs[0]));
+
+       MemFree((void **)&((*kdl)->AdjList));
+       MemFree((void **)&((*kdl)->Coords));
+       MemFree((void **)&((*kdl)->Grid));
+
+       MemFree((void **)kernelData);
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListAaCommon.h b/src/BenchKernelD3Q19ListAaCommon.h
new file mode 100644 (file)
index 0000000..11ec5ff
--- /dev/null
@@ -0,0 +1,137 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__
+
+
+#include "Kernel.h"
+
+#include <inttypes.h>
+
+#define N_D3Q19_IDX            18
+
+typedef struct KernelDataList_
+{
+       KernelData kd;
+       uint32_t * AdjList;     // Stores PDF indices, which are the destination for propagation.
+                                               // Determine the destination for node index n and direction d via:
+                                               // (n * N_D3Q19_IDX) + d
+       uint32_t * Grid;        // Stores the node indices; use L_INDEX_4 macro for access.
+       uint32_t * Coords;      // Map node indices to coordiantes; use C_INDEX_* macro for access.
+       int nFluid;                     // Number of fluid nodes allocated, i.e. length of adjList * N_D3Q19_IDX.
+       int nCells;                     // Total number of nodes allocated, including nodes for padding!
+       int Iteration;          // Current iteration, starts at 0.
+} KernelDataList;
+
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDL(_x_)       ((KernelDataList *)(_x_))
+
+
+
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)    JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+       #error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+       #error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_3              FNAME(PINDEX3)
+
+static inline int FNAME(PINDEX3)(int nCells, int cellIndex, int d)
+{
+       Assert(nCells > 0);
+       Assert(cellIndex >= 0);
+       Assert(cellIndex < nCells);
+
+       Assert(d >= 0);
+       #ifdef D3Q19
+               Assert(d < N_D3Q19);
+       #else
+               #error Not implemented for this discretization.
+       #endif
+
+#ifdef DATA_LAYOUT_SOA
+       return d * nCells + cellIndex;
+#elif  DATA_LAYOUT_AOS
+       return cellIndex * N_D3Q19 + d;
+#else
+       #error P_INDEX_3 function not implemented for chosen data layout.
+#endif
+}
+
+#define P_INDEX_5              FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(KernelDataList * kdl, int x, int y, int z, int d)
+{
+       Assert(kdl != NULL);
+#ifdef DEBUG
+       uint32_t * grid = kdl->Grid;
+       int * dims = kdl->kd.Dims;
+
+       Assert(grid != NULL);
+       Assert(dims != NULL);
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+       Assert(x >= 0 && x < dims[0]);
+       Assert(y >= 0 && y < dims[1]);
+       Assert(z >= 0 && z < dims[2]);
+       Assert(d >= 0 && d < N_D3Q19);
+#endif
+
+       return P_INDEX_3(kdl->nCells, kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)], d);
+}
+
+// -----------------------------------------------------------------------
+// Macros for accessing coord array
+
+#define C_INDEX_X(cellIndex)   C_INDEX(cellIndex, 0)
+#define C_INDEX_Y(cellIndex)   C_INDEX(cellIndex, 1)
+#define C_INDEX_Z(cellIndex)   C_INDEX(cellIndex, 2)
+
+static inline int C_INDEX(int cellIndex, int xyz)
+{
+       Assert(cellIndex >= 0);
+       Assert(xyz >= 0);
+       Assert(xyz < 3);
+
+       return cellIndex * 3 + xyz;
+}
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__
diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c
new file mode 100644 (file)
index 0000000..c522252
--- /dev/null
@@ -0,0 +1,847 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
+static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd);
+
+void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION)
+       KernelData     * kd  = (KernelData *)kernelData;
+       KernelDataList * kdl = KDL(kernelData);
+#endif
+
+       int maxIterations = cd->MaxIterations;
+
+       int nThreads = 1;
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = kd->Pdfs[0];
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = kd->Pdfs[0];
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       // TODO: outer openmp parallel
+
+       for(int iter = 0; iter < maxIterations; iter += 2) {
+
+               // ---------------------------------------------------
+               // even time step
+               // ---------------------------------------------------
+
+               X_LIKWID_START("list-aa-pv-even");
+
+               #ifdef _OPENMP
+                       #pragma omp parallel default(none) shared(ld, kernelData, cd)
+               #endif
+               {
+                       KernelEven(ld, kernelData, cd);
+               }
+
+               X_LIKWID_STOP("list-aa-pv-even");
+
+               #ifdef VERIFICATION
+                       kdl->Iteration = iter;
+                       kd->PdfsActive = kd->Pdfs[0];
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               // ---------------------------------------------------
+               // odd time step
+               // ---------------------------------------------------
+
+               X_LIKWID_START("list-aa-pv-odd");
+
+               #ifdef _OPENMP
+                       #pragma omp parallel default(none) shared(ld, kernelData, cd)
+               #endif
+               {
+                       KernelOdd(ld, kernelData, cd);
+               }
+
+               X_LIKWID_STOP("list-aa-pv-odd");
+
+
+               #ifdef VERIFICATION
+                       kdl->Iteration = iter + 1;
+                       kd->PdfsActive = kd->Pdfs[0];
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                               kdl->Iteration = iter + 1;
+                               kd->PdfsActive = kd->Pdfs[0];
+                               VtkWrite(ld, kd, cd, iter);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kdl->Iteration = iter + 1;
+                       kd->PdfsActive = kd->Pdfs[0];
+                       KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+       } // for (int iter = 0; ...
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = kd->Pdfs[0];
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = kd->Pdfs[0];
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
+
+static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData        * kd   = (KernelData *)kernelData;
+       KernelDataList    * kdl  = KDL(kernelData);
+       KernelDataListRia * kdlr = KDLR(kernelData);
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;     PdfT w_1_indep = 0.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ui;
+
+       PdfT ux, uy, uz;
+       PdfT dens;
+
+
+       VPDFT VONE_HALF = VSET(0.5);
+       VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+       VPDFT vw_1_indep, vw_2_indep;
+       VPDFT vw_0 = VSET(w_0);
+       VPDFT vw_1 = VSET(w_1);
+       VPDFT vw_2 = VSET(w_2);
+
+       VPDFT vw_1_x3 = VSET(w_1_x3);
+       VPDFT vw_2_x3 = VSET(w_2_x3);
+       VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       VPDFT vui, vux, vuy, vuz, vdens;
+
+       VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+       VPDFT vomegaEven = VSET(omegaEven);
+       VPDFT vomegaOdd  = VSET(omegaOdd);
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       PdfT * src = kd->Pdfs[0];
+
+       int nCells = kdl->nCells;
+
+       int threadId = 0;
+#ifdef _OPENMP
+       threadId =  omp_get_thread_num();
+#endif
+
+       int * threadIndices = kdlr->FluidNodeThreadIndices;
+
+       int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+       int nFluidVec    = nFluidThread - (nFluidThread % VSIZE);
+
+       int indexStartVec = threadIndices[threadId];
+       int indexStopVec  = threadIndices[threadId] + nFluidVec;
+       int indexStop     = threadIndices[threadId] + nFluidThread;
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       for (int index = indexStartVec; index < indexStopVec; index += VSIZE) {
+
+
+               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]);
+               D3Q19_LIST
+               #undef X
+
+
+               //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+               //           vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+               vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+               //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+               //           vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+               vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+               //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+               //           vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+               vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+               //vdens = vpdf_C +
+               //          vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+               //          vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+               //          vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+               //          vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+               vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+               //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+               vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+               //src[I(index, D3Q19_C)             ]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+               VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+               //vw_1_indep = vw_1 * vdir_indep_trm;
+               vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+               //vui = vuy;
+               vui = vuy;
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+               //src[I(index, D3Q19_S)]  =[UA] vpdf_N - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+               //src[I(index, D3Q19_N)]  =[UA] vpdf_S - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+               //vui = vux;
+               vui = vux;
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+               //voddPart =  vomegaOdd  * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 );
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+               //src[I(index, D3Q19_W)]  =[UA] vpdf_E - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+               //src[I(index, D3Q19_E)]  =[UA] vpdf_W - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+               //vui = vuz;
+               vui = vuz;
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+               //src[I(index, D3Q19_B)]  =[UA] vpdf_T - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+               //src[I(index, D3Q19_T)]  =[UA] vpdf_B - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+               //vw_2_indep = vw_2 * vdir_indep_trm;
+               vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+               //vui = vuy - vux;
+               vui = VSUB(vuy,vux);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+               //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+               //vui = vux + vuy;
+               vui = VADD(vux,vuy);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+               //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+               //vui = vuz - vux;
+               vui = VSUB(vuz,vux);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+               //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+               //vui = vux + vuz;
+               vui = VADD(vux,vuz);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+               //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+               //vui = vuz - vuy;
+               vui = VSUB(vuz,vuy);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+               //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+               //vui = vuy + vuz;
+               vui = VADD(vuy,vuz);
+               //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
+               vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+               //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
+               voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+               //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart;
+               VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+               //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart;
+               VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+       } // loop over fluid nodes
+
+       for (int index = indexStopVec; index < indexStop; ++index) {
+
+               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(index, idx)];
+               D3Q19_LIST
+               #undef X
+
+
+               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+               dens = pdf_C +
+                          pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                          pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                          pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                          pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+               dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+               // direction: w_0
+               src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+               // direction: w_1
+               w_1_indep = w_1*dir_indep_trm;
+
+               ui = uy;
+               evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+               oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+               src[I(index, D3Q19_S)]  = pdf_N - evenPart - oddPart;
+               src[I(index, D3Q19_N)]  = pdf_S - evenPart + oddPart;
+
+               ui = ux;
+               evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+               oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+               src[I(index, D3Q19_W)]  = pdf_E - evenPart - oddPart;
+               src[I(index, D3Q19_E)]  = pdf_W - evenPart + oddPart;
+
+               ui = uz;
+               evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+               oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+               src[I(index, D3Q19_B)]  = pdf_T - evenPart - oddPart;
+               src[I(index, D3Q19_T)]  = pdf_B - evenPart + oddPart;
+
+               // direction: w_2
+               w_2_indep = w_2*dir_indep_trm;
+
+               ui = -ux + uy;
+               evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+               src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+               src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+               ui = ux + uy;
+               evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+               src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+               src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+               ui = -ux + uz;
+               evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+               src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+               src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+               ui = ux + uz;
+               evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+               src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+               src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+               ui = -uy + uz;
+               evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+               src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+               src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+               ui = uy + uz;
+               evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+               oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+               src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+               src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+       } // loop over fluid nodes
+
+       #undef I
+
+       return;
+}
+
+static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData        * kd   = (KernelData *)kernelData;
+       KernelDataList    * kdl  = KDL(kernelData);
+       KernelDataListRia * kdlr = KDLR(kernelData);
+
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;     PdfT w_1_indep = 0.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ui;
+
+       PdfT ux, uy, uz;
+       PdfT dens;
+
+
+       VPDFT VONE_HALF = VSET(0.5);
+       VPDFT VTHREE_HALF = VSET(3.0 / 2.0);
+
+       VPDFT vw_1_indep, vw_2_indep;
+       VPDFT vw_0 = VSET(w_0);
+       VPDFT vw_1 = VSET(w_1);
+       VPDFT vw_2 = VSET(w_2);
+
+       VPDFT vw_1_x3 = VSET(w_1_x3);
+       VPDFT vw_2_x3 = VSET(w_2_x3);
+       VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       VPDFT vui, vux, vuy, vuz, vdens;
+
+       VPDFT vevenPart, voddPart, vdir_indep_trm;
+
+       VPDFT vomegaEven = VSET(omegaEven);
+       VPDFT vomegaOdd  = VSET(omegaOdd);
+
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT * JOIN(ppdf_,name) = NULL;
+       D3Q19_LIST
+       #undef X
+
+       uint32_t nConsecNodes = kdlr->nConsecNodes;
+       uint32_t * consecNodes = kdlr->ConsecNodes;
+       uint32_t consecIndex = 0;
+       uint32_t consecValue = 0;
+
+#ifndef DEBUG
+       UNUSED(nConsecNodes);
+#endif
+
+       PdfT * src = kd->Pdfs[0];
+
+       int nCells = kdl->nCells;
+
+       uint32_t adjListIndex;
+       uint32_t * adjList = kdl->AdjList;
+
+       int threadId = 0;
+
+       #ifdef _OPENMP
+               threadId = omp_get_thread_num();
+       #endif
+
+       consecIndex = kdlr->ConsecThreadIndices[threadId];
+       consecValue = 0;
+
+       int * threadIndices = kdlr->FluidNodeThreadIndices;
+
+       int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+
+       int indexStart = threadIndices[threadId];
+       int indexStop  = threadIndices[threadId] + nFluidThread;
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       #define ADJ_LIST(dir) adjList[adjListIndex + (dir)]
+
+       int pointerOffset = 1;
+
+       for (int index = indexStart; index < indexStop; index += 1) {
+
+               if (consecValue > 0) {
+                       --consecValue;
+                       // Increment all pdf pointers by an offset. If the previous iteration was
+                       // scalar, increment only by one. If the previous iteration was vectorized,
+                       // increment by the vector width. These offsets are set in the corresponding
+                       // if branches.
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) += pointerOffset;
+                       D3Q19_LIST
+                       #undef X
+               }
+               else {
+                       Assert(consecIndex < nConsecNodes);
+
+                       consecValue = consecNodes[consecIndex] - 1;
+                       // Load new pointers to PDFs of local cell:
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]);
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+                       ++consecIndex;
+               }
+
+               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = *JOIN(ppdf_,name);
+               D3Q19_LIST
+               #undef X
+
+               if (consecValue >= (VSIZE - 1)) {
+                       // Vectorized part.
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name));
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       vpdf_C = VLDU(ppdf_C);
+
+                       //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE -
+                       //      vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW;
+                       vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW);
+                       //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN -
+                       //      vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS;
+                       vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS);
+                       //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS -
+                       //      vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS;
+                       vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS);
+
+                       //vdens = vpdf_C +
+                       //        vpdf_N  + vpdf_E  + vpdf_S  + vpdf_W  +
+                       //        vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW +
+                       //        vpdf_T  + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW +
+                       //        vpdf_B  + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW;
+                       vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),
+                                                VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW));
+
+                       //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF;
+                       vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF));
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       //src[I(index, D3Q19_C)]  =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm);
+                       VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm)))));
+
+                       //vw_1_indep = vw_1 * vdir_indep_trm;
+                       vw_1_indep = VMUL(vw_1,vdir_indep_trm);
+
+                       //vui = vuy;
+                       vui = vuy;
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3)));
+                       //src[ADJ_LIST(D3Q19_N)]  =[UA] vpdf_N - vevenPart - voddPart;
+                       VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_S)]  =[UA] vpdf_S - vevenPart + voddPart;
+                       VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart));
+
+                       //vui = vux;
+                       vui = vux;
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3)));
+                       //src[ADJ_LIST(D3Q19_E)]  =[UA] vpdf_E - vevenPart - voddPart;
+                       VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_W)]  =[UA] vpdf_W - vevenPart + voddPart;
+                       VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart));
+
+                       //vui = vuz;
+                       vui = vuz;
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3)));
+                       //src[ADJ_LIST(D3Q19_T)]  =[UA] vpdf_T - vevenPart - voddPart;
+                       VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_B)]  =[UA] vpdf_B - vevenPart + voddPart;
+                       VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart));
+
+                       //vw_2_indep = vw_2 * vdir_indep_trm;
+                       vw_2_indep = VMUL(vw_2,vdir_indep_trm);
+
+                       //vui = vuy - vux;
+                       vui = VSUB(vuy,vux);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart;
+                       VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart;
+                       VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart));
+
+                       //vui = vux + vuy;
+                       vui = VADD(vux,vuy);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart;
+                       VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart;
+                       VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart));
+
+                       //vui = vuz - vux;
+                       vui = VSUB(vuz,vux);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart;
+                       VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart;
+                       VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart));
+
+                       //vui = vux + vuz;
+                       vui = VADD(vux,vuz);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart;
+                       VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart;
+                       VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart));
+
+                       //vui = vuz - vuy;
+                       vui = VSUB(vuz,vuy);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart;
+                       VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart;
+                       VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart));
+
+                       //vui = vuy + vuz;
+                       vui = VADD(vuy,vuz);
+                       //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep);
+                       vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep));
+                       //voddPart  = vomegaOdd  * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3);
+                       voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3)));
+                       //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart;
+                       VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart));
+                       //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart;
+                       VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart));
+
+                       consecValue   -= (VSIZE - 1);
+                       index         += (VSIZE - 1);
+                       pointerOffset  = VSIZE;
+
+               }
+               else {
+                       // Scalar part.
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = *(JOIN(ppdf_,name));
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       pdf_C = *ppdf_C;
+
+                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                       adjListIndex = index * N_D3Q19_IDX;
+
+                       // direction: w_0
+                       src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1 * dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3);
+                       src[ADJ_LIST(D3Q19_N)]  = pdf_N - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_S)]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3);
+                       src[ADJ_LIST(D3Q19_E)]  = pdf_E - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_W)]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3);
+                       src[ADJ_LIST(D3Q19_T)]  = pdf_T - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_B)]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2 * dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep);
+                       oddPart  = omegaOdd  * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3);
+                       src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart;
+                       src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart;
+
+                       pointerOffset = 1;
+               }
+
+       } // loop over fluid nodes
+
+       #undef ADJ_LIST
+       #undef I
+}
diff --git a/src/BenchKernelD3Q19ListAaPv.h b/src/BenchKernelD3Q19ListAaPv.h
new file mode 100644 (file)
index 0000000..84dfc05
--- /dev/null
@@ -0,0 +1,38 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV__
+
+#include "Kernel.h"
+
+void D3Q19ListAaPvInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaPvDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV__
diff --git a/src/BenchKernelD3Q19ListAaPvCommon.c b/src/BenchKernelD3Q19ListAaPvCommon.c
new file mode 100644 (file)
index 0000000..9cdf0e1
--- /dev/null
@@ -0,0 +1,821 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaPvCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
+               }
+               else {
+                       *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
+               }
+
+       }
+       else {
+               *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
+       }
+
+
+       return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+#if 0
+       if (isnan(pdf)) {
+               printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+               DEBUG_BREAK_POINT();
+               exit(1);
+       }
+#endif
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
+               }
+               else {
+                       kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
+               }
+
+       }
+       else {
+               kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
+       }
+
+       return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+       return;
+}
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+       Assert(ld != NULL);
+       Assert(kdlr != NULL);
+       Assert(nThreads > 0);
+
+       uint32_t * adjList = kdlr->kdl.AdjList;
+
+       uint32_t nConsecNodes = 0;
+       uint32_t consecIndex = 0;
+
+       int nFluid = kdlr->kdl.nFluid;
+
+       uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
+       int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+
+       int nNodesPerThread = nFluid / nThreads;
+
+       for (int i = 0; i < nThreads; ++i) {
+               consecThreadIndices[i]     = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+               fluidNodeThreadIndices[i] = consecThreadIndices[i];
+       }
+       consecThreadIndices[nThreads]     = -1;
+       fluidNodeThreadIndices[nThreads] = nFluid;
+
+       int indexThread = 1;
+
+       // We execute following code two times.
+       // - The first time to get the count of how many entries we need for the
+       //   consecNodes array.
+       // - The second time to fill the array.
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+               }
+       }
+
+       if (nFluid > 0) {
+               nConsecNodes = consecIndex + 1;
+       }
+
+       uint32_t * consecNodes;
+       MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
+
+       consecIndex = 0;
+
+       if (nFluid > 0) {
+               consecNodes[consecIndex] = 1;
+       }
+
+       indexThread = 1;
+       consecThreadIndices[0] = 0;
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       consecThreadIndices[indexThread] = consecIndex + 1;
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] = 1;
+               }
+               else {
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] += 1;
+               }
+       }
+
+
+       kdlr->ConsecNodes = consecNodes;
+       kdlr->nConsecNodes = nConsecNodes;
+
+       kdlr->ConsecThreadIndices  = consecThreadIndices;
+       kdlr->nConsecThreadIndices = nThreads;
+
+       kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices;
+       kdlr->nFluidNodeThreadIndices = nThreads;
+
+       printf("# total fluid nodes: %d   consecutive blocks: %d\n", nFluid, nConsecNodes);
+
+       uint32_t vwidth[]       = {2, 4, 8, 16, 32};
+       uint32_t vectorizable[] = {0, 0, 0,  0,  0};
+
+       for (int i = 0; i < nConsecNodes; ++i) {
+               for (int k = 0; k < N_ELEMS(vwidth); ++k) {
+                       vectorizable[k] += consecNodes[i] / vwidth[k];
+               }
+       }
+
+       printf("# vectorizable fraction of fluid node updates:\n");
+       for (int i = 0; i < N_ELEMS(vwidth); ++i) {
+
+               printf("#  vector width: %2d     %6.2f %% (%u/%u fluid nodes)\n",
+                       vwidth[i], (double)vectorizable[i] * vwidth[i] / nFluid * 100.0,
+                       vectorizable[i] * vwidth[i], nFluid);
+       }
+
+       return;
+}
+
+void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelData * kd;
+       KernelDataList * kdl;
+       KernelDataListRia * kdlr;
+       MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+       kd = (KernelData *)kdlr;
+       kdl = KDL(kdlr);
+
+       *kernelData = kd;
+
+#ifdef DEBUG
+       kd->Pdfs[0] = NULL;
+       kd->Pdfs[1] = NULL;
+       kd->PdfsActive = NULL;
+       kd->DstPdfs = NULL;
+       kd->SrcPdfs = NULL;
+       kd->Dims[0] = -1;
+       kd->Dims[1] = -1;
+       kd->Dims[2] = -1;
+       kd->GlobalDims[0] = -1;
+       kd->GlobalDims[1] = -1;
+       kd->GlobalDims[2] = -1;
+       kd->Offsets[0] = -1;
+       kd->Offsets[1] = -1;
+       kd->Offsets[2] = -1;
+
+       kd->ObstIndices = NULL;
+       kd->nObstIndices = -1;
+       kd->BounceBackPdfsSrc = NULL;
+       kd->BounceBackPdfsDst = NULL;
+       kd->nBounceBackPdfs = -1;
+
+       kdl->AdjList = NULL;
+       kdl->Coords = NULL;
+       kdl->Grid = NULL;
+       kdl->nCells = -1;
+       kdl->nFluid = -1;
+
+       kdlr->ConsecNodes = NULL;
+       kdlr->nConsecNodes = 0;
+       kdlr->ConsecThreadIndices = NULL;
+       kdlr->nConsecThreadIndices = 0;
+#endif
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+       kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+       kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+       int * lDims = ld->Dims;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int nTotalCells = lX * lY * lZ;
+       int nCells = ld->nFluid; // TODO: + padding
+       int nFluid = ld->nFluid;
+
+       kdl->nCells = nCells;
+       kdl->nFluid = nFluid;
+
+       PdfT * pdfs[2];
+
+       int blk[3] = { 0 };
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = lX;
+       if (blk[1] == 0) blk[1] = lY;
+       if (blk[2] == 0) blk[2] = lZ;
+
+       printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+       double latMiB      = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latPadMib   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+
+       printf("# lattice size:          %e MiB\n", latMiB);
+       printf("# fluid lattice size:    %e MiB\n", latFluidMib);
+       printf("# lattice padding:       %e MiB\n", latPadMib);
+
+#define PAGE_4K                4096
+
+       printf("# aligning lattices to:  %d b\n", PAGE_4K);
+
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+       kd->Pdfs[0] = pdfs[0];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // Here we touch only the fluid nodes as this loop is OpenMP parallel and
+       // we want the same scheduling as in the kernel.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+       } }
+
+       // Initialize all PDFs to some standard value.
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+       } }
+
+       // ----------------------------------------------------------------------
+       // create grid which will hold the index numbers of the fluid nodes
+
+       uint32_t * grid;
+
+       if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+               printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+               exit(1);
+       }
+       kdl->Grid = grid;
+
+       int latticeIndex;
+
+#ifdef DEBUG
+       for(int z = 0; z < lZ; ++z) {
+               for(int y = 0; y < lY; ++y) {
+                       for(int x = 0; x < lX; ++x) {
+
+                               latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+                               grid[latticeIndex] = ~0;
+                       }
+               }
+       }
+#endif
+
+       // ----------------------------------------------------------------------
+       // generate numbering over grid
+
+       uint32_t * coords;
+
+       if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+               printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+               exit(1);
+       }
+
+       kdl->Coords = coords;
+
+       // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+       // through the ld->Lattice array.
+       int counter = 0;
+
+       // Blocking is implemented via setup of the adjacency list. The kernel later will
+       // walk through the lattice blocked automatically.
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bX = 0; bX < lX; bX += blk[0]) {
+
+               int eX = MIN(bX + blk[0], lX);
+               int eY = MIN(bY + blk[1], lY);
+               int eZ = MIN(bZ + blk[2], lZ);
+
+
+               for (int z = bZ; z < eZ; ++z) {
+               for (int y = bY; y < eY; ++y) {
+               for (int x = bX; x < eX; ++x) {
+
+                       latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+                       if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+                               grid[latticeIndex] = counter;
+
+                               coords[C_INDEX_X(counter)] = x;
+                               coords[C_INDEX_Y(counter)] = y;
+                               coords[C_INDEX_Z(counter)] = z;
+
+                               ++counter;
+                       }
+               } } }
+       } } }
+
+       Verify(counter == nFluid);
+
+       uint32_t * adjList;
+
+       double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
+
+       printf("# index size:            %e MiB\n", indexMib);
+
+       // AdjList only requires 18 instead of 19 entries per node, as
+       // the center PDF needs no addressing.
+       if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
+               printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
+               exit(1);
+       }
+
+       kdl->AdjList = adjList;
+
+       int x, y, z;
+
+       uint32_t neighborIndex;
+       uint32_t dstIndex;
+
+       int nx, ny, nz, px, py, pz;
+
+       // Loop over all fluid nodes and compute the indices to the neighboring
+       // PDFs for configured data layout (AoS/SoA).
+       // TODO: Parallelized loop to ensure correct NUMA placement.
+       // #ifdef _OPENMP  --> add line continuation
+       //      #pragma omp parallel for default(none)
+       //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+       //                              stderr,
+       //                              lDims, grid, ld, lX, lY, lZ, adjList)
+       //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+       // #endif
+       for (int index = 0; index < nFluid; ++index) {
+               x = coords[C_INDEX_X(index)];
+               y = coords[C_INDEX_Y(index)];
+               z = coords[C_INDEX_Z(index)];
+
+               Assert(x >= 0 && x < lX);
+               Assert(y >= 0 && y < lY);
+               Assert(z >= 0 && z < lZ);
+
+               Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+#ifdef PROP_MODEL_PUSH
+                       nx = x + D3Q19_X[d];
+                       ny = y + D3Q19_Y[d];
+                       nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+                       nx = x - D3Q19_X[d];
+                       ny = y - D3Q19_Y[d];
+                       nz = z - D3Q19_Z[d];
+#else
+                       #error No implementation for this PROP_MODEL_NAME.
+#endif
+                       // If the neighbor is outside the latcie in X direction and we have a
+                       // periodic boundary then we need to wrap around.
+                       if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                       ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                       ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                                                                                                               ){
+                               // x periodic
+
+                               if (nx < 0) {
+                                       px = lX - 1;
+                               }
+                               else if (nx >= lX) {
+                                       px = 0;
+                               } else {
+                                       px = nx;
+                               }
+                               // y periodic
+                               if (ny < 0) {
+                                       py = lY - 1;
+                               }
+                               else if (ny >= lY) {
+                                       py = 0;
+                               } else {
+                                       py = ny;
+                               }
+
+                               // z periodic
+                               if (nz < 0) {
+                                       pz = lZ - 1;
+                               }
+                               else if (nz >= lZ) {
+                                       pz = 0;
+                               } else {
+                                       pz = nz;
+                               }
+
+                               if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                       dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                               }
+                               else {
+                                       neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+                                       AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+                                       dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                               }
+                       }
+                       else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else {
+                               neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+                               Assert(neighborIndex != ~0);
+
+                               dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                       }
+
+                       Assert(dstIndex >= 0);
+                       Assert(dstIndex < nCells * N_D3Q19);
+
+                       adjList[index * N_D3Q19_IDX + d] = dstIndex;
+               }
+       }
+
+       int nThreads = 1;
+
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+       SetupConsecNodes(ld, KDLR(kd), nThreads);
+
+       double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
+       double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0);
+       double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
+
+       printf("# loop balance:          %.2f B/FLUP  even: %.2f B/FLUP  odd %.2f B/FLUP\n",
+                       loopBalance, loopBalanceEven, loopBalanceOdd);
+
+       // Fill remaining KernelData structures
+       kd->GetNode = GetNode;
+       kd->SetNode = SetNode;
+
+       kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+       kd->Kernel = FNAME(D3Q19ListAaPvKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19ListAaPvDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+       MemFree((void **)&((*kdlr)->ConsecNodes));
+
+       if ((*kdlr)->ConsecThreadIndices != NULL) {
+               MemFree((void **)&((*kdlr)->ConsecThreadIndices));
+       }
+
+       if ((*kdlr)->FluidNodeThreadIndices != NULL) {
+               MemFree((void **)&((*kdlr)->FluidNodeThreadIndices));
+       }
+
+       KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+       MemFree((void **)&((*kdl)->AdjList));
+       MemFree((void **)&((*kdl)->Coords));
+       MemFree((void **)&((*kdl)->Grid));
+
+       MemFree((void **)&((*kernelData)->Pdfs[0]));
+
+       MemFree((void **)kernelData);
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListAaPvCommon.h b/src/BenchKernelD3Q19ListAaPvCommon.h
new file mode 100644 (file)
index 0000000..2ce7fd3
--- /dev/null
@@ -0,0 +1,61 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__
+
+#if !defined(DATA_LAYOUT_SOA)
+       #error List AA Pv works only with DATA_LAYOUT_SOA
+#endif
+
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+typedef struct KernelDataListRia_ {
+       KernelDataList kdl;
+
+       // Array contains information of how many adjacent nodes share the same access pattern.
+       uint32_t * ConsecNodes;
+       uint32_t nConsecNodes;  // Number of entries in ConsecNodes array.
+
+       // Array contains (for each thread) an index into ConsecNodes.
+       uint32_t * ConsecThreadIndices;
+       // Number of entries in ConsecThreadIndices.
+       uint32_t nConsecThreadIndices;
+
+       // Array contains fluid node indices for each thread where to start in the
+       // vector of fluid nodes.
+       int * FluidNodeThreadIndices;
+       // Number of entries in FluidNodeThreadIndices.
+       int nFluidNodeThreadIndices;
+
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)      ((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19ListAaRia.c b/src/BenchKernelD3Q19ListAaRia.c
new file mode 100644 (file)
index 0000000..87addcc
--- /dev/null
@@ -0,0 +1,479 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaRiaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "LikwidIf.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData * kd = (KernelData *)kernelData;
+       KernelDataList * kdl = KDL(kernelData);
+       KernelDataListRia * kdlr = KDLR(kernelData);
+       PdfT omega = cd->Omega;
+       PdfT omegaEven = omega;
+       // 1/ 4: best stability;
+       // 1/12: removes third-order advection error (best advection);
+       // 1/ 6: removes fourth-order diffusion error (best diffusion);
+       // 3/16: exact location of bounce back for poiseuille flow
+       PdfT magicParam = 1.0 / 12.0;
+       PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+       PdfT evenPart = 0.0;
+       PdfT oddPart = 0.0;
+       PdfT dir_indep_trm = 0.0;
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;     PdfT w_1_indep = 0.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;     PdfT w_2_indep = 0.0;
+
+       PdfT ui;
+
+       PdfT ux, uy, uz;
+       PdfT dens;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT * JOIN(ppdf_,name) = NULL;
+       D3Q19_LIST
+       #undef X
+
+       uint32_t nConsecNodes = kdlr->nConsecNodes;
+       uint32_t * consecNodes = kdlr->ConsecNodes;
+       uint32_t consecIndex = 0;
+       uint32_t consecValue = 0;
+
+#ifndef DEBUG
+       UNUSED(nConsecNodes);
+#endif
+
+       PdfT * src = kd->Pdfs[0];
+
+       int maxIterations = cd->MaxIterations;
+       int nFluid = kdl->nFluid;
+       int nCells = kdl->nCells;
+
+       uint32_t adjListIndex;
+       uint32_t * adjList = kdl->AdjList;
+
+
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       // TODO: outer openmp parallel
+       for(int iter = 0; iter < maxIterations; iter += 2) {
+
+               X_LIKWID_START("list-aa-ria-even");
+
+               // --------------------------------------------------------------------
+               // even time step
+               // --------------------------------------------------------------------
+               #ifdef _OPENMP
+                       #pragma omp parallel for default(none) \
+                                       shared(stderr, nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \
+                                       private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\
+                                               pdf_C, \
+                                               pdf_N, pdf_E, pdf_S, pdf_W, \
+                                               pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                               pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                               pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW)
+               #endif
+               for (int index = 0; index < nFluid; ++index) {
+
+                       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[I(index, idx)];
+                       D3Q19_LIST
+                       #undef X
+
+// #define LID_DRIVEN_CAVITY
+
+#ifdef LID_DRIVEN_CAVITY
+                       int nX = kd->Dims[0];
+                       int nY = kd->Dims[1];
+                       int nZ = kd->Dims[2];
+
+                       int x = kdl->Coords[C_INDEX_X(index)];
+                       int y = kdl->Coords[C_INDEX_Y(index)];
+                       int z = kdl->Coords[C_INDEX_Z(index)];
+
+                       if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) {
+                               ux = 0.1 * 0.577;
+                               uy = 0.0;
+                               uz = 0.0;
+                       } else {
+#endif
+                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                               uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                        pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                               uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                        pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                       }
+#endif
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                       // direction: w_0
+                       src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                       // direction: w_1
+                       w_1_indep = w_1*dir_indep_trm;
+
+                       ui = uy;
+                       evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                       src[I(index, D3Q19_S)]  = pdf_N - evenPart - oddPart;
+                       src[I(index, D3Q19_N)]  = pdf_S - evenPart + oddPart;
+
+                       ui = ux;
+                       evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                       src[I(index, D3Q19_W)]  = pdf_E - evenPart - oddPart;
+                       src[I(index, D3Q19_E)]  = pdf_W - evenPart + oddPart;
+
+                       ui = uz;
+                       evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                       src[I(index, D3Q19_B)]  = pdf_T - evenPart - oddPart;
+                       src[I(index, D3Q19_T)]  = pdf_B - evenPart + oddPart;
+
+                       // direction: w_2
+                       w_2_indep = w_2*dir_indep_trm;
+
+                       ui = -ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                       src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart;
+                       src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart;
+
+                       ui = ux + uy;
+                       evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                       src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart;
+                       src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart;
+
+                       ui = -ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart;
+                       src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart;
+
+                       ui = ux + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart;
+                       src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart;
+
+                       ui = -uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart;
+                       src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart;
+
+                       ui = uy + uz;
+                       evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                       oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                       src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart;
+                       src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart;
+
+               } // (parallel) loop over fluid nodes
+
+               X_LIKWID_STOP("list-aa-ria-even");
+
+               // save current iteration
+               kdl->Iteration = iter;
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               // --------------------------------------------------------------------
+               // odd time step
+               // --------------------------------------------------------------------
+
+               X_LIKWID_START("list-aa-ria-odd");
+
+               #ifdef _OPENMP
+                       #pragma omp parallel default(none) \
+                                       shared(stderr, nFluid, nCells, kd, kdl, kdlr, adjList, omegaOdd, omegaEven, src, consecNodes, nConsecNodes) \
+                                       private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\
+                                               pdf_C, \
+                                               pdf_N, pdf_E, pdf_S, pdf_W, \
+                                               pdf_NE, pdf_SE, pdf_SW, pdf_NW, \
+                                               pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \
+                                               pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \
+                                               ppdf_C, \
+                                               ppdf_N, ppdf_E, ppdf_S, ppdf_W, \
+                                               ppdf_NE, ppdf_SE, ppdf_SW, ppdf_NW, \
+                                               ppdf_T, ppdf_TN, ppdf_TE, ppdf_TS, ppdf_TW, \
+                                               ppdf_B, ppdf_BN, ppdf_BE, ppdf_BS, ppdf_BW, \
+                                               consecValue, consecIndex)
+               #endif
+               {
+                       int threadId = 0;
+
+                       #ifdef _OPENMP
+                               threadId = omp_get_thread_num();
+                       #endif
+
+                       consecIndex = kdlr->ConsecThreadIndices[threadId];
+                       consecValue = 0;
+
+                       int * threadIndices = kdlr->FluidNodeThreadIndices;
+
+                       int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId];
+
+                       int indexStart = threadIndices[threadId];
+                       int indexStop  = threadIndices[threadId] + nFluidThread;
+
+                       for (int index = indexStart; index < indexStop; ++index) {
+
+                               #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+#if 1
+                               if (consecValue > 0) {
+                                       --consecValue;
+                                       // Increment all pdf pointers.
+                                       #define X(name, idx, idxinv, _x, _y, _z)        ++JOIN(ppdf_,name);
+                                       D3Q19_LIST
+                                       #undef X
+                               }
+                               else {
+                                       Assert(consecIndex < nConsecNodes);
+
+                                       consecValue = consecNodes[consecIndex] - 1;
+                                       // Load new pointers to PDFs of local cell:
+
+                                       adjListIndex = index * N_D3Q19_IDX;
+
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]);
+                                       D3Q19_LIST_WO_C
+                                       #undef X
+
+                                       ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]);
+                                       ++consecIndex;
+                               }
+
+                               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = *JOIN(ppdf_,name);
+                               D3Q19_LIST
+                               #undef X
+#else
+                               adjListIndex = index * N_D3Q19_IDX;
+
+                               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+                               pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)];
+
+                               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]);
+                               D3Q19_LIST_WO_C
+                               #undef X
+
+                               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idxinv]];
+                               D3Q19_LIST_WO_C
+                               #undef X
+#endif
+
+#ifdef LID_DRIVEN_CAVITY
+                               int nX = kd->Dims[0];
+                               int nY = kd->Dims[1];
+                               int nZ = kd->Dims[2];
+
+                               int x = kdl->Coords[C_INDEX_X(index)];
+                               int y = kdl->Coords[C_INDEX_Y(index)];
+                               int z = kdl->Coords[C_INDEX_Z(index)];
+
+                               if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) {
+                                       ux = 0.1 * 0.577;
+                                       uy = 0.0;
+                                       uz = 0.0;
+                               } else {
+#endif
+                                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+#ifdef LID_DRIVEN_CAVITY
+                               }
+#endif
+
+                               dens = pdf_C +
+                                          pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                          pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                          pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                          pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                               dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0;
+
+                               adjListIndex = index * N_D3Q19_IDX;
+
+                               // direction: w_0
+                               src[I(index, D3Q19_C)             ]  = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm);
+
+                               // direction: w_1
+                               w_1_indep = w_1*dir_indep_trm;
+
+                               ui = uy;
+                               evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 );
+                               *ppdf_S  = pdf_N - evenPart - oddPart;
+                               *ppdf_N  = pdf_S - evenPart + oddPart;
+
+                               ui = ux;
+                               evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 );
+                               *ppdf_W  = pdf_E - evenPart - oddPart;
+                               *ppdf_E  = pdf_W - evenPart + oddPart;
+
+                               ui = uz;
+                               evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 );
+                               *ppdf_B  = pdf_T - evenPart - oddPart;
+                               *ppdf_T  = pdf_B - evenPart + oddPart;
+
+                               // direction: w_2
+                               w_2_indep = w_2*dir_indep_trm;
+
+                               ui = -ux + uy;
+                               evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 );
+                               *ppdf_SE = pdf_NW - evenPart - oddPart;
+                               *ppdf_NW = pdf_SE - evenPart + oddPart;
+
+                               ui = ux + uy;
+                               evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 );
+                               *ppdf_SW = pdf_NE - evenPart - oddPart;
+                               *ppdf_NE = pdf_SW - evenPart + oddPart;
+
+                               ui = -ux + uz;
+                               evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 );
+                               *ppdf_BE = pdf_TW - evenPart - oddPart;
+                               *ppdf_TW = pdf_BE - evenPart + oddPart;
+
+                               ui = ux + uz;
+                               evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 );
+                               *ppdf_BW = pdf_TE - evenPart - oddPart;
+                               *ppdf_TE = pdf_BW - evenPart + oddPart;
+
+                               ui = -uy + uz;
+                               evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 );
+                               *ppdf_BN = pdf_TS - evenPart - oddPart;
+                               *ppdf_TS = pdf_BN - evenPart + oddPart;
+
+                               ui = uy + uz;
+                               evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep );
+                               oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 );
+                               *ppdf_BS = pdf_TN - evenPart - oddPart;
+                               *ppdf_TN = pdf_BS - evenPart + oddPart;
+
+                               #undef I
+                       } // loop over fluid nodes
+               } // end pragma omp parallel
+
+               X_LIKWID_STOP("list-aa-ria-odd");
+
+               // save current iteration
+               kdl->Iteration = iter + 1;
+
+               #ifdef VERIFICATION
+                       kd->PdfsActive = src;
+                       KernelAddBodyForce(kd, ld, cd);
+               #endif
+
+               #ifdef VTK_OUTPUT
+                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                               kd->PdfsActive = src;
+                               VtkWrite(ld, kd, cd, iter);
+                       }
+               #endif
+
+               #ifdef STATISTICS
+                       kd->PdfsActive = src;
+                       KernelStatistics(kd, ld, cd, iter);
+               #endif
+
+
+       } // for (int iter = 0; ...
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = src;
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
diff --git a/src/BenchKernelD3Q19ListAaRia.h b/src/BenchKernelD3Q19ListAaRia.h
new file mode 100644 (file)
index 0000000..f7fa6dd
--- /dev/null
@@ -0,0 +1,38 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_RIA__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_RIA__
+
+#include "Kernel.h"
+
+void D3Q19ListAaRiaInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListAaRiaDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_RIA__
diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.c b/src/BenchKernelD3Q19ListAaRiaCommon.c
new file mode 100644 (file)
index 0000000..7faf37b
--- /dev/null
@@ -0,0 +1,804 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListAaRiaCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]];
+               }
+               else {
+                       *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
+               }
+
+       }
+       else {
+               *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
+       }
+
+
+       return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+#if 0
+       if (isnan(pdf)) {
+               printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+               DEBUG_BREAK_POINT();
+               exit(1);
+       }
+#endif
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if (kdl->Iteration % 2 == 0) {
+               // Pdfs are stored inverse, local PDFs are located in remote nodes
+
+               uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+               if (dir != D3Q19_C) {
+                       uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+                       kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf;
+               }
+               else {
+                       kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
+               }
+
+       }
+       else {
+               kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
+       }
+
+       return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       KernelDataList * kdl = (KernelDataList *)kd;
+
+       if(kdl->Iteration % 2 == 0){
+
+               uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)];
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ...
+               kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C];
+
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx];
+               D3Q19_LIST_WO_C
+               #undef X
+
+       } else {
+
+               #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+               #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+               D3Q19_LIST
+               #undef X
+               #undef I
+
+       }
+
+       return;
+}
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+       Assert(ld != NULL);
+       Assert(kdlr != NULL);
+       Assert(nThreads > 0);
+
+       uint32_t * adjList = kdlr->kdl.AdjList;
+
+       uint32_t nConsecNodes = 0;
+       uint32_t consecIndex = 0;
+
+       int nFluid = kdlr->kdl.nFluid;
+
+       uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
+       int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1));
+
+       int nNodesPerThread = nFluid / nThreads;
+
+       for (int i = 0; i < nThreads; ++i) {
+               consecThreadIndices[i]     = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+               fluidNodeThreadIndices[i] = consecThreadIndices[i];
+       }
+       consecThreadIndices[nThreads]     = -1;
+       fluidNodeThreadIndices[nThreads] = nFluid;
+
+       int indexThread = 1;
+
+       // We execute following code two times.
+       // - The first time to get the count of how many entries we need for the
+       //   consecNodes array.
+       // - The second time to fill the array.
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+               }
+       }
+
+       if (nFluid > 0) {
+               nConsecNodes = consecIndex + 1;
+       }
+
+       uint32_t * consecNodes;
+       MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
+
+       consecIndex = 0;
+
+       if (nFluid > 0) {
+               consecNodes[consecIndex] = 1;
+       }
+
+       indexThread = 1;
+       consecThreadIndices[0] = 0;
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       consecThreadIndices[indexThread] = consecIndex + 1;
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] = 1;
+               }
+               else {
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] += 1;
+               }
+       }
+
+
+       kdlr->ConsecNodes = consecNodes;
+       kdlr->nConsecNodes = nConsecNodes;
+
+       kdlr->ConsecThreadIndices  = consecThreadIndices;
+       kdlr->nConsecThreadIndices = nThreads;
+
+       kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices;
+       kdlr->nFluidNodeThreadIndices = nThreads;
+
+       printf("# total fluid nodes: %d   consecutive blocks: %d\n", nFluid, nConsecNodes);
+
+       return;
+}
+
+void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelData * kd;
+       KernelDataList * kdl;
+       KernelDataListRia * kdlr;
+       MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+       kd = (KernelData *)kdlr;
+       kdl = KDL(kdlr);
+
+       *kernelData = kd;
+
+#ifdef DEBUG
+       kd->Pdfs[0] = NULL;
+       kd->Pdfs[1] = NULL;
+       kd->PdfsActive = NULL;
+       kd->DstPdfs = NULL;
+       kd->SrcPdfs = NULL;
+       kd->Dims[0] = -1;
+       kd->Dims[1] = -1;
+       kd->Dims[2] = -1;
+       kd->GlobalDims[0] = -1;
+       kd->GlobalDims[1] = -1;
+       kd->GlobalDims[2] = -1;
+       kd->Offsets[0] = -1;
+       kd->Offsets[1] = -1;
+       kd->Offsets[2] = -1;
+
+       kd->ObstIndices = NULL;
+       kd->nObstIndices = -1;
+       kd->BounceBackPdfsSrc = NULL;
+       kd->BounceBackPdfsDst = NULL;
+       kd->nBounceBackPdfs = -1;
+
+       kdl->AdjList = NULL;
+       kdl->Coords = NULL;
+       kdl->Grid = NULL;
+       kdl->nCells = -1;
+       kdl->nFluid = -1;
+
+       kdlr->ConsecNodes = NULL;
+       kdlr->nConsecNodes = 0;
+       kdlr->ConsecThreadIndices = NULL;
+       kdlr->nConsecThreadIndices = 0;
+#endif
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+       kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+       kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+       int * lDims = ld->Dims;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int nTotalCells = lX * lY * lZ;
+       int nCells = ld->nFluid; // TODO: + padding
+       int nFluid = ld->nFluid;
+
+       kdl->nCells = nCells;
+       kdl->nFluid = nFluid;
+
+       PdfT * pdfs[2];
+
+       int blk[3] = { 0 };
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = lX;
+       if (blk[1] == 0) blk[1] = lY;
+       if (blk[2] == 0) blk[2] = lZ;
+
+       printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+       double latMiB      = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latPadMib   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+
+       printf("# lattice size:          %e MiB\n", latMiB);
+       printf("# fluid lattice size:    %e MiB\n", latFluidMib);
+       printf("# lattice padding:       %e MiB\n", latPadMib);
+
+#define PAGE_4K                4096
+
+       printf("# aligning lattices to:  %d b\n", PAGE_4K);
+
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+       kd->Pdfs[0] = pdfs[0];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // Here we touch only the fluid nodes as this loop is OpenMP parallel and
+       // we want the same scheduling as in the kernel.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+       } }
+
+       // Initialize all PDFs to some standard value.
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+       } }
+
+       // ----------------------------------------------------------------------
+       // create grid which will hold the index numbers of the fluid nodes
+
+       uint32_t * grid;
+
+       if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+               printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+               exit(1);
+       }
+       kdl->Grid = grid;
+
+       int latticeIndex;
+
+#ifdef DEBUG
+       for(int z = 0; z < lZ; ++z) {
+               for(int y = 0; y < lY; ++y) {
+                       for(int x = 0; x < lX; ++x) {
+
+                               latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+                               grid[latticeIndex] = ~0;
+                       }
+               }
+       }
+#endif
+
+       // ----------------------------------------------------------------------
+       // generate numbering over grid
+
+       uint32_t * coords;
+
+       if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+               printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+               exit(1);
+       }
+
+       kdl->Coords = coords;
+
+       // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+       // through the ld->Lattice array.
+       int counter = 0;
+
+       // Blocking is implemented via setup of the adjacency list. The kernel later will
+       // walk through the lattice blocked automatically.
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bX = 0; bX < lX; bX += blk[0]) {
+
+               int eX = MIN(bX + blk[0], lX);
+               int eY = MIN(bY + blk[1], lY);
+               int eZ = MIN(bZ + blk[2], lZ);
+
+
+               for (int z = bZ; z < eZ; ++z) {
+               for (int y = bY; y < eY; ++y) {
+               for (int x = bX; x < eX; ++x) {
+
+                       latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+                       if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+                               grid[latticeIndex] = counter;
+
+                               coords[C_INDEX_X(counter)] = x;
+                               coords[C_INDEX_Y(counter)] = y;
+                               coords[C_INDEX_Z(counter)] = z;
+
+                               ++counter;
+                       }
+               } } }
+       } } }
+
+       Verify(counter == nFluid);
+
+       uint32_t * adjList;
+
+       double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
+
+       printf("# index size:            %e MiB\n", indexMib);
+
+       // AdjList only requires 18 instead of 19 entries per node, as
+       // the center PDF needs no addressing.
+       if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
+               printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
+               exit(1);
+       }
+
+       kdl->AdjList = adjList;
+
+       int x, y, z;
+
+       uint32_t neighborIndex;
+       uint32_t dstIndex;
+
+       int nx, ny, nz, px, py, pz;
+
+       // Loop over all fluid nodes and compute the indices to the neighboring
+       // PDFs for configured data layout (AoS/SoA).
+       // TODO: Parallelized loop to ensure correct NUMA placement.
+       // #ifdef _OPENMP  --> add line continuation
+       //      #pragma omp parallel for default(none)
+       //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+       //                              stderr,
+       //                              lDims, grid, ld, lX, lY, lZ, adjList)
+       //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+       // #endif
+       for (int index = 0; index < nFluid; ++index) {
+               x = coords[C_INDEX_X(index)];
+               y = coords[C_INDEX_Y(index)];
+               z = coords[C_INDEX_Z(index)];
+
+               Assert(x >= 0 && x < lX);
+               Assert(y >= 0 && y < lY);
+               Assert(z >= 0 && z < lZ);
+
+               Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+#ifdef PROP_MODEL_PUSH
+                       nx = x + D3Q19_X[d];
+                       ny = y + D3Q19_Y[d];
+                       nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+                       nx = x - D3Q19_X[d];
+                       ny = y - D3Q19_Y[d];
+                       nz = z - D3Q19_Z[d];
+#else
+                       #error No implementation for this PROP_MODEL_NAME.
+#endif
+                       // If the neighbor is outside the latcie in X direction and we have a
+                       // periodic boundary then we need to wrap around.
+                       if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                       ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                       ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                                                                                                               ){
+                               // x periodic
+
+                               if (nx < 0) {
+                                       px = lX - 1;
+                               }
+                               else if (nx >= lX) {
+                                       px = 0;
+                               } else {
+                                       px = nx;
+                               }
+                               // y periodic
+                               if (ny < 0) {
+                                       py = lY - 1;
+                               }
+                               else if (ny >= lY) {
+                                       py = 0;
+                               } else {
+                                       py = ny;
+                               }
+
+                               // z periodic
+                               if (nz < 0) {
+                                       pz = lZ - 1;
+                               }
+                               else if (nz >= lZ) {
+                                       pz = 0;
+                               } else {
+                                       pz = nz;
+                               }
+
+                               if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                       dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                               }
+                               else {
+                                       neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+                                       AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+                                       dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                               }
+                       }
+                       else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else {
+                               neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+                               Assert(neighborIndex != ~0);
+
+                               dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                       }
+
+                       Assert(dstIndex >= 0);
+                       Assert(dstIndex < nCells * N_D3Q19);
+
+                       adjList[index * N_D3Q19_IDX + d] = dstIndex;
+               }
+       }
+
+       int nThreads = 1;
+
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+       SetupConsecNodes(ld, KDLR(kd), nThreads);
+
+       double loopBalanceEven = 2.0 * 19 * sizeof(PdfT);
+       double loopBalanceOdd  = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0);
+       double loopBalance     = (loopBalanceEven + loopBalanceOdd) / 2.0;
+
+       printf("# loop balance:          %.2f B/FLUP  even: %.2f B/FLUP  odd %.2f B/FLUP\n",
+                       loopBalance, loopBalanceEven, loopBalanceOdd);
+
+       // Fill remaining KernelData structures
+       kd->GetNode = GetNode;
+       kd->SetNode = SetNode;
+
+       kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+       kd->Kernel = FNAME(D3Q19ListAaRiaKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19ListAaRiaDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+       MemFree((void **)&((*kdlr)->ConsecNodes));
+
+       if ((*kdlr)->ConsecThreadIndices != NULL) {
+               MemFree((void **)&((*kdlr)->ConsecThreadIndices));
+       }
+
+       if ((*kdlr)->FluidNodeThreadIndices != NULL) {
+               MemFree((void **)&((*kdlr)->FluidNodeThreadIndices));
+       }
+
+       KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+       MemFree((void **)&((*kdl)->AdjList));
+       MemFree((void **)&((*kdl)->Coords));
+       MemFree((void **)&((*kdl)->Grid));
+
+       MemFree((void **)&((*kernelData)->Pdfs[0]));
+
+       MemFree((void **)kernelData);
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.h b/src/BenchKernelD3Q19ListAaRiaCommon.h
new file mode 100644 (file)
index 0000000..e127e59
--- /dev/null
@@ -0,0 +1,61 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__
+
+#if !defined(DATA_LAYOUT_SOA)
+       #error List AA Ria works only with DATA_LAYOUT_SOA
+#endif
+
+#include "BenchKernelD3Q19ListAaCommon.h"
+
+typedef struct KernelDataListRia_ {
+       KernelDataList kdl;
+
+       // Array contains information of how many adjacent nodes share the same access pattern.
+       uint32_t * ConsecNodes;
+       uint32_t nConsecNodes;  // Number of entries in ConsecNodes array.
+
+       // Array contains (for each thread) an index into ConsecNodes.
+       uint32_t * ConsecThreadIndices;
+       // Number of entries in ConsecThreadIndices.
+       uint32_t nConsecThreadIndices;
+
+       // Array contains fluid node indices for each thread where to start in the
+       // vector of fluid nodes.
+       int * FluidNodeThreadIndices;
+       // Number of entries in FluidNodeThreadIndices.
+       int nFluidNodeThreadIndices;
+
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)      ((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__
+
diff --git a/src/BenchKernelD3Q19ListCommon.c b/src/BenchKernelD3Q19ListCommon.c
new file mode 100644 (file)
index 0000000..4f97bae
--- /dev/null
@@ -0,0 +1,609 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+
+#include <math.h>
+
+
+// Forward definition.
+void FNAME(D3Q19ListKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+#if 0
+       *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
+#else
+#ifdef PROP_MODEL_PUSH
+       *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)];
+#elif PROP_MODEL_PULL
+
+
+       // The relevant PDFs here are the ones, which will get streamed in later
+       // during propagation. So we must return the *remote* PDFs.
+       uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+       if (dir != D3Q19_C) {
+
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]];
+       }
+       else {
+               *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
+
+       }
+#endif
+#endif
+
+       return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+#if 0
+       if (isnan(pdf)) {
+               printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+               DEBUG_BREAK_POINT();
+               exit(1);
+       }
+#endif
+
+#if 0
+       kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
+#else
+#ifdef PROP_MODEL_PUSH
+       kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf;
+#elif PROP_MODEL_PULL
+
+       // The relevant PDFs here are the ones, which will get streamed in later
+       // during propagation. So we must set this *remote* PDFs.
+       uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+       if (dir != D3Q19_C) {
+
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf;
+       }
+       else {
+               kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
+
+       }
+#endif
+#endif
+
+       return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       PdfT sum = 0.0;
+
+       // TODO: pull scheme?
+       #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+       #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx];
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+       // if (sum < 0.0) {
+       //              printf("%d %d %d negative density \n", x, y, z);
+       //              exit(1);
+       // }
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+       return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+#if 0
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       // TODO: pull scheme?
+       #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+       #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+       return;
+}
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelData * kd;
+       KernelDataList * kdl;
+       MemAlloc((void **)&kdl, sizeof(KernelDataList));
+
+       kd = (KernelData *)kdl;
+       *kernelData = kd;
+
+#ifdef DEBUG
+       kd->Pdfs[0] = NULL;
+       kd->Pdfs[1] = NULL;
+       kd->PdfsActive = NULL;
+       kd->DstPdfs = NULL;
+       kd->SrcPdfs = NULL;
+       kd->Dims[0] = -1;
+       kd->Dims[1] = -1;
+       kd->Dims[2] = -1;
+       kd->GlobalDims[0] = -1;
+       kd->GlobalDims[1] = -1;
+       kd->GlobalDims[2] = -1;
+       kd->Offsets[0] = -1;
+       kd->Offsets[1] = -1;
+       kd->Offsets[2] = -1;
+
+       kd->ObstIndices = NULL;
+       kd->nObstIndices = -1;
+       kd->BounceBackPdfsSrc = NULL;
+       kd->BounceBackPdfsDst = NULL;
+       kd->nBounceBackPdfs = -1;
+
+       kdl->AdjList = NULL;
+       kdl->Coords = NULL;
+       kdl->Grid = NULL;
+       kdl->nCells = -1;
+       kdl->nFluid = -1;
+#endif
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+       kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+       kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+       int * lDims = ld->Dims;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int nTotalCells = lX * lY * lZ;
+       int nCells = ld->nFluid; // TODO: + padding
+       int nFluid = ld->nFluid;
+
+       kdl->nCells = nCells;
+       kdl->nFluid = nFluid;
+
+       PdfT * pdfs[2];
+
+       int blk[3] = { 0 };
+
+       ParseParameters(params, blk);
+
+       if (blk[0] == 0) blk[0] = lX;
+       if (blk[1] == 0) blk[1] = lY;
+       if (blk[2] == 0) blk[2] = lZ;
+
+       printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+
+       printf("# allocating data for %d fluid LB nodes with padding (%lu bytes = %f MiB for both lattices)\n",
+               nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19,
+               2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0);
+
+       MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19);
+       MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19);
+
+       kd->Pdfs[0] = pdfs[0];
+       kd->Pdfs[1] = pdfs[1];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // Here we touch only the fluid nodes as this loop is OpenMP parallel and
+       // we want the same scheduling as in the kernel.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+               pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0;
+       } }
+
+       // Initialize all PDFs to some standard value.
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+               pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0;
+       } }
+
+       // ----------------------------------------------------------------------
+       // create grid which will hold the index numbers of the fluid nodes
+
+       uint32_t * grid;
+
+       if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+               printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+               exit(1);
+       }
+       kdl->Grid = grid;
+
+       int latticeIndex;
+
+#ifdef DEBUG
+       for(int z = 0; z < lZ; ++z) {
+               for(int y = 0; y < lY; ++y) {
+                       for(int x = 0; x < lX; ++x) {
+
+                               latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+                               grid[latticeIndex] = ~0;
+                       }
+               }
+       }
+#endif
+
+       // ----------------------------------------------------------------------
+       // generate numbering over grid
+
+       uint32_t * coords;
+
+       if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+               printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+               exit(1);
+       }
+
+       kdl->Coords = coords;
+
+       // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+       // through the ld->Lattice array.
+       int counter = 0;
+
+       // Blocking is implemented via setup of the adjacency list. The kernel later will
+       // walk through the lattice blocked automatically.
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bX = 0; bX < lX; bX += blk[0]) {
+
+               int eX = MIN(bX + blk[0], lX);
+               int eY = MIN(bY + blk[1], lY);
+               int eZ = MIN(bZ + blk[2], lZ);
+
+
+               for (int z = bZ; z < eZ; ++z) {
+               for (int y = bY; y < eY; ++y) {
+               for (int x = bX; x < eX; ++x) {
+
+                       latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+                       if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+                               grid[latticeIndex] = counter;
+
+                               coords[C_INDEX_X(counter)] = x;
+                               coords[C_INDEX_Y(counter)] = y;
+                               coords[C_INDEX_Z(counter)] = z;
+
+                               ++counter;
+                       }
+               } } }
+       } } }
+
+       Verify(counter == nFluid);
+
+       uint32_t * adjList;
+
+       // AdjList only requires 18 instead of 19 entries per node, as
+       // the center PDF needs no addressing.
+       if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
+               printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
+               exit(1);
+       }
+
+       kdl->AdjList = adjList;
+
+       int x, y, z;
+
+       uint32_t neighborIndex;
+       uint32_t dstIndex;
+
+       int nx, ny, nz, px, py, pz;
+
+       // Loop over all fluid nodes and compute the indices to the neighboring
+       // PDFs for configured data layout (AoS/SoA).
+       // TODO: Parallelized loop to ensure correct NUMA placement.
+       // #ifdef _OPENMP  --> add line continuation
+       //      #pragma omp parallel for default(none)
+       //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+       //                              stderr,
+       //                              lDims, grid, ld, lX, lY, lZ, adjList)
+       //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+       // #endif
+       for (int index = 0; index < nFluid; ++index) {
+               x = coords[C_INDEX_X(index)];
+               y = coords[C_INDEX_Y(index)];
+               z = coords[C_INDEX_Z(index)];
+
+               Assert(x >= 0 && x < lX);
+               Assert(y >= 0 && y < lY);
+               Assert(z >= 0 && z < lZ);
+
+               Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+#ifdef PROP_MODEL_PUSH
+                       nx = x + D3Q19_X[d];
+                       ny = y + D3Q19_Y[d];
+                       nz = z + D3Q19_Z[d];
+#elif PROP_MODEL_PULL
+                       nx = x - D3Q19_X[d];
+                       ny = y - D3Q19_Y[d];
+                       nz = z - D3Q19_Z[d];
+#else
+                       #error No implementation for this PROP_MODEL_NAME.
+#endif
+                       // If the neighbor is outside the latcie in X direction and we have a
+                       // periodic boundary then we need to wrap around.
+                       if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                       ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                       ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                                                                                                               ){
+                               // x periodic
+
+                               if (nx < 0) {
+                                       px = lX - 1;
+                               }
+                               else if (nx >= lX) {
+                                       px = 0;
+                               } else {
+                                       px = nx;
+                               }
+                               // y periodic
+                               if (ny < 0) {
+                                       py = lY - 1;
+                               }
+                               else if (ny >= lY) {
+                                       py = 0;
+                               } else {
+                                       py = ny;
+                               }
+
+                               // z periodic
+                               if (nz < 0) {
+                                       pz = lZ - 1;
+                               }
+                               else if (nz >= lZ) {
+                                       pz = 0;
+                               } else {
+                                       pz = nz;
+                               }
+
+                               if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                       dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                               }
+                               else {
+                                       neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+                                       AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+                                       dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                               }
+                       }
+                       else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else {
+                               neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+                               Assert(neighborIndex != ~0);
+
+                               dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                       }
+
+                       Assert(dstIndex >= 0);
+                       Assert(dstIndex < nCells * N_D3Q19);
+
+                       adjList[index * N_D3Q19_IDX + d] = dstIndex;
+               }
+       }
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = GetNode;
+       kd->SetNode = SetNode;
+
+       kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+       kd->Kernel = FNAME(D3Q19ListKernel);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19ListDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+       MemFree((void **)&((*kernelData)->Pdfs[0]));
+       MemFree((void **)&((*kernelData)->Pdfs[1]));
+
+       MemFree((void **)&((*kdl)->AdjList));
+       MemFree((void **)&((*kdl)->Coords));
+       MemFree((void **)&((*kdl)->Grid));
+
+       MemFree((void **)kernelData);
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListCommon.h b/src/BenchKernelD3Q19ListCommon.h
new file mode 100644 (file)
index 0000000..c62b0d0
--- /dev/null
@@ -0,0 +1,136 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_COMMON_H__
+
+
+#include "Kernel.h"
+
+#include <inttypes.h>
+
+#define N_D3Q19_IDX            18
+
+typedef struct KernelDataList_
+{
+       KernelData kd;
+       uint32_t * AdjList;     // Stores PDF indices, which are the destination for propagation.
+                                               // Determine the destination for node index n and direction d via:
+                                               // (n * N_D3Q19_IDX) + d
+       uint32_t * Grid;        // Stores the node indices; use L_INDEX_4 macro for access.
+       uint32_t * Coords;      // Map node indices to coordiantes; use C_INDEX_* macro for access.
+       int nFluid;                     // Number of fluid nodes allocated, i.e. length of adjList * N_D3Q19_IDX.
+       int nCells;                     // Total number of nodes allocated, including nodes for padding!
+} KernelDataList;
+
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDL(_x_)       ((KernelDataList *)(_x_))
+
+
+
+
+// Build a function name extended by the propagation model name and the data layout.
+// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined
+// as SoA and PROP_MODEL is defined as Push.
+#define FNAME(functionName)    JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME))
+
+#ifndef DATA_LAYOUT_NAME
+       #error DATA_LAYOUT_NAME must be defined
+#endif
+
+#ifndef PROP_MODEL_NAME
+       #error PROP_MODEL_NAME must be defined
+#endif
+
+// -----------------------------------------------------------------------
+// Index function for accesssing PDF array for different data layouts.
+
+#define P_INDEX_3              FNAME(PINDEX3)
+
+static inline int FNAME(PINDEX3)(int nCells, int cellIndex, int d)
+{
+       Assert(nCells > 0);
+       Assert(cellIndex >= 0);
+       Assert(cellIndex < nCells);
+
+       Assert(d >= 0);
+       #ifdef D3Q19
+               Assert(d < N_D3Q19);
+       #else
+               #error Not implemented for this discretization.
+       #endif
+
+#ifdef DATA_LAYOUT_SOA
+       return d * nCells + cellIndex;
+#elif  DATA_LAYOUT_AOS
+       return cellIndex * N_D3Q19 + d;
+#else
+       #error P_INDEX_3 function not implemented for chosen data layout.
+#endif
+}
+
+#define P_INDEX_5              FNAME(PINDEX5)
+
+static inline int FNAME(PINDEX5)(KernelDataList * kdl, int x, int y, int z, int d)
+{
+       Assert(kdl != NULL);
+#ifdef DEBUG
+       uint32_t * grid = kdl->Grid;
+       int * dims = kdl->kd.Dims;
+
+       Assert(grid != NULL);
+       Assert(dims != NULL);
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+       Assert(x >= 0 && x < dims[0]);
+       Assert(y >= 0 && y < dims[1]);
+       Assert(z >= 0 && z < dims[2]);
+       Assert(d >= 0 && d < N_D3Q19);
+#endif
+
+       return P_INDEX_3(kdl->nCells, kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)], d);
+}
+
+// -----------------------------------------------------------------------
+// Macros for accessing coord array
+
+#define C_INDEX_X(cellIndex)   C_INDEX(cellIndex, 0)
+#define C_INDEX_Y(cellIndex)   C_INDEX(cellIndex, 1)
+#define C_INDEX_Z(cellIndex)   C_INDEX(cellIndex, 2)
+
+static inline int C_INDEX(int cellIndex, int xyz)
+{
+       Assert(cellIndex >= 0);
+       Assert(xyz >= 0);
+       Assert(xyz < 3);
+
+       return cellIndex * 3 + xyz;
+}
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_COMMON_H__
diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c
new file mode 100644 (file)
index 0000000..dfab54a
--- /dev/null
@@ -0,0 +1,455 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <inttypes.h>
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+#define TMP_UX 18
+#define TMP_UY 19
+#define TMP_UZ 20
+#define TMP_W1 21
+#define TMP_W2 22
+
+#define N_TMP 23
+
+#define TMP_INDEX(tmp_index, tmp_dir)  nTmpArray * (tmp_dir) + (tmp_index)
+
+void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData        * kd   = (KernelData *)kernelData;
+       KernelDataList    * kdl  = KDL(kernelData);
+       KernelDataListRia * kdlr = KDLR(kernelData);
+
+       PdfT omega = cd->Omega;
+       const PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+       const VPDFT vw_1_x3 = VSET(w_1_x3);
+       const VPDFT vw_2_x3 = VSET(w_2_x3);
+
+       const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       const VPDFT vomegaEven = VSET(omegaEven);
+       const VPDFT vomegaOdd  = VSET(omegaOdd);
+
+       const VPDFT voneHalf = VSET(0.5);
+
+       // uint32_t nConsecNodes = kdlr->nConsecNodes;
+       // uint32_t * consecNodes = kdlr->ConsecNodes;
+       // uint32_t consecIndex = 0;
+       // uint32_t consecValue = 0;
+
+       PdfT * src = kd->Pdfs[0];
+       PdfT * dst = kd->Pdfs[1];
+       PdfT * tmp;
+
+       int maxIterations  = cd->MaxIterations;
+
+       int nFluid         = kdl->nFluid;
+       int nCells         = kdl->nCells;
+
+       int nTmpArray      = kdlr->nTmpArray;
+
+       Assert(nTmpArray % VSIZE == 0);
+
+       uint32_t * adjList = kdl->AdjList;
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       #ifdef _OPENMP
+               #pragma omp parallel default(none) \
+                       shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
+                       cd, maxIterations, ld, tmp, nTmpArray, \
+                       stderr )
+       #endif
+       {
+               uint32_t adjListIndex;
+
+               PdfT ux, uy, uz, ui;
+               VPDFT vux, vuy, vuz, vui;
+
+               #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+               D3Q19_LIST
+               #undef X
+               VPDFT vpdf_a, vpdf_b;
+
+               PdfT evenPart, oddPart, dir_indep_trm, dens;
+               PdfT w_1_indep, w_2_indep;
+               VPDFT vevenPart, voddPart;
+               VPDFT vw_1_indep, vw_2_indep;
+
+               int indexMax;
+
+               PdfT * tmpArray;
+               MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT));
+
+               int nThreads = 1;
+               int threadId = 0;
+
+#ifdef _OPENMP
+               nThreads = omp_get_max_threads();
+               threadId = omp_get_thread_num();
+#endif
+
+               int nCellsThread = nFluid / nThreads;
+               int blIndexStart = threadId * nCellsThread;
+
+               if (threadId < nFluid % nThreads) {
+                       blIndexStart += threadId;
+                       nCellsThread += 1;
+               }
+               else {
+                       blIndexStart += nFluid % nThreads;
+               }
+
+               int blIndexStop = blIndexStart + nCellsThread;
+
+               // We have three loops:
+               // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
+               // 2. Vectorized handling of nodes.
+               // 3. Remaining nodes, less than vector size.
+
+               unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
+               int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
+
+               int nCellsVectorized = nCellsThread - nCellsUnaligned;
+               nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
+
+               int blIndexVec       = blIndexStart + nCellsUnaligned;
+               int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
+
+               // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
+
+               for(int iter = 0; iter < maxIterations; ++iter) {
+
+#if 1
+                       #define INDEX_START     blIndexStart
+                       #define INDEX_STOP  blIndexVec
+                       #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
+
+                       #define INDEX_START blIndexVec
+                       #define INDEX_STOP  blIndexRemaining
+                       #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h"
+
+                       #define INDEX_START blIndexRemaining
+                       #define INDEX_STOP      blIndexStop
+                       #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
+#else
+                       #define INDEX_START blIndexStart
+                       #define INDEX_STOP      blIndexStop
+                       #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h"
+#endif
+                       #pragma omp barrier
+
+                       #pragma omp single
+                       {
+                               #ifdef VERIFICATION
+                                       kd->PdfsActive = dst;
+                                       KernelAddBodyForce(kd, ld, cd);
+                               #endif
+
+                               #ifdef VTK_OUTPUT
+                                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                                               kd->PdfsActive = dst;
+                                               VtkWrite(ld, kd, cd, iter);
+                                       }
+                               #endif
+
+                               #ifdef STATISTICS
+                                       kd->PdfsActive = dst;
+                                       KernelStatistics(kd, ld, cd, iter);
+                               #endif
+
+                               // swap grids
+                               tmp = src;
+                               src = dst;
+                               dst = tmp;
+                       }
+
+                       #pragma omp barrier
+
+               } // for (int iter = 0; ...
+
+               MemFree((void **)&tmpArray);
+       }
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = src;
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
+
+void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd)
+{
+
+       Assert(ld != NULL);
+       Assert(kernelData != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->Omega > 0.0);
+       Assert(cd->Omega < 2.0);
+
+       KernelData        * kd   = (KernelData *)kernelData;
+       KernelDataList    * kdl  = KDL(kernelData);
+       KernelDataListRia * kdlr = KDLR(kernelData);
+
+       PdfT omega = cd->Omega;
+       const PdfT omegaEven = omega;
+
+       PdfT magicParam = 1.0 / 12.0;
+       const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5));
+
+
+       const PdfT w_0 = 1.0 /  3.0;
+       const PdfT w_1 = 1.0 / 18.0;
+       const PdfT w_2 = 1.0 / 36.0;
+
+       const PdfT w_1_x3 = w_1 * 3.0;  const PdfT w_1_nine_half = w_1 * 9.0 / 2.0;
+       const PdfT w_2_x3 = w_2 * 3.0;  const PdfT w_2_nine_half = w_2 * 9.0 / 2.0;
+
+       const VPDFT vw_1_x3 = VSET(w_1_x3);
+       const VPDFT vw_2_x3 = VSET(w_2_x3);
+
+       const VPDFT vw_1_nine_half = VSET(w_1_nine_half);
+       const VPDFT vw_2_nine_half = VSET(w_2_nine_half);
+
+       const VPDFT vomegaEven = VSET(omegaEven);
+       const VPDFT vomegaOdd  = VSET(omegaOdd);
+
+       const VPDFT voneHalf = VSET(0.5);
+
+       // uint32_t nConsecNodes = kdlr->nConsecNodes;
+       // uint32_t * consecNodes = kdlr->ConsecNodes;
+       // uint32_t consecIndex = 0;
+       // uint32_t consecValue = 0;
+
+       PdfT * src = kd->Pdfs[0];
+       PdfT * dst = kd->Pdfs[1];
+       PdfT * tmp;
+
+       int maxIterations  = cd->MaxIterations;
+
+       int nFluid         = kdl->nFluid;
+       int nCells         = kdl->nCells;
+
+       int nTmpArray      = kdlr->nTmpArray;
+
+       Assert(nTmpArray % VSIZE == 0);
+
+       uint32_t * adjList = kdl->AdjList;
+
+       #ifdef VTK_OUTPUT
+               if (cd->VtkOutput) {
+                       kd->PdfsActive = src;
+                       VtkWrite(ld, kd, cd, -1);
+               }
+       #endif
+
+       #ifdef STATISTICS
+               kd->PdfsActive = src;
+               KernelStatistics(kd, ld, cd, 0);
+       #endif
+
+       #ifdef _OPENMP
+               #pragma omp parallel default(none) \
+                       shared(nFluid, nCells, kd, kdl, adjList, src, dst, \
+                       cd, maxIterations, ld, tmp, nTmpArray, \
+                       stderr )
+       #endif
+       {
+               uint32_t adjListIndex;
+
+               PdfT ux, uy, uz, ui;
+               VPDFT vux, vuy, vuz, vui;
+
+               #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+               D3Q19_LIST
+               #undef X
+               VPDFT vpdf_a, vpdf_b;
+
+               PdfT evenPart, oddPart, dir_indep_trm, dens;
+               PdfT w_1_indep, w_2_indep;
+               VPDFT vevenPart, voddPart;
+               VPDFT vw_1_indep, vw_2_indep;
+
+               int indexMax;
+
+               PdfT * tmpArray;
+               MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP);
+
+               int nThreads = 1;
+               int threadId = 0;
+
+#ifdef _OPENMP
+               nThreads = omp_get_max_threads();
+               threadId = omp_get_thread_num();
+#endif
+
+               int nCellsThread = nFluid / nThreads;
+               int blIndexStart = threadId * nCellsThread;
+
+               if (threadId < nFluid % nThreads) {
+                       blIndexStart += threadId;
+                       nCellsThread += 1;
+               }
+               else {
+                       blIndexStart += nFluid % nThreads;
+               }
+
+               int blIndexStop = blIndexStart + nCellsThread;
+
+               // We have three loops:
+               // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct.
+               // 2. Vectorized handling of nodes.
+               // 3. Remaining nodes, less than vector size.
+
+               unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]);
+               int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE;
+
+               int nCellsVectorized = nCellsThread - nCellsUnaligned;
+               nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE);
+
+               int blIndexVec       = blIndexStart + nCellsUnaligned;
+               int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized;
+
+               // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop);
+
+               for(int iter = 0; iter < maxIterations; ++iter) {
+
+#if 1
+                       #define INDEX_START     blIndexStart
+                       #define INDEX_STOP  blIndexVec
+                       #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
+
+                       #define INDEX_START blIndexVec
+                       #define INDEX_STOP  blIndexRemaining
+                       #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h"
+
+                       #define INDEX_START blIndexRemaining
+                       #define INDEX_STOP      blIndexStop
+                       #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
+#else
+                       #define INDEX_START blIndexStart
+                       #define INDEX_STOP      blIndexStop
+                       #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h"
+#endif
+                       #pragma omp barrier
+
+                       #pragma omp single
+                       {
+                               #ifdef VERIFICATION
+                                       kd->PdfsActive = dst;
+                                       KernelAddBodyForce(kd, ld, cd);
+                               #endif
+
+                               #ifdef VTK_OUTPUT
+                                       if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) {
+                                               kd->PdfsActive = dst;
+                                               VtkWrite(ld, kd, cd, iter);
+                                       }
+                               #endif
+
+                               #ifdef STATISTICS
+                                       kd->PdfsActive = dst;
+                                       KernelStatistics(kd, ld, cd, iter);
+                               #endif
+
+                               // swap grids
+                               tmp = src;
+                               src = dst;
+                               dst = tmp;
+                       }
+
+                       #pragma omp barrier
+
+               } // for (int iter = 0; ...
+
+               MemFree((void **)&tmpArray);
+       }
+
+#ifdef VTK_OUTPUT
+       if (cd->VtkOutput) {
+               kd->PdfsActive = src;
+               VtkWrite(ld, kd, cd, maxIterations);
+       }
+#endif
+
+#ifdef STATISTICS
+       kd->PdfsActive = src;
+       KernelStatistics(kd, ld, cd, maxIterations);
+#endif
+
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListPullSplitNt.h b/src/BenchKernelD3Q19ListPullSplitNt.h
new file mode 100644 (file)
index 0000000..bdfa628
--- /dev/null
@@ -0,0 +1,41 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__
+#define __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__
+
+#include "Kernel.h"
+
+void D3Q19ListPullSplitNt1SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+void D3Q19ListPullSplitNt2SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+// void D3Q19ListPullSplitNtRia1SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+// void D3Q19ListPullSplitNtRia2SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+
+void D3Q19ListPullSplitNtDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData);
+
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__
diff --git a/src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h b/src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h
new file mode 100644 (file)
index 0000000..a3e586b
--- /dev/null
@@ -0,0 +1,142 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+
+#ifndef INDEX_START
+       #error INDEX_START must be defined
+#endif
+
+#ifndef INDEX_STOP
+       #error INDEX_STOP must be defined
+#endif
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) {
+
+               indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex);
+#ifdef DEBUG
+               memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP);
+#endif
+               for (int index = 0; index < indexMax; ++index) {
+
+
+                       adjListIndex = (index + blockedIndex) * N_D3Q19_IDX;
+
+                       pdf_C = src[I(index + blockedIndex, D3Q19_C)];
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name);
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+                       tmpArray[TMP_INDEX(index, TMP_UX)] = ux;
+                       tmpArray[TMP_INDEX(index, TMP_UY)] = uy;
+                       tmpArray[TMP_INDEX(index, TMP_UZ)] = uz;
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0;
+
+                       w_1_indep = w_1 * dir_indep_trm;
+                       w_2_indep = w_2 * dir_indep_trm;
+
+                       tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep;
+                       tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep;
+
+                       dst[I(index + blockedIndex, D3Q19_C )]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+               }
+
+               #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )]) % VSIZE == 0); \
+                               vui         = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \
+                               vpdf_a      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \
+                               vpdf_b      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               vw_1_indep  = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \
+                               \
+                               vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \
+                               voddPart  = VMUL(vomegaOdd,  VSUB(     VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \
+                               VST(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \
+                       } \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )]) % VSIZE == 0); \
+                               vpdf_b      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], vpdf_b); \
+                       }
+
+               #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               _v1         = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \
+                               _v2         = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \
+                               vpdf_a          = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \
+                               vpdf_b          = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               vw_2_indep  = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \
+                               \
+                               vui = _expr; \
+                               vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \
+                               voddPart  = VMUL(vomegaOdd,  VSUB(     VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \
+                               VST(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \
+                       } \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )]) % VSIZE == 0); \
+                               vpdf_b      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], vpdf_b); \
+                       }
+
+               LOOP_1(N, S, vuy, UY);
+               LOOP_1(E, W, vux, UX);
+               LOOP_1(T, B, vuz, UZ);
+
+               LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux));
+               LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux));
+               LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux));
+               LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux));
+               LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy));
+               LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy));
+
+               #undef LOOP_1
+               #undef LOOP_2
+
+       } // loop over fluid nodes
+
+       #undef I
+
+       #undef INDEX_START
+       #undef INDEX_STOP
+
diff --git a/src/BenchKernelD3Q19ListPullSplitNt1SScalar.h b/src/BenchKernelD3Q19ListPullSplitNt1SScalar.h
new file mode 100644 (file)
index 0000000..9b833ae
--- /dev/null
@@ -0,0 +1,139 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+
+#ifndef INDEX_START
+       #error INDEX_START must be defined
+#endif
+
+#ifndef INDEX_STOP
+       #error INDEX_STOP must be defined
+#endif
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) {
+
+               indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex);
+#ifdef DEBUG
+               memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP);
+#endif
+               for (int index = 0; index < indexMax; ++index) {
+
+
+                       adjListIndex = (index + blockedIndex) * N_D3Q19_IDX;
+
+                       pdf_C = src[I(index + blockedIndex, D3Q19_C)];
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name);
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+                       tmpArray[TMP_INDEX(index, TMP_UX)] = ux;
+                       tmpArray[TMP_INDEX(index, TMP_UY)] = uy;
+                       tmpArray[TMP_INDEX(index, TMP_UZ)] = uz;
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0;
+
+                       w_1_indep = w_1 * dir_indep_trm;
+                       w_2_indep = w_2 * dir_indep_trm;
+
+                       tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep;
+                       tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep;
+
+                       dst[I(index + blockedIndex, D3Q19_C )]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+               }
+
+
+               #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               _vel               = tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]; \
+                               JOIN(pdf_,_dir1)   = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \
+                               JOIN(pdf_,_dir2)   = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                               w_1_indep          = tmpArray[TMP_INDEX(index, TMP_W1)]; \
+                               \
+                               ui = _vel; \
+                               evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+                               oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_1_x3); \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )]  = JOIN(pdf_,_dir1) - evenPart - oddPart; \
+                               tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]     = JOIN(pdf_,_dir2) - evenPart + oddPart; \
+                       } \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )] = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                       }
+
+               #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               _v1 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]; \
+                               _v2 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]; \
+                               JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \
+                               JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                               w_2_indep = tmpArray[TMP_INDEX(index, TMP_W2)]; \
+                               \
+                               ui = _expr; \
+                               evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+                               oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_2_x3); \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))] = JOIN(pdf_,_dir1) - evenPart - oddPart; \
+                               tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]     = JOIN(pdf_,_dir2) - evenPart + oddPart; \
+                       } \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )] = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                       }
+
+               LOOP_1(N, S, uy, UY);
+               LOOP_1(E, W, ux, UX);
+               LOOP_1(T, B, uz, UZ);
+
+               LOOP_2(NW, SE, uy, ux, UY, UX, uy - ux);
+               LOOP_2(NE, SW, uy, ux, UY, UX, uy + ux);
+               LOOP_2(TW, BE, ux, uz, UX, UZ, uz - ux);
+               LOOP_2(TE, BW, ux, uz, UX, UZ, uz + ux);
+               LOOP_2(TS, BN, uy, uz, UY, UZ, uz - uy);
+               LOOP_2(TN, BS, uy, uz, UY, UZ, uz + uy);
+
+               #undef LOOP_1
+               #undef LOOP_2
+
+
+       }
+
+       #undef I
+
+       #undef INDEX_START
+       #undef INDEX_STOP
diff --git a/src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h b/src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h
new file mode 100644 (file)
index 0000000..399fa5f
--- /dev/null
@@ -0,0 +1,130 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+
+#ifndef INDEX_START
+       #error INDEX_START must be defined
+#endif
+
+#ifndef INDEX_STOP
+       #error INDEX_STOP must be defined
+#endif
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) {
+
+               indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex);
+#ifdef DEBUG
+               memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP);
+#endif
+               for (int index = 0; index < indexMax; ++index) {
+
+
+                       adjListIndex = (index + blockedIndex) * N_D3Q19_IDX;
+
+                       pdf_C = src[I(index + blockedIndex, D3Q19_C)];
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name);
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+                       tmpArray[TMP_INDEX(index, TMP_UX)] = ux;
+                       tmpArray[TMP_INDEX(index, TMP_UY)] = uy;
+                       tmpArray[TMP_INDEX(index, TMP_UZ)] = uz;
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0;
+
+                       w_1_indep = w_1 * dir_indep_trm;
+                       w_2_indep = w_2 * dir_indep_trm;
+
+                       tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep;
+                       tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep;
+
+                       dst[I(index + blockedIndex, D3Q19_C )]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+               }
+
+               #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               vui = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \
+                               vpdf_a      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \
+                               vpdf_b      = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               vw_1_indep  = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \
+                               \
+                               vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \
+                               voddPart  = VMUL(vomegaOdd,  VSUB(     VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \
+                       }
+
+               #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \
+                       for (int index = 0; index < indexMax; index += VSIZE) { \
+                               _v1         = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \
+                               _v2         = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \
+                               vpdf_a          = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \
+                               vpdf_b          = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \
+                               vw_2_indep  = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \
+                               \
+                               vui = _expr; \
+                               vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \
+                               voddPart  = VMUL(vomegaOdd,  VSUB(     VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \
+                               VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \
+                       }
+
+               LOOP_1(N, S, vuy, UY);
+               LOOP_1(E, W, vux, UX);
+               LOOP_1(T, B, vuz, UZ);
+               LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux));
+               LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux));
+               LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux));
+               LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux));
+               LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy));
+               LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy));
+
+               #undef LOOP_1
+               #undef LOOP_2
+
+       } // loop over fluid nodes
+
+       #undef I
+
+       #undef INDEX_START
+       #undef INDEX_STOP
+
diff --git a/src/BenchKernelD3Q19ListPullSplitNt2SScalar.h b/src/BenchKernelD3Q19ListPullSplitNt2SScalar.h
new file mode 100644 (file)
index 0000000..ca1f3dd
--- /dev/null
@@ -0,0 +1,133 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+
+#ifndef INDEX_START
+       #error INDEX_START must be defined
+#endif
+
+#ifndef INDEX_STOP
+       #error INDEX_STOP must be defined
+#endif
+
+       #define I(index, dir)   P_INDEX_3((nCells), (index), (dir))
+
+       for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) {
+
+               indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex);
+#ifdef DEBUG
+               memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP);
+#endif
+               for (int index = 0; index < indexMax; ++index) {
+
+
+                       adjListIndex = (index + blockedIndex) * N_D3Q19_IDX;
+
+                       pdf_C = src[I(index + blockedIndex, D3Q19_C)];
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name);
+                       D3Q19_LIST_WO_C
+                       #undef X
+
+                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+
+                       tmpArray[TMP_INDEX(index, TMP_UX)] = ux;
+                       tmpArray[TMP_INDEX(index, TMP_UY)] = uy;
+                       tmpArray[TMP_INDEX(index, TMP_UZ)] = uz;
+
+                       dens = pdf_C +
+                                  pdf_N  + pdf_E  + pdf_S  + pdf_W  +
+                                  pdf_NE + pdf_SE + pdf_SW + pdf_NW +
+                                  pdf_T  + pdf_TN + pdf_TE + pdf_TS + pdf_TW +
+                                  pdf_B  + pdf_BN + pdf_BE + pdf_BS + pdf_BW;
+
+                       dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0;
+
+                       w_1_indep = w_1 * dir_indep_trm;
+                       w_2_indep = w_2 * dir_indep_trm;
+
+                       tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep;
+                       tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep;
+
+                       dst[I(index + blockedIndex, D3Q19_C )]  = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm);
+               }
+
+
+               #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               _vel               = tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]; \
+                               JOIN(pdf_,_dir1)   = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \
+                               JOIN(pdf_,_dir2)   = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                               w_1_indep          = tmpArray[TMP_INDEX(index, TMP_W1)]; \
+                               \
+                               ui = _vel; \
+                               evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_1_nine_half - w_1_indep); \
+                               oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_1_x3); \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )]  = JOIN(pdf_,_dir1) - evenPart - oddPart; \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )]  = JOIN(pdf_,_dir2) - evenPart + oddPart; \
+                       }
+
+               #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \
+                       for (int index = 0; index < indexMax; ++index) { \
+                               _v1 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]; \
+                               _v2 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]; \
+                               JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \
+                               JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \
+                               w_2_indep = tmpArray[TMP_INDEX(index, TMP_W2)]; \
+                               \
+                               ui = _expr; \
+                               evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_2_nine_half - w_2_indep); \
+                               oddPart  = omegaOdd  * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_2_x3); \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))] = JOIN(pdf_,_dir1) - evenPart - oddPart; \
+                               dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))] = JOIN(pdf_,_dir2) - evenPart + oddPart; \
+                       }
+
+               LOOP_1(N, S, uy, UY);
+               LOOP_1(E, W, ux, UX);
+               LOOP_1(T, B, uz, UZ);
+
+               LOOP_2(NW, SE, uy, ux, UY, UX, uy - ux);
+               LOOP_2(NE, SW, uy, ux, UY, UX, uy + ux);
+               LOOP_2(TW, BE, ux, uz, UX, UZ, uz - ux);
+               LOOP_2(TE, BW, ux, uz, UX, UZ, uz + ux);
+               LOOP_2(TS, BN, uy, uz, UY, UZ, uz - uy);
+               LOOP_2(TN, BS, uy, uz, UY, UZ, uz + uy);
+
+               #undef LOOP_1
+               #undef LOOP_2
+
+
+       }
+
+       #undef I
+
+       #undef INDEX_START
+       #undef INDEX_STOP
diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.c b/src/BenchKernelD3Q19ListPullSplitNtCommon.c
new file mode 100644 (file)
index 0000000..b5df14b
--- /dev/null
@@ -0,0 +1,797 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "BenchKernelD3Q19ListPullSplitNtCommon.h"
+
+#include "Memory.h"
+#include "Vtk.h"
+#include "Vector.h"
+
+#include <math.h>
+
+#ifdef _OPENMP
+       #include <omp.h>
+#endif
+
+// Forward definition.
+void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+void FNAME(KernelPullSplitNtRia1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+void FNAME(KernelPullSplitNtRia2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+
+
+
+// -----------------------------------------------------------------------
+// Functions which are used as callback by the kernel to read or write
+// PDFs and nodes.
+
+static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdf != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       // The relevant PDFs here are the ones, which will get streamed in later
+       // during propagation. So we must return the *remote* PDFs.
+       uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+       if (dir != D3Q19_C) {
+
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]];
+       }
+       else {
+               *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)];
+
+       }
+
+       return;
+}
+
+static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+       Assert(dir >= 0); Assert(dir < N_D3Q19);
+
+       if (isnan(pdf)) {
+               printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]);
+               DEBUG_BREAK_POINT();
+               exit(1);
+       }
+
+       // The relevant PDFs here are the ones, which will get streamed in later
+       // during propagation. So we must set this *remote* PDFs.
+       uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)];
+
+       if (dir != D3Q19_C) {
+
+               uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX;
+
+               kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf;
+       }
+       else {
+               kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf;
+
+       }
+
+       return;
+}
+
+
+static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+       PdfT sum = 0.0;
+
+       // TODO: pull scheme?
+
+       #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+       #define X(name, idx, idxinv, _x, _y, _z)        pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx];
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+#ifdef DETECT_NANS
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d]) || isinf(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+       return;
+}
+
+
+static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs)
+{
+       Assert(kd != NULL);
+       Assert(kd->PdfsActive != NULL);
+       Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]);
+       Assert(pdfs != NULL);
+
+       Assert(x >= 0); Assert(y >= 0); Assert(z >= 0);
+       Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]);
+
+#ifdef DETECT_NANS
+       for (int d = 0; d < 19; ++d) {
+               if(isnan(pdfs[d])) {
+                       printf("%d %d %d %d nan! get node\n", x, y, z, d);
+                                               for (int d2 = 0; d2 < 19; ++d2) {
+                                                       printf("%d: %e\n", d2, pdfs[d2]);
+                                               }
+                       exit(1);
+               }
+       }
+#endif
+
+       // TODO: pull scheme?
+       #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir))
+       #define X(name, idx, idxinv, _x, _y, _z)        kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx];
+       D3Q19_LIST
+       #undef X
+       #undef I
+
+       return;
+}
+
+static void ParameterUsage()
+{
+       printf("Kernel parameters:\n");
+       printf("  [-blk <n>] [-blk-[xyz] <n>] [-n-tmp-array <n>]\n");
+
+       return;
+}
+
+static void ParseParameters(Parameters * params, int * blk, int * nTmpArray)
+{
+       Assert(blk != NULL);
+
+       blk[0] = 0; blk[1] = 0; blk[2] = 0;
+       *nTmpArray = 152;
+
+       #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= params->nKernelArgs) { \
+                               printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \
+                               exit(1); \
+                       } \
+               } while (0)
+
+
+       for (int i = 0; i < params->nKernelArgs; ++i) {
+               if (ARG_IS("-blk") || ARG_IS("--blk")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = blk[1] = blk[2] = tmp;
+               }
+               else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[0] = tmp;
+               }
+               else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[1] = tmp;
+               }
+               else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: blocking parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       blk[2] = tmp;
+               }
+               else if (ARG_IS("-n-tmp-array") || ARG_IS("--n-tmp-array")) {
+                       NEXT_ARG_PRESENT();
+
+                       int tmp = strtol(params->KernelArgs[++i], NULL, 0);
+
+                       if (tmp <= 0) {
+                               printf("ERROR: -n-tmp-array parameter must be > 0.\n");
+                               exit(1);
+                       }
+
+                       if (tmp % VSIZE != 0) {
+                               printf("ERROR: value for -n-tmp-array must be a multiple of %d.\n", VSIZE);
+                               exit(1);
+                       }
+
+                       *nTmpArray = tmp;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       ParameterUsage();
+                       exit(1);
+               }
+               else {
+                       printf("ERROR: unknown kernel parameter.\n");
+                       ParameterUsage();
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+       return;
+}
+
+static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads)
+{
+       Assert(ld != NULL);
+       Assert(kdlr != NULL);
+       Assert(nThreads > 0);
+
+       uint32_t * adjList = kdlr->kdl.AdjList;
+
+       uint32_t nConsecNodes = 0;
+       uint32_t consecIndex = 0;
+
+       int nFluid = kdlr->kdl.nFluid;
+
+       uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1));
+
+       int nNodesPerThread = nFluid / nThreads;
+
+       for (int i = 0; i < nThreads; ++i) {
+               consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads);
+       }
+       consecThreadIndices[nThreads] = -1;
+
+       int indexThread = 1;
+
+       // We execute following code two times.
+       // - The first time to get the count of how many entries we need for the
+       //   consecNodes array.
+       // - The second time to fill the array.
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+               }
+       }
+
+       if (nFluid > 0) {
+               nConsecNodes = consecIndex + 1;
+       }
+
+       uint32_t * consecNodes;
+       MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes);
+
+       consecIndex = 0;
+
+       if (nFluid > 0) {
+               consecNodes[consecIndex] = 1;
+       }
+
+       indexThread = 1;
+       consecThreadIndices[0] = 0;
+
+       // Loop over adjacency list of all nodes.
+    // Compare if adjacent nodes share the same access pattern.
+       for (int index = 1; index < nFluid; ++index) {
+
+               int different = 0;
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+                       if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) {
+                               // Different access pattern.
+                               different = 1;
+                               break;
+                       }
+               }
+
+               if (consecThreadIndices[indexThread] == index) {
+                       // We are at a thread boundary. Starting from this index the fluids
+                       // belong to another thread. Force a break, if nodes are consecutive.
+                       consecThreadIndices[indexThread] = consecIndex + 1;
+                       ++indexThread;
+                       different = 1;
+               }
+
+               if (different) {
+                       ++consecIndex;
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] = 1;
+               }
+               else {
+                       Assert(consecIndex < nConsecNodes);
+                       consecNodes[consecIndex] += 1;
+               }
+       }
+
+
+       kdlr->ConsecNodes = consecNodes;
+       kdlr->nConsecNodes = nConsecNodes;
+
+       kdlr->ConsecThreadIndices  = consecThreadIndices;
+       kdlr->nConsecThreadIndices = nThreads;
+
+       // printf("# total fluid nodes: %d   consecutive blocks: %d\n", nFluid, nConsecNodes);
+
+       return;
+}
+
+
+static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       KernelData * kd;
+       KernelDataList * kdl;
+       KernelDataListRia * kdlr;
+       MemAlloc((void **)&kdlr, sizeof(KernelDataListRia));
+
+       kd = (KernelData *)kdlr;
+       kdl = KDL(kdlr);
+
+       *kernelData = kd;
+
+#ifdef DEBUG
+       kd->Pdfs[0] = NULL;
+       kd->Pdfs[1] = NULL;
+       kd->PdfsActive = NULL;
+       kd->DstPdfs = NULL;
+       kd->SrcPdfs = NULL;
+       kd->Dims[0] = -1;
+       kd->Dims[1] = -1;
+       kd->Dims[2] = -1;
+       kd->GlobalDims[0] = -1;
+       kd->GlobalDims[1] = -1;
+       kd->GlobalDims[2] = -1;
+       kd->Offsets[0] = -1;
+       kd->Offsets[1] = -1;
+       kd->Offsets[2] = -1;
+
+       kd->ObstIndices = NULL;
+       kd->nObstIndices = -1;
+       kd->BounceBackPdfsSrc = NULL;
+       kd->BounceBackPdfsDst = NULL;
+       kd->nBounceBackPdfs = -1;
+
+       kdl->AdjList = NULL;
+       kdl->Coords = NULL;
+       kdl->Grid = NULL;
+       kdl->nCells = -1;
+       kdl->nFluid = -1;
+
+       kdlr->ConsecNodes = NULL;
+       kdlr->nConsecNodes = 0;
+       kdlr->ConsecThreadIndices = NULL;
+       kdlr->nConsecThreadIndices = 0;
+#endif
+
+       // Ajust the dimensions according to padding, if used.
+       kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0];
+       kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1];
+       kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2];
+
+       int * lDims = ld->Dims;
+
+       int lX = lDims[0];
+       int lY = lDims[1];
+       int lZ = lDims[2];
+
+       int nTotalCells = lX * lY * lZ;
+       int nCells = ld->nFluid;
+       int nFluid = ld->nFluid;
+
+       // We padd each stream of a PDF array for a complete cache line.
+       // TODO: padding for L1/L2 and TLB.
+       nCells = nCells + (8 - nCells % 8);
+
+       Assert(nCells % VSIZE == 0);
+
+       kdl->nCells = nCells;
+       kdl->nFluid = nFluid;
+
+       PdfT * pdfs[2];
+
+       int blk[3] = { 0 };
+
+       ParseParameters(params, blk, &kdlr->nTmpArray);
+
+       if (blk[0] == 0) blk[0] = lX;
+       if (blk[1] == 0) blk[1] = lY;
+       if (blk[2] == 0) blk[2] = lZ;
+
+       printf("# blocking               x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]);
+       printf("# temporary array size:  %d PDFs, %lu b\n", kdlr->nTmpArray, kdlr->nTmpArray * sizeof(PdfT) * 23);
+
+       double latMiB      = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+       double latPadMib   = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0;
+
+       printf("# lattice size:          %e MiB total: %e MiB\n", latMiB,      latMiB * 2);
+       printf("# fluid lattice size:    %e MiB total: %e MiB\n", latFluidMib, latFluidMib * 2);
+       printf("# lattice padding:       %e MiB total: %e MiB\n", latPadMib,   latPadMib * 2);
+
+#define PAGE_4K                4096
+
+       printf("# aligning lattices to:  %d b\n", PAGE_4K);
+
+       MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+       MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K);
+
+       kd->Pdfs[0] = pdfs[0];
+       kd->Pdfs[1] = pdfs[1];
+
+       // Initialize PDFs with some (arbitrary) data for correct NUMA placement.
+       // Here we touch only the fluid nodes as this loop is OpenMP parallel and
+       // we want the same scheduling as in the kernel.
+       #ifdef _OPENMP
+               #pragma omp parallel for
+       #endif
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0;
+               pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0;
+       } }
+
+       // Initialize all PDFs to some standard value.
+       for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) {
+               pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0;
+               pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0;
+       } }
+
+       // ----------------------------------------------------------------------
+       // create grid which will hold the index numbers of the fluid nodes
+
+       uint32_t * grid;
+
+       if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) {
+               printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t));
+               exit(1);
+       }
+       kdl->Grid = grid;
+
+       int latticeIndex;
+
+#ifdef DEBUG
+       for(int z = 0; z < lZ; ++z) {
+               for(int y = 0; y < lY; ++y) {
+                       for(int x = 0; x < lX; ++x) {
+
+                               latticeIndex = L_INDEX_4(ld->Dims, x, y, z);
+
+                               grid[latticeIndex] = ~0;
+                       }
+               }
+       }
+#endif
+
+       // ----------------------------------------------------------------------
+       // generate numbering over grid
+
+       uint32_t * coords;
+
+       if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) {
+               printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3);
+               exit(1);
+       }
+
+       kdl->Coords = coords;
+
+       // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes
+       // through the ld->Lattice array.
+       int counter = 0;
+
+       // Blocking is implemented via setup of the adjacency list. The kernel later will
+       // walk through the lattice blocked automatically.
+       for (int bZ = 0; bZ < lZ; bZ += blk[2]) {
+       for (int bY = 0; bY < lY; bY += blk[1]) {
+       for (int bX = 0; bX < lX; bX += blk[0]) {
+
+               int eX = MIN(bX + blk[0], lX);
+               int eY = MIN(bY + blk[1], lY);
+               int eZ = MIN(bZ + blk[2], lZ);
+
+
+               for (int z = bZ; z < eZ; ++z) {
+               for (int y = bY; y < eY; ++y) {
+               for (int x = bX; x < eX; ++x) {
+
+                       latticeIndex = L_INDEX_4(lDims, x, y, z);
+
+                       if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) {
+                               grid[latticeIndex] = counter;
+
+                               coords[C_INDEX_X(counter)] = x;
+                               coords[C_INDEX_Y(counter)] = y;
+                               coords[C_INDEX_Z(counter)] = z;
+
+                               ++counter;
+                       }
+               } } }
+       } } }
+
+       Verify(counter == nFluid);
+
+       uint32_t * adjList;
+
+       double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0;
+
+       printf("# index size:            %e MiB\n", indexMib);
+
+
+       // AdjList only requires 18 instead of 19 entries per node, as
+       // the center PDF needs no addressing.
+       if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) {
+               printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX);
+               exit(1);
+       }
+
+       kdl->AdjList = adjList;
+
+       int x, y, z;
+
+       uint32_t neighborIndex;
+       uint32_t dstIndex;
+
+       int nx, ny, nz, px, py, pz;
+
+       // Loop over all fluid nodes and compute the indices to the neighboring
+       // PDFs for configured data layout (AoS/SoA).
+       // Parallelized loop to ensure correct NUMA placement.
+       // #ifdef _OPENMP  --> add line continuation
+       //      #pragma omp parallel for default(none)
+       //              shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z,
+       //                              stderr,
+       //                              lDims, grid, ld, lX, lY, lZ, adjList)
+       //              private(x, y, z, nx, ny, nz, neighborIndex, dstIndex)
+       // #endif
+       for (int index = 0; index < nFluid; ++index) {
+               x = coords[C_INDEX_X(index)];
+               y = coords[C_INDEX_Y(index)];
+               z = coords[C_INDEX_Z(index)];
+
+               Assert(x >= 0 && x < lX);
+               Assert(y >= 0 && y < lY);
+               Assert(z >= 0 && z < lZ);
+
+               Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE);
+
+               // Loop over all directions except the center one.
+               for(int d = 0; d < N_D3Q19 - 1; ++d) {
+                       Assert(d != D3Q19_C);
+
+#ifdef PROP_MODEL_PUSH
+                       nx = x + D3Q19_X[d];
+                       ny = y + D3Q19_Y[d];
+                       nz = z + D3Q19_Z[d];
+
+#elif PROP_MODEL_PULL
+                       nx = x - D3Q19_X[d];
+                       ny = y - D3Q19_Y[d];
+                       nz = z - D3Q19_Z[d];
+#else
+                       #error No implementation for this PROP_MODEL_NAME.
+#endif
+                       // If the neighbor is outside the latcie in X direction and we have a
+                       // periodic boundary then we need to wrap around.
+                       if (    ((nx < 0 || nx >= lX) && ld->PeriodicX) ||
+                                       ((ny < 0 || ny >= lY) && ld->PeriodicY) ||
+                                       ((nz < 0 || nz >= lZ) && ld->PeriodicZ)
+                                                                                                                               ){
+                               // x periodic
+
+                               if (nx < 0) {
+                                       px = lX - 1;
+                               }
+                               else if (nx >= lX) {
+                                       px = 0;
+                               } else {
+                                       px = nx;
+                               }
+                               // y periodic
+                               if (ny < 0) {
+                                       py = lY - 1;
+                               }
+                               else if (ny >= lY) {
+                                       py = 0;
+                               } else {
+                                       py = ny;
+                               }
+
+                               // z periodic
+                               if (nz < 0) {
+                                       pz = lZ - 1;
+                               }
+                               else if (nz >= lZ) {
+                                       pz = 0;
+                               } else {
+                                       pz = nz;
+                               }
+
+                               if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) {
+                                       dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                               }
+                               else {
+                                       neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)];
+
+                                       AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d);
+
+                                       dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                               }
+                       }
+                       else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) {
+                               dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]);
+                       }
+                       else {
+                               neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)];
+
+                               Assert(neighborIndex != ~0);
+
+                               dstIndex = P_INDEX_3(nCells, neighborIndex, d);
+                       }
+
+                       Assert(dstIndex >= 0);
+                       Assert(dstIndex < nCells * N_D3Q19);
+
+                       adjList[index * N_D3Q19_IDX + d] = dstIndex;
+               }
+       }
+
+       int nThreads = 1;
+
+#ifdef _OPENMP
+       nThreads = omp_get_max_threads();
+#endif
+
+       SetupConsecNodes(ld, KDLR(kd), nThreads);
+
+
+       // Fill remaining KernelData structures
+       kd->GetNode = GetNode;
+       kd->SetNode = SetNode;
+
+       kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf);
+       kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf);
+
+       kd->Kernel = NULL; // FNAME(KernelPullSplitNt2S);
+
+       kd->DstPdfs = NULL;
+       kd->PdfsActive = kd->Pdfs[0];
+
+       return;
+}
+
+void FNAME(D3Q19ListPullSplitNt1SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       FNAME(Init)(ld, kernelData, params);
+       (*kernelData)->Kernel = FNAME(KernelPullSplitNt1S);
+
+       double loopBalance  = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
+       printf("# loop balance:          %.2f B/FLUP\n", loopBalance);
+}
+
+void FNAME(D3Q19ListPullSplitNt2SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params)
+{
+       FNAME(Init)(ld, kernelData, params);
+       (*kernelData)->Kernel = FNAME(KernelPullSplitNt2S);
+
+       double loopBalance  = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0);
+       printf("# loop balance:          %.2f B/FLUP\n", loopBalance);
+}
+
+
+void FNAME(D3Q19ListPullSplitNtDeinit)(LatticeDesc * ld, KernelData ** kernelData)
+{
+       KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData;
+
+       MemFree((void **)&((*kdlr)->ConsecNodes));
+
+       if ((*kdlr)->ConsecThreadIndices != NULL) {
+               MemFree((void **)&((*kdlr)->ConsecThreadIndices));
+       }
+
+       KernelDataList ** kdl = (KernelDataList **)kernelData;
+
+       MemFree((void **)&((*kdl)->AdjList));
+       MemFree((void **)&((*kdl)->Coords));
+       MemFree((void **)&((*kdl)->Grid));
+
+       MemFree((void **)&((*kernelData)->Pdfs[0]));
+       MemFree((void **)&((*kernelData)->Pdfs[1]));
+
+       MemFree((void **)kernelData);
+       return;
+}
+
diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.h b/src/BenchKernelD3Q19ListPullSplitNtCommon.h
new file mode 100644 (file)
index 0000000..720c0d4
--- /dev/null
@@ -0,0 +1,57 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__
+#define __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__
+
+#if !defined(DATA_LAYOUT_SOA)
+       #error List AA Ria works only with DATA_LAYOUT_SOA
+#endif
+
+#include "BenchKernelD3Q19ListCommon.h"
+
+typedef struct KernelDataListRia_ {
+       KernelDataList kdl;
+
+       // Array contains information of how many adjacent nodes share the same access pattern.
+       uint32_t * ConsecNodes;
+       uint32_t nConsecNodes;  // Number of entries in ConsecNodes array.
+
+       // Array contains (for each thread) an index into ConsecNodes.
+       uint32_t * ConsecThreadIndices;
+       // Number of entries in ConsecThreadIndices.
+       uint32_t nConsecThreadIndices;
+
+       // Size of temporary array in units of PdfTs.
+       int nTmpArray;
+} KernelDataListRia;
+
+// Macro for casting KernelData * to KernelDataList *.
+#define KDLR(_x_)      ((KernelDataListRia *)(_x_))
+
+
+#endif // __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__
+
diff --git a/src/BoostJoin.h b/src/BoostJoin.h
new file mode 100644 (file)
index 0000000..e9167ab
--- /dev/null
@@ -0,0 +1,38 @@
+//  Boost config.hpp configuration header file  ------------------------------//
+//  boostinspect:ndprecated_macros -- tell the inspect tool to ignore this file
+
+//  Copyright (c) 2001-2003 John Maddock
+//  Copyright (c) 2001 Darin Adler
+//  Copyright (c) 2001 Peter Dimov
+//  Copyright (c) 2002 Bill Kempf
+//  Copyright (c) 2002 Jens Maurer
+//  Copyright (c) 2002-2003 David Abrahams
+//  Copyright (c) 2003 Gennaro Prota
+//  Copyright (c) 2003 Eric Friedman
+//  Copyright (c) 2010 Eric Jourdanneau, Joel Falcou
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+//  See http://www.boost.org/ for most recent version.
+
+//  Boost config.hpp policy and rationale documentation has been moved to
+//  http://www.boost.org/libs/config/
+//
+//  This file is intended to be stable, and relatively unchanging.
+//  It should contain boilerplate code only - no compiler specific
+//  code unless it is unavoidable - no changes unless unavoidable.
+
+// from boost/config/suffix.hpp
+
+//
+// Helper macro BOOST_JOIN:
+// The following piece of macro magic joins the two
+// arguments together, even when one of the arguments is
+// itself a macro (see 16.3.1 in C++ standard).  The key
+// is that macro expansion of macro arguments does not
+// occur in BOOST_DO_JOIN2 but does in BOOST_DO_JOIN.
+//
+#define BOOST_JOIN( X, Y ) BOOST_DO_JOIN( X, Y )
+#define BOOST_DO_JOIN( X, Y ) BOOST_DO_JOIN2(X,Y)
+#define BOOST_DO_JOIN2( X, Y ) X##Y
diff --git a/src/Geometry.c b/src/Geometry.c
new file mode 100644 (file)
index 0000000..31c985a
--- /dev/null
@@ -0,0 +1,267 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Geometry.h"
+#include "Memory.h"
+
+#include <strings.h>
+#include <math.h>
+
+#include <errno.h>
+
+void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], LatticeDesc * ld)
+{
+       int type = -1;
+       void * typeDetails = NULL;
+       int tmp;
+
+       if (strncasecmp("channel", geometryType, 7) == 0) {
+               type = GEO_TYPE_CHANNEL;
+       }
+       else if (strncasecmp("box", geometryType, 3) == 0) {
+               type = GEO_TYPE_BOX;
+       }
+       else if (strncasecmp("pipe", geometryType, 4) == 0) {
+               type = GEO_TYPE_PIPE;
+       }
+       else if (strncasecmp("blocks", geometryType, 6) == 0) {
+               type = GEO_TYPE_BLOCKS;
+
+               // Default block size
+               tmp = 8;
+
+               if (strlen(geometryType) > 7) {
+                       int blockSize = atoi(&geometryType[7]);
+
+                       int dimMin = dims[0];
+
+                       if (dims[1] < dimMin) dimMin = dims[1];
+                       if (dims[2] < dimMin) dimMin = dims[2];
+
+                       if (blockSize < 0 || blockSize > dimMin / 2) {
+                               printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
+                               // TODO: find a better solution for handling errors in here.
+                               Verify(0);
+                       }
+
+                       tmp = blockSize;
+               }
+
+               typeDetails = &tmp;
+       }
+       else {
+               printf("ERROR: unknown geometry specified.\n");
+               Verify(0);
+       }
+
+       GeoCreateByType(type, typeDetails, dims, periodic, ld);
+
+       return;
+}
+
+void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int periodic[3], LatticeDesc * ld)
+{
+       Assert(dims != NULL);
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+
+       Assert(periodic != NULL);
+       Assert(periodic[0] >= 0);
+       Assert(periodic[1] >= 0);
+       Assert(periodic[2] >= 0);
+
+       Assert(ld != NULL);
+
+       Assert(type >= GEO_TYPE_MIN);
+       Assert(type <= GEO_TYPE_MAX);
+
+       const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" };
+
+       printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]);
+
+       ld->Dims[0] = dims[0];
+       ld->Dims[1] = dims[1];
+       ld->Dims[2] = dims[2];
+       ld->nCells = dims[0] * dims[1] * dims[2];
+       ld->PeriodicX = periodic[0];
+       ld->PeriodicY = periodic[1];
+       ld->PeriodicZ = periodic[2];
+
+       LatticeT * lattice;
+       MemAlloc((void **)&lattice, sizeof(LatticeT) * dims[0] * dims[1] * dims[2]);
+
+       ld->Lattice = lattice;
+
+       for (int z = 0; z < dims[2]; ++z) {
+               for (int y = 0; y < dims[1]; ++y) {
+                       for (int x = 0; x < dims[0]; ++x) {
+                               lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_FLUID;
+                       }
+               }
+       }
+
+       if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) {
+               periodic[0] = 1;
+       }
+
+       // Walls or periodic on first and last x plane.
+       for (int z = 0; z < dims[2]; ++z) {
+               for (int y = 0; y < dims[1]; ++y) {
+                       if(periodic[0]){
+                               lattice[L_INDEX_4(dims, 0, y, z)]                               = LAT_CELL_FLUID;
+                               lattice[L_INDEX_4(dims, dims[0] - 1, y, z)]     = LAT_CELL_FLUID;
+                       } else {
+                               lattice[L_INDEX_4(dims, 0, y, z)]                               = LAT_CELL_OBSTACLE;
+                               lattice[L_INDEX_4(dims, dims[0] - 1, y, z)]     = LAT_CELL_OBSTACLE;
+                       }
+               }
+       }
+
+       // Walls or periodic on first and last y plane.
+       for (int z = 0; z < dims[2]; ++z) {
+               for (int x = 0; x < dims[0]; ++x) {
+                       if(periodic[1]){
+                               lattice[L_INDEX_4(dims, x, 0, z)]                               = LAT_CELL_FLUID;
+                               lattice[L_INDEX_4(dims, x, dims[1] - 1, z)]     = LAT_CELL_FLUID;
+                       } else {
+                               lattice[L_INDEX_4(dims, x, 0, z)]                               = LAT_CELL_OBSTACLE;
+                               lattice[L_INDEX_4(dims, x, dims[1] - 1, z)]     = LAT_CELL_OBSTACLE;
+                       }
+               }
+       }
+
+       // Walls or periodic on first and last z plane.
+       for (int y = 0; y < dims[1]; ++y) {
+               for (int x = 0; x < dims[0]; ++x) {
+                       if(periodic[2]){
+                               lattice[L_INDEX_4(dims, x, y, 0)]                               = LAT_CELL_FLUID;
+                               lattice[L_INDEX_4(dims, x, y, dims[2] - 1)]     = LAT_CELL_FLUID;
+                       } else {
+                               lattice[L_INDEX_4(dims, x, y, 0)]                               = LAT_CELL_OBSTACLE;
+                               lattice[L_INDEX_4(dims, x, y, dims[2] - 1)]     = LAT_CELL_OBSTACLE;
+                       }
+               }
+       }
+
+       if (type == GEO_TYPE_CHANNEL) {
+               periodic[0] = 1;
+       }
+       else if (type == GEO_TYPE_PIPE) {
+               #define SQR(a) ((a)*(a))
+               double centerZ = dims[2] / 2.0 - 0.5;
+               double centerY = dims[1] / 2.0 - 0.5;
+               double minDiameter = MIN(dims[1], dims[2]);
+               double minRadiusSquared = SQR(minDiameter / 2 - 1);
+
+               for (int z = 0; z < dims[2]; ++z) {
+                       for (int y = 0; y < dims[1]; ++y) {
+                               if((SQR(z - centerZ) + SQR(y - centerY)) >= minRadiusSquared) {
+                                       for (int x = 0; x < dims[0]; ++x) {
+                                               lattice[L_INDEX_4(dims, x, y, z)]       = LAT_CELL_OBSTACLE;
+                                       }
+                               }
+                       }
+               }
+               #undef SQR
+       }
+       else if (type == GEO_TYPE_BLOCKS) {
+
+               int blockSize = *((int *)typeDetails);
+
+               if (blockSize == 0) {
+                       blockSize = 8;
+               }
+
+               int dimMin = dims[0];
+
+               if (dims[1] < dimMin) dimMin = dims[1];
+               if (dims[2] < dimMin) dimMin = dims[2];
+
+               if (blockSize < 0 || blockSize > dimMin / 2) {
+                       printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n");
+                       // TODO: find a better solution for handling errors in here.
+                       Verify(0);
+               }
+
+               // Number of blocks in x, y, and z direction.
+               int nbx = blockSize, nby = blockSize, nbz = blockSize;
+
+               for (int z = 0; z < dims[2]; ++z) {
+                               if ((z % (2 * nbz)) < nbz) continue;
+
+                       for (int y = 0; y < dims[1]; ++y) {
+                               if ((y % (2 * nby)) < nby) continue;
+
+                               for (int x = 0; x < dims[0]; ++x) {
+
+                                       if ((x % (2 * nbx)) >= nbx) {
+                                               lattice[L_INDEX_4(dims, x, y, z)]       = LAT_CELL_OBSTACLE;
+                                       }
+                               }
+                       }
+               }
+       }
+
+//     if (latticeDumpAscii) {
+//             const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet
+//             for (int z = dims[2] - 1; z >= 0; --z) {
+//                     printf("plane % 2d\n", z);
+//
+//                     for (int y = dims[1] - 1; y >= 0; --y) {
+//                             printf(" %2d  ", y);
+//                             for (int x = 0; x < dims[0]; ++x) {
+//                                     printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]);
+//                             }
+//                             printf("\n");
+//                     }
+//             }
+//     }
+
+// Lattice Helper Function
+
+       ld->nObst = 0;
+       ld->nFluid = 0;
+       ld->nInlet = 0;
+       ld->nOutlet = 0;
+
+       for (int z = 0; z < dims[2]; ++z) {
+               for (int y = 0; y < dims[1]; ++y) {
+                       for (int x = 0; x < dims[0]; ++x) {
+                               switch (lattice[L_INDEX_4(dims, x, y, z)]) {
+                                       case LAT_CELL_OBSTACLE:                                 ld->nObst++; break;
+                                       case LAT_CELL_FLUID:                                    ld->nFluid++; break;
+                                       case LAT_CELL_INLET:                                    ld->nInlet++;  ld->nFluid++; break;
+                                       case LAT_CELL_OUTLET:                                   ld->nOutlet++; ld->nFluid++; break;
+                                       default:
+                                               Verify(0);
+                               }
+                       }
+               }
+       }
+
+       return;
+}
diff --git a/src/Geometry.h b/src/Geometry.h
new file mode 100644 (file)
index 0000000..11d72ff
--- /dev/null
@@ -0,0 +1,48 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+#include "Lattice.h"
+
+
+typedef enum GEO_TYPES_ {
+       GEO_TYPE_MIN = 0,
+       GEO_TYPE_BOX = 0,
+       GEO_TYPE_CHANNEL = 1,
+       GEO_TYPE_PIPE = 2,
+       GEO_TYPE_BLOCKS = 3,    // Expects a pointer to an integer, holding the
+                                                   // value of the block size as type detail.
+       GEO_TYPE_MAX = 3
+} GEO_TYPES;
+
+
+void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int periodic[3], LatticeDesc * ld);
+
+void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], LatticeDesc * ld);
+
+#endif // __GEOMETRY_H__
diff --git a/src/Kernel.c b/src/Kernel.c
new file mode 100644 (file)
index 0000000..88018b4
--- /dev/null
@@ -0,0 +1,801 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Kernel.h"
+#include "Lattice.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#define X(name, idx, idx_inv, x, y, z) , x
+int D3Q19_X[] = {
+       EXPAND(D3Q19_LIST)
+};
+#undef X
+
+#define X(name, idx, idx_inv, x, y, z) , y
+int D3Q19_Y[] = {
+       EXPAND(D3Q19_LIST)
+};
+#undef X
+
+#define X(name, idx, idx_inv, x, y, z) , z
+int D3Q19_Z[] = {
+       EXPAND(D3Q19_LIST)
+};
+#undef X
+
+#define X(name, idx, idxinv, x, y, z)  , idxinv
+int D3Q19_INV[] = {
+       EXPAND(D3Q19_LIST)
+};
+#undef X
+
+
+#define X(name, idx, idxinv, x, y, z)  , STRINGIFY(name)
+const char * D3Q19_NAMES[N_D3Q19] = {
+       EXPAND(D3Q19_LIST)
+};
+#undef X
+
+void KernelComputeBoundaryConditions(KernelData * kd, LatticeDesc * ld, CaseData * cd)
+{
+       Assert(kd != NULL);
+       Assert(ld != NULL);
+       Assert(cd != NULL);
+
+       Assert(cd->RhoIn  > 0.0);
+       Assert(cd->RhoOut > 0.0);
+
+       PdfT rho_in         = cd->RhoIn;
+       PdfT rho_out        = cd->RhoOut;
+       PdfT rho_in_inv     = 1.0 / rho_in;
+       PdfT rho_out_inv    = 1.0 / rho_out;
+       PdfT indep_ux       = 0.0;
+
+       PdfT dens;
+       PdfT ux;
+
+       const PdfT one_third  = 1.0 / 3.0;
+       const PdfT one_fourth = 1.0 / 4.0;
+       const PdfT one_sixth  = 1.0 / 6.0;
+
+       PdfT pdfs[N_D3Q19];
+
+       int nX = kd->Dims[0];
+       int nY = kd->Dims[1];
+       int nZ = kd->Dims[2];
+
+       int x;
+       int x_in  = 0;
+       int x_out = nX - 1;
+
+       double density_in = 0.0;
+       double density_out = 0.0;
+
+       // update inlet / outlet boundary conditions
+       for (int z = 1; z < nZ - 1; ++z) {
+               for (int y = 1; y < nY - 1; ++y) {
+
+
+                       // -----------------------------------------------------------------------------
+                       // update inlet conditions
+
+                       if (ld->Lattice[L_INDEX_4(ld->Dims, x_in, y, z)] == LAT_CELL_INLET) {
+
+                               x = x_in;
+
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_C , pdfs + D3Q19_C);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_T , pdfs + D3Q19_T);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_B , pdfs + D3Q19_B);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_S , pdfs + D3Q19_S);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_N , pdfs + D3Q19_N);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TS, pdfs + D3Q19_TS);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BS, pdfs + D3Q19_BS);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TN, pdfs + D3Q19_TN);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BN, pdfs + D3Q19_BN);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_SW, pdfs + D3Q19_SW);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TW, pdfs + D3Q19_TW);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_W , pdfs + D3Q19_W);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BW, pdfs + D3Q19_BW);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_NW, pdfs + D3Q19_NW);
+
+                               dens = rho_in;
+
+                               ux = 1 - (pdfs[D3Q19_C]  +
+                                               (pdfs[D3Q19_T]  + pdfs[D3Q19_B]  + pdfs[D3Q19_S]  + pdfs[D3Q19_N]) +
+                                               (pdfs[D3Q19_TS] + pdfs[D3Q19_BS] + pdfs[D3Q19_TN] + pdfs[D3Q19_BN]) +
+                                               2 * (pdfs[D3Q19_SW] + pdfs[D3Q19_TW] + pdfs[D3Q19_W] + pdfs[D3Q19_BW] + pdfs[D3Q19_NW])) * rho_in_inv;
+
+                               indep_ux = one_sixth * dens * ux;
+
+                               pdfs[D3Q19_E ] = pdfs[D3Q19_W]  + one_third  * dens * ux;
+                               pdfs[D3Q19_NE] = pdfs[D3Q19_SW] - one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) + indep_ux;
+                               pdfs[D3Q19_SE] = pdfs[D3Q19_NW] + one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) + indep_ux;
+                               pdfs[D3Q19_TE] = pdfs[D3Q19_BW] - one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) + indep_ux;
+                               pdfs[D3Q19_BE] = pdfs[D3Q19_TW] + one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) + indep_ux;
+
+
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_E , pdfs[D3Q19_E ]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_NE, pdfs[D3Q19_NE]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_SE, pdfs[D3Q19_SE]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_TE, pdfs[D3Q19_TE]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_BE, pdfs[D3Q19_BE]);
+
+                               for(int d = 0; d < N_D3Q19; ++d) {
+                                       density_in += pdfs[d];
+                               }
+                       }
+
+                       // -----------------------------------------------------------------------------
+                       // update outlet conditions
+
+                       if (ld->Lattice[L_INDEX_4(ld->Dims, x_out, y, z)] == LAT_CELL_OUTLET) {
+                               // update outlet conditions
+
+                               x = x_out;
+
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_C , pdfs + D3Q19_C );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_T , pdfs + D3Q19_T );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_B , pdfs + D3Q19_B );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_S , pdfs + D3Q19_S );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_N , pdfs + D3Q19_N );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TS, pdfs + D3Q19_TS);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BS, pdfs + D3Q19_BS);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TN, pdfs + D3Q19_TN);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BN, pdfs + D3Q19_BN);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_NE, pdfs + D3Q19_NE);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BE, pdfs + D3Q19_BE);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_E , pdfs + D3Q19_E );
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TE, pdfs + D3Q19_TE);
+                               kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_SE, pdfs + D3Q19_SE);
+
+                               dens = rho_out;
+
+                               ux = -1 + (pdfs[D3Q19_C] +
+                                               (pdfs[D3Q19_T]  + pdfs[D3Q19_B]  + pdfs[D3Q19_S]  + pdfs[D3Q19_N]) +
+                                               (pdfs[D3Q19_TS] + pdfs[D3Q19_BS] + pdfs[D3Q19_TN] + pdfs[D3Q19_BN]) +
+                                               2 * (pdfs[D3Q19_NE] + pdfs[D3Q19_BE] + pdfs[D3Q19_E] + pdfs[D3Q19_TE] + pdfs[D3Q19_SE])) * rho_out_inv;
+                               indep_ux = one_sixth * dens * ux;
+
+                               pdfs[D3Q19_W ] = pdfs[D3Q19_E] - one_third * dens * ux;
+                               pdfs[D3Q19_SW] = pdfs[D3Q19_NE] + one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) - indep_ux;
+                               pdfs[D3Q19_NW] = pdfs[D3Q19_SE] - one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) - indep_ux;
+                               pdfs[D3Q19_BW] = pdfs[D3Q19_TE] + one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) - indep_ux;
+                               pdfs[D3Q19_TW] = pdfs[D3Q19_BE] - one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) - indep_ux;
+
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_W , pdfs[D3Q19_W ]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_NW, pdfs[D3Q19_NW]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_SW, pdfs[D3Q19_SW]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_TW, pdfs[D3Q19_TW]);
+                               kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_BW, pdfs[D3Q19_BW]);
+
+                               for(int d = 0; d < N_D3Q19; ++d) {
+                                       density_out += pdfs[d];
+                               }
+                       }
+               }
+       }
+
+       // DEBUG: printf("# density inlet: %e  density outlet: %e\n", density_in, density_out);
+
+}
+
+
+PdfT KernelDensity(KernelData * kd, LatticeDesc * ld)
+{
+       Assert(kd != NULL);
+       Assert(ld != NULL);
+
+       Assert(ld->Lattice != NULL);
+       Assert(ld->Dims    != NULL);
+
+       Assert(ld->Dims[0] > 0);
+       Assert(ld->Dims[1] > 0);
+       Assert(ld->Dims[2] > 0);
+
+       int * lDims = ld->Dims;
+       int nX = lDims[0];
+       int nY = lDims[1];
+       int nZ = lDims[2];
+
+       PdfT pdfs[N_D3Q19] = { -1.0 };
+       PdfT density = 0.0;
+
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+                       for(int x = 0; x < nX; ++x) {
+
+                               if(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) {
+
+                                       kd->GetNode(kd, x, y, z, pdfs);
+
+                                       for(int d = 0; d < N_D3Q19; ++d) {
+//                                             if (pdfs[d] < 0.0) {
+//                                                     printf("# %d %d %d %d < 0 %e %s\n", x, y, z, d, pdfs[d], D3Q19_NAMES[d]);
+//     exit(1);
+//                                             }
+                                               density += pdfs[d];
+                                       }
+                               }
+
+                       }
+               }
+       }
+
+       return density / ld->nFluid;
+}
+
+
+// prescribes a given density
+void KernelSetInitialDensity(LatticeDesc * ld, KernelData * kd, CaseData * cd)
+{
+       int * lDims = ld->Dims;
+
+       PdfT rho_in = cd->RhoIn;
+       PdfT rho_out = cd->RhoOut;
+
+       PdfT ux = 0.0;
+       PdfT uy = 0.0;
+       PdfT uz = 0.0;
+       PdfT dens = 1.0;
+
+       PdfT omega = cd->Omega;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT dir_indep_trm;
+       PdfT omega_w0  = 3.0 * w_0 * omega;
+       PdfT omega_w1  = 3.0 * w_1 * omega;
+       PdfT omega_w2  = 3.0 * w_2 * omega;
+       PdfT one_third = 1.0 / 3.0;
+
+       int nX = lDims[0];
+       int nY = lDims[1];
+       int nZ = lDims[2];
+
+       PdfT pdfs[N_D3Q19];
+
+       #ifdef _OPENMP
+               #pragma omp parallel for collapse(3)
+       #endif
+       for(int z = 0; z < nZ; ++z) { for(int y = 0; y < nY; ++y) { for(int x = 0; x < nX; ++x) {
+
+               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+       // TODO: fix later.
+       //              if((caseData->geoType == GEO_TYPE_CHANNEL) || (caseData->geoType == GEO_TYPE_RCHANNEL))
+                               dens = rho_in + (rho_out - rho_in)*(x)/(nX-1.0);
+
+                       #define SQR(a) ((a)*(a))
+
+                       dir_indep_trm = one_third * dens - 0.5 * (ux * ux + uy * uy + uz * uz);
+
+                       pdfs[D3Q19_C]  = omega_w0 * (dir_indep_trm);
+
+                       pdfs[D3Q19_NW] = omega_w2 * (dir_indep_trm - (ux - uy) + 1.5 * SQR(ux - uy));
+                       pdfs[D3Q19_SE] = omega_w2 * (dir_indep_trm + (ux - uy) + 1.5 * SQR(ux - uy));
+
+                       pdfs[D3Q19_NE] = omega_w2 * (dir_indep_trm + (ux + uy) + 1.5 * SQR(ux + uy));
+                       pdfs[D3Q19_SW] = omega_w2 * (dir_indep_trm - (ux + uy) + 1.5 * SQR(ux + uy));
+
+
+                       pdfs[D3Q19_TW] = omega_w2 * (dir_indep_trm - (ux - uz) + 1.5 * SQR(ux - uz));
+                       pdfs[D3Q19_BE] = omega_w2 * (dir_indep_trm + (ux - uz) + 1.5 * SQR(ux - uz));
+
+                       pdfs[D3Q19_TE] = omega_w2 * (dir_indep_trm + (ux + uz) + 1.5 * SQR(ux + uz));
+                       pdfs[D3Q19_BW] = omega_w2 * (dir_indep_trm - (ux + uz) + 1.5 * SQR(ux + uz));
+
+
+                       pdfs[D3Q19_TS] = omega_w2 * (dir_indep_trm - (uy - uz) + 1.5 * SQR(uy - uz));
+                       pdfs[D3Q19_BN] = omega_w2 * (dir_indep_trm + (uy - uz) + 1.5 * SQR(uy - uz));
+
+                       pdfs[D3Q19_TN] = omega_w2 * (dir_indep_trm + (uy + uz) + 1.5 * SQR(uy + uz));
+                       pdfs[D3Q19_BS] = omega_w2 * (dir_indep_trm - (uy + uz) + 1.5 * SQR(uy + uz));
+
+
+                       pdfs[D3Q19_N]  = omega_w1 * (dir_indep_trm + uy + 1.5 * SQR(uy));
+                       pdfs[D3Q19_S]  = omega_w1 * (dir_indep_trm - uy + 1.5 * SQR(uy));
+
+                       pdfs[D3Q19_E]  = omega_w1 * (dir_indep_trm + ux + 1.5 * SQR(ux));
+                       pdfs[D3Q19_W]  = omega_w1 * (dir_indep_trm - ux + 1.5 * SQR(ux));
+
+                       pdfs[D3Q19_T]  = omega_w1 * (dir_indep_trm + uz + 1.5 * SQR(uz));
+                       pdfs[D3Q19_B]  = omega_w1 * (dir_indep_trm - uz + 1.5 * SQR(uz));
+
+
+                       kd->SetNode(kd, x, y, z, pdfs);
+
+                       #undef SQR
+               }
+       } } }
+}
+
+
+// prescribes a given velocity
+void KernelSetInitialVelocity(LatticeDesc * ld, KernelData * kd, CaseData * cd)
+{
+
+       int * lDims = ld->Dims;
+
+       // TODO: ux is overriden below...
+       PdfT ux = 0.09; // caseData->initUx;
+       PdfT uy = 0.0; // caseData->initUy;
+       PdfT uz = 0.0; // caseData->initUz;
+       PdfT dens = 1.0;
+
+       PdfT omega = cd->Omega;
+
+       PdfT w_0 = 1.0 /  3.0;
+       PdfT w_1 = 1.0 / 18.0;
+       PdfT w_2 = 1.0 / 36.0;
+
+       PdfT dir_indep_trm;
+       PdfT omega_w0  = 3.0 * w_0 * omega;
+       PdfT omega_w1  = 3.0 * w_1 * omega;
+       PdfT omega_w2  = 3.0 * w_2 * omega;
+       PdfT one_third = 1.0 / 3.0;
+
+       int nX = lDims[0];
+       int nY = lDims[1];
+       int nZ = lDims[2];
+
+       PdfT pdfs[N_D3Q19];
+
+       PdfT density;
+
+       #ifdef _OPENMP
+               #pragma omp parallel for collapse(3)
+       #endif
+       for(int z = 0; z < nZ; ++z) { for(int y = 0; y < nY; ++y) { for(int x = 0; x < nX; ++x) {
+
+               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_FLUID) {
+
+                       ux = 0.0;
+                       uy = 0.0;
+                       uz = 0.0;
+
+                       kd->GetNode(kd, x, y, z, pdfs);
+
+
+                       density = 0.0;
+
+                       #define X(name, idx, idxinv, _x, _y, _z)        density += pdfs[idx];
+                               D3Q19_LIST
+                       #undef X
+
+
+                       #define SQR(a) ((a)*(a))
+                       dir_indep_trm = one_third * dens - 0.5 * (ux * ux + uy * uy + uz * uz);
+
+                       pdfs[D3Q19_C]  = omega_w0 * (dir_indep_trm);
+
+                       pdfs[D3Q19_NW] = omega_w2 * (dir_indep_trm - (ux - uy) + 1.5 * SQR(ux - uy));
+                       pdfs[D3Q19_SE] = omega_w2 * (dir_indep_trm + (ux - uy) + 1.5 * SQR(ux - uy));
+
+                       pdfs[D3Q19_NE] = omega_w2 * (dir_indep_trm + (ux + uy) + 1.5 * SQR(ux + uy));
+                       pdfs[D3Q19_SW] = omega_w2 * (dir_indep_trm - (ux + uy) + 1.5 * SQR(ux + uy));
+
+
+                       pdfs[D3Q19_TW] = omega_w2 * (dir_indep_trm - (ux - uz) + 1.5 * SQR(ux - uz));
+                       pdfs[D3Q19_BE] = omega_w2 * (dir_indep_trm + (ux - uz) + 1.5 * SQR(ux - uz));
+
+                       pdfs[D3Q19_TE] = omega_w2 * (dir_indep_trm + (ux + uz) + 1.5 * SQR(ux + uz));
+                       pdfs[D3Q19_BW] = omega_w2 * (dir_indep_trm - (ux + uz) + 1.5 * SQR(ux + uz));
+
+
+                       pdfs[D3Q19_TS] = omega_w2 * (dir_indep_trm - (uy - uz) + 1.5 * SQR(uy - uz));
+                       pdfs[D3Q19_BN] = omega_w2 * (dir_indep_trm + (uy - uz) + 1.5 * SQR(uy - uz));
+
+                       pdfs[D3Q19_TN] = omega_w2 * (dir_indep_trm + (uy + uz) + 1.5 * SQR(uy + uz));
+                       pdfs[D3Q19_BS] = omega_w2 * (dir_indep_trm - (uy + uz) + 1.5 * SQR(uy + uz));
+
+
+                       pdfs[D3Q19_N]  = omega_w1 * (dir_indep_trm + uy + 1.5 * SQR(uy));
+                       pdfs[D3Q19_S]  = omega_w1 * (dir_indep_trm - uy + 1.5 * SQR(uy));
+
+                       pdfs[D3Q19_E]  = omega_w1 * (dir_indep_trm + ux + 1.5 * SQR(ux));
+                       pdfs[D3Q19_W]  = omega_w1 * (dir_indep_trm - ux + 1.5 * SQR(ux));
+
+                       pdfs[D3Q19_T]  = omega_w1 * (dir_indep_trm + uz + 1.5 * SQR(uz));
+                       pdfs[D3Q19_B]  = omega_w1 * (dir_indep_trm - uz + 1.5 * SQR(uz));
+
+                       #undef SQR
+
+
+                       kd->SetNode(kd, x, y, z, pdfs);
+               }
+       } } }
+
+}
+
+// Compute analytical x velocity for channel flow.
+//
+// Formula 7 from Kutay et al. "Laboratory validation of lattice Boltzmann method for modeling
+// pore-scale flow in granular materials", doi:10.1016/j.compgeo.2006.08.002.
+//
+// also formula 10 from
+// Pan et al. "An evaluation of lattice Boltzmann equation methods for simulating flow
+// through porous media", doi:10.1016/S0167-5648(04)80040-6.
+//
+// calculate velocity in a pipe for a given radius
+//
+static PdfT CalcXVelForPipeProfile(PdfT maxRadiusSquared, PdfT curRadiusSquared, PdfT xForce, PdfT viscosity)
+{
+       return xForce*(maxRadiusSquared - curRadiusSquared) / (2.0*viscosity);
+}
+
+static void KernelGetXSlice(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * outputArray, int xPos)
+{
+       Assert(ld != NULL);
+       Assert(kd != NULL);
+
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       Assert(xPos >= 0);
+       Assert(xPos < ld->Dims[0]);
+
+
+       PdfT ux = 0.0;
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+       PdfT pdfs[N_D3Q19];
+
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+
+                       if (ld->Lattice[L_INDEX_4(ld->Dims, xPos, y, z)] != LAT_CELL_OBSTACLE) {
+                               kd->GetNode(kd, xPos, y, z, pdfs);
+
+                               #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = pdfs[idx];
+                               D3Q19_LIST
+                               #undef X
+                               UNUSED(pdf_C); UNUSED(pdf_S); UNUSED(pdf_N); UNUSED(pdf_T); UNUSED(pdf_B);
+                               UNUSED(pdf_TN); UNUSED(pdf_BN); UNUSED(pdf_TS); UNUSED(pdf_BS);
+
+                               ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                        pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+
+                               #ifdef VERIFICATION
+                               ux += 0.5 * cd->XForce;
+                               #endif
+
+                               outputArray[y * nZ + z] = ux;
+                       }
+                       else {
+                               outputArray[y * nZ + z] = 0.0;
+                       }
+               }
+       }
+
+}
+
+// Verification of channel profile with analytical solution.
+// Taken from Kutay et al. "Laboratory validation of lattice Boltzmann method for modeling
+// pore-scale flow in granular materials", doi:10.1016/j.compgeo.2006.08.002. and
+// Pan et al. "An evaluation of lattice Boltzmann equation methods for simulating flow
+// through porous media", doi:10.1016/S0167-5648(04)80040-6
+//
+void KernelVerifiy(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * errorNorm)
+{
+       Assert(ld != NULL);
+       Assert(kd != NULL);
+       Assert(cd != NULL);
+       Assert(errorNorm != NULL);
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       PdfT omega   = cd->Omega;
+       PdfT viscosity        = (1.0 / omega - 0.5) / 3.0;
+
+       // ux averaged across cross sections in x direction
+       PdfT * outputArray = (PdfT *)malloc(nZ * nY * sizeof(PdfT));
+       Verify(outputArray != NULL);
+
+       memset(outputArray, -10, nZ*nY*sizeof(PdfT));
+
+       // uncomment this to get values averaged along the x-axis
+       //AveragePipeCrossSections(ld, kd, outputArray);
+       KernelGetXSlice(ld, kd, cd, outputArray, (int)(nX/2));
+
+
+       FILE * fh;
+       char fileName[1024];
+       PdfT tmpAvgUx = 0.0;
+       PdfT tmpAnalyUx = 0.0;
+       int flagEvenNy = 0;
+       int y = 0;
+
+       if (nY % 2 == 0)
+               flagEvenNy = 1;
+
+       y = (nY-flagEvenNy-1)/2;
+
+       snprintf(fileName, sizeof(fileName), "flow-profile.dat");
+
+       printf("# Kernel validation: writing profile to %s\n", fileName);
+
+       fh = fopen(fileName, "w");
+
+       if(fh == NULL) {
+               printf("ERROR: opening file %s failed.\n", fileName);
+               exit(1);
+       }
+
+       fprintf(fh, "# Flow profile in Z direction. Taken at the middle of the X length (= %d) of total length %d.\n", nZ / 2, nZ);
+       // fprintf(fh, "# Snapshot taken at iteration %d.\n", iteration);
+       fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\" u 1:3 t \\\"analytical\\\", \\\"\\\" u 1:4 t \\\"simulation\\\";\"\n", fileName);
+       fprintf(fh, "# Plot graphically: gnuplot -e \"plot \\\"%s\\\" u 1:3 w linesp t \\\"analytical\\\", \\\"\\\" u 1:4 w linesp t \\\"simulation\\\"; pause -1;\"\n", fileName);
+       fprintf(fh, "# z coord., radius, analytic, simulation, diff abs, diff rel, undim_analytic, undim_sim\n");
+
+       double deviation = 0.0;
+       double curRadiusSquared;
+       double center = nY / 2.0;
+       double minDiameter = nY;
+       #define SQR(a) ((a)*(a))
+       double minRadiusSquared = SQR(minDiameter / 2.0 - 1.0);
+       #undef SQR
+       double u_max = cd->XForce*minRadiusSquared/(2.0*viscosity);
+
+       for(int z = 0; z < nZ; ++z) {
+
+               fprintf(fh, "%d\t", z);
+
+               #define SQR(a) ((a)*(a))
+               curRadiusSquared = SQR(z-center+0.5);
+
+
+               // dimensionless radius
+               fprintf(fh, "%e\t", (z-center+0.5)/center);
+
+               // analytic profile
+               if(curRadiusSquared >= minRadiusSquared)
+                       tmpAnalyUx = 0.0;
+               else
+                       tmpAnalyUx = CalcXVelForPipeProfile(minRadiusSquared, curRadiusSquared, cd->XForce, viscosity);
+
+               //averaged profile
+               if(flagEvenNy == 1)
+                       tmpAvgUx = (outputArray[y*nZ + z] + outputArray[(y+1)*nZ + z])/2.0;
+               else
+                       tmpAvgUx = outputArray[y*nZ + z];
+
+               fprintf(fh, "%e\t", tmpAnalyUx);
+               fprintf(fh, "%e\t", tmpAvgUx);
+
+               fprintf(fh, "%e\t", fabs(tmpAnalyUx-tmpAvgUx));
+               if (tmpAnalyUx != 0.0) {
+                       fprintf(fh, "%e\t", fabs(tmpAnalyUx - tmpAvgUx) / tmpAnalyUx);
+                       deviation += SQR(fabs(tmpAnalyUx - tmpAvgUx) / tmpAnalyUx);
+               }
+               else {
+                       fprintf(fh, "0.0\t");
+               }
+
+               fprintf(fh, "%e\t", tmpAnalyUx / u_max);
+               fprintf(fh, "%e\t", tmpAvgUx / u_max);
+               fprintf(fh, "\n");
+
+               #undef SQR
+       }
+
+       *errorNorm = sqrt(deviation);
+
+       printf("# Kernel validation: L2 error norm of relative error: %e\n", *errorNorm);
+
+
+       fclose(fh);
+       free(outputArray);
+
+
+}
+
+
+void KernelStatistics(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration)
+{
+       KernelStatisticsAdv(kd, ld, cd, iteration, 0);
+}
+
+void KernelStatisticsAdv(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration, int forceOutput)
+{
+       if (iteration % cd->StatisticsModulus == 0 || forceOutput) {
+               printf("# iter: %4d   avg density: %e\n", iteration, KernelDensity(kd, ld));
+       }
+
+       if (iteration % 10 != 0 && !forceOutput) {
+               return;
+       }
+
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+
+       int x = nX / 2;
+
+       PdfT pdfs[N_D3Q19];
+
+       // ----------------------------------------------------------------------
+       // velocity in x-direction in cross section appended for each iteration
+
+       double density;
+       double densitySum;
+       double ux;
+       double uxSum = 0.0;
+       int nFluidNodes = 0;
+
+       for (int y = 0; y < nY; ++y) {
+               for (int z = 0; z < nZ; ++z) {
+
+                       if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                               kd->GetNode(kd, x, y, z, pdfs);
+
+                               ux = pdfs[D3Q19_E] + pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_TE] + pdfs[D3Q19_BE] -
+                                        pdfs[D3Q19_W] - pdfs[D3Q19_NW] - pdfs[D3Q19_SW] - pdfs[D3Q19_TW] - pdfs[D3Q19_BW];
+
+                               uxSum += ux;
+                               ++nFluidNodes;
+                       }
+               }
+       }
+
+       const char * mode = "w";
+
+       if (iteration > 0) {
+               mode = "a";
+       }
+
+       const char * fileName = "ux-progress.dat";
+       FILE * fh;
+
+       fh = fopen(fileName, mode);
+
+       if(fh == NULL) {
+               printf("ERROR: opening file %s failed.\n", fileName);
+               exit(1);
+       }
+
+       if (iteration == 0) {
+               fprintf(fh, "# Average velocity in x direction of cross section in the middle (x = %d) of the geometry (NX = %d).\n", x, nX);
+               fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\";\"\n", fileName);
+               fprintf(fh, "# iteration, avg ux\n");
+       }
+
+       fprintf(fh, "%d %e\n", iteration, uxSum / nFluidNodes);
+
+       fclose(fh);
+
+       // ----------------------------------------------------------------------
+       // average velocity/density for each in cross section in x direction
+
+       fileName = "density-ux.dat";
+
+       fh = fopen(fileName, "w");
+
+       if(fh == NULL) {
+               printf("ERROR: opening file %s failed.\n", fileName);
+               exit(1);
+       }
+
+       fprintf(fh, "# Average density and average x velocity over each cross section in x direction. Snapshot taken at iteration %d.\n", iteration);
+       fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\" u 1:2; plot \\\"%s\\\" u 1:3;\"\n", fileName, fileName);
+//     fprintf(fh, "# Plot graphically: gnuplot -e \"plot \\\"%s\\\" u 1:3 w linesp t \\\"l\\\", \\\"\\\" u 1:4 w linesp t \\\"simulation\\\"; pause -1;"
+       fprintf(fh, "# x, avg density, avg ux\n");
+
+       for (x = 0; x < nX; ++x) {
+
+               uxSum = 0.0;
+               densitySum = 0.0;
+               nFluidNodes = 0;
+
+               for (int y = 0; y < nY; ++y) {
+                       for (int z = 0; z < nZ; ++z) {
+
+                               if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_OBSTACLE) {
+                                       continue;
+                               }
+
+                               kd->GetNode(kd, x, y, z, pdfs);
+
+                               density =
+                                       pdfs[D3Q19_C] +
+                                       pdfs[D3Q19_N]  + pdfs[D3Q19_E]  + pdfs[D3Q19_S]  + pdfs[D3Q19_W]  +
+                                       pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_SW] + pdfs[D3Q19_NW] +
+                                       pdfs[D3Q19_T]  + pdfs[D3Q19_TN] + pdfs[D3Q19_TE] + pdfs[D3Q19_TS] + pdfs[D3Q19_TW] +
+                                       pdfs[D3Q19_B]  + pdfs[D3Q19_BN] + pdfs[D3Q19_BE] + pdfs[D3Q19_BS] + pdfs[D3Q19_BW];
+
+                               densitySum += density;
+
+                               ux =
+                                       pdfs[D3Q19_E] + pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_TE] + pdfs[D3Q19_BE] -
+                                       pdfs[D3Q19_W] - pdfs[D3Q19_NW] - pdfs[D3Q19_SW] - pdfs[D3Q19_TW] - pdfs[D3Q19_BW];
+
+                               uxSum += ux;
+
+                               ++nFluidNodes;
+                       }
+               }
+
+               fprintf(fh, "%d  %e  %e\n", x, densitySum / nFluidNodes, uxSum / nFluidNodes);
+       }
+
+       fclose(fh);
+}
+
+
+
+void KernelAddBodyForce(KernelData * kd, LatticeDesc * ld, CaseData * cd)
+{
+       Assert(kd != NULL);
+       Assert(ld != NULL);
+       Assert(cd != NULL);
+
+       int nX = kd->Dims[0];
+       int nY = kd->Dims[1];
+       int nZ = kd->Dims[2];
+
+       PdfT w_0 = 1.0 /  3.0; // C
+       PdfT w_1 = 1.0 / 18.0; // N,S,E,W,T,B
+       PdfT w_2 = 1.0 / 36.0; // NE,NW,SE,SW,TE,TW,BE,BW,TN,TS,BN,BS
+       PdfT w[] = {w_1,w_1,w_1,w_1,w_2,w_2,w_2,w_2,w_1,w_2,w_2,w_2,w_2,w_1,w_2,w_2,w_2,w_2,w_0};
+
+       PdfT xForce = cd->XForce;
+
+       PdfT pdfs[N_D3Q19];
+
+
+       #ifdef _OPENMP
+       #pragma omp parallel for collapse(3) default(none) \
+                       shared(nX,nY,nZ,ld,kd,w,xForce,D3Q19_X,cd) \
+                       private(pdfs)
+       #endif
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+                       for(int x = 0; x < nX; ++x) {
+                               if(ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_OBSTACLE)
+                                       continue;
+
+                               // load pdfs into temp array
+                               kd->GetNode(kd, x, y, z, pdfs);
+
+                               // add body force in x direction ( method by Luo)
+                               for (int d = 0; d < N_D3Q19; ++d) {
+                                       pdfs[d] = pdfs[d] + 3.0*w[d]*D3Q19_X[d]*xForce;
+                               }
+
+                               kd->SetNode(kd, x, y, z, pdfs);
+
+                       }
+               }
+       }
+}
diff --git a/src/Kernel.h b/src/Kernel.h
new file mode 100644 (file)
index 0000000..eddae64
--- /dev/null
@@ -0,0 +1,217 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __KERNEL_H__
+#define __KERNEL_H__
+
+#include "Base.h"
+#include "Lattice.h"
+
+#ifdef DATA_LAYOUT_NAME
+       #error DATA_LAYOUT_NAME must not be defined here.
+#endif
+
+#ifdef PROP_MODEL_NAME
+       #error PROP_MODEL_NAME must not be defined here.
+#endif
+
+
+#ifdef DATA_LAYOUT_SOA
+       #define DATA_LAYOUT_NAME SoA
+#endif
+
+#ifdef DATA_LAYOUT_AOS
+       #define DATA_LAYOUT_NAME AoS
+#endif
+
+#ifdef PROP_MODEL_PUSH
+       #define PROP_MODEL_NAME Push
+#endif
+
+#ifdef PROP_MODEL_PULL
+       #define PROP_MODEL_NAME Pull
+#endif
+
+#ifdef PROP_MODEL_AA
+       #define PROP_MODEL_NAME AA
+#endif
+
+
+
+typedef double         PdfT;
+
+
+
+#define D3Q19
+
+
+#define N_D3Q19                19
+
+#define D3Q19_N                0
+#define D3Q19_S                1
+#define D3Q19_E                2
+#define D3Q19_W                3
+
+#define D3Q19_NE       4
+#define D3Q19_SE       5
+#define D3Q19_NW       6
+#define D3Q19_SW       7
+
+#define D3Q19_T                8
+#define D3Q19_TN       9
+#define D3Q19_TE       10
+#define D3Q19_TW       11
+#define D3Q19_TS       12
+
+#define D3Q19_B                13
+#define D3Q19_BS       14
+#define D3Q19_BN       15
+#define D3Q19_BW       16
+#define D3Q19_BE       17
+
+#define D3Q19_C                18              // IMPORTANT: Center particle must be the last one.
+
+// ---
+
+#ifdef X
+       #error X is not allowed to be defined here!
+#endif
+
+// The following list must be sorted ascending according
+// to the index of the direction, i.e. D3Q19_N, D3Q19_S, ...
+#define D3Q19_LIST \
+       X(N,  D3Q19_N,  D3Q19_S,          0,  1,  0) \
+       X(S,  D3Q19_S,  D3Q19_N,          0, -1,  0) \
+       X(E,  D3Q19_E,  D3Q19_W,          1,  0,  0) \
+       X(W,  D3Q19_W,  D3Q19_E,         -1,  0,  0) \
+       X(NE, D3Q19_NE, D3Q19_SW,         1,  1,  0) \
+       X(SE, D3Q19_SE, D3Q19_NW,         1, -1,  0) \
+       X(NW, D3Q19_NW, D3Q19_SE,        -1,  1,  0) \
+       X(SW, D3Q19_SW, D3Q19_NE,        -1, -1,  0) \
+       X(T,  D3Q19_T,  D3Q19_B,          0,  0,  1) \
+       X(TN, D3Q19_TN, D3Q19_BS,         0,  1,  1) \
+       X(TE, D3Q19_TE, D3Q19_BW,     1,  0,  1) \
+       X(TW, D3Q19_TW, D3Q19_BE,    -1,  0,  1) \
+       X(TS, D3Q19_TS, D3Q19_BN,     0, -1,  1) \
+       X(B,  D3Q19_B,  D3Q19_T,          0,  0, -1) \
+       X(BS, D3Q19_BS, D3Q19_TN,     0, -1, -1) \
+       X(BN, D3Q19_BN, D3Q19_TS,     0,  1, -1) \
+       X(BW, D3Q19_BW, D3Q19_TE,    -1,  0, -1) \
+       X(BE, D3Q19_BE, D3Q19_TW,     1,  0, -1) \
+       X(C,  D3Q19_C,  D3Q19_C,          0,  0,  0)
+
+#define D3Q19_LIST_WO_C \
+       X(N,  D3Q19_N,  D3Q19_S,          0,  1,  0) \
+       X(S,  D3Q19_S,  D3Q19_N,          0, -1,  0) \
+       X(E,  D3Q19_E,  D3Q19_W,          1,  0,  0) \
+       X(W,  D3Q19_W,  D3Q19_E,         -1,  0,  0) \
+       X(NE, D3Q19_NE, D3Q19_SW,         1,  1,  0) \
+       X(SE, D3Q19_SE, D3Q19_NW,         1, -1,  0) \
+       X(NW, D3Q19_NW, D3Q19_SE,        -1,  1,  0) \
+       X(SW, D3Q19_SW, D3Q19_NE,        -1, -1,  0) \
+       X(T,  D3Q19_T,  D3Q19_B,          0,  0,  1) \
+       X(TN, D3Q19_TN, D3Q19_BS,         0,  1,  1) \
+       X(TE, D3Q19_TE, D3Q19_BW,     1,  0,  1) \
+       X(TW, D3Q19_TW, D3Q19_BE,    -1,  0,  1) \
+       X(TS, D3Q19_TS, D3Q19_BN,     0, -1,  1) \
+       X(B,  D3Q19_B,  D3Q19_T,          0,  0, -1) \
+       X(BS, D3Q19_BS, D3Q19_TN,     0, -1, -1) \
+       X(BN, D3Q19_BN, D3Q19_TS,     0,  1, -1) \
+       X(BW, D3Q19_BW, D3Q19_TE,    -1,  0, -1) \
+       X(BE, D3Q19_BE, D3Q19_TW,     1,  0, -1)
+
+
+extern int D3Q19_X[N_D3Q19];
+extern int D3Q19_Y[N_D3Q19];
+extern int D3Q19_Z[N_D3Q19];
+extern int D3Q19_INV[N_D3Q19];
+
+extern const char * D3Q19_NAMES[N_D3Q19];
+
+
+
+typedef struct CaseData_ {
+       PdfT Omega;
+       PdfT RhoIn;
+       PdfT RhoOut;
+       PdfT Ux;
+       PdfT Uy;
+       PdfT Uz;
+       PdfT XForce;
+       int MaxIterations;
+       int VtkOutput;
+       int VtkModulus;
+       int StatisticsModulus;
+} CaseData;
+
+
+typedef struct KernelData_ {
+       PdfT * Pdfs[2];
+       PdfT * SrcPdfs;
+       PdfT * DstPdfs;
+       PdfT * PdfsActive;
+       int Dims[3];
+       int GlobalDims[3];
+       int Offsets[3];
+       int * ObstIndices;
+       int nObstIndices;
+       int * BounceBackPdfsSrc;
+       int * BounceBackPdfsDst;
+       int nBounceBackPdfs;
+
+       void (* BoundaryConditionsGetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT * pdf);
+       void (* BoundaryConditionsSetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT pdf);
+
+       void (* GetNode)(struct KernelData_ * kd, int x, int y, int z, PdfT * pdfs);
+       void (* SetNode)(struct KernelData_ * kd, int x, int y, int z, PdfT * pdfs);
+
+       void (* Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd);
+
+} KernelData;
+
+typedef struct Parameters_ {
+       int nArgs;
+       char ** Args;
+       int nKernelArgs;
+       char ** KernelArgs;
+} Parameters;
+
+void KernelComputeBoundaryConditions(KernelData * kd, LatticeDesc * ld, CaseData * cd);
+
+PdfT KernelDensity(KernelData * kd, LatticeDesc * ld);
+
+void KernelStatistics(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration);
+void KernelStatisticsAdv(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration, int forceOutput);
+
+
+void KernelSetInitialDensity (LatticeDesc * ld, KernelData * kd, CaseData * cd);
+void KernelSetInitialVelocity(LatticeDesc * ld, KernelData * kd, CaseData * cd);
+
+void KernelVerifiy(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * errorNorm);
+
+void KernelAddBodyForce(KernelData * kd, LatticeDesc * ld, CaseData * cd);
+
+#endif // __KERNEL_H__
diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h
new file mode 100644 (file)
index 0000000..2ee063a
--- /dev/null
@@ -0,0 +1,138 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __KERNEL_FUNCTIONS_H__
+#define __KERNEL_FUNCTIONS_H__
+
+#include "BenchKernelD3Q19.h"
+#include "BenchKernelD3Q19List.h"
+#include "BenchKernelD3Q19ListAa.h"
+#include "BenchKernelD3Q19ListAaRia.h"
+#include "BenchKernelD3Q19ListAaPv.h"
+#include "BenchKernelD3Q19ListPullSplitNt.h"
+
+typedef struct KernelFunctions_
+{
+       char Name[128];
+       void (* Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params);
+       void (* Deinit)(LatticeDesc * ld, KernelData ** kernelData);
+} KernelFunctions;
+
+KernelFunctions g_kernels[] =
+{
+       {
+               .Name   = "list-aa-pv-soa",
+               .Init   = D3Q19ListAaPvInit_PushSoA,
+               .Deinit = D3Q19ListAaPvDeinit_PushSoA
+       },
+       {
+               .Name   = "list-aa-ria-soa",
+               .Init   = D3Q19ListAaRiaInit_PushSoA,
+               .Deinit = D3Q19ListAaRiaDeinit_PushSoA
+       },
+       {
+               .Name   = "list-aa-soa",
+               .Init   = D3Q19ListAaInit_PushSoA,
+               .Deinit = D3Q19ListAaDeinit_PushSoA
+       },
+       {
+               .Name   = "list-aa-aos",
+               .Init   = D3Q19ListAaInit_PushAoS,
+               .Deinit = D3Q19ListAaDeinit_PushAoS
+       },
+       {
+               .Name   = "list-pull-split-nt-1s-soa",
+               .Init   = D3Q19ListPullSplitNt1SInit_PullSoA,
+               .Deinit = D3Q19ListPullSplitNtDeinit_PullSoA
+       },
+       {
+               .Name   = "list-pull-split-nt-2s-soa",
+               .Init   = D3Q19ListPullSplitNt2SInit_PullSoA,
+               .Deinit = D3Q19ListPullSplitNtDeinit_PullSoA
+       },
+       {
+               .Name   = "list-push-soa",
+               .Init   = D3Q19ListInit_PushSoA,
+               .Deinit = D3Q19ListDeinit_PushSoA
+       },
+       {
+               .Name   = "list-push-aos",
+               .Init   = D3Q19ListInit_PushAoS,
+               .Deinit = D3Q19ListDeinit_PushAoS
+       },
+       {
+               .Name   = "list-pull-soa",
+               .Init   = D3Q19ListInit_PullSoA,
+               .Deinit = D3Q19ListDeinit_PullSoA
+       },
+       {
+               .Name = "list-pull-aos",
+               .Init = D3Q19ListInit_PullAoS,
+               .Deinit = D3Q19ListDeinit_PullAoS
+       },
+       {
+               .Name   = "push-soa",
+               .Init   = D3Q19Init_PushSoA,
+               .Deinit = D3Q19Deinit_PushSoA
+       },
+       {
+               .Name   = "push-aos",
+               .Init   = D3Q19Init_PushAoS,
+               .Deinit = D3Q19Deinit_PushAoS
+       },
+       {
+               .Name   = "pull-soa",
+               .Init   = D3Q19Init_PullSoA,
+               .Deinit = D3Q19Deinit_PullSoA
+       },
+       {
+               .Name   = "pull-aos",
+               .Init   = D3Q19Init_PullAoS,
+               .Deinit = D3Q19Deinit_PullAoS
+       },
+       {
+               .Name   = "blk-push-soa",
+               .Init   = D3Q19BlkInit_PushSoA,
+               .Deinit = D3Q19BlkDeinit_PushSoA
+       },
+       {
+               .Name   = "blk-push-aos",
+               .Init   = D3Q19BlkInit_PushAoS,
+               .Deinit = D3Q19BlkDeinit_PushAoS
+       },
+       {
+               .Name   = "blk-pull-soa",
+               .Init   = D3Q19BlkInit_PullSoA,
+               .Deinit = D3Q19BlkDeinit_PullSoA
+       },
+       {
+               .Name   = "blk-pull-aos",
+               .Init   = D3Q19BlkInit_PullAoS,
+               .Deinit = D3Q19BlkDeinit_PullAoS
+       },
+};
+
+#endif // __KERNEL_FUNCTIONS_H__
diff --git a/src/Lattice.h b/src/Lattice.h
new file mode 100644 (file)
index 0000000..99b3fc3
--- /dev/null
@@ -0,0 +1,78 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __LATTICE_H__
+#define __LATTICE_H__
+
+#include "Base.h"
+
+
+typedef int            LatticeT;
+
+typedef enum LAT_CELL_TYPES_ {
+       LAT_CELL_OBSTACLE       = 0,
+       LAT_CELL_FLUID          = 1,
+       LAT_CELL_INLET          = 2,
+       LAT_CELL_OUTLET         = 4
+} LAT_CELL_TYPES;
+
+
+typedef struct LatticeDesc_ {
+       int Dims[3];
+       LatticeT * Lattice;
+       int nCells;                     // Total number of cells (Obstacles + Fluids).
+       int nFluid;                     // Total number of fluid cells. Fluid cells are fluid, inlet, and outlet.
+       int nObst;                      // Total number of obstacles in the lattice.
+       int nInlet;                     // Total number of inlets in the lattice.
+       int nOutlet;            // Total number of outlets.
+       int PeriodicX;          // Periodic in X direction.
+       int PeriodicY;          // Periodic in Y direction.
+       int PeriodicZ;          // Periodic in Z direction.
+} LatticeDesc;
+
+
+// #define L_INDEX_4(dims, x, y, z)    ((z) * (dims[0]) * (dims[1]) + (y) * (dims[0]) + (x))
+
+static inline int L_INDEX_4(int dims[3], int x, int y, int z)
+{
+       Assert(dims != NULL);
+
+       Assert(dims[0] > 0);
+       Assert(dims[1] > 0);
+       Assert(dims[2] > 0);
+
+       Assert(x >= 0);
+       Assert(x < dims[0]);
+       Assert(y >= 0);
+       Assert(y < dims[1]);
+       Assert(z >= 0);
+       Assert(z < dims[2]);
+
+       return z * dims[0] * dims[1] + y * dims[0] + x;
+}
+
+
+#endif // __LATTICE_H__
diff --git a/src/LikwidIf.h b/src/LikwidIf.h
new file mode 100644 (file)
index 0000000..e3c24da
--- /dev/null
@@ -0,0 +1,46 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __LIKWID_IF_H__
+#define __LIKWID_IF_H__
+
+#ifdef HAVE_LIKWID
+       #include <likwid.h>
+
+
+       #define X_LIKWID_INIT()                                         likwid_markerInit()
+       #define X_LIKWID_DEINIT()                                       likwid_markerClose()
+    #define X_LIKWID_START(region_name)                        likwid_markerStartRegion(region_name)
+    #define X_LIKWID_STOP(region_name)                 likwid_markerStopRegion(region_name)
+#else
+
+       #define X_LIKWID_INIT()
+       #define X_LIKWID_DEINIT()
+    #define X_LIKWID_START(region_name)
+    #define X_LIKWID_STOP(region_name)
+#endif
+
+#endif // __LIKWID_IF_H__
diff --git a/src/Main.c b/src/Main.c
new file mode 100644 (file)
index 0000000..1525ad5
--- /dev/null
@@ -0,0 +1,589 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>  // strcasecmp
+
+#include <inttypes.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "Base.h"
+#include "Kernel.h"
+#include "Memory.h"
+
+#include "Lattice.h"
+#include "Geometry.h"
+#include "Pinning.h"
+#include "LikwidIf.h"
+
+#include "KernelFunctions.h"
+
+#ifdef __x86_64__
+       #include <xmmintrin.h>
+
+
+       #define MXCSR_DAZ                                       6
+       #define MXCSR_FTZ                                       15
+
+
+       int FpIsMxCsrMaskSet(unsigned int mask)
+       {
+               unsigned int mxcsr;
+               unsigned int mxcsrNew;
+
+               mxcsr = _mm_getcsr();
+
+               mxcsrNew = mxcsr & mask;
+
+               return (mxcsrNew == mask);
+       }
+
+       int FpGetFtz()
+       {
+               return FpIsMxCsrMaskSet(1 << MXCSR_FTZ);
+       }
+
+       int FpGetDaz()
+       {
+               return FpIsMxCsrMaskSet(1 << MXCSR_DAZ);
+       }
+#endif
+
+
+int ParseDimensions(const char * parameter, int * nX, int * nY, int * nZ)
+{
+       char * tmp;
+
+       *nX = atoi(parameter);
+
+       if (*nX <= 0) {
+               printf("ERROR: parameter for X dimension must be > 0.\n");
+               return 0;
+       }
+
+       tmp = strchr(parameter, 'x');
+
+       if (tmp == NULL) {
+               printf("ERROR: parameter for Y dimension is missing.\n");
+               return 0;
+       }
+
+       *nY = atoi(tmp + 1);
+
+       if (*nY <= 0) {
+               printf("ERROR: parameter for Y dimension must be > 0.\n");
+               return 0;
+       }
+
+       tmp = strchr(tmp + 1, 'x');
+
+       if (tmp == NULL) {
+               printf("ERROR: parameter for Z dimension is missing.\n");
+               return 0;
+       }
+
+       *nZ = atoi(tmp + 1);
+
+       if (*nZ <= 0) {
+               printf("ERROR: parameter for Z dimension must be > 0.\n");
+               return 0;
+       }
+
+       return 1;
+}
+
+int main(int argc, char * argv[])
+{
+       int dims[3] = { 20, 20, 20 };           // Dimensions in x, y, and z direction
+       const char * geometryType = "channel";
+       // int latticeDumpAscii = 0;
+       int verify = 0; UNUSED(verify);
+       char * kernelToUse = NULL;
+       int nThreads = 1;
+       const char * pinString = NULL;
+       int periodic[3] = { 0 };
+
+       CaseData cd;
+
+       cd.MaxIterations                = 1000;
+       cd.RhoIn                                = 1.0;
+       cd.RhoOut                               = 1.0;
+       cd.Omega                                = 1.0;
+       cd.VtkOutput                    = 0;
+       cd.VtkModulus                   = 100;
+       cd.StatisticsModulus    = 100;
+       cd.XForce                               = 0.00001;
+       kernelToUse                             = "push-soa";
+
+       Parameters p;
+       p.nArgs        = argc;
+       p.Args         = argv;
+       p.nKernelArgs  = 0;
+       p.KernelArgs   = NULL;
+
+#define LBM_BENCH_KERNELS_VERSION_MAJOR 0
+#define LBM_BENCH_KERNELS_VERSION_MINOR 1
+
+    printf("Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE\n");
+    printf("This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.\n");
+       printf("This is free software, and you are welcome to redistribute it under certain conditions.\n");
+       printf("\n");
+       printf("LBM Benchmark Kernels %d.%d, compiled %s %s, type: %s\n",
+               LBM_BENCH_KERNELS_VERSION_MAJOR, LBM_BENCH_KERNELS_VERSION_MINOR, __DATE__, __TIME__,
+#ifdef VERIFICATION
+               "verification"
+#else
+               "benchmark"
+#endif
+       );
+
+       // ----------------------------------------------------------------------
+       // Parse command line arguments
+
+       #define ARG_IS(param)                   (!strcmp(argv[i], param))
+       #define NEXT_ARG_PRESENT() \
+               do { \
+                       if (i + 1 >= argc) { \
+                               printf("ERROR: argument %s requires a parameter.\n", argv[i]); \
+                               return 1; \
+                       } \
+               } while (0)
+
+    for (int i = 1; i < argc; ++i) {
+
+        if (ARG_IS("-dims") || ARG_IS("--dims")) {
+            NEXT_ARG_PRESENT();
+
+
+            if (!ParseDimensions(argv[++i], &dims[0], &dims[1], &dims[2])) {
+                return 1;
+            }
+        }
+               // else if (ARG_IS("-lattice-dump-ascii") || ARG_IS("--lattice-dump-ascii")) {
+               //      latticeDumpAscii = 1;
+               // }
+               else if (ARG_IS("-geometry") || ARG_IS("--geometry")) {
+                       NEXT_ARG_PRESENT();
+
+                       geometryType = argv[++i];
+               }
+               else if (ARG_IS("-iterations") ||ARG_IS("--iterations")) {
+                       NEXT_ARG_PRESENT();
+
+                       cd.MaxIterations = strtol(argv[++i], NULL, 0);
+
+                       if (cd.MaxIterations <= 0) {
+                               printf("ERROR: number of iterations must be > 0.\n");
+                               return 1;
+                       }
+               }
+               else if (ARG_IS("-rho-in") ||ARG_IS("--rho-in")) {
+                       NEXT_ARG_PRESENT();
+
+                       cd.RhoIn = strtod(argv[++i], NULL);
+               }
+               else if (ARG_IS("-rho-out") ||ARG_IS("--rho-out")) {
+                       NEXT_ARG_PRESENT();
+
+                       cd.RhoOut = strtod(argv[++i], NULL);
+               }
+               else if (ARG_IS("-omega") ||ARG_IS("--omega")) {
+                       NEXT_ARG_PRESENT();
+
+                       cd.Omega = strtod(argv[++i], NULL);
+               }
+               else if (ARG_IS("-x-force") ||ARG_IS("--x-force")) {
+                       NEXT_ARG_PRESENT();
+
+                       cd.XForce = strtod(argv[++i], NULL);
+               }
+               else if (ARG_IS("-verify") || ARG_IS("--verify")) {
+#ifdef VERIFICATION
+
+                       // Choose this preset for verification. As geometry type "box" is
+                       // used but x and y direction are made pridoc.
+                       // Everything else can be altered, but enough iterations should be
+                       // performed in order to receive a fully developed flow field.
+                       verify = 1;
+
+                       cd.Omega  = 1.0;
+                       cd.RhoIn  = 1.0;
+                       cd.RhoOut = 1.0;
+                       geometryType = "box";
+                       dims[0] = 16;
+                       dims[1] = 16;
+                       dims[2] = 16;
+                       cd.XForce = 0.00001;
+                       cd.MaxIterations = 1000;
+                       periodic[0] = 1;
+                       periodic[1] = 1;
+                       periodic[2] = 0;
+
+                       printf("#\n");
+                       printf("# VERIFICATION: verifying flow profile of channel flow.\n");
+                       printf("#\n");
+
+                       // TODO: this is not a good idea as we ignore all other options...
+
+#else
+                       printf("ERROR: in order to use -verify VERIFICATION must be defined during compilation.\n");
+                       printf("       Recompile with VERIFICATION=on.\n");
+                       return 1;
+#endif
+               }
+               else if (ARG_IS("-vtk") || ARG_IS("--vtk")) {
+#ifdef VTK_OUTPUT
+
+                       cd.VtkOutput = 1;
+
+                       // If the next parameter is a number it is used as the itartion count,
+                       // if not it is probably another parameter.
+                       if (i + 1 < argc) {
+
+                               int vtkModulus = atoi(argv[i+1]);
+
+                               if (vtkModulus > 0) {
+                                       cd.VtkModulus = vtkModulus;
+                                       ++i;
+                               }
+                       }
+#else
+                       printf("ERROR: in order to use -vtk VTK_OUTPUT must be defined during compilation.\n");
+                       printf("       Recompile with VTK_OUTPUT=on.\n");
+                       return 1;
+#endif
+               }
+               else if (ARG_IS("-statistics") || ARG_IS("--statistics")) {
+#ifdef STATISTICS
+                       NEXT_ARG_PRESENT();
+
+                       cd.StatisticsModulus = atoi(argv[++i]);
+
+                       if (cd.StatisticsModulus <= 0) {
+                               printf("ERROR: the iteration count for -statistics must be > 0.\n");
+                               return 1;
+                       }
+#else
+                       printf("ERROR: in order to use -statistics STATISTICS must be defined during compilation.\n");
+                       printf("       Recompile with STATISTICS=on.\n");
+                       return 1;
+#endif
+               }
+               else if (ARG_IS("-kernel") || ARG_IS("--kernel")) {
+                       NEXT_ARG_PRESENT();
+
+                       kernelToUse = argv[++i];
+               }
+               else if (ARG_IS("-list") || ARG_IS("--list")) {
+                       printf("Available kernels to benchmark:\n");
+
+                       for (int j = 0; j < N_ELEMS(g_kernels); ++j) {
+                               printf("   %s\n", g_kernels[j].Name);
+                       }
+
+                       return 0;
+               }
+               else if (ARG_IS("-pin") || ARG_IS("--pin")) {
+                       NEXT_ARG_PRESENT();
+
+                       pinString = argv[++i];
+               }
+               else if (ARG_IS("-t") || ARG_IS("-threads") || ARG_IS("--threads")) {
+#ifdef _OPENMP
+                       NEXT_ARG_PRESENT();
+
+                       nThreads = atoi(argv[++i]);
+
+                       if (nThreads <= 0) {
+                               printf("ERROR: number of threads must be > 0.\n");
+                               return 1;
+                       }
+#else
+                       printf("ERROR: specifying number of threads is only available when compiled with OpenMP support.\n");
+                       return 1;
+#endif
+               }
+               else if (ARG_IS("-periodic-x") || ARG_IS("--periodic-x")) {
+                       periodic[0] = 1;
+               }
+               else if (ARG_IS("-periodic-y") || ARG_IS("--periodic-y")) {
+                       periodic[1] = 1;
+               }
+               else if (ARG_IS("-periodic-z") || ARG_IS("--periodic-z")) {
+                       periodic[2] = 1;
+               }
+               else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) {
+                       printf("ERROR: unknown argument: %s\n", argv[i]);
+                       printf("\n");
+                       printf("Usage:\n");
+                       printf("./lbmbenchk -list\n");
+                       printf("./lbmbenchk \n");
+                       printf("      [-dims XxYyZ] [-geometry box|channel|pipe|porosity[-value]] [-iterations <iterations>] [-lattice-dump-ascii]\n");
+                       printf("      [-rho-in <density>] [-rho-out <density] [-omega <omega>] [-kernel <kernel>]\n");
+                       printf("      [-periodic-x]\n");
+#ifdef STATISTICS
+                       printf("      [-statistics <every-n-iteration>]\n");
+#endif
+#ifdef VTK_OUTPUT
+                       printf("      [-vtk [<every-n-iteration>]]\n");
+#endif
+#ifdef _OPENMP
+                       printf("      [-t <number of threads>]\n");
+#endif
+                       printf("      [-pin core{,core}*]\n");
+#ifdef VERIFICATION
+                       printf("      [-verify]\n");
+#endif
+                       printf("      -- <kernel specific parameters>\n");
+                       printf("\n");
+                       printf("-list           List available kernels.\n");
+                       printf("\n");
+                       printf("-dims XxYxZ             Specify geometry dimensions.\n");
+                       printf("\n");
+                       printf("-geometry porosity-<value>\n");
+                       printf("                Geometetry with blocks of size <value> regularily layout out.\n");
+                       printf("\n");
+                       return 1;
+               }
+               else if (ARG_IS("--")) {
+                       // printf("# kernel args start with %s  these are %d args.\n", argv[i + 1], argc - i - 1);
+                       p.KernelArgs = &argv[++i];
+                       p.nKernelArgs = argc - i;
+                       break;
+               }
+               else {
+                       printf("ERROR: unknown parameter: %s.\n", argv[i]);
+                       exit(1);
+               }
+       }
+
+       #undef ARG_IS
+       #undef NEXT_ARG_PRESENT
+
+
+       // ----------------------------------------------------------------------
+       // Check if we exceed our index addressing PDFs.
+
+       {
+               uint64_t nPdfs = ((uint64_t)19) * dims[0] * dims[1] * dims[2];
+
+               if (nPdfs > ((2LU << 31) - 1)) {
+                       printf("ERROR: number of PDFs exceed 2^31.\n");
+                       exit(1);
+               }
+       }
+
+       // ----------------------------------------------------------------------
+
+#ifdef _OPENMP
+       omp_set_num_threads(nThreads);
+#endif
+
+       LatticeDesc ld;
+
+       GeoCreateByStr(geometryType, dims, periodic, &ld);
+
+       const char * defines[] = {
+#ifdef VTK_OUTPUT
+       "VTK_OUTPUT",
+#endif
+#ifdef STATISTICS
+       "STATISTICS",
+#endif
+#ifdef VERIFICATION
+       "VERIFICATION",
+#endif
+#ifdef _OPENMP
+       "_OPENMP",
+#endif
+#ifdef HAVE_LIKWID
+       "HAVE_LIKWID",
+#endif
+       };
+
+       printf("#  defines:           ");
+       for (int j = 0; j < N_ELEMS(defines); ++j) {
+               printf("%s ", defines[j]);
+       }
+       printf("\n");
+
+       printf("#  nodes total:       % 10d\n", ld.nObst + ld.nFluid);
+       printf("#  nodes fluid:       % 10d (including inlet & outlet)\n", ld.nFluid);
+       printf("#  nodes obstacles:   % 10d\n", ld.nObst);
+       printf("#  nodes inlet:       % 10d\n", ld.nInlet);
+       printf("#  nodes outlet:      % 10d\n", ld.nOutlet);
+       printf("#  periodicity:       x: %d y: %d z: %d\n", ld.PeriodicX, ld.PeriodicY, ld.PeriodicZ);
+
+#ifdef VTK_OUTPUT
+       printf("#  VTK output:         %d (every %d iteration)\n", cd.VtkOutput, cd.VtkModulus);
+#endif
+#ifdef STATISTICS
+       printf("#  statistics:         every %d iteration\n", cd.StatisticsModulus);
+#endif
+
+       printf("#  omega:             %f\n", cd.Omega);
+       printf("#  initial density at inlet/outlet:\n");
+       printf("#    rho in:          %e\n", cd.RhoIn);
+    printf("#    rho out:         %e\n", cd.RhoOut);
+       printf("# iterations:         %d\n", cd.MaxIterations);
+
+#ifdef __x86_64__
+       printf("# fp status:          DAZ: %d  FTZ: %d\n", FpGetDaz(), FpGetFtz());
+#endif
+
+#ifdef _OPENMP
+       printf("# OpenMP threads:     %d\n", omp_get_max_threads());
+
+       if (pinString != NULL) {
+               #pragma omp parallel
+               {
+                       int threadId = omp_get_thread_num();
+                       int err;
+
+                       err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId);
+
+                       if (err) {
+                               printf("ERROR [thread %d]: pinning failed.\n", threadId);
+                               exit(1);
+                       }
+
+                       const char * cpuList = PinCpuListAsString();
+                       Assert(cpuList != NULL);
+
+                       // Not so nice hack to print the thread ids ordered.
+                       #pragma omp for ordered
+                       for (int i = 0; i < omp_get_num_threads(); ++i) {
+                               #pragma omp ordered
+                               printf("# thread %2d  pinned to core(s):  %s\n", threadId, cpuList);
+                       }
+
+                       free((void *)cpuList);
+               }
+       }
+#endif
+
+       KernelData * kd;
+
+       KernelFunctions * kf = NULL;
+
+       if (kernelToUse == NULL) {
+               kf = &g_kernels[0];
+       }
+       else {
+               for (int j = 0; j < N_ELEMS(g_kernels); ++j) {
+
+                       if (!strcasecmp(kernelToUse, g_kernels[j].Name)) {
+                               kf = &g_kernels[j];
+                               break;
+                       }
+               }
+       }
+
+       if (kf == NULL) {
+               printf("ERROR: requested kernel \"%s\" not found.\n", kernelToUse);
+               exit(1);
+       }
+
+       printf("#\n");
+       printf("# kernel:           %s\n", kf->Name);
+       printf("#\n");
+
+       // Initialize kernel by calling its own initialization function
+       kf->Init(&ld, &kd, &p);
+
+#ifdef VERIFICATION
+       if (verify) {
+               KernelSetInitialDensity( &ld, kd, &cd);
+               KernelSetInitialVelocity(&ld, kd, &cd);
+       }
+#endif
+
+       printf("# starting kernel...\n");
+
+       X_LIKWID_INIT();
+
+       double timeStart = Time();
+
+       // Call the LBM kernel
+       kd->Kernel(&ld, kd, &cd);
+
+       double duration = Time() - timeStart;
+
+       X_LIKWID_DEINIT();
+
+       // Print some statistics...
+       KernelStatisticsAdv(kd, &ld, &cd, cd.MaxIterations, 1 /* force output */);
+
+#ifdef VERIFICATION
+       PdfT errorNorm = -1.0;
+       KernelVerifiy(&ld, kd, &cd, &errorNorm);
+#endif
+
+       // Deinitialize kernel by calling its own deinitialization function
+       kf->Deinit(&ld, &kd);
+
+
+       double perf = (double)ld.nFluid * (double)cd.MaxIterations / duration / 1.e6;
+
+       printf("P:   %f MFLUP/s  t: %d  d: %f s  iter: %d  fnodes: %f x1e6  geo: %s  kernel: %s  %s\n",
+               perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6,
+               geometryType, kernelToUse,
+#ifdef VERIFICATION
+               "# VERIFICATION"
+#else
+               "# benchmark"
+#endif
+       );
+
+       int exitCode = 0;
+
+#ifdef VERIFICATION
+
+       if (verify) {
+               printf("# VERIFICATION: deviation from analytical solution: %e\n", errorNorm);
+
+               if (errorNorm > 0.1) {
+                       printf("# VERIFICATION FAILED.\n");
+                       exitCode = 1;
+               }
+               else {
+                       printf("# VERIFICATION SUCCEEDED.\n");
+               }
+       }
+#else
+//     printf("# VERIFICATION: deviation from analytical solution: %e\n", errorNorm);
+//     printf("# VERIFICATION: this is only valid for pipe geometry with enough iterations performed.\n");
+#endif
+
+       MemFree((void **)&ld.Lattice);
+
+       return exitCode;
+}
diff --git a/src/Makefile b/src/Makefile
new file mode 100644 (file)
index 0000000..a9889a5
--- /dev/null
@@ -0,0 +1,298 @@
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016-2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+
+# Default configuration to chose.
+# CONFIG ?= linux-gcc
+CONFIG ?= linux-intel
+
+BUILD ?= debug
+
+# If we compile for BENCHMARK all verifcation and statistics are disabled,
+# if not specified otherwise.
+ifeq (on,$(BENCHMARK))
+  VERIFICATION ?= off
+  VTK_OUTPUT   ?= off
+  STATISTICS   ?= off
+endif
+
+VERIFICATION ?= on
+
+STATISTICS ?= on
+
+VTK_OUTPUT ?= on
+
+COLOR ?= on
+
+OPENMP ?= on
+
+VTK_OUTPUT_ASCII ?= off
+
+LID_DRIVEN_CAVITY ?= off
+
+ISA ?= avx
+
+LIKWID ?= off
+
+# Global settings for the Makefile
+SHELL      = sh
+
+ifdef V
+  Q        =
+else
+  Q        = @
+endif
+
+AR         = ar
+CMP        = cmp
+CHMOD      = chmod
+CPP        = cpp
+DIFF       = diff
+ECHO       = echo
+ECHO_E     = echo -e
+ECHO_N     = echo -n
+GIT        = git
+GREP       = grep
+MKDIR      = mkdir
+MKDIR_P    = mkdir -p
+NM         = nm
+RM         = rm
+RM_F       = rm -f
+RM_RF      = rm -rf
+TAR        = tar
+TAR_CZF    = tar czf
+TOUCH      = touch
+SED        = sed
+
+
+
+# Where to store objects and dependency files.
+OBJECT_DIR      = obj/$(CONFIG)-$(BUILD)$(TAG)
+DEP_DIR         = obj/$(CONFIG)-$(BUILD)$(TAG)-dep
+
+# Sources to consider.
+SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \
+            Vtk.c Pinning.c
+
+# ------------------------------------------------------------------------
+# NO CHANGE BELOW SHOULD BE NEEDED
+# ------------------------------------------------------------------------
+
+OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \
+               $(OBJECT_DIR)/BenchKernelD3Q19Common_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19Common_PullAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19Common_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19Common_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19_PullAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19List_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19List_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19List_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19List_PullAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PullAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAa_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaCommon_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAa_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaCommon_PushAoS.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaRia_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaRiaCommon_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \
+               $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o
+
+OBJ = $(OBJ_C)
+
+include ./config.$(CONFIG).mk
+
+
+# Colorize command lines.
+ifeq (on,$(COLOR))
+  COLOR_GRAY        =\033[1;30m
+  COLOR_LIGHT_GRAY  =\033[0;37m
+  COLOR_CYAN        =\033[0;36m
+  COLOR_LIGHT_CYAN  =\033[1;36m
+  COLOR_MAGENTA     =\033[0;35m
+  COLOR_DARK_MAGENTA=\033[1;35m
+  COLOR_NO          =\033[0m
+  COLOR_RED         =\033[0;31m
+  COLOR_DARK_RED    =\033[1;31m
+  COLOR_GREEN       =\033[0;32m
+  COLOR_DARK_GREEEN =\033[1;32m
+else
+  COLOR_GRAY        =
+  COLOR_LIGHT_GRAY  =
+  COLOR_CYAN        =
+  COLOR_LIGHT_CYAN  =
+  COLOR_MAGENTA     =
+  COLOR_DARK_MAGENTA=
+  COLOR_NO          =
+  COLOR_RED         =
+  COLOR_DARK_RED    =
+  COLOR_GREEN       =
+  COLOR_DARK_GREEEN =
+endif
+
+# Dependencies which are not source files, that require
+# rebuilding the application.
+REBUILD_DEPS = $(filter-out %.d,$(MAKEFILE_LIST)) $(OBJECT_DIR)/.target $(DEP_DIR)/.target
+
+BUILD_CONFIG=
+
+# Verify computed flow field
+ifeq (on,$(VERIFICATION))
+  PP_FLAGS     += $(D)VERIFICATION
+endif
+
+ifeq (on,$(VTK_OUTPUT))
+  PP_FLAGS += $(D)VTK_OUTPUT
+endif
+
+ifeq (on,$(STATISTICS))
+  PP_FLAGS += $(D)STATISTICS
+endif
+
+ifeq (on,$(VTK_OUTPUT_ASCII))
+  PP_FLAGS += $(D)VTK_OUTPUT_ASCII
+endif
+
+ifeq (on,$(LID_DRIVEN_CAVITY))
+  PP_FLAGS += $(D)LID_DRIVEN_CAVITY
+endif
+
+ifeq (avx,$(ISA))
+  PP_FLAGS += $(D)VECTOR_AVX
+endif
+ifeq (sse,$(ISA))
+  PP_FLAGS += $(D)VECTOR_SSE
+endif
+
+# Required for posix_memalign
+PP_FLAGS += $(D)_POSIX_C_SOURCE=200112L
+
+ifeq (on,$(LIKWID))
+  PP_FLAGS += $(LIKWID_INC) $(D)HAVE_LIKWID
+  LD_LIBS  += $(LIKWID_LIB) -llikwid
+endif
+
+.phony: all clean clean-all
+
+$(info $(shell $(ECHO_E) "# Configuration:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   building $(.DEFAULT_GOAL)..."))
+
+
+$(info # Object dir:      $(OBJECT_DIR))
+$(info # Dependency dir:  $(DEP_DIR))
+# $(info # objects:        $(OBJ))
+
+BIN_DIR=../bin
+
+all: $(BIN_DIR)/lbmbenchk-$(CONFIG)-$(BUILD)$(BUILD_CONFIG)$(TAG)
+
+
+# ------------------------------------------------------------------------
+# TODO: Include dependencies, if they exist and no clean target is specified.
+
+#ifeq (,$(SKIP_DEP_INCLUDE))
+#  ifneq (,$(OBJ))
+#    TMP_OBJ__ = $(patsubst $(OBJECT_DIR)/%,$(DEP_DIR)/%,$(OBJ))
+#    $(warning TMP_OBJ__: $(TMP_OBJ__))
+#    $(warning OBJ: $(OBJ))
+#    $(warning DEPS:      $(patsubst %.o,%.d,$(TMP_OBJ__)))
+#    $(warning $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$(OBJ)))
+    -include $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$(OBJ))
+#  endif
+#endif
+
+
+$(BIN_DIR):
+       [ -d "$@" ] || mkdir -p "$@"
+
+$(BIN_DIR)/lbmbenchk-$(CONFIG)-$(BUILD)$(BUILD_CONFIG)$(TAG): $(OBJ) $(REBUILD_DEPS) $(DEP_DIR)/.target | $(BIN_DIR)
+       @$(ECHO_E) "linking: $(COLOR_CYAN)$@$(COLOR_NO)"
+       $(LD) $(LD_FLAGS) -o $@ $(filter-out $(REBUILD_DEPS),$^) $(LD_LIBS)
+       @$(ECHO_E) "# Builded binary:      $(COLOR_CYAN)$@$(COLOR_NO)"
+       @$(ECHO_E) "# Configuration was:   CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO)   BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO)   VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO)   STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO)   VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO)  OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO)   ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO)   LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO)   target=$(.DEFAULT_GOAL)"
+
+$(OBJECT_DIR)/%_SoA.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA -c $< -o $@
+
+$(OBJECT_DIR)/%_PushSoA.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_PUSH -c $< -o $@
+
+$(OBJECT_DIR)/%_PushAoS.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_PUSH -c $< -o $@
+
+$(OBJECT_DIR)/%_PullSoA.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PULL$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_PULL -c $< -o $@
+
+$(OBJECT_DIR)/%_PullAoS.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PULL$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_PULL -c $< -o $@
+
+$(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)    $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@
+
+
+$(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS)
+       @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)"
+       $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) -c $< -o $@
+
+
+# ------------------------------------------------------------------------
+# Following targets in princibal make only sure, that the directories
+# are created if the do not exist already.
+
+$(DEP_DIR)/.target:
+       $(Q)$(MKDIR_P) $(DEP_DIR)
+       $(Q)$(TOUCH) $@
+
+# ------------------------------------------------------------------------
+# Current configuration.
+MAKE_CFG = SYSTEM=$(SYSTEM) // BUILD=$(BUILD) // MAKEOVERRIDES=\"$(strip $(MAKEOVERRIDES))\" // VERIFICATION=$(VERIFICATION) // STATISTICS=$(STATISTICS) // VTK_OUTPUT=$(VTK_OUTPUT) // VTK_OUTPUT_ASCII=$(VTK_OUTPUT_ASCII) // LID_DRIVEN_CAVITY=$(LID_DRIVEN_CAVITY) // ISA=$(ISA) // LIKWID=$(LIKWID)
+
+# Compare current configuration to the last one so we know when to
+# rebuild this system/target despite when sources have not changed.
+$(OBJECT_DIR)/.target::
+       $(Q)$(MKDIR_P) $(OBJECT_DIR)
+       @ ( [ -f $(OBJECT_DIR)/.target ] && $(ECHO) $(MAKE_CFG) | $(CMP) -s $(OBJECT_DIR)/.target - )           \
+       || ( $(ECHO) $(MAKE_CFG) > $(OBJECT_DIR)/.target; $(ECHO) "# configuration variables changed, rebuilding ..." )
+
+
+clean:
+       -rm $(OBJECT_DIR)/*.o $(DEP_DIR)/*.d
+
+clean-all:
+       -[ -d obj ] && rm -r obj
diff --git a/src/Memory.c b/src/Memory.c
new file mode 100644 (file)
index 0000000..d9d1f94
--- /dev/null
@@ -0,0 +1,86 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>  // strerror
+#include <errno.h>
+
+#include "Base.h"
+#include "Memory.h"
+
+
+int MemAlloc(void ** ptr, size_t bytesToAlloc)
+{
+       void * tmpPtr;
+
+       tmpPtr = malloc(bytesToAlloc);
+
+       if (tmpPtr == NULL) { //  && bytesToAlloc != 0) {
+               Error("allocation of %lu bytes failed: %d - %s\n", bytesToAlloc, errno, strerror(errno));
+               exit(1);
+       }
+
+       *ptr = tmpPtr;
+
+       return 0;
+}
+
+int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes)
+{
+       int ret;
+
+       ret = posix_memalign(ptr, alignmentBytes, bytesToAlloc);
+
+       if (ret) {
+               Error("allocation of %lu bytes aligned to %lu bytes failed: %d - %s\n", bytesToAlloc, alignmentBytes, errno, strerror(errno));
+               exit(1);
+       }
+
+       return 0;
+}
+
+
+int MemFree(void ** ptr)
+{
+       Assert(*ptr != NULL);
+
+       free(*ptr);
+
+       *ptr = NULL;
+
+       return 0;
+}
+
+int MemZero(void * ptr, size_t bytesToZero)
+{
+       Assert(ptr != NULL);
+       Assert(bytesToZero > 0);
+
+       memset(ptr, 0, bytesToZero);
+
+       return 0;
+}
diff --git a/src/Memory.h b/src/Memory.h
new file mode 100644 (file)
index 0000000..cfe1dfc
--- /dev/null
@@ -0,0 +1,38 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __MEMORY_H__
+#define __MEMORY_H__
+
+#include <stddef.h> // size_t
+
+int MemAlloc(void ** ptr, size_t bytesToAlloc);
+int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes);
+int MemFree(void ** ptr);
+
+int MemZero(void * ptr, size_t bytesToZero);
+
+#endif // __MEMORY_H__
diff --git a/src/Pinning.c b/src/Pinning.c
new file mode 100644 (file)
index 0000000..0cf70ea
--- /dev/null
@@ -0,0 +1,393 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef _GNU_SOURCE
+       #define _GNU_SOURCE
+#endif
+#include <sched.h>
+#include <errno.h>
+
+
+#include "Base.h"
+#include "Pinning.h"
+
+
+
+
+// -----------------------------------------------------------------------
+//
+// Binds the calling thread to specified core.
+//
+// Return value: 0 = success, else error.
+//
+// -----------------------------------------------------------------------
+
+int PinCurrentThreadToCore(int coreNumber)
+{
+       int error = 0;
+
+       cpu_set_t cpu_set;
+       CPU_ZERO(&cpu_set);
+       CPU_SET(coreNumber, &cpu_set);
+
+       error = sched_setaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set);
+
+       if (error != 0) {
+               Error("pinning thread to core %d failed (%d): %s\n",
+                               coreNumber, error, strerror(error));
+       }
+
+       return error;
+}
+
+
+// -----------------------------------------------------------------------
+//
+// Binds the calling thread to specified core by a cpu list specified
+// in the given environment variable.
+//
+// Return value: 0 = success, else error.
+//
+// -----------------------------------------------------------------------
+
+int PinCurrentThreadByEnvVar(const char * envVarName,
+       int mpiRank, int nodeRank, int threadNumber)
+{
+       const char * envVarValue;
+       int core;
+
+       envVarValue = getenv(envVarName);
+
+       if (envVarValue == NULL) {
+               if (mpiRank == 0) {
+                       Print("skip pinning: env var %s not set\n", envVarName);
+               }
+
+               return 0;
+       }
+
+       core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber);
+
+       if (core < 0) {
+               return core;
+       }
+
+       return PinCurrentThreadToCore(core);
+}
+
+
+// -----------------------------------------------------------------------
+//
+// Binds the calling thread to a core specified in the CPU list.
+//
+// Return value: 0 = success, else error.
+//
+// -----------------------------------------------------------------------
+
+int PinCurrentThreadByCpuList(const char * cpuList,
+       int mpiRank, int nodeRank, int threadNumber)
+{
+       int core;
+
+       if (cpuList == NULL) {
+               if (mpiRank == 0) {
+                       printf("ERROR: cpu list is NULL.\n");
+               }
+
+               exit(1);
+       }
+
+       core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber);
+
+       if (core < 0) {
+               return core;
+       }
+
+       return PinCurrentThreadToCore(core);
+}
+
+
+// -----------------------------------------------------------------------
+//
+// Parses the provided cpu list and returns the core number for the
+// specified MPI rank, local rank, and thread.
+//
+// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5
+//
+// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a
+// node rank. The first block maps to node rank 1, the second to node
+// rank 2, etc.
+//
+// Inside a block the core numbers specify where the threads should
+// be pinned to. They are separated by "," and the first number maps
+// to the first core, the second number to the second core, etc.
+//
+// For example: 0,2,4_6,8,10
+//
+// Node rank 0 thread 0 pinned to core  0
+//           0        1                 2
+//           0        2                 4
+//           1        0                 6
+//           1        1                 8
+//           1        2                10
+//
+// -----------------------------------------------------------------------
+
+int PinParseCpuList(const char * cpuList,
+               int mpiRank, int nodeRank, int threadNumber)
+{
+       int cpu = -1;
+
+       if (cpuList == NULL) {
+               return -1;
+       }
+
+       const char * c = cpuList;
+
+       // Ensure only valid characters are in the cpu list.
+       // Cpu list is in the format of "0,1,2_3,4,5".
+       while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) {
+               ++c;
+       }
+
+       if (*c != 0x00) {
+       // Invalid character detected.
+               return -2;
+       }
+
+       c = cpuList;
+
+       int i = 0;
+
+       // Move variable c after the "nodeRank"th "_" in the cpu list.
+       while (i < nodeRank && *c != 0x00) {
+       if (*c == '_') ++i;
+               ++c;
+       }
+
+       if (i != nodeRank || *c < '0' || *c > '9') {
+               // Cpu list for this node rank not found.
+               return -3;
+       }
+
+       // Now find the core for the specified thread.
+
+       int t = 0;
+
+       while (t < threadNumber && *c != 0x00) {
+               if (*c == ',') {
+                       ++t;
+               }
+               else if (*c == '_') {
+                       // Unexpected character at this position.
+                       break;
+               }
+
+               ++c;
+       }
+
+       if (t != threadNumber || *c < '0' || *c > '9') {
+               // Cpu for this threadNumber not found.
+               return -4;
+       }
+
+       cpu = atoi(c);
+
+       return cpu;
+}
+
+
+
+// -----------------------------------------------------------------------
+//
+// Returns the first core from the calling thread's affinity set.
+//
+// On error a value < 0 is returned.
+//
+// -----------------------------------------------------------------------
+
+int PinCurrentCore()
+{
+       int core = -1;
+       int err;
+
+       cpu_set_t cpu_set;
+       CPU_ZERO(&cpu_set);
+
+       err = sched_getaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set);
+
+       // constant CPU_SETSIZE is one larger than the maximum CPU
+       // number that can be stored in a CPU set
+       for (int i = 0; i < CPU_SETSIZE; ++i) {
+               if (CPU_ISSET(i, &cpu_set)) {
+                       core = i;
+                       break;
+               }
+       }
+
+       if (err != 0) {
+               Error("getting thread affinty failed (%d): %s\n", err, strerror(err));
+               return -1;
+       }
+
+       return core;
+}
+
+
+
+// -----------------------------------------------------------------------
+//
+// Returns the all cores from the calling thread's affinity set.
+//
+// On error a value < 0 is returned.
+//
+// -----------------------------------------------------------------------
+
+typedef cpu_set_t CpuSet;
+
+
+static CpuSet PinCurrentCores()
+{
+       CpuSet cpuSet;
+       int err;
+
+       cpu_set_t cpu_set;
+       CPU_ZERO(&cpu_set);
+
+       err = sched_getaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set);
+
+       cpuSet = cpu_set;
+
+       if (err != 0) {
+               Error("getting thread affinty failed (%d): %s\n", err, strerror(err));
+               return cpuSet;
+       }
+
+       return cpuSet;
+}
+
+static char * CpuSetToString(cpu_set_t * cpu_set)
+{
+       int previousSetCore = -2;
+       int rangeBeginCore = -2;
+
+       char * buffer1 = (char *)malloc(1024);
+       Assert(buffer1 != NULL);
+       char * buffer2 = (char *)malloc(1024);
+       Assert(buffer2 != NULL);
+
+       buffer1[0] = 0x00;
+       buffer2[0] = 0x00;
+
+       char * buffer = buffer1;
+       char * bufferOld = buffer2;
+
+       const char * empty = "";
+       const char * realComma = ",";
+       const char * comma = empty;
+
+       // TODO: use snprintf
+       // TODO: increase allocated buffer if necessary
+
+       for (int i = 0; i < CPU_SETSIZE; ++i) {
+               if (!CPU_ISSET(i, cpu_set)) {
+                       continue;
+               }
+
+               if (i == previousSetCore + 1) {
+                       previousSetCore = i;
+                       continue;
+               }
+
+               // Now we reached the end of a range.
+               // The range can also consist of only one core.
+               // Be aware, that this core is not part of the range.
+
+               // TODO: this code is repeated below -> use it only once
+               if (rangeBeginCore >= 0 && previousSetCore >= 0) {
+                       char * tmp;
+
+                       tmp = buffer;
+                       buffer = bufferOld;
+                       bufferOld = tmp;
+
+                       if (rangeBeginCore < previousSetCore) {
+                               sprintf(buffer, "%s%s%d-%d", bufferOld, comma, rangeBeginCore, previousSetCore);
+                       }
+                       else {
+                               sprintf(buffer, "%s%s%d", bufferOld, comma, previousSetCore);
+                       }
+
+                       comma = realComma;
+               }
+
+               // With this core a new range begins.
+               rangeBeginCore = i;
+               previousSetCore = i;
+       }
+
+       if (rangeBeginCore >= 0 && previousSetCore >= 0) {
+               char * tmp;
+
+               tmp = buffer;
+               buffer = bufferOld;
+               bufferOld = tmp;
+
+               if (rangeBeginCore < previousSetCore) {
+                       sprintf(buffer, "%s%s%d-%d", bufferOld, comma, rangeBeginCore, previousSetCore);
+               }
+               else {
+                       sprintf(buffer, "%s%s%d", bufferOld, comma, previousSetCore);
+               }
+       }
+
+       free(bufferOld); bufferOld = NULL;
+
+       return buffer;
+}
+
+char * PinCpuListAsString()
+{
+       CpuSet cpuSet = PinCurrentCores();
+
+       return CpuSetToString(&cpuSet);
+}
+
+#ifdef TEST
+
+int main(int argc, char * argv[])
+{
+       char * cpuList = PinCpuListAsString();
+
+       printf("pinned to cores: %s\n", cpuList);
+
+       free(cpuList); cpuList = NULL;
+
+       return 0;
+}
+
+#endif // TEST
+
diff --git a/src/Pinning.h b/src/Pinning.h
new file mode 100644 (file)
index 0000000..40225c1
--- /dev/null
@@ -0,0 +1,47 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __PINNING_H__
+#define __PINNING_H__
+
+
+int PinCurrentThreadToCore(int coreNumber);
+
+int PinParseCpuList(const char * cpuList,
+                                       int mpiRank, int nodeRank, int threadNumber);
+
+int PinCurrentThreadByEnvVar(const char * envVarName,
+       int mpiRank, int nodeRank, int threadNumber);
+
+int PinCurrentThreadByCpuList(const char * cpuList,
+       int mpiRank, int nodeRank, int threadNumber);
+
+int PinCurrentCore();
+
+char * PinCpuListAsString();
+
+
+#endif // __PINNING_H__
diff --git a/src/Vector.h b/src/Vector.h
new file mode 100644 (file)
index 0000000..41b9a79
--- /dev/null
@@ -0,0 +1,83 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __VECTOR_H__
+#define __VECTOR_H__
+
+#if !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
+       #warning Defining VECTOR_AVX as no ISA extension was selected.
+       #define VECTOR_AVX
+#endif
+
+#if defined(VECTOR_AVX) && defined(VECTOR_SSE)
+       #error Only VECTOR_AVX or VECTOR_SSE can be defined at the same time.
+#endif
+
+#ifdef VECTOR_AVX
+
+       #include <immintrin.h>
+       // Vector size in double-precision floatin-point numbers.
+       #define VSIZE   4
+
+       #define VPDFT                           __m256d
+
+       #define VSET(scalar)            _mm256_set1_pd(scalar)
+
+       #define VLD(expr)                       _mm256_load_pd(expr)
+       #define VLDU(expr)                      _mm256_loadu_pd(expr)
+
+       #define VST(dst, src)           _mm256_store_pd(dst, src)
+       #define VSTU(dst, src)          _mm256_storeu_pd(dst, src)
+       #define VSTNT(dst, src)         _mm256_stream_pd(dst, src)
+
+       #define VMUL(a, b)                      _mm256_mul_pd(a, b)
+       #define VADD(a, b)                      _mm256_add_pd(a, b)
+       #define VSUB(a, b)                      _mm256_sub_pd(a, b)
+#endif
+
+#ifdef VECTOR_SSE
+       #include <emmintrin.h>
+       // Vector size in double-precision floatin-point numbers.
+       #define VSIZE 2
+
+       #define VPDFT                           __m128d
+
+       #define VSET(scalar)            _mm_set1_pd(scalar)
+
+       #define VLD(expr)                       _mm_load_pd(expr)
+       #define VLDU(expr)                      _mm_loadu_pd(expr)
+
+       #define VST(dst, src)           _mm_store_pd(dst, src)
+       #define VSTU(dst, src)          _mm_storeu_pd(dst, src)
+       #define VSTNT(dst, src)         _mm_stream_pd(dst, src)
+
+       #define VMUL(a, b)                      _mm_mul_pd(a, b)
+       #define VADD(a, b)                      _mm_add_pd(a, b)
+       #define VSUB(a, b)                      _mm_sub_pd(a, b)
+#endif
+
+
+#endif // __VECTOR_H__
diff --git a/src/Vtk.c b/src/Vtk.c
new file mode 100644 (file)
index 0000000..e650722
--- /dev/null
+++ b/src/Vtk.c
@@ -0,0 +1,245 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#include "Vtk.h"
+
+#include <math.h>
+
+// TODO: make this portable
+
+// needed for stat & mkdir
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h> // strerror
+
+// TODO: make byteswap portable
+
+#include <inttypes.h>
+// glibc
+#include <byteswap.h>
+
+// macros for portability
+// #define BS32(a)             bswap_32(*((uint32_t *)(&a)))
+#define BS64(a)                bswap_64(*((uint64_t *)(&a)))
+
+
+void VtkWrite(LatticeDesc * ld, KernelData * kd, CaseData * cd, int iteration)
+{
+       Assert(kd != NULL);
+       Assert(ld != NULL);
+       Assert(ld->Dims[0] > 0);
+       Assert(ld->Dims[1] > 0);
+       Assert(ld->Dims[2] > 0);
+
+       // TODO: this should be made portable...
+       // Check if subdirectory vtk exists, if not, create it.
+       {
+               int err;
+               struct stat fileStatus;
+
+               err = stat("vtk", &fileStatus);
+
+               if (err) {
+                       // printf("ERROR: stat %d - %s\n", errno, strerror(errno));
+
+                       // Set default mask and hope mkdir applies umask...
+                       err = mkdir("vtk", 0700);
+
+                       if (err) {
+                               printf("ERROR: cannot create directory vtk - %d: %s\n", errno, strerror(errno));
+                               exit(1);
+                       }
+
+                       printf("# created directory vtk.\n");
+               }
+               else {
+
+                       if (!S_ISDIR(fileStatus.st_mode)) {
+                               printf("ERROR: cannot create subdirectory vtk as already a file with the same name exists.\n");
+                               exit(1);
+                       }
+
+               }
+       }
+
+
+       char fileName[1024];
+
+       snprintf(fileName, sizeof(fileName), "vtk/file-%04d.vtk", iteration);
+
+       printf("# VTK: writing file %s\n", fileName);
+
+       FILE * fh;
+
+       fh = fopen(fileName, "w");
+
+       if(fh == NULL) {
+               printf("ERROR: opening file %s failed.\n", fileName);
+               exit(1);
+       }
+
+       // http://www.vtk.org/pdf/file-formats.pdf
+       int nX = ld->Dims[0];
+       int nY = ld->Dims[1];
+       int nZ = ld->Dims[2];
+       int * lDims = ld->Dims;
+
+       // Temporaries for endian conversion.
+       uint64_t uDensity, uUx, uUy, uUz;
+
+       PdfT pdfs[N_D3Q19];
+
+       fprintf(fh, "# vtk DataFile Version 1.0\n");
+       fprintf(fh, "Comment: lid driven cavity, iteration % 4d\n", iteration);
+#ifdef VTK_OUTPUT_ASCII
+       fprintf(fh, "ASCII\n");
+#else
+       fprintf(fh, "BINARY\n");
+#endif
+       fprintf(fh, "DATASET STRUCTURED_POINTS\n");
+       fprintf(fh, "DIMENSIONS %d %d %d\n", nX, nY, nZ);
+       fprintf(fh, "ORIGIN 0 0 0 \n");
+       fprintf(fh, "SPACING 1 1 1\n");
+       fprintf(fh, "POINT_DATA %d\n", nX * nY * nZ);
+
+       // ----------------------------------------------------------------------
+       // Flag field: obstacle = 0, fluid = 1, inlet = 2, outlet = 4
+
+       fprintf(fh, "SCALARS NodesTypes unsigned_char 1\n");
+       fprintf(fh, "LOOKUP_TABLE default\n");
+
+       unsigned char c;
+
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+                       for(int x = 0; x < nX; ++x) {
+#ifdef VTK_OUTPUT_ASCII
+                               fprintf(fh, "%d\n", ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)]);
+#else
+                               c = (unsigned char)ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)];
+                               fwrite(&c, sizeof(unsigned char), 1, fh);
+#endif
+                       }
+               }
+       }
+
+       // ----------------------------------------------------------------------
+       // Density field
+
+       fprintf(fh, "SCALARS Density double\n");
+       fprintf(fh, "LOOKUP_TABLE default\n");
+
+       double density;
+
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+                       for(int x = 0; x < nX; ++x) {
+
+                               density = 0.0;
+                               if (ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       kd->GetNode(kd, x, y, z, pdfs);
+
+                                       for (int d = 0; d < N_D3Q19; ++d) {
+                                               density += pdfs[d];
+                                       }
+                               }
+
+#ifdef VTK_OUTPUT_ASCII
+                               fprintf(fh, "%e\n", density);
+#else
+                               uDensity = BS64(density);
+                               fwrite(&uDensity, sizeof(double), 1, fh);
+#endif
+                       }
+               }
+       }
+
+       // ----------------------------------------------------------------------
+       // Velocity vectors: velocity in x, y, and z direction
+
+       fprintf(fh, "VECTORS VelocityVectors double\n");
+
+       // Declare pdf_N, pdf_E, pdf_S, pdf_W, ...
+       #define X(name, idx, idxinv, x, y, z)   PdfT JOIN(pdf_,name);
+       D3Q19_LIST
+       #undef X
+
+       double ux, uy, uz;
+
+       for(int z = 0; z < nZ; ++z) {
+               for(int y = 0; y < nY; ++y) {
+                       for(int x = 0; x < nX; ++x) {
+
+                               if (ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) {
+                                       kd->GetNode(kd, x, y, z, pdfs);
+
+// DETECT NANS
+//                                     for (int d = 0; d < 19; ++d) {
+//                                             if(isnan(pdfs[d])) {
+//                                                     printf("%d %d %d %d nan!\n", x, y, z, d);
+//                                                     for (int d2 = 0; d2 < 19; ++d2) {
+//                                                             printf("%d: %e\n", d2, pdfs[d2]);
+//                                                     }
+//                                                     exit(1);
+//                                             }
+//                                     }
+                                       #define X(name, idx, idxinv, _x, _y, _z)        JOIN(pdf_,name) = pdfs[idx];
+                                       D3Q19_LIST
+                                       #undef X
+                                       UNUSED(pdf_C);
+
+
+                                       ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE -
+                                                pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW;
+                                       uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN -
+                                                pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS;
+                                       uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS -
+                                                pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS;
+                                       #ifdef VERIFICATION
+                                       ux += 0.5 * cd->XForce;
+                                       #endif
+                               }
+                               else {
+                                       ux = 0.0; uy = 0.0; uz = 0.0;
+                               }
+
+#ifdef VTK_OUTPUT_ASCII
+                               fprintf(fh, "%f %f %f\n", ux, uy, uz);
+#else
+                               uUx = BS64(ux); uUy = BS64(uy); uUz = BS64(uz);
+                               fwrite(&uUx, sizeof(double), 1, fh);
+                               fwrite(&uUy, sizeof(double), 1, fh);
+                               fwrite(&uUz, sizeof(double), 1, fh);
+#endif
+                       }
+               }
+       }
+
+       fclose(fh);
+}
+
diff --git a/src/Vtk.h b/src/Vtk.h
new file mode 100644 (file)
index 0000000..b36395d
--- /dev/null
+++ b/src/Vtk.h
@@ -0,0 +1,34 @@
+// --------------------------------------------------------------------------
+//
+// Copyright
+//   Markus Wittmann, 2016-2017
+//   RRZE, University of Erlangen-Nuremberg, Germany
+//   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+//
+//   Viktor Haag, 2016
+//   LSS, University of Erlangen-Nuremberg, Germany
+//
+//  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+//
+//  LbmBenchKernels is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  LbmBenchKernels is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+//
+// --------------------------------------------------------------------------
+#ifndef __VTK_H__
+#define __VTK_H__
+
+#include "Kernel.h"
+
+void VtkWrite(LatticeDesc * ld, KernelData * kd, CaseData *cd, int iteration);
+
+#endif // __VTK_H__
diff --git a/src/config.linux-gcc.mk b/src/config.linux-gcc.mk
new file mode 100644 (file)
index 0000000..41dea5d
--- /dev/null
@@ -0,0 +1,97 @@
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016-2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+
+
+# ------------------------------------------------------------------------
+# C ompiler/linker to use.
+# Flags are specified at the end of the file.
+# ------------------------------------------------------------------------
+CC              = gcc
+LD              = gcc
+
+# ------------------------------------------------------------------------
+
+# Preprocessing variables.
+D          = -D
+I          = -I
+PP_FLAGS  +=
+
+# Architecture to optimize for.
+TARCH     ?= -mavx
+
+# Generated dependencies, can be left empty.
+MAKE_DEPEND = $(CC) -MM -MQ'$(OBJECT_DIR)/$(<:%.c=%.o)' -MF'$(DEP_DIR)/$(<:%.c=%.d)' $(PP_FLAGS) $< > /dev/null
+
+# Generates dependencies, can be left empty.
+# $(call make_depend,<source-file>,<source-file-to-generate-dependency-for)
+define make_depend
+       $(CC) -MM -MQ'$(OBJECT_DIR)/$(2:%.c=%.o)' -MF'$(DEP_DIR)/$(2:%.c=%.d)' $(PP_FLAGS) $1 > /dev/null
+endef
+
+
+ifeq (on,$(OPENMP))
+  OPENMP_C_FLAGS  += -fopenmp
+  OPENMP_LD_FLAGS += -fopenmp
+endif
+
+
+ifeq (release,$(BUILD))
+
+  C_FLAGS      += -O3 $(TARCH)
+  LD_FLAGS     += -O3 $(TARCH)
+
+  PP_FLAGS     +=
+
+else
+ifeq (debug,$(BUILD))
+
+  C_FLAGS      += -O0 $(TARCH) -g -ggdb
+  LD_FLAGS     += -O0 $(TARCH) -g -ggdb
+
+  PP_FLAGS     += $(D)DEBUG
+
+else
+  $(error unknown BUILD=$(BUILD), specify release or debug)
+endif
+endif
+
+ifeq (on,$(ADDRESS_SANITIZER))
+  # see https://github.com/google/sanitizers/wiki/AddressSanitizerFlags for details
+  C_FLAGS      += -fsanitize=address
+  LD_FLAGS     += -fsanitize=address
+endif
+
+ifeq (on,$(DEBUG_SYMBOLS))
+  C_FLAGS      += -g -ggdb
+  LD_FLAGS     += -g -ggdb
+endif
+
+C_FLAGS        += -Wall -Wuninitialized -Wunused-variable -Wshadow -fargument-noalias -fargument-noalias-anything -std=c99 \
+ -MT $@ -MF $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$@) -MMD $(OPENMP_C_FLAGS)
+LD_FLAGS       += -Wall $(OPENMP_LD_FLAGS)
+LD_LIBS        += -lm
+
diff --git a/src/config.linux-intel.mk b/src/config.linux-intel.mk
new file mode 100644 (file)
index 0000000..00d1ca7
--- /dev/null
@@ -0,0 +1,100 @@
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016, 2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+
+
+# ------------------------------------------------------------------------
+# C compiler/linker to use.
+# Flags are specified at the end of the file.
+# ------------------------------------------------------------------------
+CC              = icc
+LD              = icc
+
+# ------------------------------------------------------------------------
+
+# Preprocessing variables.
+D          = -D
+I          = -I
+PP_FLAGS  +=
+
+# Architecture to optimize for.
+TARCH     ?= -xAVX
+
+# Generated dependencies, can be left empty.
+MAKE_DEPEND = $(CC) -MM -MQ'$(OBJECT_DIR)/$(<:%.c=%.o)' -MF'$(DEP_DIR)/$(<:%.c=%.d)' $(PP_FLAGS) $< > /dev/null
+
+# Generates dependencies, can be left empty.
+# $(call make_depend,<source-file>,<source-file-to-generate-dependency-for)
+define make_depend
+       $(CC) -MM -MQ'$(OBJECT_DIR)/$(2:%.c=%.o)' -MF'$(DEP_DIR)/$(2:%.c=%.d)' $(PP_FLAGS) $1 > /dev/null
+endef
+
+ifeq (on,$(OPENMP))
+  OPENMP_C_FLAGS  += -qopenmp
+  OPENMP_LD_FLAGS += -qopenmp
+endif
+
+ifeq (release,$(BUILD))
+
+  C_FLAGS      += -O3 $(TARCH)
+  LD_FLAGS     += -O3 $(TARCH)
+
+  PP_FLAGS     +=
+
+else
+ifeq (debug,$(BUILD))
+
+  C_FLAGS      += -O0 $(TARCH) -g -fstack-protector -debug inline-debug-info -debug extended -debug variable-locations
+  LD_FLAGS     += -O0 $(TARCH) -g -fstack-protector -debug inline-debug-info -debug extended -debug variable-locations
+
+  # Intel compiler intrinsic reference:
+  # debugging: compile with -D__INTEL_COMPILER_USE_INTRINSIC_PROTOTYPES for
+  #            improved compile-time checking
+  # release:   remove this option as it significantly increases compile time
+
+  PP_FLAGS     += $(D)DEBUG $(D)__INTEL_COMPILER_USE_INTRINSIC_PROTOTYPES
+
+else
+  $(error unknown BUILD=$(BUILD), specify release or debug)
+endif
+endif
+
+ifeq (on,$(DEBUG_SYMBOLS))
+  C_FLAGS      += -g
+  LD_FLAGS     += -g
+endif
+
+
+# ------------------------------------------------------------------------
+# C compiler/linker flags to use.
+# ------------------------------------------------------------------------
+C_FLAGS        += -Wall -Wcheck -Wabi -Wdeprecated -Wextra-tokens -Wformat -Wformat-security -Wshadow -Wuninitialized -Wunused-variable \
+ -fno-alias -fargument-noalias -fno-fnalias -std=c99 \
+ -MT $@ -MF $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$@) -MMD -diag-disable 10010 $(OPENMP_C_FLAGS)
+LD_FLAGS       += -Wall -Wcheck $(OPENMP_LD_FLAGS)
+LD_LIBS        +=
+
+
diff --git a/src/test-verification.sh b/src/test-verification.sh
new file mode 100755 (executable)
index 0000000..1dc8906
--- /dev/null
@@ -0,0 +1,113 @@
+#!/bin/bash -l
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016-2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+set -u
+
+Tmp="delme.test.sh.$(hostname).$$.tmp"
+Binary="../bin/lbmbenchk-linux-intel-release"
+NThreads="5"
+
+TestsTotal="0"
+TestsFailed="0"
+TestsSucceeded="0"
+
+if [ "$#" -ge "1" ]; then
+  Binary="$1"
+fi
+
+
+function on_exit
+{
+  if [ -e "$Tmp" ]; then
+    rm -f "$Tmp" 2>&1 || true
+  fi
+}
+
+trap "on_exit" EXIT
+
+function run_kernel
+{
+  local Binary="$1"
+  local K="$2"      # Kernel name
+  local T="$3"      # Number of threads
+  local KernelArgs="${4:-""}"
+
+  ((++TestsTotal))
+
+  echo -n "$Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) ${KernelArgs:+"-- "}$KernelArgs "
+
+  if [ "$KernelArgs" == "" ]; then
+    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) > "$Tmp" 2>&1
+  else
+    $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) -- $KernelArgs > "$Tmp" 2>&1
+  fi
+
+  local ExitCode="$?"
+
+  if [ "$ExitCode" != "0" ]; then
+    echo ""
+    cat "$Tmp"
+    echo "$Binary -verify  -kernel $K"
+    echo "Verification failed. Exit code = $ExitCode."
+    ((++TestsFailed))
+  else
+    echo "OK"
+    ((++TestsSucceeded))
+  fi
+
+}
+
+for K in $("$Binary" -list | tail -n +7); do
+
+  for T in $(seq 1 $NThreads); do
+
+    run_kernel "$Binary" "$K" "$T"
+
+    # Check in the usage string, if the kernel accepts parameters for blocking.
+
+    $Binary -kernel $K -- -h > "$Tmp" 2>&1
+
+    LineParameterStart="$(grep -n "^Kernel parameters:" "$Tmp" | sed -e 's/:.*//')"
+
+    if [ "$LineParameterStart" == "" ]; then
+      continue
+    fi
+
+    tail -n +$LineParameterStart "$Tmp" | grep -q -- "-blk"
+    ExitCode="$?"
+
+    if [ "$ExitCode" == "0" ]; then
+      # Kernel supports blocking
+      run_kernel "$Binary" "$K" "$T" "-blk 7"
+    fi
+
+  done
+
+done
+
+
+echo "# Tests toal: $TestsTotal  succeeded: $TestsSucceeded  failed: $TestsFailed"
diff --git a/src/test.sh b/src/test.sh
new file mode 100755 (executable)
index 0000000..ea64791
--- /dev/null
@@ -0,0 +1,69 @@
+#!/bin/bash -l
+# --------------------------------------------------------------------------
+#
+# Copyright
+#   Markus Wittmann, 2016-2017
+#   RRZE, University of Erlangen-Nuremberg, Germany
+#   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
+#
+#   Viktor Haag, 2016
+#   LSS, University of Erlangen-Nuremberg, Germany
+#
+#  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
+#
+#  LbmBenchKernels is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  LbmBenchKernels is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
+#
+# --------------------------------------------------------------------------
+set -u
+set -e
+
+XTag="-test"
+
+Build=release
+
+
+
+Config=linux-intel
+
+module purge
+module add intel64
+
+make clean-all
+make -j CONFIG=$Config TAG=$XTag-debug
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
+
+BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v"
+BinaryB="../bin/lbmbenchk-$Config-$Build$XTag-b"
+
+./test-verification.sh "$BinaryV"
+
+
+
+Config=linux-gcc
+
+module purge
+module add gcc
+
+make clean-all
+
+make -j CONFIG=$Config TAG=$XTag-debug
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v
+make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
+
+BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v"
+BinaryB="../bin/lbmbenchk-$Config-$Build$XTag-b"
+
+./test-verification.sh "$BinaryV"
+
This page took 0.898319 seconds and 5 git commands to generate.