From 109880839321408644c94a34eb31208460b9f46d Mon Sep 17 00:00:00 2001 From: Markus Wittmann Date: Mon, 10 Jul 2017 11:38:45 +0200 Subject: [PATCH] version 0.1 --- COPYING | 675 ++++++++++++++ LICENSE | 675 ++++++++++++++ doc/Makefile | 38 + doc/main.rst | 242 +++++ src/Base.h | 204 +++++ src/BenchKernelD3Q19.c | 736 +++++++++++++++ src/BenchKernelD3Q19.h | 57 ++ src/BenchKernelD3Q19Common.c | 655 ++++++++++++++ src/BenchKernelD3Q19Common.h | 86 ++ src/BenchKernelD3Q19List.c | 347 +++++++ src/BenchKernelD3Q19List.h | 44 + src/BenchKernelD3Q19ListAa.c | 406 +++++++++ src/BenchKernelD3Q19ListAa.h | 40 + src/BenchKernelD3Q19ListAaCommon.c | 628 +++++++++++++ src/BenchKernelD3Q19ListAaCommon.h | 137 +++ src/BenchKernelD3Q19ListAaPv.c | 847 ++++++++++++++++++ src/BenchKernelD3Q19ListAaPv.h | 38 + src/BenchKernelD3Q19ListAaPvCommon.c | 821 +++++++++++++++++ src/BenchKernelD3Q19ListAaPvCommon.h | 61 ++ src/BenchKernelD3Q19ListAaRia.c | 479 ++++++++++ src/BenchKernelD3Q19ListAaRia.h | 38 + src/BenchKernelD3Q19ListAaRiaCommon.c | 804 +++++++++++++++++ src/BenchKernelD3Q19ListAaRiaCommon.h | 61 ++ src/BenchKernelD3Q19ListCommon.c | 609 +++++++++++++ src/BenchKernelD3Q19ListCommon.h | 136 +++ src/BenchKernelD3Q19ListPullSplitNt.c | 455 ++++++++++ src/BenchKernelD3Q19ListPullSplitNt.h | 41 + ...chKernelD3Q19ListPullSplitNt1SIntrinsics.h | 142 +++ src/BenchKernelD3Q19ListPullSplitNt1SScalar.h | 139 +++ ...chKernelD3Q19ListPullSplitNt2SIntrinsics.h | 130 +++ src/BenchKernelD3Q19ListPullSplitNt2SScalar.h | 133 +++ src/BenchKernelD3Q19ListPullSplitNtCommon.c | 797 ++++++++++++++++ src/BenchKernelD3Q19ListPullSplitNtCommon.h | 57 ++ src/BoostJoin.h | 38 + src/Geometry.c | 267 ++++++ src/Geometry.h | 48 + src/Kernel.c | 801 +++++++++++++++++ src/Kernel.h | 217 +++++ src/KernelFunctions.h | 138 +++ src/Lattice.h | 78 ++ src/LikwidIf.h | 46 + src/Main.c | 589 ++++++++++++ src/Makefile | 298 ++++++ src/Memory.c | 86 ++ src/Memory.h | 38 + src/Pinning.c | 393 ++++++++ src/Pinning.h | 47 + src/Vector.h | 83 ++ src/Vtk.c | 245 +++++ src/Vtk.h | 34 + src/config.linux-gcc.mk | 97 ++ src/config.linux-intel.mk | 100 +++ src/test-verification.sh | 113 +++ src/test.sh | 69 ++ 54 files changed, 14583 insertions(+) create mode 100644 COPYING create mode 100644 LICENSE create mode 100644 doc/Makefile create mode 100644 doc/main.rst create mode 100644 src/Base.h create mode 100644 src/BenchKernelD3Q19.c create mode 100644 src/BenchKernelD3Q19.h create mode 100644 src/BenchKernelD3Q19Common.c create mode 100644 src/BenchKernelD3Q19Common.h create mode 100644 src/BenchKernelD3Q19List.c create mode 100644 src/BenchKernelD3Q19List.h create mode 100644 src/BenchKernelD3Q19ListAa.c create mode 100644 src/BenchKernelD3Q19ListAa.h create mode 100644 src/BenchKernelD3Q19ListAaCommon.c create mode 100644 src/BenchKernelD3Q19ListAaCommon.h create mode 100644 src/BenchKernelD3Q19ListAaPv.c create mode 100644 src/BenchKernelD3Q19ListAaPv.h create mode 100644 src/BenchKernelD3Q19ListAaPvCommon.c create mode 100644 src/BenchKernelD3Q19ListAaPvCommon.h create mode 100644 src/BenchKernelD3Q19ListAaRia.c create mode 100644 src/BenchKernelD3Q19ListAaRia.h create mode 100644 src/BenchKernelD3Q19ListAaRiaCommon.c create mode 100644 src/BenchKernelD3Q19ListAaRiaCommon.h create mode 100644 src/BenchKernelD3Q19ListCommon.c create mode 100644 src/BenchKernelD3Q19ListCommon.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNt.c create mode 100644 src/BenchKernelD3Q19ListPullSplitNt.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNt1SScalar.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNt2SScalar.h create mode 100644 src/BenchKernelD3Q19ListPullSplitNtCommon.c create mode 100644 src/BenchKernelD3Q19ListPullSplitNtCommon.h create mode 100644 src/BoostJoin.h create mode 100644 src/Geometry.c create mode 100644 src/Geometry.h create mode 100644 src/Kernel.c create mode 100644 src/Kernel.h create mode 100644 src/KernelFunctions.h create mode 100644 src/Lattice.h create mode 100644 src/LikwidIf.h create mode 100644 src/Main.c create mode 100644 src/Makefile create mode 100644 src/Memory.c create mode 100644 src/Memory.h create mode 100644 src/Pinning.c create mode 100644 src/Pinning.h create mode 100644 src/Vector.h create mode 100644 src/Vtk.c create mode 100644 src/Vtk.h create mode 100644 src/config.linux-gcc.mk create mode 100644 src/config.linux-intel.mk create mode 100755 src/test-verification.sh create mode 100755 src/test.sh diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..10926e8 --- /dev/null +++ b/COPYING @@ -0,0 +1,675 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..10926e8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,675 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. + diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..690e4eb --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,38 @@ +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016-2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- + + +.PHONY: all main + +all: main + +main: main.rst +#main.css + [ -d html ] || mkdir -p html +# rst2html --stylesheet=html4css1.css,main.css $< html/$@.html + rst2html --stylesheet=html4css1.css $< html/$@.html + diff --git a/doc/main.rst b/doc/main.rst new file mode 100644 index 0000000..921ea84 --- /dev/null +++ b/doc/main.rst @@ -0,0 +1,242 @@ +.. # -------------------------------------------------------------------------- + # + # Copyright + # Markus Wittmann, 2016-2017 + # RRZE, University of Erlangen-Nuremberg, Germany + # markus.wittmann -at- fau.de or hpc -at- rrze.fau.de + # + # Viktor Haag, 2016 + # LSS, University of Erlangen-Nuremberg, Germany + # + # This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). + # + # LbmBenchKernels is free software: you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by + # the Free Software Foundation, either version 3 of the License, or + # (at your option) any later version. + # + # LbmBenchKernels is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + # + # You should have received a copy of the GNU General Public License + # along with LbmBenchKernels. If not, see . + # + # -------------------------------------------------------------------------- + +.. title:: LBM Benchmark Kernels Documentation + + +=================================== +LBM Benchmark Kernels Documentation +=================================== + +.. sectnum:: +.. contents:: + +Compilation +=========== + +The benchmark framework currently supports only Linux systems and the GCC and +Intel compilers. Every other configuration probably requires adjustment inside +the code and the makefiles. Further some code might be platform or at least +POSIX specific. + +The benchmark can be build via ``make`` from the ``src`` subdirectory. This will +generate one binary which hosts all implemented benchmark kernels. + +Binaries are located under the ``bin`` subdirectory and will have different names +depending on compiler and build configuration. + +Debug and Verification +---------------------- + +:: + + make + +Running ``make`` without any arguments builds the debug version (BUILD=debug) of +the benchmark kernels, where no optimizations are performed, line numbers and +debug symbols are included as well as ``DEBUG`` will be defined. The resulting +binary will be found in the ``bin`` subdirectory and named +``lbmbenchk-linux--debug``. + +Without any further specification the binary includes verification +(``VERIFICATION=on``), statistics (``STATISTICS``), and VTK output +(``VTK_OUTPUT=on``) enabled. + +Please note that the generated binary will therefore +exhibit a poor performance. + +Benchmarking +------------ + +To generate a binary for benchmarking run make with :: + + make BENCHMARK=on BUILD=release + +Here BUILD=release turns optimizations on and BENCHMARK=on disables +verfification, statistics, and VTK output. + +Release and Verification +------------------------ + +Verification with the debug builds can be extremely slow. Hence verification +capabilities can be build with release builds: :: + + make BUILD=release + +Compilers +--------- + +Currently only the GCC and Intel compiler under Linux are supported. Between +both configuration can be chosen via ``CONFIG=linux-gcc`` or +``CONFIG=linux-intel``. + +Options Summary +--------------- + +Options that can be specified when building the framework with make: + +============= ======================= ============ ========================================================== +name values default description +------------- ----------------------- ------------ ---------------------------------------------------------- +TARCH -- -- Via TARCH the architecture the compiler generates code for can be overriden. The value depends on the chose compiler. +BENCHMARK on, off off If enabled, disables VERIFICATION, STATISTICS, VTK_OUTPUT. +BUILD debug, release debug No optimization, debug symbols, DEBUG defined. +CONFIG linux-gcc, linux-intel linux-intel Select GCC or Intel compiler. +ISA avx, sse avx Determines which ISA extension is used for macro definitions. This is *not* the architecture the compiler generates code for. +OPENMP on, off on OpenMP, i.\,e.\. threading support. +STATISTICS on, off off View statistics, like density etc, during simulation. +VERIFICATION on, off off Turn verification on/off. +VTK_OUTPUT on, off off Enable/Disable VTK file output. +============= ======================= ============ ========================================================== + +Invocation +========== + +Running the binary will print among the GPL licence header a line like the following: + + LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification + +if verfication was enabled during compilation or + + LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: benchmark + +if verfication was disabled during compilation. + +Command Line Parameters +----------------------- + +Running the binary with ``-h`` list all available parameters: :: + + Usage: + ./lbmbenchk -list + ./lbmbenchk + [-dims XxYyZ] [-geometry box|channel|pipe|blocks[-]] [-iterations ] [-lattice-dump-ascii] + [-rho-in ] [-rho-out ] [-kernel ] + [-periodic-x] + [-t ] + [-pin core{,core}*] + [-verify] + -- + + -list List available kernels. + + -dims XxYxZ Specify geometry dimensions. + + -geometry blocks- + Geometetry with blocks of size regularily layout out. + + +If an option is specified multiple times the last one overrides previous ones. +This holds also true for ``-verify`` which sets geometry dimensions, +iterations, etc, which can afterward be override, e.g.: :: + + $ bin/lbmbenchk-linux-intel-release -verfiy -dims 32x32x32 + +Kernel specific parameters can be opatained via selecting the specific kernel +and passing ``-h`` as parameter: :: + + $ bin/lbmbenchk-linux-intel-release -kernel -- -h + ... + Kernel parameters: + [-blk ] [-blk-[xyz] ] + + +A list of all available kernels can be obtained via ``-list``: :: + + $ ../bin/lbmbenchk-linux-gcc-debug -list + Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE + This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE. + This is free software, and you are welcome to redistribute it under certain conditions. + + LBM Benchmark Kernels 0.1, compiled Jul 5 2017 21:59:22, type: verification + Available kernels to benchmark: + list-aa-pv-soa + list-aa-ria-soa + list-aa-soa + list-aa-aos + list-pull-split-nt-1s-soa + list-pull-split-nt-2s-soa + list-push-soa + list-push-aos + list-pull-soa + list-pull-aos + push-soa + push-aos + pull-soa + pull-aos + blk-push-soa + blk-push-aos + blk-pull-soa + blk-pull-aos + + +Benchmarking +============ + +Correct benchmarking is a nontrivial task. Whenever benchmark results should be +created make sure the binary was compiled with: + +- ``BENCHMARK=on`` and +- ``BUILD=release`` and +- the correct ISA for macros is used, selected via ``ISA`` and +- use ``TARCH`` to specify the architecture the compiler generates code for. + +During benchmarking pinning should be used via the ``-pin`` parameter. Running +a benchmark with 10 threads an pin them to the first 10 cores works like :: + + $ bin/lbmbenchk-linux-intel-release ... -t 10 -pin $(seq -s , 0 9) + +Things the binary does nor check or controll: + +- transparent huge pages: when allocating memory small 4 KiB pages might be + replaced with larger ones. This is in general a good thing, but if this is + really the case, depends on the system settings. + +- CPU/core frequency: For reproducible results the frequency of all cores + should be fixed. + +- NUMA placement policy: The benchmark assumes a first touch policy, which + means the memory will be placed at the NUMA domain the touching core is + associated with. If a different policy is in place or the NUMA domain to be + used is already full memory might be allocated in a remote domain. Accesses + to remote domains typically have a higher latency and lower bandwidth. + +- System load: interference with other application, espcially on desktop + systems should be avoided. + +- Padding: most kernels do not care about padding against cache or TLB + thrashing. Even if the number of (fluid) nodes suggest everything is fine, + through parallelization still problems might occur. + +- CPU dispatcher function: the compiler might add different versions of a + function for different ISA extensions. Make sure the code you might think is + executed is actually the code which is executed. + +.. |datetime| date:: %Y-%m-%d %H:%M + +Document was generated at |datetime|. + diff --git a/src/Base.h b/src/Base.h new file mode 100644 index 0000000..27f9116 --- /dev/null +++ b/src/Base.h @@ -0,0 +1,204 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BASE_H__ +#define __BASE_H__ + +#include +#include +#include + +#include + + +static inline double Time() +{ + struct timeval time; + + gettimeofday(&time, NULL); + + return (time.tv_sec + 1e-6 * time.tv_usec); +} + + +#define TOOL_NAME "lbmbenchk" + +#define STRINGIFYX(x) #x +#define STRINGIFY(x) STRINGIFYX(x) + +// See top of BoostJoin.h for Boost Licence. +#include "BoostJoin.h" +#define JOIN(X, Y) BOOST_DO_JOIN(X, Y) + + +// Some macro fu to remove the first comma. +// "x" is an empty macro agrument in EXPAND2 +// before the first comma which is skipped +#ifndef EXPAND + #define EXPAND2(x, ...) __VA_ARGS__ + #define EXPAND(x, ...) EXPAND2(x, ## __VA_ARGS__) +#endif + +#ifdef DEBUG + + #define Assert(expression) \ + do { \ + if (!(expression)) { \ + Error("%s:%d assertion \"%s\" failed with code %d\n", \ + __FILE__, __LINE__, \ + #expression, expression); \ + __asm__ ("int $3\n"); \ + exit(-1); \ + } \ + } while (0) + + #define AssertMsg(expression, formatString, ...) \ + do { \ + if (!(expression)) { \ + Error("%s:%d assertion \"%s\" failed with code %d\n", \ + __FILE__, __LINE__, \ + #expression, expression); \ + Error(formatString, ##__VA_ARGS__); \ + __asm__ ("int $3\n"); \ + exit(-1); \ + } \ + } while (0) +#else + + #define Assert(expression) + #define AssertMsg(expression, formatString, ...) + +#endif + + #define Verify(expression) \ + do { \ + if (!(expression)) { \ + Error("%s:%d verify \"%s\" failed with code %d\n", \ + __FILE__, __LINE__, \ + #expression, expression); \ + __asm__ ("int $3\n"); \ + exit(-1); \ + } \ + } while (0) + + #define VerifyMsg(expression, formatString, ...) \ + do { \ + if (!(expression)) { \ + Error("%s:%d verify \"%s\" failed with code %d\n", \ + __FILE__, __LINE__, \ + #expression, expression); \ + Error(formatString, ##__VA_ARGS__); \ + __asm__ ("int $3\n"); \ + exit(-1); \ + } \ + } while (0) + + #define Print(formatString, ...) \ + fprintf(stdout, SHC_MAGENTA "[" TOOL_NAME "] " SHC_NC formatString, ##__VA_ARGS__) + + #define Warning(formatString, ...) \ + fprintf(stdout, "[" TOOL_NAME "] WARNING: " formatString, ##__VA_ARGS__) + + #define Error(formatString, ...) \ + fprintf(stderr, SHC_RED "[" TOOL_NAME "] ERROR: " formatString SHC_NC , ##__VA_ARGS__) + + #define DebugPrint(formatString, ...) \ + fprintf(stderr, "[" TOOL_NAME "] DEBUG: " formatString, ##__VA_ARGS__) + + #ifndef NO_SHELL_COLORS + + // or "\e" + #define ESC "\x1b" + + // No Color + #define SHC_NC ESC "[0m" + + #define SHC_BLACK ESC "[0;30m" + #define SHC_MAGENTA ESC "[0;35m" + #define SHC_RED ESC "[0;31m" + #define SHC_DARK_RED ESC "[1;31m" + #define SHC_CYAN ESC "[0;36m" + #define SHC_BROWN ESC "[0;33m" + #define SHC_DARK_GREEN ESC "[1;32m" + + #else // NO_SHELL_COLORS + + // No Color + #define SHC_NC "" + + #define SHC_BLACK "" + #define SHC_MAGENTA "" + #define SHC_RED "" + #define SHC_DARK_RED "" + #define SHC_CYAN "" + #define SHC_BROWN "" + #define SHC_DARK_GREEN "" + + #endif // NO_SHELL_COLORS + + + #define N_ELEMS(x) (sizeof(x) / sizeof((x)[0])) + + + #define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +static inline int MinI(int a, int b) { return a <= b ? a : b; } + +// Raises a breakpoint if a debugger is attached, else SIG_TRAP is raised. +#define DEBUG_BREAK_POINT() __asm__ ("int $3\n") + +#define UNUSED(variable) (void)(variable) + + +static inline char * ByteToHuman(size_t bytes) +{ + static char buffer[256] = { 0 }; + + if (bytes < 1024) { + snprintf(buffer, sizeof(buffer), "%lu b", bytes); + return buffer; + } + + double KiB = bytes / 1024.0; + + if (KiB < 1024.0) { + snprintf(buffer, sizeof(buffer), "%9.2e KiB", KiB); + return buffer; + } + + double MiB = KiB / 1024.0; + if (MiB < 1024.0) { + snprintf(buffer, sizeof(buffer), "%9.2e MiB", MiB); + return buffer; + } + + double GiB = MiB / 1024.0; + snprintf(buffer, sizeof(buffer), "%9.2e GiB", GiB); + return buffer; +} + + +#endif // __BASE_H__ diff --git a/src/BenchKernelD3Q19.c b/src/BenchKernelD3Q19.c new file mode 100644 index 0000000..8203d2a --- /dev/null +++ b/src/BenchKernelD3Q19.c @@ -0,0 +1,736 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19Common.h" + +#include "Memory.h" +#include "Vtk.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +void FNAME(D3Q19Kernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; +// PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter + PdfT magicParam = 1.0/12.0; // 1/4: best stability; 1/12: removes third-order advection error (best advection); 1/6: removes fourth-order diffusion error (best diffusion); 3/16: exact location of bounce back for poiseuille flow + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ux, uy, uz, ui; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, 0); + } + #endif + + for (int iter = 0; iter < maxIterations; ++iter) { + + #ifdef _OPENMP + #pragma omp parallel for collapse(3) default(none) \ + shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int z = oZ; z < nZ + oZ; ++z) { + for (int y = oY; y < nY + oY; ++y) { + for (int x = oX; x < nX + oX; ++x) { + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + +#ifdef PROP_MODEL_PUSH + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; + //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} + D3Q19_LIST + #undef X + +#elif PROP_MODEL_PULL + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)]; + //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} + D3Q19_LIST + #undef X + +#else + #error No implementation for PROP_MODEL_NAME. +#endif + + // #define LID_DRIVEN_CAVITY + + #ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { + #endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + #ifdef LID_DRIVEN_CAVITY + } + + #endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + +#ifdef PROP_MODEL_PUSH + + // direction: w_0 + dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + dst[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + dst[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart; + dst[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + dst[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + dst[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + dst[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + dst[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + +#elif PROP_MODEL_PULL + + // direction: w_0 + dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + dst[I(x, y, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + dst[I(x, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_T)] = pdf_T - evenPart - oddPart; + dst[I(x, y, z, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + +#else + #error No implementation for PROP_MODEL_NAME. +#endif + + #undef I + } + } + } // z, y, x (from inner to outer) + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, dst) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]]; + } + + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + + +void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int * gDims = kd->GlobalDims; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + KernelDataEx * kdex = (KernelDataEx *)kd; + + int blk[3]; + blk[0] = kdex->Blk[0]; + blk[1] = kdex->Blk[1]; + blk[2] = kdex->Blk[2]; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; +// PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter + PdfT magicParam = 1.0/12.0; // 1/4: best stability; 1/12: removes third-order advection error (best advection); 1/6: removes fourth-order diffusion error (best diffusion); 3/16: exact location of bounce back for poiseuille flow + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ux, uy, uz, ui; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, 0); + } + #endif + + int nThreads = 1; + + #ifdef _OPENMP + nThreads = omp_get_max_threads(); + #endif + + for (int iter = 0; iter < maxIterations; ++iter) { + + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for collapse(3) default(none) + // shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, + // w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, + // oX, oY, oZ, nX, nY, nZ, blk) + // private(ux, uy, uz, ui, dens, dir_indep_trm, + // pdf_C, + // pdf_N, pdf_E, pdf_S, pdf_W, + // pdf_NE, pdf_SE, pdf_SW, pdf_NW, + // pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, + // pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, + // evenPart, oddPart, w_1_indep, w_2_indep) + // #endif + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(gDims,src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd, \ + oX, oY, oZ, nX, nY, nZ, blk, nThreads) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + + for (int i = 0; i < nThreads; ++i) { + + int threadStartX = nX / nThreads * i; + int threadEndX = nX / nThreads * (i + 1); + + if (nX % nThreads > 0) { + if (nX % nThreads > i) { + threadStartX += i; + threadEndX += i + 1; + } + else { + threadStartX += nX % nThreads; + threadEndX += nX % nThreads; + } + } + + // for (int z = oZ; z < nZ + oZ; ++z) { + // for (int y = oY; y < nY + oY; ++y) { + // for (int x = oX; x < nX + oX; ++x) { + for (int bZ = oZ; bZ < nZ + oZ; bZ += blk[2]) { + for (int bY = oY; bY < nY + oY; bY += blk[1]) { + for (int bX = oX + threadStartX; bX < threadEndX + oX; bX += blk[0]) { + // for (int bX = oX; bX < nX + oX; bX += blk[0]) { + + // Must do everything here, else it would break collapse. + int eZ = MIN(bZ + blk[2], nZ + oZ); + int eY = MIN(bY + blk[1], nY + oY); + int eX = MIN(bX + blk[0], threadEndX + oX); + +// printf("%d: %d-%d %d-%d %d-%d %d - %d\n", omp_get_thread_num(), bZ, eZ, bY, eY, bX, eX, threadStartX, threadEndX); + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + #define I(x, y, z, dir) P_INDEX_5(gDims, (x), (y), (z), (dir)) + +#ifdef PROP_MODEL_PUSH + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x, y, z, idx)]; + //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} + D3Q19_LIST + #undef X + +#elif PROP_MODEL_PULL + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(x - _x, y - _y, z - _z, idx)]; + //if (isnan(JOIN(pdf_,name))) { printf("iter: %d %d %d %d %d %s nan\n", iter, x-oX, y-oY, z-oZ, idx, D3Q19_NAMES[idx]); exit(1);} + D3Q19_LIST + #undef X + +#else + #error No implementation for PROP_MODEL_NAME. +#endif + + // #define LID_DRIVEN_CAVITY + + #ifdef LID_DRIVEN_CAVITY + + if (z == nZ - 4 + oZ && x > 3 + oX && x < (nX - 4 + oX) && y > 3 + oY && y < (nY - 4 + oY)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + + } else { + #endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + #ifdef LID_DRIVEN_CAVITY + } + + #endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + +#ifdef PROP_MODEL_PUSH + + // direction: w_0 + dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[I(x, y + 1, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + dst[I(x, y - 1, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[I(x + 1, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + dst[I(x - 1, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[I(x, y, z + 1, D3Q19_T)] = pdf_T - evenPart - oddPart; + dst[I(x, y, z - 1, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[I(x - 1, y + 1, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + dst[I(x + 1, y - 1, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[I(x + 1, y + 1, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + dst[I(x - 1, y - 1, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[I(x - 1, y, z + 1, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + dst[I(x + 1, y, z - 1, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[I(x + 1, y, z + 1, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + dst[I(x - 1, y, z - 1, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[I(x, y - 1, z + 1, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + dst[I(x, y + 1, z - 1, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[I(x, y + 1, z + 1, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + dst[I(x, y - 1, z - 1, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + +#elif PROP_MODEL_PULL + + // direction: w_0 + dst[I(x, y, z, D3Q19_C)] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_N)] = pdf_N - evenPart - oddPart; + dst[I(x, y, z, D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_E)] = pdf_E - evenPart - oddPart; + dst[I(x, y, z, D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[I(x, y, z, D3Q19_T)] = pdf_T - evenPart - oddPart; + dst[I(x, y, z, D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + dst[I(x, y, z, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + dst[I(x, y, z, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[I(x, y, z, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + dst[I(x, y, z, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + +#else + #error No implementation for PROP_MODEL_NAME. +#endif + + #undef I + } + } + } // z, y, x (from inner to outer) + } + } + } // z, y, x (from inner to outer) + + } // loop over threads + + // Fixup bounce back PDFs. + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(kd, dst) + #endif + for (int i = 0; i < kd->nBounceBackPdfs; ++i) { + dst[kd->BounceBackPdfsDst[i]] = dst[kd->BounceBackPdfsSrc[i]]; + } + + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + + } // for (int iter = 0; ... + + #ifdef VTK_OUTPUT + + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } + + #endif + + return; +} + diff --git a/src/BenchKernelD3Q19.h b/src/BenchKernelD3Q19.h new file mode 100644 index 0000000..8d7c127 --- /dev/null +++ b/src/BenchKernelD3Q19.h @@ -0,0 +1,57 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19__ +#define __BENCH_KERNEL_D3Q19__ + +#include "Kernel.h" + + +void D3Q19Init_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19Init_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19Deinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19Deinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData); + +void D3Q19Init_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19Init_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19Deinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19Deinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData); + +void D3Q19BlkInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19BlkInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19BlkDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19BlkDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData); + +void D3Q19BlkInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19BlkInit_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19BlkDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19BlkDeinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData); + +#endif // __BENCH_KERNEL_D3Q19__ diff --git a/src/BenchKernelD3Q19Common.c b/src/BenchKernelD3Q19Common.c new file mode 100644 index 0000000..e697bc8 --- /dev/null +++ b/src/BenchKernelD3Q19Common.c @@ -0,0 +1,655 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19Common.h" + +#include "Memory.h" +#include "Vtk.h" + +#include +#include + + +// Forward definition. +void FNAME(D3Q19Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + +void FNAME(D3Q19BlkKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + +static void FNAME(BcGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); + Assert(y >= 0); + Assert(z >= 0); + Assert(x < kd->Dims[0]); + Assert(y < kd->Dims[1]); + Assert(z < kd->Dims[2]); + Assert(dir >= 0); + Assert(dir < N_D3Q19); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + +#ifdef PROP_MODEL_PUSH + int nx = x; + int ny = y; + int nz = z; +#elif PROP_MODEL_PULL + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; +#endif + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + *pdf = kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)]; + #undef I + + return; +} + +static void FNAME(BcSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); + Assert(y >= 0); + Assert(z >= 0); + Assert(x < kd->Dims[0]); + Assert(y < kd->Dims[1]); + Assert(z < kd->Dims[2]); + Assert(dir >= 0); + Assert(dir < N_D3Q19); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + +#ifdef PROP_MODEL_PUSH + int nx = x; + int ny = y; + int nz = z; +#elif PROP_MODEL_PULL + int nx = x - D3Q19_X[dir]; + int ny = y - D3Q19_Y[dir]; + int nz = z - D3Q19_Z[dir]; +#endif + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) + kd->PdfsActive[I(nx + oX, ny + oY, nz + oZ, dir)] = pdf; + #undef I + + + return; +} + + +static void FNAME(GetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); + Assert(y >= 0); + Assert(z >= 0); + Assert(x < kd->Dims[0]); + Assert(y < kd->Dims[1]); + Assert(z < kd->Dims[2]); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) +#ifdef PROP_MODEL_PUSH + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)]; +#elif PROP_MODEL_PULL + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)]; +#endif + D3Q19_LIST + #undef X + #undef I + +#if 0 // DETECT NANs + + for (int d = 0; d < 19; ++d) { + if (isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + + exit(1); + } + } + +#endif + + return; +} + + +static void FNAME(SetNode)(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); + Assert(y >= 0); + Assert(z >= 0); + Assert(x < kd->Dims[0]); + Assert(y < kd->Dims[1]); + Assert(z < kd->Dims[2]); + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + #define I(x, y, z, dir) P_INDEX_5(kd->GlobalDims, (x), (y), (z), (dir)) +#ifdef PROP_MODEL_PUSH + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX, y + oY, z + oZ, idx)] = pdfs[idx]; +#elif PROP_MODEL_PULL + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x + oX - (_x), y + oY - (_y), z + oZ - (_z), idx)] = pdfs[idx]; +#endif + D3Q19_LIST + #undef X + #undef I + + return; +} + + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + + +void FNAME(D3Q19BlkInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelDataEx * kdex = NULL; + MemAlloc((void **)&kdex, sizeof(KernelDataEx)); + + kdex->Blk[0] = 0; kdex->Blk[1] = 0; kdex->Blk[2] = 0; + + KernelData * kd = &kdex->kd; + *kernelData = kd; + + kd->nObstIndices = ld->nObst; + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = ld->Dims[0]; + kd->Dims[1] = ld->Dims[1]; + kd->Dims[2] = ld->Dims[2]; + + + int * lDims = ld->Dims; + int * gDims = kd->GlobalDims; + + gDims[0] = lDims[0] + 2; + gDims[1] = lDims[1] + 2; + gDims[2] = lDims[2] + 2; + + kd->Offsets[0] = 1; + kd->Offsets[1] = 1; + kd->Offsets[2] = 1; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int gX = gDims[0]; + int gY = gDims[1]; + int gZ = gDims[2]; + + int oX = kd->Offsets[0]; + int oY = kd->Offsets[1]; + int oZ = kd->Offsets[2]; + + int blk[3] = { 0 }; + + int nCells = gX * gY * gZ; + + PdfT * pdfs[2]; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = gX; + if (blk[1] == 0) blk[1] = gY; + if (blk[2] == 0) blk[2] = gZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + + kdex->Blk[0] = blk[0]; kdex->Blk[1] = blk[1]; kdex->Blk[2] = blk[2]; + + + printf("# allocating data for %d LB nodes with padding (%lu bytes = %f MiB for both lattices)\n", + nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, + 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + + MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = pdfs[1]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // This depends on the chosen data layout. + // The structure of the loop should resemble the same "execution layout" + // as in the kernel! +#ifdef _OPENMP + #pragma omp parallel for collapse(3) +#endif + + for (int bZ = 0; bZ < gZ; bZ += blk[2]) { + for (int bY = 0; bY < gY; bY += blk[1]) { + for (int bX = 0; bX < gX; bX += blk[0]) { + + // Must do everything here, else it would break collapse. + int eZ = MIN(bZ + blk[2], gZ); + int eY = MIN(bY + blk[1], gY); + int eX = MIN(bX + blk[0], gX); + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 1.0; + } + + } + } + } + } + } + } + + // Initialize all PDFs to some standard value. + for (int z = 0; z < gZ; ++z) { + for (int y = 0; y < gY; ++y) { + for (int x = 0; x < gX; ++x) { + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + pdfs[1][P_INDEX_5(gDims, x, y, z, d)] = 0.0; + } + } + } + } + + + // Count how many *PDFs* need bounce back treatment. + + uint64_t nPdfs = ((uint64_t)19) * gX * gY * gZ; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + + // Compiler bug? Incorrect computation of nBounceBackPdfs when using icc 15.0.2. + // Works when declaring nBounceBackPdfs as int64_t or using volatile. + volatile int nBounceBackPdfs = 0; + // int64_t nBounceBackPdfs = 0; + int nx, ny, nz, px, py, pz; + + // TODO: apply blocking? + + for (int z = 0; z < lZ; ++z) { + for (int y = 0; y < lY; ++y) { + for (int x = 0; x < lX; ++x) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error PROP_MODEL_NAME unknown. +#endif + // Check if neighbor is inside the lattice. + // if(nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + // continue; + // } + if ((nx < 0 || nx >= lX) && ld->PeriodicX) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((ny < 0 || ny >= lY) && ld->PeriodicY) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if ((nz < 0 || nz >= lZ) && ld->PeriodicZ) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + ++nBounceBackPdfs; // Compiler bug --> see above + } + } + } + } + } + } + + + printf("# allocating %d indices for bounce back pdfs (%s for source and destination array)\n", nBounceBackPdfs, ByteToHuman(sizeof(int) * nBounceBackPdfs * 2)); + + MemAlloc((void **) & (kd->BounceBackPdfsSrc), sizeof(int) * nBounceBackPdfs + 100); + MemAlloc((void **) & (kd->BounceBackPdfsDst), sizeof(int) * nBounceBackPdfs + 100); + + kd->nBounceBackPdfs = nBounceBackPdfs; + nBounceBackPdfs = 0; + + int srcIndex; + int dstIndex; + + for (int z = 0; z < lZ; ++z) { + for (int y = 0; y < lY; ++y) { + for (int x = 0; x < lX; ++x) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + for (int d = 0; d < N_D3Q19; ++d) { +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error PROP_MODEL_NAME unknown. +#endif + + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // Implement periodic boundary in X direction. + + // If the target node reached through propagation is outside the lattice + // the kernel stores it in some buffer around the domain. + // From this position the PDF must be transported to the other side of the + // geometry. + + // Take PDF from outside the domain. + + // x periodic + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { +#ifdef PROP_MODEL_PUSH + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); +#elif PROP_MODEL_PULL + srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); +#endif + } + else { + +#ifdef PROP_MODEL_PUSH + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // Put it on the other side back into the domain. + dstIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); +#elif PROP_MODEL_PULL + srcIndex = P_INDEX_5(gDims, px + oX, py + oY, pz + oZ, d); + // Put it on the other side back into the ghost layer. + dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); +#endif + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + } + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + continue; + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { +#ifdef PROP_MODEL_PUSH + srcIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + dstIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); +#elif PROP_MODEL_PULL + srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, D3Q19_INV[d]); + dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, d); + // srcIndex = P_INDEX_5(gDims, x + oX, y + oY, z + oZ, d); + // dstIndex = P_INDEX_5(gDims, nx + oX, ny + oY, nz + oZ, D3Q19_INV[d]); +#endif + + VerifyMsg(nBounceBackPdfs < kd->nBounceBackPdfs, "nBBPdfs %d < kd->nBBPdfs %d xyz: %d %d %d d: %d\n", nBounceBackPdfs, kd->nBounceBackPdfs, x, y, z, d); + + kd->BounceBackPdfsSrc[nBounceBackPdfs] = srcIndex; + kd->BounceBackPdfsDst[nBounceBackPdfs] = dstIndex; + + ++nBounceBackPdfs; + } + } + } + } + } + } + + + // Fill remaining KernelData structures + kd->GetNode = FNAME(GetNode); + kd->SetNode = FNAME(SetNode); + + kd->BoundaryConditionsGetPdf = FNAME(BcGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BcSetPdf); + + kd->Kernel = FNAME(D3Q19BlkKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19BlkDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + MemFree((void **) & ((*kernelData)->Pdfs[0])); + MemFree((void **) & ((*kernelData)->Pdfs[1])); + + MemFree((void **) & ((*kernelData)->BounceBackPdfsSrc)); + MemFree((void **) & ((*kernelData)->BounceBackPdfsDst)); + + MemFree((void **)kernelData); + + return; +} + +// Kernels without blocking perform the same initialization/deinitialization as with +// blocking, except that a different kernel is called. Hence, no arguments are allowed. + +void FNAME(D3Q19Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + Parameters p; + + if (params->nKernelArgs != 0) { + printf("ERROR: unknown kernel parameter.\n"); + printf("This kernels accepts no parameters.\n"); + exit(1); + } + + // Setup an empty parameters structure. + p.nArgs = params->nArgs; + p.Args = params->Args; + p.nKernelArgs = 0; + p.KernelArgs = NULL; + + // Call init routine for blocking kernel and override the + // kernel function to be called later on. + FNAME(D3Q19BlkInit)(ld, kernelData, &p); + + (*kernelData)->Kernel = FNAME(D3Q19Kernel); + + return; + +} + +void FNAME(D3Q19Deinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + FNAME(D3Q19BlkDeinit)(ld, kernelData); + return; +} diff --git a/src/BenchKernelD3Q19Common.h b/src/BenchKernelD3Q19Common.h new file mode 100644 index 0000000..cceddca --- /dev/null +++ b/src/BenchKernelD3Q19Common.h @@ -0,0 +1,86 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_COMMON_H__ + +#include "Kernel.h" + +typedef struct KernelDataEx_ +{ + KernelData kd; + int Blk[3]; // Blocking in X, Y, and Z direction, value of 0 disables blocking. +} KernelDataEx; + + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must be defined +#endif + +#ifndef PROP_MODEL_NAME + #error PROP_MODEL_NAME must be defined +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(int dims[3], int x, int y, int z, int d) +{ + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + Assert(d >= 0); +#ifdef D3Q19 + Assert(d < N_D3Q19); +#else +#error Not implemented for this discretization. +#endif + +#ifdef DATA_LAYOUT_SOA + return d * dims[0] * dims[1] * dims[2] + z * dims[0] * dims[1] + y * dims[0] + x; +#elif DATA_LAYOUT_AOS + return z * dims[0] * dims[1] * N_D3Q19 + y * dims[0] * N_D3Q19 + x * N_D3Q19 + d; +#else +#error P_INDEX_5 function no implemented for chosen data layout. +#endif +} + +#endif // __BENCH_KERNEL_D3Q19_COMMON_H__ + diff --git a/src/BenchKernelD3Q19List.c b/src/BenchKernelD3Q19List.c new file mode 100644 index 0000000..7922b7a --- /dev/null +++ b/src/BenchKernelD3Q19List.c @@ -0,0 +1,347 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include +#include + + +void FNAME(D3Q19ListKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = (KernelDataList *)kernelData; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; +// PdfT omegaOdd = 8.0*((2.0-omegaEven)/(8.0-omegaEven)); //"standard" trt odd relaxation parameter + PdfT magicParam = 1.0/12.0; // 1/4: best stability; 1/12: removes third-order advection error (best advection); 1/6: removes fourth-order diffusion error (best diffusion); 3/16: exact location of bounce back for poiseuille flow + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT w_1_x3 = w_1 * 3.0; PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + PdfT w_2_x3 = w_2 * 3.0; PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ux, uy, uz, ui; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + uint32_t adjListIndex; + uint32_t * adjList = kdl->AdjList; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, 0); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + // TODO: outer openmp parallel + for(int iter = 0; iter < maxIterations; ++iter) { + + + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, src, dst, w_0, w_1, w_2, omegaEven, omegaOdd, \ + w_1_x3, w_2_x3, w_1_nine_half, w_2_nine_half, cd) \ + private(ux, uy, uz, ui, dens, dir_indep_trm, adjListIndex, \ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + evenPart, oddPart, w_1_indep, w_2_indep) + #endif + for (int index = 0; index < nFluid; ++index) { + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + +#ifdef PROP_MODEL_PUSH + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; + D3Q19_LIST + #undef X + +#elif PROP_MODEL_PULL + + adjListIndex = index * N_D3Q19_IDX; + + pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)]; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_N]]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; + D3Q19_LIST_WO_C + #undef X + +#else + #error No implementation for PROP_MODEL_NAME. +#endif + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x = kdl->Coords[C_INDEX_X(index)]; + int y = kdl->Coords[C_INDEX_Y(index)]; + int z = kdl->Coords[C_INDEX_Z(index)]; + + if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + +#ifdef PROP_MODEL_PUSH + + adjListIndex = index * N_D3Q19_IDX; + + // direction: w_0 + dst[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[adjList[adjListIndex + D3Q19_N]] = pdf_N - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_S]] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[adjList[adjListIndex + D3Q19_E]] = pdf_E - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_W]] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[adjList[adjListIndex + D3Q19_T]] = pdf_T - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_B]] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_NW]] = pdf_NW - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_SE]] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_NE]] = pdf_NE - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_SW]] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_TW]] = pdf_TW - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_BE]] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_TE]] = pdf_TE - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_BW]] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_TS]] = pdf_TS - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_BN]] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[adjList[adjListIndex + D3Q19_TN]] = pdf_TN - evenPart - oddPart; + dst[adjList[adjListIndex + D3Q19_BS]] = pdf_BS - evenPart + oddPart; + +#elif PROP_MODEL_PULL + + // direction: w_0 + dst[I(index, D3Q19_C )] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + dst[I(index, D3Q19_N )] = pdf_N - evenPart - oddPart; + dst[I(index, D3Q19_S )] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + dst[I(index, D3Q19_E )] = pdf_E - evenPart - oddPart; + dst[I(index, D3Q19_W )] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + dst[I(index, D3Q19_T )] = pdf_T - evenPart - oddPart; + dst[I(index, D3Q19_B )] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + dst[I(index, D3Q19_NW)] = pdf_NW - evenPart - oddPart; + dst[I(index, D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + dst[I(index, D3Q19_NE)] = pdf_NE - evenPart - oddPart; + dst[I(index, D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + dst[I(index, D3Q19_TW)] = pdf_TW - evenPart - oddPart; + dst[I(index, D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + dst[I(index, D3Q19_TE)] = pdf_TE - evenPart - oddPart; + dst[I(index, D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + dst[I(index, D3Q19_TS)] = pdf_TS - evenPart - oddPart; + dst[I(index, D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + dst[I(index, D3Q19_TN)] = pdf_TN - evenPart - oddPart; + dst[I(index, D3Q19_BS)] = pdf_BS - evenPart + oddPart; + +#endif + #undef I + } // loop over fluid nodes + + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + + } // for (int iter = 0; ... + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} diff --git a/src/BenchKernelD3Q19List.h b/src/BenchKernelD3Q19List.h new file mode 100644 index 0000000..a30e709 --- /dev/null +++ b/src/BenchKernelD3Q19List.h @@ -0,0 +1,44 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST__ +#define __BENCH_KERNEL_D3Q19_LIST__ + +#include "Kernel.h" + +void D3Q19ListInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19ListInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19ListDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData); + +void D3Q19ListInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19ListInit_PullAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19ListDeinit_PullAoS(LatticeDesc * ld, KernelData ** kernelData); + +#endif // __BENCH_KERNEL_D3Q19_LIST__ diff --git a/src/BenchKernelD3Q19ListAa.c b/src/BenchKernelD3Q19ListAa.c new file mode 100644 index 0000000..2c3572c --- /dev/null +++ b/src/BenchKernelD3Q19ListAa.c @@ -0,0 +1,406 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" + +#include +#include + +void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = (KernelDataList *)kernelData; + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + // 1/4: best stability; + // 1/12: removes third-order advection error (best advection); + // 1/6: removes fourth-order diffusion error (best diffusion); + // 3/16: exact location of bounce back for poiseuille flow + PdfT magicParam = 1.0/12.0; + PdfT omegaOdd = 1.0/( 0.5 + magicParam/(1.0/omega - 0.5) ); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0/2.0; PdfT w_1_indep = 0.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0/2.0; PdfT w_2_indep = 0.0; + + PdfT ui; + + PdfT ux, uy, uz; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int maxIterations = cd->MaxIterations; + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + uint32_t adjListIndex; + uint32_t * adjList = kdl->AdjList; + + + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + // TODO: outer openmp parallel + for(int iter = 0; iter < maxIterations; iter += 2) { + + + // -------------------------------------------------------------------- + // even time step + + X_LIKWID_START("list-aa-even"); + + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \ + private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW) + #endif + for (int index = 0; index < nFluid; ++index) { + + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + // Load PDFs of local cell: pdf_N = src[I(x, y, z, D3Q19_N)]; ... + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; + D3Q19_LIST + #undef X + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x = kdl->Coords[C_INDEX_X(index)]; + int y = kdl->Coords[C_INDEX_Y(index)]; + int z = kdl->Coords[C_INDEX_Z(index)]; + + if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart; + src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart; + src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart; + src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart; + src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart; + src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart; + src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart; + src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart; + src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart; + src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart; + + } // loop over fluid nodes + + X_LIKWID_STOP("list-aa-even"); + + // save current iteration + kdl->Iteration = iter; + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + // -------------------------------------------------------------------- + // odd time step + + X_LIKWID_START("list-aa-odd"); + +#ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \ + private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW) +#endif + for (int index = 0; index < nFluid; ++index) { + + + adjListIndex = index * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idxinv]]; + D3Q19_LIST_WO_C + #undef X + +#ifdef LID_DRIVEN_CAVITY + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x = kdl->Coords[C_INDEX_X(index)]; + int y = kdl->Coords[C_INDEX_Y(index)]; + int z = kdl->Coords[C_INDEX_Z(index)]; + + if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + adjListIndex = index * N_D3Q19_IDX; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[adjList[adjListIndex + D3Q19_N]] = pdf_N - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_S]] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[adjList[adjListIndex + D3Q19_E]] = pdf_E - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_W]] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[adjList[adjListIndex + D3Q19_T]] = pdf_T - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_B]] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_NW]] = pdf_NW - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_SE]] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_NE]] = pdf_NE - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_SW]] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_TW]] = pdf_TW - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_BE]] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_TE]] = pdf_TE - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_BW]] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_TS]] = pdf_TS - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_BN]] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[adjList[adjListIndex + D3Q19_TN]] = pdf_TN - evenPart - oddPart; + src[adjList[adjListIndex + D3Q19_BS]] = pdf_BS - evenPart + oddPart; + + #undef I + } // loop over fluid nodes + + X_LIKWID_STOP("list-aa-odd"); + + // save current iteration + kdl->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + + } // for (int iter = 0; ... + + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} diff --git a/src/BenchKernelD3Q19ListAa.h b/src/BenchKernelD3Q19ListAa.h new file mode 100644 index 0000000..d0ac199 --- /dev/null +++ b/src/BenchKernelD3Q19ListAa.h @@ -0,0 +1,40 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA__ +#define __BENCH_KERNEL_D3Q19_LIST_AA__ + +#include "Kernel.h" + +void D3Q19ListAaInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19ListAaInit_PushAoS(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListAaDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); +void D3Q19ListAaDeinit_PushAoS(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA__ diff --git a/src/BenchKernelD3Q19ListAaCommon.c b/src/BenchKernelD3Q19ListAaCommon.c new file mode 100644 index 0000000..d2bcb52 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaCommon.c @@ -0,0 +1,628 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include + + +// Forward definition. +void FNAME(D3Q19ListAaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + + +// ----------------------------------------------------------------------- +// Functions which are used as callback by the kernel to read or write +// PDFs and nodes. + +static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]]; + } + else { + *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)]; + } + + } + else { + *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)]; + } + + + return; +} + +static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + if (isnan(pdf)) { + printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]); + DEBUG_BREAK_POINT(); + exit(1); + } + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf; + } + else { + kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf; + } + + } + else { + kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf; + } + + return; +} + + +static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 + // Detect NaNs + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d]) || isinf(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + return; +} + + +static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + +#if 0 + // Detect NaNs + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C]; + + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + } + + return; +} + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + +void FNAME(D3Q19ListAaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelData * kd; + KernelDataList * kdl; + MemAlloc((void **)&kdl, sizeof(KernelDataList)); + + kd = (KernelData *)kdl; + *kernelData = kd; + +#ifdef DEBUG + kd->Pdfs[0] = NULL; + kd->Pdfs[1] = NULL; + kd->PdfsActive = NULL; + kd->DstPdfs = NULL; + kd->SrcPdfs = NULL; + kd->Dims[0] = -1; + kd->Dims[1] = -1; + kd->Dims[2] = -1; + kd->GlobalDims[0] = -1; + kd->GlobalDims[1] = -1; + kd->GlobalDims[2] = -1; + kd->Offsets[0] = -1; + kd->Offsets[1] = -1; + kd->Offsets[2] = -1; + + kd->ObstIndices = NULL; + kd->nObstIndices = -1; + kd->BounceBackPdfsSrc = NULL; + kd->BounceBackPdfsDst = NULL; + kd->nBounceBackPdfs = -1; + + kdl->AdjList = NULL; + kdl->Coords = NULL; + kdl->Grid = NULL; + kdl->nCells = -1; + kdl->nFluid = -1; +#endif + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; + kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; + kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2]; + + int * lDims = ld->Dims; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int nTotalCells = lX * lY * lZ; + int nCells = ld->nFluid; // TODO: + padding + int nFluid = ld->nFluid; + + kdl->nCells = nCells; + kdl->nFluid = nFluid; + + PdfT * pdfs[2]; + + int blk[3] = { 0 }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = lX; + if (blk[1] == 0) blk[1] = lY; + if (blk[2] == 0) blk[2] = lZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + printf("# allocating data for %d fluid LB nodes with padding (%lu bytes = %f MiB for both lattices)\n", + nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, + 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + + MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + + kd->Pdfs[0] = pdfs[0]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // Here we touch only the fluid nodes as this loop is OpenMP parallel and + // we want the same scheduling as in the kernel. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0; + } } + + // Initialize all PDFs to some standard value. + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0; + } } + + // ---------------------------------------------------------------------- + // create grid which will hold the index numbers of the fluid nodes + + uint32_t * grid; + + if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) { + printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t)); + exit(1); + } + kdl->Grid = grid; + + int latticeIndex; + +#ifdef DEBUG + for(int z = 0; z < lZ; ++z) { + for(int y = 0; y < lY; ++y) { + for(int x = 0; x < lX; ++x) { + + latticeIndex = L_INDEX_4(ld->Dims, x, y, z); + + grid[latticeIndex] = ~0; + } + } + } +#endif + + // ---------------------------------------------------------------------- + // generate numbering over grid + + uint32_t * coords; + + if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) { + printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3); + exit(1); + } + + kdl->Coords = coords; + + // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes + // through the ld->Lattice array. + int counter = 0; + + // Blocking is implemented via setup of the adjacency list. The kernel later will + // walk through the lattice blocked automatically. + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bX = 0; bX < lX; bX += blk[0]) { + + int eX = MIN(bX + blk[0], lX); + int eY = MIN(bY + blk[1], lY); + int eZ = MIN(bZ + blk[2], lZ); + + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + latticeIndex = L_INDEX_4(lDims, x, y, z); + + if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) { + grid[latticeIndex] = counter; + + coords[C_INDEX_X(counter)] = x; + coords[C_INDEX_Y(counter)] = y; + coords[C_INDEX_Z(counter)] = z; + + ++counter; + } + } } } + } } } + + Verify(counter == nFluid); + + uint32_t * adjList; + + // AdjList only requires 18 instead of 19 entries per node, as + // the center PDF needs no addressing. + if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) { + printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX); + exit(1); + } + + kdl->AdjList = adjList; + + int x, y, z; + + uint32_t neighborIndex; + uint32_t dstIndex; + + int nx, ny, nz, px, py, pz; + + // Loop over all fluid nodes and compute the indices to the neighboring + // PDFs for configure data layout (AoS/SoA). + // TODO: Parallelized loop to ensure correct NUMA placement. + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for default(none) + // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, + // stderr, + // lDims, grid, ld, lX, lY, lZ, adjList) + // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex) + // #endif + for (int index = 0; index < nFluid; ++index) { + x = coords[C_INDEX_X(index)]; + y = coords[C_INDEX_Y(index)]; + z = coords[C_INDEX_Z(index)]; + + Assert(x >= 0 && x < lX); + Assert(y >= 0 && y < lY); + Assert(z >= 0 && z < lZ); + + Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE); + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error No implementation for this PROP_MODEL_NAME. +#endif + // If the neighbor is outside the latcie in X direction and we have a + // periodic boundary then we need to wrap around. + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // x periodic + + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)]; + + AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)]; + + Assert(neighborIndex != ~0); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + + Assert(dstIndex >= 0); + Assert(dstIndex < nCells * N_D3Q19); + + adjList[index * N_D3Q19_IDX + d] = dstIndex; + } + } + + + // Fill remaining KernelData structures + kd->GetNode = GetNode; + kd->SetNode = SetNode; + + kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf); + + kd->Kernel = FNAME(D3Q19ListAaKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19ListAaDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + KernelDataList ** kdl = (KernelDataList **)kernelData; + + MemFree((void **)&((*kernelData)->Pdfs[0])); + + MemFree((void **)&((*kdl)->AdjList)); + MemFree((void **)&((*kdl)->Coords)); + MemFree((void **)&((*kdl)->Grid)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19ListAaCommon.h b/src/BenchKernelD3Q19ListAaCommon.h new file mode 100644 index 0000000..11ec5ff --- /dev/null +++ b/src/BenchKernelD3Q19ListAaCommon.h @@ -0,0 +1,137 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__ + + +#include "Kernel.h" + +#include + +#define N_D3Q19_IDX 18 + +typedef struct KernelDataList_ +{ + KernelData kd; + uint32_t * AdjList; // Stores PDF indices, which are the destination for propagation. + // Determine the destination for node index n and direction d via: + // (n * N_D3Q19_IDX) + d + uint32_t * Grid; // Stores the node indices; use L_INDEX_4 macro for access. + uint32_t * Coords; // Map node indices to coordiantes; use C_INDEX_* macro for access. + int nFluid; // Number of fluid nodes allocated, i.e. length of adjList * N_D3Q19_IDX. + int nCells; // Total number of nodes allocated, including nodes for padding! + int Iteration; // Current iteration, starts at 0. +} KernelDataList; + + +// Macro for casting KernelData * to KernelDataList *. +#define KDL(_x_) ((KernelDataList *)(_x_)) + + + + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must be defined +#endif + +#ifndef PROP_MODEL_NAME + #error PROP_MODEL_NAME must be defined +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_3 FNAME(PINDEX3) + +static inline int FNAME(PINDEX3)(int nCells, int cellIndex, int d) +{ + Assert(nCells > 0); + Assert(cellIndex >= 0); + Assert(cellIndex < nCells); + + Assert(d >= 0); + #ifdef D3Q19 + Assert(d < N_D3Q19); + #else + #error Not implemented for this discretization. + #endif + +#ifdef DATA_LAYOUT_SOA + return d * nCells + cellIndex; +#elif DATA_LAYOUT_AOS + return cellIndex * N_D3Q19 + d; +#else + #error P_INDEX_3 function not implemented for chosen data layout. +#endif +} + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(KernelDataList * kdl, int x, int y, int z, int d) +{ + Assert(kdl != NULL); +#ifdef DEBUG + uint32_t * grid = kdl->Grid; + int * dims = kdl->kd.Dims; + + Assert(grid != NULL); + Assert(dims != NULL); + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + Assert(x >= 0 && x < dims[0]); + Assert(y >= 0 && y < dims[1]); + Assert(z >= 0 && z < dims[2]); + Assert(d >= 0 && d < N_D3Q19); +#endif + + return P_INDEX_3(kdl->nCells, kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)], d); +} + +// ----------------------------------------------------------------------- +// Macros for accessing coord array + +#define C_INDEX_X(cellIndex) C_INDEX(cellIndex, 0) +#define C_INDEX_Y(cellIndex) C_INDEX(cellIndex, 1) +#define C_INDEX_Z(cellIndex) C_INDEX(cellIndex, 2) + +static inline int C_INDEX(int cellIndex, int xyz) +{ + Assert(cellIndex >= 0); + Assert(xyz >= 0); + Assert(xyz < 3); + + return cellIndex * 3 + xyz; +} + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA_COMMON_H__ diff --git a/src/BenchKernelD3Q19ListAaPv.c b/src/BenchKernelD3Q19ListAaPv.c new file mode 100644 index 0000000..c522252 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaPv.c @@ -0,0 +1,847 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaPvCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" +#include "LikwidIf.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + + +static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd); +static void KernelOdd( LatticeDesc * ld, KernelData * kernelData, CaseData * cd); + +void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + +#if defined(VTK_OUTPUT) || defined(STATISTICS) || defined(VERIFICATION) + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); +#endif + + int maxIterations = cd->MaxIterations; + + int nThreads = 1; +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = kd->Pdfs[0]; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = kd->Pdfs[0]; + KernelStatistics(kd, ld, cd, 0); + #endif + + // TODO: outer openmp parallel + + for(int iter = 0; iter < maxIterations; iter += 2) { + + // --------------------------------------------------- + // even time step + // --------------------------------------------------- + + X_LIKWID_START("list-aa-pv-even"); + + #ifdef _OPENMP + #pragma omp parallel default(none) shared(ld, kernelData, cd) + #endif + { + KernelEven(ld, kernelData, cd); + } + + X_LIKWID_STOP("list-aa-pv-even"); + + #ifdef VERIFICATION + kdl->Iteration = iter; + kd->PdfsActive = kd->Pdfs[0]; + KernelAddBodyForce(kd, ld, cd); + #endif + + // --------------------------------------------------- + // odd time step + // --------------------------------------------------- + + X_LIKWID_START("list-aa-pv-odd"); + + #ifdef _OPENMP + #pragma omp parallel default(none) shared(ld, kernelData, cd) + #endif + { + KernelOdd(ld, kernelData, cd); + } + + X_LIKWID_STOP("list-aa-pv-odd"); + + + #ifdef VERIFICATION + kdl->Iteration = iter + 1; + kd->PdfsActive = kd->Pdfs[0]; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kdl->Iteration = iter + 1; + kd->PdfsActive = kd->Pdfs[0]; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kdl->Iteration = iter + 1; + kd->PdfsActive = kd->Pdfs[0]; + KernelStatistics(kd, ld, cd, iter); + #endif + + } // for (int iter = 0; ... + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = kd->Pdfs[0]; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = kd->Pdfs[0]; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} + +static void KernelEven(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; PdfT w_1_indep = 0.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; PdfT w_2_indep = 0.0; + + PdfT ui; + + PdfT ux, uy, uz; + PdfT dens; + + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + PdfT * src = kd->Pdfs[0]; + + int nCells = kdl->nCells; + + int threadId = 0; +#ifdef _OPENMP + threadId = omp_get_thread_num(); +#endif + + int * threadIndices = kdlr->FluidNodeThreadIndices; + + int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; + int nFluidVec = nFluidThread - (nFluidThread % VSIZE); + + int indexStartVec = threadIndices[threadId]; + int indexStopVec = threadIndices[threadId] + nFluidVec; + int indexStop = threadIndices[threadId] + nFluidThread; + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + for (int index = indexStartVec; index < indexStopVec; index += VSIZE) { + + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(&src[I(index, idx)]); + D3Q19_LIST + #undef X + + + //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - + // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - + // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - + // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + //vdens = vpdf_C + + // vpdf_N + vpdf_E + vpdf_S + vpdf_W + + // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + + // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + + // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + //src[I(index, D3Q19_C) ] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); + VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + //vw_1_indep = vw_1 * vdir_indep_trm; + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + //vui = vuy; + vui = vuy; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + //src[I(index, D3Q19_S)] =[UA] vpdf_N - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_S)],VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + //src[I(index, D3Q19_N)] =[UA] vpdf_S - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_N)],VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + //vui = vux; + vui = vux; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3 ); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + //src[I(index, D3Q19_W)] =[UA] vpdf_E - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_W)],VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + //src[I(index, D3Q19_E)] =[UA] vpdf_W - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_E)],VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + //vui = vuz; + vui = vuz; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + //src[I(index, D3Q19_B)] =[UA] vpdf_T - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_B)],VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + //src[I(index, D3Q19_T)] =[UA] vpdf_B - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_T)],VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + //vw_2_indep = vw_2 * vdir_indep_trm; + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + //vui = vuy - vux; + vui = VSUB(vuy,vux); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_SE)] =[UA] vpdf_NW - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_SE)],VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + //src[I(index, D3Q19_NW)] =[UA] vpdf_SE - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_NW)],VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + //vui = vux + vuy; + vui = VADD(vux,vuy); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_SW)] =[UA] vpdf_NE - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_SW)],VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + //src[I(index, D3Q19_NE)] =[UA] vpdf_SW - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_NE)],VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + //vui = vuz - vux; + vui = VSUB(vuz,vux); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_BE)] =[UA] vpdf_TW - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_BE)],VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + //src[I(index, D3Q19_TW)] =[UA] vpdf_BE - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_TW)],VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + //vui = vux + vuz; + vui = VADD(vux,vuz); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_BW)] =[UA] vpdf_TE - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_BW)],VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + //src[I(index, D3Q19_TE)] =[UA] vpdf_BW - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_TE)],VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + //vui = vuz - vuy; + vui = VSUB(vuz,vuy); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_BN)] =[UA] vpdf_TS - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_BN)],VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + //src[I(index, D3Q19_TS)] =[UA] vpdf_BN - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_TS)],VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + //vui = vuy + vuz; + vui = VADD(vuy,vuz); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + //src[I(index, D3Q19_BS)] =[UA] vpdf_TN - vevenPart - voddPart; + VSTU(&src[I(index, D3Q19_BS)],VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + //src[I(index, D3Q19_TN)] =[UA] vpdf_BS - vevenPart + voddPart; + VSTU(&src[I(index, D3Q19_TN)],VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + } // loop over fluid nodes + + for (int index = indexStopVec; index < indexStop; ++index) { + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; + D3Q19_LIST + #undef X + + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart; + src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart; + src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart; + src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart; + src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart; + src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart; + src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart; + src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart; + src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart; + src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart; + + } // loop over fluid nodes + + #undef I + + return; +} + +static void KernelOdd(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; PdfT w_1_indep = 0.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; PdfT w_2_indep = 0.0; + + PdfT ui; + + PdfT ux, uy, uz; + PdfT dens; + + + VPDFT VONE_HALF = VSET(0.5); + VPDFT VTHREE_HALF = VSET(3.0 / 2.0); + + VPDFT vw_1_indep, vw_2_indep; + VPDFT vw_0 = VSET(w_0); + VPDFT vw_1 = VSET(w_1); + VPDFT vw_2 = VSET(w_2); + + VPDFT vw_1_x3 = VSET(w_1_x3); + VPDFT vw_2_x3 = VSET(w_2_x3); + VPDFT vw_1_nine_half = VSET(w_1_nine_half); + VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + VPDFT vui, vux, vuy, vuz, vdens; + + VPDFT vevenPart, voddPart, vdir_indep_trm; + + VPDFT vomegaEven = VSET(omegaEven); + VPDFT vomegaOdd = VSET(omegaOdd); + + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); VPDFT JOIN(vpdf_,name); + D3Q19_LIST + #undef X + + // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_,name) = NULL; + D3Q19_LIST + #undef X + + uint32_t nConsecNodes = kdlr->nConsecNodes; + uint32_t * consecNodes = kdlr->ConsecNodes; + uint32_t consecIndex = 0; + uint32_t consecValue = 0; + +#ifndef DEBUG + UNUSED(nConsecNodes); +#endif + + PdfT * src = kd->Pdfs[0]; + + int nCells = kdl->nCells; + + uint32_t adjListIndex; + uint32_t * adjList = kdl->AdjList; + + int threadId = 0; + + #ifdef _OPENMP + threadId = omp_get_thread_num(); + #endif + + consecIndex = kdlr->ConsecThreadIndices[threadId]; + consecValue = 0; + + int * threadIndices = kdlr->FluidNodeThreadIndices; + + int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; + + int indexStart = threadIndices[threadId]; + int indexStop = threadIndices[threadId] + nFluidThread; + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + #define ADJ_LIST(dir) adjList[adjListIndex + (dir)] + + int pointerOffset = 1; + + for (int index = indexStart; index < indexStop; index += 1) { + + if (consecValue > 0) { + --consecValue; + // Increment all pdf pointers by an offset. If the previous iteration was + // scalar, increment only by one. If the previous iteration was vectorized, + // increment by the vector width. These offsets are set in the corresponding + // if branches. + #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) += pointerOffset; + D3Q19_LIST + #undef X + } + else { + Assert(consecIndex < nConsecNodes); + + consecValue = consecNodes[consecIndex] - 1; + // Load new pointers to PDFs of local cell: + + adjListIndex = index * N_D3Q19_IDX; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]); + D3Q19_LIST_WO_C + #undef X + + ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]); + ++consecIndex; + } + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *JOIN(ppdf_,name); + D3Q19_LIST + #undef X + + if (consecValue >= (VSIZE - 1)) { + // Vectorized part. + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(vpdf_,name) = VLDU(JOIN(ppdf_,name)); + D3Q19_LIST_WO_C + #undef X + + vpdf_C = VLDU(ppdf_C); + + //vux = vpdf_E + vpdf_NE + vpdf_SE + vpdf_TE + vpdf_BE - + // vpdf_W - vpdf_NW - vpdf_SW - vpdf_TW - vpdf_BW; + vux = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_E,VADD(vpdf_NE,vpdf_SE)),VADD(vpdf_TE,vpdf_BE)),vpdf_W),vpdf_NW),vpdf_SW),vpdf_TW),vpdf_BW); + //vuy = vpdf_N + vpdf_NE + vpdf_NW + vpdf_TN + vpdf_BN - + // vpdf_S - vpdf_SE - vpdf_SW - vpdf_TS - vpdf_BS; + vuy = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_N,VADD(vpdf_NE,vpdf_NW)),VADD(vpdf_TN,vpdf_BN)),vpdf_S),vpdf_SE),vpdf_SW),vpdf_TS),vpdf_BS); + //vuz = vpdf_T + vpdf_TE + vpdf_TW + vpdf_TN + vpdf_TS - + // vpdf_B - vpdf_BE - vpdf_BW - vpdf_BN - vpdf_BS; + vuz = VSUB(VSUB(VSUB(VSUB(VSUB(VADD(VADD(vpdf_T,VADD(vpdf_TE,vpdf_TW)),VADD(vpdf_TN,vpdf_TS)),vpdf_B),vpdf_BE),vpdf_BW),vpdf_BN),vpdf_BS); + + //vdens = vpdf_C + + // vpdf_N + vpdf_E + vpdf_S + vpdf_W + + // vpdf_NE + vpdf_SE + vpdf_SW + vpdf_NW + + // vpdf_T + vpdf_TN + vpdf_TE + vpdf_TS + vpdf_TW + + // vpdf_B + vpdf_BN + vpdf_BE + vpdf_BS + vpdf_BW; + vdens = VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(VADD(vpdf_C,VADD(vpdf_N,vpdf_E)),VADD(vpdf_S,vpdf_W)),VADD(vpdf_NE,vpdf_SE)), + VADD(vpdf_SW,vpdf_NW)),VADD(vpdf_T,vpdf_TN)),VADD(vpdf_TE,vpdf_TS)),VADD(vpdf_TW,vpdf_B)),VADD(vpdf_BN,vpdf_BE)),VADD(vpdf_BS,vpdf_BW)); + + //vdir_indep_trm = vdens - (vux * vux + vuy * vuy + vuz * vuz) * VTHREE_HALF; + vdir_indep_trm = VSUB(vdens,VMUL(VADD(VADD(VMUL(vux,vux),VMUL(vuy,vuy)),VMUL(vuz,vuz)),VTHREE_HALF)); + + adjListIndex = index * N_D3Q19_IDX; + + //src[I(index, D3Q19_C)] =[UA] vpdf_C - vomegaEven * (vpdf_C - vw_0 * vdir_indep_trm); + VSTU(&src[I(index, D3Q19_C)],VSUB(vpdf_C,VMUL(vomegaEven,VSUB(vpdf_C,VMUL(vw_0,vdir_indep_trm))))); + + //vw_1_indep = vw_1 * vdir_indep_trm; + vw_1_indep = VMUL(vw_1,vdir_indep_trm); + + //vui = vuy; + vui = vuy; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_N + vpdf_S) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_N,vpdf_S)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_N - vpdf_S) - vui * vw_1_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_N,vpdf_S)),VMUL(vui,vw_1_x3))); + //src[ADJ_LIST(D3Q19_N)] =[UA] vpdf_N - vevenPart - voddPart; + VSTU(ppdf_S, VSUB(VSUB(vpdf_N,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_S)] =[UA] vpdf_S - vevenPart + voddPart; + VSTU(ppdf_N, VADD(VSUB(vpdf_S,vevenPart),voddPart)); + + //vui = vux; + vui = vux; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_E + vpdf_W) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_E,vpdf_W)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_E - vpdf_W) - vui * vw_1_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_E,vpdf_W)),VMUL(vui,vw_1_x3))); + //src[ADJ_LIST(D3Q19_E)] =[UA] vpdf_E - vevenPart - voddPart; + VSTU(ppdf_W, VSUB(VSUB(vpdf_E,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_W)] =[UA] vpdf_W - vevenPart + voddPart; + VSTU(ppdf_E, VADD(VSUB(vpdf_W,vevenPart),voddPart)); + + //vui = vuz; + vui = vuz; + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_T + vpdf_B) - vui * vui * vw_1_nine_half - vw_1_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_T,vpdf_B)),VMUL(vui,VMUL(vui,vw_1_nine_half))),vw_1_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_T - vpdf_B) - vui * vw_1_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_T,vpdf_B)),VMUL(vui,vw_1_x3))); + //src[ADJ_LIST(D3Q19_T)] =[UA] vpdf_T - vevenPart - voddPart; + VSTU(ppdf_B, VSUB(VSUB(vpdf_T,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_B)] =[UA] vpdf_B - vevenPart + voddPart; + VSTU(ppdf_T, VADD(VSUB(vpdf_B,vevenPart),voddPart)); + + //vw_2_indep = vw_2 * vdir_indep_trm; + vw_2_indep = VMUL(vw_2,vdir_indep_trm); + + //vui = vuy - vux; + vui = VSUB(vuy,vux); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NW + vpdf_SE) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NW,vpdf_SE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NW - vpdf_SE) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NW,vpdf_SE)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_NW)] =[UA] vpdf_NW - vevenPart - voddPart; + VSTU(ppdf_SE, VSUB(VSUB(vpdf_NW,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_SE)] =[UA] vpdf_SE - vevenPart + voddPart; + VSTU(ppdf_NW, VADD(VSUB(vpdf_SE,vevenPart),voddPart)); + + //vui = vux + vuy; + vui = VADD(vux,vuy); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_NE + vpdf_SW) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_NE,vpdf_SW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_NE - vpdf_SW) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_NE,vpdf_SW)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_NE)] =[UA] vpdf_NE - vevenPart - voddPart; + VSTU(ppdf_SW, VSUB(VSUB(vpdf_NE,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_SW)] =[UA] vpdf_SW - vevenPart + voddPart; + VSTU(ppdf_NE, VADD(VSUB(vpdf_SW,vevenPart),voddPart)); + + //vui = vuz - vux; + vui = VSUB(vuz,vux); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TW + vpdf_BE) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TW,vpdf_BE)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TW - vpdf_BE) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TW,vpdf_BE)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_TW)] =[UA] vpdf_TW - vevenPart - voddPart; + VSTU(ppdf_BE, VSUB(VSUB(vpdf_TW,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_BE)] =[UA] vpdf_BE - vevenPart + voddPart; + VSTU(ppdf_TW, VADD(VSUB(vpdf_BE,vevenPart),voddPart)); + + //vui = vux + vuz; + vui = VADD(vux,vuz); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TE + vpdf_BW) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TE,vpdf_BW)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TE - vpdf_BW) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TE,vpdf_BW)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_TE)] =[UA] vpdf_TE - vevenPart - voddPart; + VSTU(ppdf_BW, VSUB(VSUB(vpdf_TE,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_BW)] =[UA] vpdf_BW - vevenPart + voddPart; + VSTU(ppdf_TE, VADD(VSUB(vpdf_BW,vevenPart),voddPart)); + + //vui = vuz - vuy; + vui = VSUB(vuz,vuy); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TS + vpdf_BN) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TS,vpdf_BN)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TS - vpdf_BN) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TS,vpdf_BN)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_TS)] =[UA] vpdf_TS - vevenPart - voddPart; + VSTU(ppdf_TN, VSUB(VSUB(vpdf_TS,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_BN)] =[UA] vpdf_BN - vevenPart + voddPart; + VSTU(ppdf_BN, VADD(VSUB(vpdf_BN,vevenPart),voddPart)); + + //vui = vuy + vuz; + vui = VADD(vuy,vuz); + //vevenPart = vomegaEven * (VONE_HALF * (vpdf_TN + vpdf_BS) - vui * vui * vw_2_nine_half - vw_2_indep); + vevenPart = VMUL(vomegaEven,VSUB(VSUB(VMUL(VONE_HALF,VADD(vpdf_TN,vpdf_BS)),VMUL(vui,VMUL(vui,vw_2_nine_half))),vw_2_indep)); + //voddPart = vomegaOdd * (VONE_HALF * (vpdf_TN - vpdf_BS) - vui * vw_2_x3); + voddPart = VMUL(vomegaOdd,VSUB(VMUL(VONE_HALF,VSUB(vpdf_TN,vpdf_BS)),VMUL(vui,vw_2_x3))); + //src[ADJ_LIST(D3Q19_TN)] =[UA] vpdf_TN - vevenPart - voddPart; + VSTU(ppdf_BS, VSUB(VSUB(vpdf_TN,vevenPart),voddPart)); + //src[ADJ_LIST(D3Q19_BS)] =[UA] vpdf_BS - vevenPart + voddPart; + VSTU(ppdf_TN, VADD(VSUB(vpdf_BS,vevenPart),voddPart)); + + consecValue -= (VSIZE - 1); + index += (VSIZE - 1); + pointerOffset = VSIZE; + + } + else { + // Scalar part. + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *(JOIN(ppdf_,name)); + D3Q19_LIST_WO_C + #undef X + + pdf_C = *ppdf_C; + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + adjListIndex = index * N_D3Q19_IDX; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1 * dir_indep_trm; + + ui = uy; + evenPart = omegaEven * (0.5 * (pdf_N + pdf_S) - ui * ui * w_1_nine_half - w_1_indep); + oddPart = omegaOdd * (0.5 * (pdf_N - pdf_S) - ui * w_1_x3); + src[ADJ_LIST(D3Q19_N)] = pdf_N - evenPart - oddPart; + src[ADJ_LIST(D3Q19_S)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven * (0.5 * (pdf_E + pdf_W) - ui * ui * w_1_nine_half - w_1_indep); + oddPart = omegaOdd * (0.5 * (pdf_E - pdf_W) - ui * w_1_x3); + src[ADJ_LIST(D3Q19_E)] = pdf_E - evenPart - oddPart; + src[ADJ_LIST(D3Q19_W)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven * (0.5 * (pdf_T + pdf_B) - ui * ui * w_1_nine_half - w_1_indep); + oddPart = omegaOdd * (0.5 * (pdf_T - pdf_B) - ui * w_1_x3); + src[ADJ_LIST(D3Q19_T)] = pdf_T - evenPart - oddPart; + src[ADJ_LIST(D3Q19_B)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2 * dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven * (0.5 * (pdf_NW + pdf_SE) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_NW - pdf_SE) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_NW)] = pdf_NW - evenPart - oddPart; + src[ADJ_LIST(D3Q19_SE)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven * (0.5 * (pdf_NE + pdf_SW) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_NE - pdf_SW) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_NE)] = pdf_NE - evenPart - oddPart; + src[ADJ_LIST(D3Q19_SW)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven * (0.5 * (pdf_TW + pdf_BE) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_TW - pdf_BE) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_TW)] = pdf_TW - evenPart - oddPart; + src[ADJ_LIST(D3Q19_BE)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven * (0.5 * (pdf_TE + pdf_BW) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_TE - pdf_BW) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_TE)] = pdf_TE - evenPart - oddPart; + src[ADJ_LIST(D3Q19_BW)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven * (0.5 * (pdf_TS + pdf_BN) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_TS - pdf_BN) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_TS)] = pdf_TS - evenPart - oddPart; + src[ADJ_LIST(D3Q19_BN)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven * (0.5 * (pdf_TN + pdf_BS) - ui * ui * w_2_nine_half - w_2_indep); + oddPart = omegaOdd * (0.5 * (pdf_TN - pdf_BS) - ui * w_2_x3); + src[ADJ_LIST(D3Q19_TN)] = pdf_TN - evenPart - oddPart; + src[ADJ_LIST(D3Q19_BS)] = pdf_BS - evenPart + oddPart; + + pointerOffset = 1; + } + + } // loop over fluid nodes + + #undef ADJ_LIST + #undef I +} diff --git a/src/BenchKernelD3Q19ListAaPv.h b/src/BenchKernelD3Q19ListAaPv.h new file mode 100644 index 0000000..84dfc05 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaPv.h @@ -0,0 +1,38 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV__ +#define __BENCH_KERNEL_D3Q19_LIST_AA_PV__ + +#include "Kernel.h" + +void D3Q19ListAaPvInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListAaPvDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV__ diff --git a/src/BenchKernelD3Q19ListAaPvCommon.c b/src/BenchKernelD3Q19ListAaPvCommon.c new file mode 100644 index 0000000..9cdf0e1 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaPvCommon.c @@ -0,0 +1,821 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaPvCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include + +#ifdef _OPENMP + #include +#endif + +// Forward definition. +void FNAME(D3Q19ListAaPvKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + + +// ----------------------------------------------------------------------- +// Functions which are used as callback by the kernel to read or write +// PDFs and nodes. + +static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]]; + } + else { + *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)]; + } + + } + else { + *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)]; + } + + + return; +} + +static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + +#if 0 + if (isnan(pdf)) { + printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]); + DEBUG_BREAK_POINT(); + exit(1); + } +#endif + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf; + } + else { + kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf; + } + + } + else { + kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf; + } + + return; +} + + +static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d]) || isinf(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + return; +} + + +static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C]; + + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + } + + return; +} + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + +static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads) +{ + Assert(ld != NULL); + Assert(kdlr != NULL); + Assert(nThreads > 0); + + uint32_t * adjList = kdlr->kdl.AdjList; + + uint32_t nConsecNodes = 0; + uint32_t consecIndex = 0; + + int nFluid = kdlr->kdl.nFluid; + + uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1)); + int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1)); + + int nNodesPerThread = nFluid / nThreads; + + for (int i = 0; i < nThreads; ++i) { + consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads); + fluidNodeThreadIndices[i] = consecThreadIndices[i]; + } + consecThreadIndices[nThreads] = -1; + fluidNodeThreadIndices[nThreads] = nFluid; + + int indexThread = 1; + + // We execute following code two times. + // - The first time to get the count of how many entries we need for the + // consecNodes array. + // - The second time to fill the array. + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + } + } + + if (nFluid > 0) { + nConsecNodes = consecIndex + 1; + } + + uint32_t * consecNodes; + MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes); + + consecIndex = 0; + + if (nFluid > 0) { + consecNodes[consecIndex] = 1; + } + + indexThread = 1; + consecThreadIndices[0] = 0; + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + consecThreadIndices[indexThread] = consecIndex + 1; + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] = 1; + } + else { + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] += 1; + } + } + + + kdlr->ConsecNodes = consecNodes; + kdlr->nConsecNodes = nConsecNodes; + + kdlr->ConsecThreadIndices = consecThreadIndices; + kdlr->nConsecThreadIndices = nThreads; + + kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices; + kdlr->nFluidNodeThreadIndices = nThreads; + + printf("# total fluid nodes: %d consecutive blocks: %d\n", nFluid, nConsecNodes); + + uint32_t vwidth[] = {2, 4, 8, 16, 32}; + uint32_t vectorizable[] = {0, 0, 0, 0, 0}; + + for (int i = 0; i < nConsecNodes; ++i) { + for (int k = 0; k < N_ELEMS(vwidth); ++k) { + vectorizable[k] += consecNodes[i] / vwidth[k]; + } + } + + printf("# vectorizable fraction of fluid node updates:\n"); + for (int i = 0; i < N_ELEMS(vwidth); ++i) { + + printf("# vector width: %2d %6.2f %% (%u/%u fluid nodes)\n", + vwidth[i], (double)vectorizable[i] * vwidth[i] / nFluid * 100.0, + vectorizable[i] * vwidth[i], nFluid); + } + + return; +} + +void FNAME(D3Q19ListAaPvInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelData * kd; + KernelDataList * kdl; + KernelDataListRia * kdlr; + MemAlloc((void **)&kdlr, sizeof(KernelDataListRia)); + + kd = (KernelData *)kdlr; + kdl = KDL(kdlr); + + *kernelData = kd; + +#ifdef DEBUG + kd->Pdfs[0] = NULL; + kd->Pdfs[1] = NULL; + kd->PdfsActive = NULL; + kd->DstPdfs = NULL; + kd->SrcPdfs = NULL; + kd->Dims[0] = -1; + kd->Dims[1] = -1; + kd->Dims[2] = -1; + kd->GlobalDims[0] = -1; + kd->GlobalDims[1] = -1; + kd->GlobalDims[2] = -1; + kd->Offsets[0] = -1; + kd->Offsets[1] = -1; + kd->Offsets[2] = -1; + + kd->ObstIndices = NULL; + kd->nObstIndices = -1; + kd->BounceBackPdfsSrc = NULL; + kd->BounceBackPdfsDst = NULL; + kd->nBounceBackPdfs = -1; + + kdl->AdjList = NULL; + kdl->Coords = NULL; + kdl->Grid = NULL; + kdl->nCells = -1; + kdl->nFluid = -1; + + kdlr->ConsecNodes = NULL; + kdlr->nConsecNodes = 0; + kdlr->ConsecThreadIndices = NULL; + kdlr->nConsecThreadIndices = 0; +#endif + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; + kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; + kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2]; + + int * lDims = ld->Dims; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int nTotalCells = lX * lY * lZ; + int nCells = ld->nFluid; // TODO: + padding + int nFluid = ld->nFluid; + + kdl->nCells = nCells; + kdl->nFluid = nFluid; + + PdfT * pdfs[2]; + + int blk[3] = { 0 }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = lX; + if (blk[1] == 0) blk[1] = lY; + if (blk[2] == 0) blk[2] = lZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + double latMiB = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latPadMib = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + + printf("# lattice size: %e MiB\n", latMiB); + printf("# fluid lattice size: %e MiB\n", latFluidMib); + printf("# lattice padding: %e MiB\n", latPadMib); + +#define PAGE_4K 4096 + + printf("# aligning lattices to: %d b\n", PAGE_4K); + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // Here we touch only the fluid nodes as this loop is OpenMP parallel and + // we want the same scheduling as in the kernel. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0; + } } + + // Initialize all PDFs to some standard value. + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0; + } } + + // ---------------------------------------------------------------------- + // create grid which will hold the index numbers of the fluid nodes + + uint32_t * grid; + + if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) { + printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t)); + exit(1); + } + kdl->Grid = grid; + + int latticeIndex; + +#ifdef DEBUG + for(int z = 0; z < lZ; ++z) { + for(int y = 0; y < lY; ++y) { + for(int x = 0; x < lX; ++x) { + + latticeIndex = L_INDEX_4(ld->Dims, x, y, z); + + grid[latticeIndex] = ~0; + } + } + } +#endif + + // ---------------------------------------------------------------------- + // generate numbering over grid + + uint32_t * coords; + + if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) { + printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3); + exit(1); + } + + kdl->Coords = coords; + + // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes + // through the ld->Lattice array. + int counter = 0; + + // Blocking is implemented via setup of the adjacency list. The kernel later will + // walk through the lattice blocked automatically. + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bX = 0; bX < lX; bX += blk[0]) { + + int eX = MIN(bX + blk[0], lX); + int eY = MIN(bY + blk[1], lY); + int eZ = MIN(bZ + blk[2], lZ); + + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + latticeIndex = L_INDEX_4(lDims, x, y, z); + + if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) { + grid[latticeIndex] = counter; + + coords[C_INDEX_X(counter)] = x; + coords[C_INDEX_Y(counter)] = y; + coords[C_INDEX_Z(counter)] = z; + + ++counter; + } + } } } + } } } + + Verify(counter == nFluid); + + uint32_t * adjList; + + double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0; + + printf("# index size: %e MiB\n", indexMib); + + // AdjList only requires 18 instead of 19 entries per node, as + // the center PDF needs no addressing. + if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) { + printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX); + exit(1); + } + + kdl->AdjList = adjList; + + int x, y, z; + + uint32_t neighborIndex; + uint32_t dstIndex; + + int nx, ny, nz, px, py, pz; + + // Loop over all fluid nodes and compute the indices to the neighboring + // PDFs for configured data layout (AoS/SoA). + // TODO: Parallelized loop to ensure correct NUMA placement. + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for default(none) + // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, + // stderr, + // lDims, grid, ld, lX, lY, lZ, adjList) + // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex) + // #endif + for (int index = 0; index < nFluid; ++index) { + x = coords[C_INDEX_X(index)]; + y = coords[C_INDEX_Y(index)]; + z = coords[C_INDEX_Z(index)]; + + Assert(x >= 0 && x < lX); + Assert(y >= 0 && y < lY); + Assert(z >= 0 && z < lZ); + + Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE); + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error No implementation for this PROP_MODEL_NAME. +#endif + // If the neighbor is outside the latcie in X direction and we have a + // periodic boundary then we need to wrap around. + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // x periodic + + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)]; + + AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)]; + + Assert(neighborIndex != ~0); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + + Assert(dstIndex >= 0); + Assert(dstIndex < nCells * N_D3Q19); + + adjList[index * N_D3Q19_IDX + d] = dstIndex; + } + } + + int nThreads = 1; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + + SetupConsecNodes(ld, KDLR(kd), nThreads); + + double loopBalanceEven = 2.0 * 19 * sizeof(PdfT); + double loopBalanceOdd = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0); + double loopBalance = (loopBalanceEven + loopBalanceOdd) / 2.0; + + printf("# loop balance: %.2f B/FLUP even: %.2f B/FLUP odd %.2f B/FLUP\n", + loopBalance, loopBalanceEven, loopBalanceOdd); + + // Fill remaining KernelData structures + kd->GetNode = GetNode; + kd->SetNode = SetNode; + + kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf); + + kd->Kernel = FNAME(D3Q19ListAaPvKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19ListAaPvDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData; + + MemFree((void **)&((*kdlr)->ConsecNodes)); + + if ((*kdlr)->ConsecThreadIndices != NULL) { + MemFree((void **)&((*kdlr)->ConsecThreadIndices)); + } + + if ((*kdlr)->FluidNodeThreadIndices != NULL) { + MemFree((void **)&((*kdlr)->FluidNodeThreadIndices)); + } + + KernelDataList ** kdl = (KernelDataList **)kernelData; + + MemFree((void **)&((*kdl)->AdjList)); + MemFree((void **)&((*kdl)->Coords)); + MemFree((void **)&((*kdl)->Grid)); + + MemFree((void **)&((*kernelData)->Pdfs[0])); + + MemFree((void **)kernelData); + return; +} + diff --git a/src/BenchKernelD3Q19ListAaPvCommon.h b/src/BenchKernelD3Q19ListAaPvCommon.h new file mode 100644 index 0000000..2ce7fd3 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaPvCommon.h @@ -0,0 +1,61 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__ + +#if !defined(DATA_LAYOUT_SOA) + #error List AA Pv works only with DATA_LAYOUT_SOA +#endif + +#include "BenchKernelD3Q19ListAaCommon.h" + +typedef struct KernelDataListRia_ { + KernelDataList kdl; + + // Array contains information of how many adjacent nodes share the same access pattern. + uint32_t * ConsecNodes; + uint32_t nConsecNodes; // Number of entries in ConsecNodes array. + + // Array contains (for each thread) an index into ConsecNodes. + uint32_t * ConsecThreadIndices; + // Number of entries in ConsecThreadIndices. + uint32_t nConsecThreadIndices; + + // Array contains fluid node indices for each thread where to start in the + // vector of fluid nodes. + int * FluidNodeThreadIndices; + // Number of entries in FluidNodeThreadIndices. + int nFluidNodeThreadIndices; + +} KernelDataListRia; + +// Macro for casting KernelData * to KernelDataList *. +#define KDLR(_x_) ((KernelDataListRia *)(_x_)) + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA_PV_COMMON_H__ + diff --git a/src/BenchKernelD3Q19ListAaRia.c b/src/BenchKernelD3Q19ListAaRia.c new file mode 100644 index 0000000..87addcc --- /dev/null +++ b/src/BenchKernelD3Q19ListAaRia.c @@ -0,0 +1,479 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaRiaCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "LikwidIf.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + PdfT omega = cd->Omega; + PdfT omegaEven = omega; + // 1/ 4: best stability; + // 1/12: removes third-order advection error (best advection); + // 1/ 6: removes fourth-order diffusion error (best diffusion); + // 3/16: exact location of bounce back for poiseuille flow + PdfT magicParam = 1.0 / 12.0; + PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + PdfT evenPart = 0.0; + PdfT oddPart = 0.0; + PdfT dir_indep_trm = 0.0; + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; PdfT w_1_indep = 0.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; PdfT w_2_indep = 0.0; + + PdfT ui; + + PdfT ux, uy, uz; + PdfT dens; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + // Declare pointers to pdfs ppdf_N, ppdf_E, ppdf_S, ppdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT * JOIN(ppdf_,name) = NULL; + D3Q19_LIST + #undef X + + uint32_t nConsecNodes = kdlr->nConsecNodes; + uint32_t * consecNodes = kdlr->ConsecNodes; + uint32_t consecIndex = 0; + uint32_t consecValue = 0; + +#ifndef DEBUG + UNUSED(nConsecNodes); +#endif + + PdfT * src = kd->Pdfs[0]; + + int maxIterations = cd->MaxIterations; + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + uint32_t adjListIndex; + uint32_t * adjList = kdl->AdjList; + + + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + // TODO: outer openmp parallel + for(int iter = 0; iter < maxIterations; iter += 2) { + + X_LIKWID_START("list-aa-ria-even"); + + // -------------------------------------------------------------------- + // even time step + // -------------------------------------------------------------------- + #ifdef _OPENMP + #pragma omp parallel for default(none) \ + shared(stderr, nFluid, nCells, kd, kdl, adjList, omegaOdd, omegaEven, src) \ + private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW) + #endif + for (int index = 0; index < nFluid; ++index) { + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[I(index, idx)]; + D3Q19_LIST + #undef X + +// #define LID_DRIVEN_CAVITY + +#ifdef LID_DRIVEN_CAVITY + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x = kdl->Coords[C_INDEX_X(index)]; + int y = kdl->Coords[C_INDEX_Y(index)]; + int z = kdl->Coords[C_INDEX_Z(index)]; + + if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + src[I(index, D3Q19_S)] = pdf_N - evenPart - oddPart; + src[I(index, D3Q19_N)] = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + src[I(index, D3Q19_W)] = pdf_E - evenPart - oddPart; + src[I(index, D3Q19_E)] = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + src[I(index, D3Q19_B)] = pdf_T - evenPart - oddPart; + src[I(index, D3Q19_T)] = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + src[I(index, D3Q19_SE)] = pdf_NW - evenPart - oddPart; + src[I(index, D3Q19_NW)] = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + src[I(index, D3Q19_SW)] = pdf_NE - evenPart - oddPart; + src[I(index, D3Q19_NE)] = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + src[I(index, D3Q19_BE)] = pdf_TW - evenPart - oddPart; + src[I(index, D3Q19_TW)] = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + src[I(index, D3Q19_BW)] = pdf_TE - evenPart - oddPart; + src[I(index, D3Q19_TE)] = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + src[I(index, D3Q19_BN)] = pdf_TS - evenPart - oddPart; + src[I(index, D3Q19_TS)] = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + src[I(index, D3Q19_BS)] = pdf_TN - evenPart - oddPart; + src[I(index, D3Q19_TN)] = pdf_BS - evenPart + oddPart; + + } // (parallel) loop over fluid nodes + + X_LIKWID_STOP("list-aa-ria-even"); + + // save current iteration + kdl->Iteration = iter; + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + // -------------------------------------------------------------------- + // odd time step + // -------------------------------------------------------------------- + + X_LIKWID_START("list-aa-ria-odd"); + + #ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(stderr, nFluid, nCells, kd, kdl, kdlr, adjList, omegaOdd, omegaEven, src, consecNodes, nConsecNodes) \ + private(ux, uy, uz, dens, adjListIndex, evenPart, oddPart, dir_indep_trm, w_1_indep, w_2_indep, ui,\ + pdf_C, \ + pdf_N, pdf_E, pdf_S, pdf_W, \ + pdf_NE, pdf_SE, pdf_SW, pdf_NW, \ + pdf_T, pdf_TN, pdf_TE, pdf_TS, pdf_TW, \ + pdf_B, pdf_BN, pdf_BE, pdf_BS, pdf_BW, \ + ppdf_C, \ + ppdf_N, ppdf_E, ppdf_S, ppdf_W, \ + ppdf_NE, ppdf_SE, ppdf_SW, ppdf_NW, \ + ppdf_T, ppdf_TN, ppdf_TE, ppdf_TS, ppdf_TW, \ + ppdf_B, ppdf_BN, ppdf_BE, ppdf_BS, ppdf_BW, \ + consecValue, consecIndex) + #endif + { + int threadId = 0; + + #ifdef _OPENMP + threadId = omp_get_thread_num(); + #endif + + consecIndex = kdlr->ConsecThreadIndices[threadId]; + consecValue = 0; + + int * threadIndices = kdlr->FluidNodeThreadIndices; + + int nFluidThread = threadIndices[threadId + 1] - threadIndices[threadId]; + + int indexStart = threadIndices[threadId]; + int indexStop = threadIndices[threadId] + nFluidThread; + + for (int index = indexStart; index < indexStop; ++index) { + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + +#if 1 + if (consecValue > 0) { + --consecValue; + // Increment all pdf pointers. + #define X(name, idx, idxinv, _x, _y, _z) ++JOIN(ppdf_,name); + D3Q19_LIST + #undef X + } + else { + Assert(consecIndex < nConsecNodes); + + consecValue = consecNodes[consecIndex] - 1; + // Load new pointers to PDFs of local cell: + + adjListIndex = index * N_D3Q19_IDX; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]); + D3Q19_LIST_WO_C + #undef X + + ppdf_C = &(src[P_INDEX_3(nCells, index, D3Q19_C)]); + ++consecIndex; + } + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = *JOIN(ppdf_,name); + D3Q19_LIST + #undef X +#else + adjListIndex = index * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + pdf_C = src[P_INDEX_3(nCells, index, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(ppdf_,name) = &(src[adjList[adjListIndex + idxinv]]); + D3Q19_LIST_WO_C + #undef X + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idxinv]]; + D3Q19_LIST_WO_C + #undef X +#endif + +#ifdef LID_DRIVEN_CAVITY + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x = kdl->Coords[C_INDEX_X(index)]; + int y = kdl->Coords[C_INDEX_Y(index)]; + int z = kdl->Coords[C_INDEX_Z(index)]; + + if (z == nZ - 4 && x > 3 && x < (nX - 4) && y > 3 && y < (nY - 4)) { + ux = 0.1 * 0.577; + uy = 0.0; + uz = 0.0; + } else { +#endif + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; +#ifdef LID_DRIVEN_CAVITY + } +#endif + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz)*3.0/2.0; + + adjListIndex = index * N_D3Q19_IDX; + + // direction: w_0 + src[I(index, D3Q19_C) ] = pdf_C - omegaEven*(pdf_C - w_0*dir_indep_trm); + + // direction: w_1 + w_1_indep = w_1*dir_indep_trm; + + ui = uy; + evenPart = omegaEven*( 0.5*(pdf_N + pdf_S) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_N - pdf_S) - ui*w_1_x3 ); + *ppdf_S = pdf_N - evenPart - oddPart; + *ppdf_N = pdf_S - evenPart + oddPart; + + ui = ux; + evenPart = omegaEven*( 0.5*(pdf_E + pdf_W) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_E - pdf_W) - ui*w_1_x3 ); + *ppdf_W = pdf_E - evenPart - oddPart; + *ppdf_E = pdf_W - evenPart + oddPart; + + ui = uz; + evenPart = omegaEven*( 0.5*(pdf_T + pdf_B) - ui*ui*w_1_nine_half - w_1_indep ); + oddPart = omegaOdd*(0.5*(pdf_T - pdf_B) - ui*w_1_x3 ); + *ppdf_B = pdf_T - evenPart - oddPart; + *ppdf_T = pdf_B - evenPart + oddPart; + + // direction: w_2 + w_2_indep = w_2*dir_indep_trm; + + ui = -ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NW + pdf_SE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NW - pdf_SE) - ui*w_2_x3 ); + *ppdf_SE = pdf_NW - evenPart - oddPart; + *ppdf_NW = pdf_SE - evenPart + oddPart; + + ui = ux + uy; + evenPart = omegaEven*( 0.5*(pdf_NE + pdf_SW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_NE - pdf_SW) - ui*w_2_x3 ); + *ppdf_SW = pdf_NE - evenPart - oddPart; + *ppdf_NE = pdf_SW - evenPart + oddPart; + + ui = -ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TW + pdf_BE) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TW - pdf_BE) - ui*w_2_x3 ); + *ppdf_BE = pdf_TW - evenPart - oddPart; + *ppdf_TW = pdf_BE - evenPart + oddPart; + + ui = ux + uz; + evenPart = omegaEven*( 0.5*(pdf_TE + pdf_BW) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TE - pdf_BW) - ui*w_2_x3 ); + *ppdf_BW = pdf_TE - evenPart - oddPart; + *ppdf_TE = pdf_BW - evenPart + oddPart; + + ui = -uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TS + pdf_BN) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TS - pdf_BN) - ui*w_2_x3 ); + *ppdf_BN = pdf_TS - evenPart - oddPart; + *ppdf_TS = pdf_BN - evenPart + oddPart; + + ui = uy + uz; + evenPart = omegaEven*( 0.5*(pdf_TN + pdf_BS) - ui*ui*w_2_nine_half - w_2_indep ); + oddPart = omegaOdd*(0.5*(pdf_TN - pdf_BS) - ui*w_2_x3 ); + *ppdf_BS = pdf_TN - evenPart - oddPart; + *ppdf_TN = pdf_BS - evenPart + oddPart; + + #undef I + } // loop over fluid nodes + } // end pragma omp parallel + + X_LIKWID_STOP("list-aa-ria-odd"); + + // save current iteration + kdl->Iteration = iter + 1; + + #ifdef VERIFICATION + kd->PdfsActive = src; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, iter); + #endif + + + } // for (int iter = 0; ... + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} diff --git a/src/BenchKernelD3Q19ListAaRia.h b/src/BenchKernelD3Q19ListAaRia.h new file mode 100644 index 0000000..f7fa6dd --- /dev/null +++ b/src/BenchKernelD3Q19ListAaRia.h @@ -0,0 +1,38 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_RIA__ +#define __BENCH_KERNEL_D3Q19_LIST_AA_RIA__ + +#include "Kernel.h" + +void D3Q19ListAaRiaInit_PushSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListAaRiaDeinit_PushSoA(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA_RIA__ diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.c b/src/BenchKernelD3Q19ListAaRiaCommon.c new file mode 100644 index 0000000..7faf37b --- /dev/null +++ b/src/BenchKernelD3Q19ListAaRiaCommon.c @@ -0,0 +1,804 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListAaRiaCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include + +#ifdef _OPENMP + #include +#endif + +// Forward definition. +void FNAME(D3Q19ListAaRiaKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + + +// ----------------------------------------------------------------------- +// Functions which are used as callback by the kernel to read or write +// PDFs and nodes. + +static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]]; + } + else { + *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)]; + } + + } + else { + *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)]; + } + + + return; +} + +static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + +#if 0 + if (isnan(pdf)) { + printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]); + DEBUG_BREAK_POINT(); + exit(1); + } +#endif + + KernelDataList * kdl = (KernelDataList *)kd; + + if (kdl->Iteration % 2 == 0) { + // Pdfs are stored inverse, local PDFs are located in remote nodes + + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + D3Q19_INV[dir]]] = pdf; + } + else { + kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf; + } + + } + else { + kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf; + } + + return; +} + + +static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + pdfs[D3Q19_C] = kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; + D3Q19_LIST + #undef X + #undef I + + } + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d]) || isinf(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + return; +} + + +static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + KernelDataList * kdl = (KernelDataList *)kd; + + if(kdl->Iteration % 2 == 0){ + + uint32_t nodeIndex = kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)]; + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + // Load PDFs of local cell: pdf_N = src[adjList[adjListIndex + D3Q19_S]]; ... + kd->PdfsActive[P_INDEX_3(kdl->nCells, nodeIndex, D3Q19_C)] = pdfs[D3Q19_C]; + + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[kdl->AdjList[adjListIndex + idxinv]] = pdfs[idx]; + D3Q19_LIST_WO_C + #undef X + + } else { + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + } + + return; +} + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + +static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads) +{ + Assert(ld != NULL); + Assert(kdlr != NULL); + Assert(nThreads > 0); + + uint32_t * adjList = kdlr->kdl.AdjList; + + uint32_t nConsecNodes = 0; + uint32_t consecIndex = 0; + + int nFluid = kdlr->kdl.nFluid; + + uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1)); + int * fluidNodeThreadIndices = (int *)malloc(sizeof(int) * (nThreads + 1)); + + int nNodesPerThread = nFluid / nThreads; + + for (int i = 0; i < nThreads; ++i) { + consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads); + fluidNodeThreadIndices[i] = consecThreadIndices[i]; + } + consecThreadIndices[nThreads] = -1; + fluidNodeThreadIndices[nThreads] = nFluid; + + int indexThread = 1; + + // We execute following code two times. + // - The first time to get the count of how many entries we need for the + // consecNodes array. + // - The second time to fill the array. + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + } + } + + if (nFluid > 0) { + nConsecNodes = consecIndex + 1; + } + + uint32_t * consecNodes; + MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes); + + consecIndex = 0; + + if (nFluid > 0) { + consecNodes[consecIndex] = 1; + } + + indexThread = 1; + consecThreadIndices[0] = 0; + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + consecThreadIndices[indexThread] = consecIndex + 1; + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] = 1; + } + else { + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] += 1; + } + } + + + kdlr->ConsecNodes = consecNodes; + kdlr->nConsecNodes = nConsecNodes; + + kdlr->ConsecThreadIndices = consecThreadIndices; + kdlr->nConsecThreadIndices = nThreads; + + kdlr->FluidNodeThreadIndices = fluidNodeThreadIndices; + kdlr->nFluidNodeThreadIndices = nThreads; + + printf("# total fluid nodes: %d consecutive blocks: %d\n", nFluid, nConsecNodes); + + return; +} + +void FNAME(D3Q19ListAaRiaInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelData * kd; + KernelDataList * kdl; + KernelDataListRia * kdlr; + MemAlloc((void **)&kdlr, sizeof(KernelDataListRia)); + + kd = (KernelData *)kdlr; + kdl = KDL(kdlr); + + *kernelData = kd; + +#ifdef DEBUG + kd->Pdfs[0] = NULL; + kd->Pdfs[1] = NULL; + kd->PdfsActive = NULL; + kd->DstPdfs = NULL; + kd->SrcPdfs = NULL; + kd->Dims[0] = -1; + kd->Dims[1] = -1; + kd->Dims[2] = -1; + kd->GlobalDims[0] = -1; + kd->GlobalDims[1] = -1; + kd->GlobalDims[2] = -1; + kd->Offsets[0] = -1; + kd->Offsets[1] = -1; + kd->Offsets[2] = -1; + + kd->ObstIndices = NULL; + kd->nObstIndices = -1; + kd->BounceBackPdfsSrc = NULL; + kd->BounceBackPdfsDst = NULL; + kd->nBounceBackPdfs = -1; + + kdl->AdjList = NULL; + kdl->Coords = NULL; + kdl->Grid = NULL; + kdl->nCells = -1; + kdl->nFluid = -1; + + kdlr->ConsecNodes = NULL; + kdlr->nConsecNodes = 0; + kdlr->ConsecThreadIndices = NULL; + kdlr->nConsecThreadIndices = 0; +#endif + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; + kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; + kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2]; + + int * lDims = ld->Dims; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int nTotalCells = lX * lY * lZ; + int nCells = ld->nFluid; // TODO: + padding + int nFluid = ld->nFluid; + + kdl->nCells = nCells; + kdl->nFluid = nFluid; + + PdfT * pdfs[2]; + + int blk[3] = { 0 }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = lX; + if (blk[1] == 0) blk[1] = lY; + if (blk[2] == 0) blk[2] = lZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + double latMiB = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latPadMib = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + + printf("# lattice size: %e MiB\n", latMiB); + printf("# fluid lattice size: %e MiB\n", latFluidMib); + printf("# lattice padding: %e MiB\n", latPadMib); + +#define PAGE_4K 4096 + + printf("# aligning lattices to: %d b\n", PAGE_4K); + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // Here we touch only the fluid nodes as this loop is OpenMP parallel and + // we want the same scheduling as in the kernel. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0; + } } + + // Initialize all PDFs to some standard value. + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0; + } } + + // ---------------------------------------------------------------------- + // create grid which will hold the index numbers of the fluid nodes + + uint32_t * grid; + + if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) { + printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t)); + exit(1); + } + kdl->Grid = grid; + + int latticeIndex; + +#ifdef DEBUG + for(int z = 0; z < lZ; ++z) { + for(int y = 0; y < lY; ++y) { + for(int x = 0; x < lX; ++x) { + + latticeIndex = L_INDEX_4(ld->Dims, x, y, z); + + grid[latticeIndex] = ~0; + } + } + } +#endif + + // ---------------------------------------------------------------------- + // generate numbering over grid + + uint32_t * coords; + + if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) { + printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3); + exit(1); + } + + kdl->Coords = coords; + + // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes + // through the ld->Lattice array. + int counter = 0; + + // Blocking is implemented via setup of the adjacency list. The kernel later will + // walk through the lattice blocked automatically. + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bX = 0; bX < lX; bX += blk[0]) { + + int eX = MIN(bX + blk[0], lX); + int eY = MIN(bY + blk[1], lY); + int eZ = MIN(bZ + blk[2], lZ); + + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + latticeIndex = L_INDEX_4(lDims, x, y, z); + + if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) { + grid[latticeIndex] = counter; + + coords[C_INDEX_X(counter)] = x; + coords[C_INDEX_Y(counter)] = y; + coords[C_INDEX_Z(counter)] = z; + + ++counter; + } + } } } + } } } + + Verify(counter == nFluid); + + uint32_t * adjList; + + double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0; + + printf("# index size: %e MiB\n", indexMib); + + // AdjList only requires 18 instead of 19 entries per node, as + // the center PDF needs no addressing. + if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) { + printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX); + exit(1); + } + + kdl->AdjList = adjList; + + int x, y, z; + + uint32_t neighborIndex; + uint32_t dstIndex; + + int nx, ny, nz, px, py, pz; + + // Loop over all fluid nodes and compute the indices to the neighboring + // PDFs for configured data layout (AoS/SoA). + // TODO: Parallelized loop to ensure correct NUMA placement. + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for default(none) + // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, + // stderr, + // lDims, grid, ld, lX, lY, lZ, adjList) + // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex) + // #endif + for (int index = 0; index < nFluid; ++index) { + x = coords[C_INDEX_X(index)]; + y = coords[C_INDEX_Y(index)]; + z = coords[C_INDEX_Z(index)]; + + Assert(x >= 0 && x < lX); + Assert(y >= 0 && y < lY); + Assert(z >= 0 && z < lZ); + + Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE); + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error No implementation for this PROP_MODEL_NAME. +#endif + // If the neighbor is outside the latcie in X direction and we have a + // periodic boundary then we need to wrap around. + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // x periodic + + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)]; + + AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)]; + + Assert(neighborIndex != ~0); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + + Assert(dstIndex >= 0); + Assert(dstIndex < nCells * N_D3Q19); + + adjList[index * N_D3Q19_IDX + d] = dstIndex; + } + } + + int nThreads = 1; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + + SetupConsecNodes(ld, KDLR(kd), nThreads); + + double loopBalanceEven = 2.0 * 19 * sizeof(PdfT); + double loopBalanceOdd = 2.0 * 19 * sizeof(PdfT) + (double)kdlr->nConsecNodes / nFluid * (18 * 4.0 + 4.0); + double loopBalance = (loopBalanceEven + loopBalanceOdd) / 2.0; + + printf("# loop balance: %.2f B/FLUP even: %.2f B/FLUP odd %.2f B/FLUP\n", + loopBalance, loopBalanceEven, loopBalanceOdd); + + // Fill remaining KernelData structures + kd->GetNode = GetNode; + kd->SetNode = SetNode; + + kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf); + + kd->Kernel = FNAME(D3Q19ListAaRiaKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19ListAaRiaDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData; + + MemFree((void **)&((*kdlr)->ConsecNodes)); + + if ((*kdlr)->ConsecThreadIndices != NULL) { + MemFree((void **)&((*kdlr)->ConsecThreadIndices)); + } + + if ((*kdlr)->FluidNodeThreadIndices != NULL) { + MemFree((void **)&((*kdlr)->FluidNodeThreadIndices)); + } + + KernelDataList ** kdl = (KernelDataList **)kernelData; + + MemFree((void **)&((*kdl)->AdjList)); + MemFree((void **)&((*kdl)->Coords)); + MemFree((void **)&((*kdl)->Grid)); + + MemFree((void **)&((*kernelData)->Pdfs[0])); + + MemFree((void **)kernelData); + return; +} + diff --git a/src/BenchKernelD3Q19ListAaRiaCommon.h b/src/BenchKernelD3Q19ListAaRiaCommon.h new file mode 100644 index 0000000..e127e59 --- /dev/null +++ b/src/BenchKernelD3Q19ListAaRiaCommon.h @@ -0,0 +1,61 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__ + +#if !defined(DATA_LAYOUT_SOA) + #error List AA Ria works only with DATA_LAYOUT_SOA +#endif + +#include "BenchKernelD3Q19ListAaCommon.h" + +typedef struct KernelDataListRia_ { + KernelDataList kdl; + + // Array contains information of how many adjacent nodes share the same access pattern. + uint32_t * ConsecNodes; + uint32_t nConsecNodes; // Number of entries in ConsecNodes array. + + // Array contains (for each thread) an index into ConsecNodes. + uint32_t * ConsecThreadIndices; + // Number of entries in ConsecThreadIndices. + uint32_t nConsecThreadIndices; + + // Array contains fluid node indices for each thread where to start in the + // vector of fluid nodes. + int * FluidNodeThreadIndices; + // Number of entries in FluidNodeThreadIndices. + int nFluidNodeThreadIndices; + +} KernelDataListRia; + +// Macro for casting KernelData * to KernelDataList *. +#define KDLR(_x_) ((KernelDataListRia *)(_x_)) + + +#endif // __BENCH_KERNEL_D3Q19_LIST_AA_RIA_COMMON_H__ + diff --git a/src/BenchKernelD3Q19ListCommon.c b/src/BenchKernelD3Q19ListCommon.c new file mode 100644 index 0000000..4f97bae --- /dev/null +++ b/src/BenchKernelD3Q19ListCommon.c @@ -0,0 +1,609 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListCommon.h" + +#include "Memory.h" +#include "Vtk.h" + +#include + + +// Forward definition. +void FNAME(D3Q19ListKernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + + +// ----------------------------------------------------------------------- +// Functions which are used as callback by the kernel to read or write +// PDFs and nodes. + +static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + +#if 0 + *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)]; +#else +#ifdef PROP_MODEL_PUSH + *pdf = kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)]; +#elif PROP_MODEL_PULL + + + // The relevant PDFs here are the ones, which will get streamed in later + // during propagation. So we must return the *remote* PDFs. + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]]; + } + else { + *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)]; + + } +#endif +#endif + + return; +} + +static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + +#if 0 + if (isnan(pdf)) { + printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]); + DEBUG_BREAK_POINT(); + exit(1); + } +#endif + +#if 0 + kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf; +#else +#ifdef PROP_MODEL_PUSH + kd->PdfsActive[P_INDEX_5(KDL(kd), x, y, z, dir)] = pdf; +#elif PROP_MODEL_PULL + + // The relevant PDFs here are the ones, which will get streamed in later + // during propagation. So we must set this *remote* PDFs. + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf; + } + else { + kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf; + + } +#endif +#endif + + return; +} + + +static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + PdfT sum = 0.0; + + // TODO: pull scheme? + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + // if (sum < 0.0) { + // printf("%d %d %d negative density \n", x, y, z); + // exit(1); + // } + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d]) || isinf(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + return; +} + + +static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + +#if 0 + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + // TODO: pull scheme? + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + return; +} + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + +void FNAME(D3Q19ListInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelData * kd; + KernelDataList * kdl; + MemAlloc((void **)&kdl, sizeof(KernelDataList)); + + kd = (KernelData *)kdl; + *kernelData = kd; + +#ifdef DEBUG + kd->Pdfs[0] = NULL; + kd->Pdfs[1] = NULL; + kd->PdfsActive = NULL; + kd->DstPdfs = NULL; + kd->SrcPdfs = NULL; + kd->Dims[0] = -1; + kd->Dims[1] = -1; + kd->Dims[2] = -1; + kd->GlobalDims[0] = -1; + kd->GlobalDims[1] = -1; + kd->GlobalDims[2] = -1; + kd->Offsets[0] = -1; + kd->Offsets[1] = -1; + kd->Offsets[2] = -1; + + kd->ObstIndices = NULL; + kd->nObstIndices = -1; + kd->BounceBackPdfsSrc = NULL; + kd->BounceBackPdfsDst = NULL; + kd->nBounceBackPdfs = -1; + + kdl->AdjList = NULL; + kdl->Coords = NULL; + kdl->Grid = NULL; + kdl->nCells = -1; + kdl->nFluid = -1; +#endif + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; + kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; + kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2]; + + int * lDims = ld->Dims; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int nTotalCells = lX * lY * lZ; + int nCells = ld->nFluid; // TODO: + padding + int nFluid = ld->nFluid; + + kdl->nCells = nCells; + kdl->nFluid = nFluid; + + PdfT * pdfs[2]; + + int blk[3] = { 0 }; + + ParseParameters(params, blk); + + if (blk[0] == 0) blk[0] = lX; + if (blk[1] == 0) blk[1] = lY; + if (blk[2] == 0) blk[2] = lZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + + printf("# allocating data for %d fluid LB nodes with padding (%lu bytes = %f MiB for both lattices)\n", + nCells, 2 * sizeof(PdfT) * nCells * N_D3Q19, + 2 * sizeof(PdfT) * nCells * N_D3Q19 / 1024.0 / 1024.0); + + MemAlloc((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19); + MemAlloc((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = pdfs[1]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // Here we touch only the fluid nodes as this loop is OpenMP parallel and + // we want the same scheduling as in the kernel. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0; + pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0; + } } + + // Initialize all PDFs to some standard value. + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0; + pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0; + } } + + // ---------------------------------------------------------------------- + // create grid which will hold the index numbers of the fluid nodes + + uint32_t * grid; + + if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) { + printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t)); + exit(1); + } + kdl->Grid = grid; + + int latticeIndex; + +#ifdef DEBUG + for(int z = 0; z < lZ; ++z) { + for(int y = 0; y < lY; ++y) { + for(int x = 0; x < lX; ++x) { + + latticeIndex = L_INDEX_4(ld->Dims, x, y, z); + + grid[latticeIndex] = ~0; + } + } + } +#endif + + // ---------------------------------------------------------------------- + // generate numbering over grid + + uint32_t * coords; + + if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) { + printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3); + exit(1); + } + + kdl->Coords = coords; + + // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes + // through the ld->Lattice array. + int counter = 0; + + // Blocking is implemented via setup of the adjacency list. The kernel later will + // walk through the lattice blocked automatically. + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bX = 0; bX < lX; bX += blk[0]) { + + int eX = MIN(bX + blk[0], lX); + int eY = MIN(bY + blk[1], lY); + int eZ = MIN(bZ + blk[2], lZ); + + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + latticeIndex = L_INDEX_4(lDims, x, y, z); + + if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) { + grid[latticeIndex] = counter; + + coords[C_INDEX_X(counter)] = x; + coords[C_INDEX_Y(counter)] = y; + coords[C_INDEX_Z(counter)] = z; + + ++counter; + } + } } } + } } } + + Verify(counter == nFluid); + + uint32_t * adjList; + + // AdjList only requires 18 instead of 19 entries per node, as + // the center PDF needs no addressing. + if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) { + printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX); + exit(1); + } + + kdl->AdjList = adjList; + + int x, y, z; + + uint32_t neighborIndex; + uint32_t dstIndex; + + int nx, ny, nz, px, py, pz; + + // Loop over all fluid nodes and compute the indices to the neighboring + // PDFs for configured data layout (AoS/SoA). + // TODO: Parallelized loop to ensure correct NUMA placement. + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for default(none) + // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, + // stderr, + // lDims, grid, ld, lX, lY, lZ, adjList) + // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex) + // #endif + for (int index = 0; index < nFluid; ++index) { + x = coords[C_INDEX_X(index)]; + y = coords[C_INDEX_Y(index)]; + z = coords[C_INDEX_Z(index)]; + + Assert(x >= 0 && x < lX); + Assert(y >= 0 && y < lY); + Assert(z >= 0 && z < lZ); + + Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE); + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error No implementation for this PROP_MODEL_NAME. +#endif + // If the neighbor is outside the latcie in X direction and we have a + // periodic boundary then we need to wrap around. + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // x periodic + + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)]; + + AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)]; + + Assert(neighborIndex != ~0); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + + Assert(dstIndex >= 0); + Assert(dstIndex < nCells * N_D3Q19); + + adjList[index * N_D3Q19_IDX + d] = dstIndex; + } + } + + + // Fill remaining KernelData structures + kd->GetNode = GetNode; + kd->SetNode = SetNode; + + kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf); + + kd->Kernel = FNAME(D3Q19ListKernel); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19ListDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + KernelDataList ** kdl = (KernelDataList **)kernelData; + + MemFree((void **)&((*kernelData)->Pdfs[0])); + MemFree((void **)&((*kernelData)->Pdfs[1])); + + MemFree((void **)&((*kdl)->AdjList)); + MemFree((void **)&((*kdl)->Coords)); + MemFree((void **)&((*kdl)->Grid)); + + MemFree((void **)kernelData); + + return; +} + diff --git a/src/BenchKernelD3Q19ListCommon.h b/src/BenchKernelD3Q19ListCommon.h new file mode 100644 index 0000000..c62b0d0 --- /dev/null +++ b/src/BenchKernelD3Q19ListCommon.h @@ -0,0 +1,136 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_LIST_COMMON_H__ + + +#include "Kernel.h" + +#include + +#define N_D3Q19_IDX 18 + +typedef struct KernelDataList_ +{ + KernelData kd; + uint32_t * AdjList; // Stores PDF indices, which are the destination for propagation. + // Determine the destination for node index n and direction d via: + // (n * N_D3Q19_IDX) + d + uint32_t * Grid; // Stores the node indices; use L_INDEX_4 macro for access. + uint32_t * Coords; // Map node indices to coordiantes; use C_INDEX_* macro for access. + int nFluid; // Number of fluid nodes allocated, i.e. length of adjList * N_D3Q19_IDX. + int nCells; // Total number of nodes allocated, including nodes for padding! +} KernelDataList; + + +// Macro for casting KernelData * to KernelDataList *. +#define KDL(_x_) ((KernelDataList *)(_x_)) + + + + +// Build a function name extended by the propagation model name and the data layout. +// FNANEM(test) will be expanded to test_PushSoA if DATA_LAYOUT_NAME is defined +// as SoA and PROP_MODEL is defined as Push. +#define FNAME(functionName) JOIN(JOIN(functionName,_),JOIN(PROP_MODEL_NAME,DATA_LAYOUT_NAME)) + +#ifndef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must be defined +#endif + +#ifndef PROP_MODEL_NAME + #error PROP_MODEL_NAME must be defined +#endif + +// ----------------------------------------------------------------------- +// Index function for accesssing PDF array for different data layouts. + +#define P_INDEX_3 FNAME(PINDEX3) + +static inline int FNAME(PINDEX3)(int nCells, int cellIndex, int d) +{ + Assert(nCells > 0); + Assert(cellIndex >= 0); + Assert(cellIndex < nCells); + + Assert(d >= 0); + #ifdef D3Q19 + Assert(d < N_D3Q19); + #else + #error Not implemented for this discretization. + #endif + +#ifdef DATA_LAYOUT_SOA + return d * nCells + cellIndex; +#elif DATA_LAYOUT_AOS + return cellIndex * N_D3Q19 + d; +#else + #error P_INDEX_3 function not implemented for chosen data layout. +#endif +} + +#define P_INDEX_5 FNAME(PINDEX5) + +static inline int FNAME(PINDEX5)(KernelDataList * kdl, int x, int y, int z, int d) +{ + Assert(kdl != NULL); +#ifdef DEBUG + uint32_t * grid = kdl->Grid; + int * dims = kdl->kd.Dims; + + Assert(grid != NULL); + Assert(dims != NULL); + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + Assert(x >= 0 && x < dims[0]); + Assert(y >= 0 && y < dims[1]); + Assert(z >= 0 && z < dims[2]); + Assert(d >= 0 && d < N_D3Q19); +#endif + + return P_INDEX_3(kdl->nCells, kdl->Grid[L_INDEX_4(kdl->kd.Dims, x, y, z)], d); +} + +// ----------------------------------------------------------------------- +// Macros for accessing coord array + +#define C_INDEX_X(cellIndex) C_INDEX(cellIndex, 0) +#define C_INDEX_Y(cellIndex) C_INDEX(cellIndex, 1) +#define C_INDEX_Z(cellIndex) C_INDEX(cellIndex, 2) + +static inline int C_INDEX(int cellIndex, int xyz) +{ + Assert(cellIndex >= 0); + Assert(xyz >= 0); + Assert(xyz < 3); + + return cellIndex * 3 + xyz; +} + + +#endif // __BENCH_KERNEL_D3Q19_LIST_COMMON_H__ diff --git a/src/BenchKernelD3Q19ListPullSplitNt.c b/src/BenchKernelD3Q19ListPullSplitNt.c new file mode 100644 index 0000000..dfab54a --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt.c @@ -0,0 +1,455 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListPullSplitNtCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" + +#include +#include + +#ifdef _OPENMP + #include +#endif + +#define TMP_UX 18 +#define TMP_UY 19 +#define TMP_UZ 20 +#define TMP_W1 21 +#define TMP_W2 22 + +#define N_TMP 23 + +#define TMP_INDEX(tmp_index, tmp_dir) nTmpArray * (tmp_dir) + (tmp_index) + +void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + const PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + const VPDFT vw_1_x3 = VSET(w_1_x3); + const VPDFT vw_2_x3 = VSET(w_2_x3); + + const VPDFT vw_1_nine_half = VSET(w_1_nine_half); + const VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + const VPDFT vomegaEven = VSET(omegaEven); + const VPDFT vomegaOdd = VSET(omegaOdd); + + const VPDFT voneHalf = VSET(0.5); + + // uint32_t nConsecNodes = kdlr->nConsecNodes; + // uint32_t * consecNodes = kdlr->ConsecNodes; + // uint32_t consecIndex = 0; + // uint32_t consecValue = 0; + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + int nTmpArray = kdlr->nTmpArray; + + Assert(nTmpArray % VSIZE == 0); + + uint32_t * adjList = kdl->AdjList; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + #ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ + cd, maxIterations, ld, tmp, nTmpArray, \ + stderr ) + #endif + { + uint32_t adjListIndex; + + PdfT ux, uy, uz, ui; + VPDFT vux, vuy, vuz, vui; + + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + VPDFT vpdf_a, vpdf_b; + + PdfT evenPart, oddPart, dir_indep_trm, dens; + PdfT w_1_indep, w_2_indep; + VPDFT vevenPart, voddPart; + VPDFT vw_1_indep, vw_2_indep; + + int indexMax; + + PdfT * tmpArray; + MemAllocAligned((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP, VSIZE * sizeof(PdfT)); + + int nThreads = 1; + int threadId = 0; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); +#endif + + int nCellsThread = nFluid / nThreads; + int blIndexStart = threadId * nCellsThread; + + if (threadId < nFluid % nThreads) { + blIndexStart += threadId; + nCellsThread += 1; + } + else { + blIndexStart += nFluid % nThreads; + } + + int blIndexStop = blIndexStart + nCellsThread; + + // We have three loops: + // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. + // 2. Vectorized handling of nodes. + // 3. Remaining nodes, less than vector size. + + unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); + int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; + + int nCellsVectorized = nCellsThread - nCellsUnaligned; + nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); + + int blIndexVec = blIndexStart + nCellsUnaligned; + int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; + + // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); + + for(int iter = 0; iter < maxIterations; ++iter) { + +#if 1 + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexVec + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" + + #define INDEX_START blIndexVec + #define INDEX_STOP blIndexRemaining + #include "BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h" + + #define INDEX_START blIndexRemaining + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" +#else + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt1SScalar.h" +#endif + #pragma omp barrier + + #pragma omp single + { + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + } + + #pragma omp barrier + + } // for (int iter = 0; ... + + MemFree((void **)&tmpArray); + } + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} + +void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, KernelData * kernelData, CaseData * cd) +{ + + Assert(ld != NULL); + Assert(kernelData != NULL); + Assert(cd != NULL); + + Assert(cd->Omega > 0.0); + Assert(cd->Omega < 2.0); + + KernelData * kd = (KernelData *)kernelData; + KernelDataList * kdl = KDL(kernelData); + KernelDataListRia * kdlr = KDLR(kernelData); + + PdfT omega = cd->Omega; + const PdfT omegaEven = omega; + + PdfT magicParam = 1.0 / 12.0; + const PdfT omegaOdd = 1.0 / (0.5 + magicParam / (1.0 / omega - 0.5)); + + + const PdfT w_0 = 1.0 / 3.0; + const PdfT w_1 = 1.0 / 18.0; + const PdfT w_2 = 1.0 / 36.0; + + const PdfT w_1_x3 = w_1 * 3.0; const PdfT w_1_nine_half = w_1 * 9.0 / 2.0; + const PdfT w_2_x3 = w_2 * 3.0; const PdfT w_2_nine_half = w_2 * 9.0 / 2.0; + + const VPDFT vw_1_x3 = VSET(w_1_x3); + const VPDFT vw_2_x3 = VSET(w_2_x3); + + const VPDFT vw_1_nine_half = VSET(w_1_nine_half); + const VPDFT vw_2_nine_half = VSET(w_2_nine_half); + + const VPDFT vomegaEven = VSET(omegaEven); + const VPDFT vomegaOdd = VSET(omegaOdd); + + const VPDFT voneHalf = VSET(0.5); + + // uint32_t nConsecNodes = kdlr->nConsecNodes; + // uint32_t * consecNodes = kdlr->ConsecNodes; + // uint32_t consecIndex = 0; + // uint32_t consecValue = 0; + + PdfT * src = kd->Pdfs[0]; + PdfT * dst = kd->Pdfs[1]; + PdfT * tmp; + + int maxIterations = cd->MaxIterations; + + int nFluid = kdl->nFluid; + int nCells = kdl->nCells; + + int nTmpArray = kdlr->nTmpArray; + + Assert(nTmpArray % VSIZE == 0); + + uint32_t * adjList = kdl->AdjList; + + #ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, -1); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, 0); + #endif + + #ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(nFluid, nCells, kd, kdl, adjList, src, dst, \ + cd, maxIterations, ld, tmp, nTmpArray, \ + stderr ) + #endif + { + uint32_t adjListIndex; + + PdfT ux, uy, uz, ui; + VPDFT vux, vuy, vuz, vui; + + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + VPDFT vpdf_a, vpdf_b; + + PdfT evenPart, oddPart, dir_indep_trm, dens; + PdfT w_1_indep, w_2_indep; + VPDFT vevenPart, voddPart; + VPDFT vw_1_indep, vw_2_indep; + + int indexMax; + + PdfT * tmpArray; + MemAlloc((void **)&tmpArray, sizeof(PdfT) * nTmpArray * N_TMP); + + int nThreads = 1; + int threadId = 0; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); + threadId = omp_get_thread_num(); +#endif + + int nCellsThread = nFluid / nThreads; + int blIndexStart = threadId * nCellsThread; + + if (threadId < nFluid % nThreads) { + blIndexStart += threadId; + nCellsThread += 1; + } + else { + blIndexStart += nFluid % nThreads; + } + + int blIndexStop = blIndexStart + nCellsThread; + + // We have three loops: + // 1. Peeling to ensure alignment for non-temporal stores in loop 2 is correct. + // 2. Vectorized handling of nodes. + // 3. Remaining nodes, less than vector size. + + unsigned long addrStart = (unsigned long)&(src[P_INDEX_3(nCells, blIndexStart, 0)]); + int nCellsUnaligned = (VSIZE - (int)((addrStart / sizeof(PdfT)) % VSIZE)) % VSIZE; + + int nCellsVectorized = nCellsThread - nCellsUnaligned; + nCellsVectorized = nCellsVectorized - (nCellsVectorized % VSIZE); + + int blIndexVec = blIndexStart + nCellsUnaligned; + int blIndexRemaining = blIndexStart + nCellsUnaligned + nCellsVectorized; + + // printf("%d [%d, %d, %d, %d[\n", threadId, blIndexStart, blIndexVec, blIndexRemaining, blIndexStop); + + for(int iter = 0; iter < maxIterations; ++iter) { + +#if 1 + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexVec + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" + + #define INDEX_START blIndexVec + #define INDEX_STOP blIndexRemaining + #include "BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h" + + #define INDEX_START blIndexRemaining + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" +#else + #define INDEX_START blIndexStart + #define INDEX_STOP blIndexStop + #include "BenchKernelD3Q19ListPullSplitNt2SScalar.h" +#endif + #pragma omp barrier + + #pragma omp single + { + #ifdef VERIFICATION + kd->PdfsActive = dst; + KernelAddBodyForce(kd, ld, cd); + #endif + + #ifdef VTK_OUTPUT + if (cd->VtkOutput && (iter % cd->VtkModulus) == 0) { + kd->PdfsActive = dst; + VtkWrite(ld, kd, cd, iter); + } + #endif + + #ifdef STATISTICS + kd->PdfsActive = dst; + KernelStatistics(kd, ld, cd, iter); + #endif + + // swap grids + tmp = src; + src = dst; + dst = tmp; + } + + #pragma omp barrier + + } // for (int iter = 0; ... + + MemFree((void **)&tmpArray); + } + +#ifdef VTK_OUTPUT + if (cd->VtkOutput) { + kd->PdfsActive = src; + VtkWrite(ld, kd, cd, maxIterations); + } +#endif + +#ifdef STATISTICS + kd->PdfsActive = src; + KernelStatistics(kd, ld, cd, maxIterations); +#endif + + return; +} + diff --git a/src/BenchKernelD3Q19ListPullSplitNt.h b/src/BenchKernelD3Q19ListPullSplitNt.h new file mode 100644 index 0000000..bdfa628 --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt.h @@ -0,0 +1,41 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__ +#define __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__ + +#include "Kernel.h" + +void D3Q19ListPullSplitNt1SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +void D3Q19ListPullSplitNt2SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +// void D3Q19ListPullSplitNtRia1SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); +// void D3Q19ListPullSplitNtRia2SInit_PullSoA(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + +void D3Q19ListPullSplitNtDeinit_PullSoA(LatticeDesc * ld, KernelData ** kernelData); + + + +#endif // __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT__ diff --git a/src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h b/src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h new file mode 100644 index 0000000..a3e586b --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt1SIntrinsics.h @@ -0,0 +1,142 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- + +#ifndef INDEX_START + #error INDEX_START must be defined +#endif + +#ifndef INDEX_STOP + #error INDEX_STOP must be defined +#endif + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { + + indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); +#ifdef DEBUG + memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); +#endif + for (int index = 0; index < indexMax; ++index) { + + + adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; + + pdf_C = src[I(index + blockedIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); + D3Q19_LIST_WO_C + #undef X + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + tmpArray[TMP_INDEX(index, TMP_UX)] = ux; + tmpArray[TMP_INDEX(index, TMP_UY)] = uy; + tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0; + + w_1_indep = w_1 * dir_indep_trm; + w_2_indep = w_2 * dir_indep_trm; + + tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; + tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; + + dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); + } + + #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )]) % VSIZE == 0); \ + vui = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \ + vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + vw_1_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \ + \ + vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ + voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ + VST(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ + } \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )]) % VSIZE == 0); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], vpdf_b); \ + } + + #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + _v1 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \ + _v2 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \ + vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + vw_2_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \ + \ + vui = _expr; \ + vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ + voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ + VST(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ + } \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + Assert((unsigned long)&(dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )]) % VSIZE == 0); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], vpdf_b); \ + } + + LOOP_1(N, S, vuy, UY); + LOOP_1(E, W, vux, UX); + LOOP_1(T, B, vuz, UZ); + + LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux)); + LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux)); + LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux)); + LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux)); + LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy)); + LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy)); + + #undef LOOP_1 + #undef LOOP_2 + + } // loop over fluid nodes + + #undef I + + #undef INDEX_START + #undef INDEX_STOP + diff --git a/src/BenchKernelD3Q19ListPullSplitNt1SScalar.h b/src/BenchKernelD3Q19ListPullSplitNt1SScalar.h new file mode 100644 index 0000000..9b833ae --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt1SScalar.h @@ -0,0 +1,139 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- + +#ifndef INDEX_START + #error INDEX_START must be defined +#endif + +#ifndef INDEX_STOP + #error INDEX_STOP must be defined +#endif + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { + + indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); +#ifdef DEBUG + memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); +#endif + for (int index = 0; index < indexMax; ++index) { + + + adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; + + pdf_C = src[I(index + blockedIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); + D3Q19_LIST_WO_C + #undef X + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + tmpArray[TMP_INDEX(index, TMP_UX)] = ux; + tmpArray[TMP_INDEX(index, TMP_UY)] = uy; + tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0; + + w_1_indep = w_1 * dir_indep_trm; + w_2_indep = w_2 * dir_indep_trm; + + tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; + tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; + + dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); + } + + + #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ + for (int index = 0; index < indexMax; ++index) { \ + _vel = tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]; \ + JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \ + JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + w_1_indep = tmpArray[TMP_INDEX(index, TMP_W1)]; \ + \ + ui = _vel; \ + evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_1_nine_half - w_1_indep); \ + oddPart = omegaOdd * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_1_x3); \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )] = JOIN(pdf_,_dir1) - evenPart - oddPart; \ + tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))] = JOIN(pdf_,_dir2) - evenPart + oddPart; \ + } \ + for (int index = 0; index < indexMax; ++index) { \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )] = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + } + + #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ + for (int index = 0; index < indexMax; ++index) { \ + _v1 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]; \ + _v2 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]; \ + JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \ + JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + w_2_indep = tmpArray[TMP_INDEX(index, TMP_W2)]; \ + \ + ui = _expr; \ + evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_2_nine_half - w_2_indep); \ + oddPart = omegaOdd * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_2_x3); \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))] = JOIN(pdf_,_dir1) - evenPart - oddPart; \ + tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))] = JOIN(pdf_,_dir2) - evenPart + oddPart; \ + } \ + for (int index = 0; index < indexMax; ++index) { \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )] = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + } + + LOOP_1(N, S, uy, UY); + LOOP_1(E, W, ux, UX); + LOOP_1(T, B, uz, UZ); + + LOOP_2(NW, SE, uy, ux, UY, UX, uy - ux); + LOOP_2(NE, SW, uy, ux, UY, UX, uy + ux); + LOOP_2(TW, BE, ux, uz, UX, UZ, uz - ux); + LOOP_2(TE, BW, ux, uz, UX, UZ, uz + ux); + LOOP_2(TS, BN, uy, uz, UY, UZ, uz - uy); + LOOP_2(TN, BS, uy, uz, UY, UZ, uz + uy); + + #undef LOOP_1 + #undef LOOP_2 + + + } + + #undef I + + #undef INDEX_START + #undef INDEX_STOP diff --git a/src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h b/src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h new file mode 100644 index 0000000..399fa5f --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt2SIntrinsics.h @@ -0,0 +1,130 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- + +#ifndef INDEX_START + #error INDEX_START must be defined +#endif + +#ifndef INDEX_STOP + #error INDEX_STOP must be defined +#endif + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { + + indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); +#ifdef DEBUG + memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); +#endif + for (int index = 0; index < indexMax; ++index) { + + + adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; + + pdf_C = src[I(index + blockedIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); + D3Q19_LIST_WO_C + #undef X + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + tmpArray[TMP_INDEX(index, TMP_UX)] = ux; + tmpArray[TMP_INDEX(index, TMP_UY)] = uy; + tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0; + + w_1_indep = w_1 * dir_indep_trm; + w_2_indep = w_2 * dir_indep_trm; + + tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; + tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; + + dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); + } + + #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + vui = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \ + vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + vw_1_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \ + \ + vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ + voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ + } + + #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ + for (int index = 0; index < indexMax; index += VSIZE) { \ + _v1 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \ + _v2 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \ + vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ + vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ + vw_2_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \ + \ + vui = _expr; \ + vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ + voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ + VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ + } + + LOOP_1(N, S, vuy, UY); + LOOP_1(E, W, vux, UX); + LOOP_1(T, B, vuz, UZ); + LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux)); + LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux)); + LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux)); + LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux)); + LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy)); + LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy)); + + #undef LOOP_1 + #undef LOOP_2 + + } // loop over fluid nodes + + #undef I + + #undef INDEX_START + #undef INDEX_STOP + diff --git a/src/BenchKernelD3Q19ListPullSplitNt2SScalar.h b/src/BenchKernelD3Q19ListPullSplitNt2SScalar.h new file mode 100644 index 0000000..ca1f3dd --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNt2SScalar.h @@ -0,0 +1,133 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- + +#ifndef INDEX_START + #error INDEX_START must be defined +#endif + +#ifndef INDEX_STOP + #error INDEX_STOP must be defined +#endif + + #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) + + for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { + + indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); +#ifdef DEBUG + memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); +#endif + for (int index = 0; index < indexMax; ++index) { + + + adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; + + pdf_C = src[I(index + blockedIndex, D3Q19_C)]; + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); + D3Q19_LIST_WO_C + #undef X + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + + tmpArray[TMP_INDEX(index, TMP_UX)] = ux; + tmpArray[TMP_INDEX(index, TMP_UY)] = uy; + tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; + + dens = pdf_C + + pdf_N + pdf_E + pdf_S + pdf_W + + pdf_NE + pdf_SE + pdf_SW + pdf_NW + + pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + + pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; + + dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0; + + w_1_indep = w_1 * dir_indep_trm; + w_2_indep = w_2 * dir_indep_trm; + + tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; + tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; + + dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); + } + + + #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ + for (int index = 0; index < indexMax; ++index) { \ + _vel = tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]; \ + JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \ + JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + w_1_indep = tmpArray[TMP_INDEX(index, TMP_W1)]; \ + \ + ui = _vel; \ + evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_1_nine_half - w_1_indep); \ + oddPart = omegaOdd * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_1_x3); \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1) )] = JOIN(pdf_,_dir1) - evenPart - oddPart; \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2) )] = JOIN(pdf_,_dir2) - evenPart + oddPart; \ + } + + #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ + for (int index = 0; index < indexMax; ++index) { \ + _v1 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]; \ + _v2 = tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]; \ + JOIN(pdf_,_dir1) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]; \ + JOIN(pdf_,_dir2) = tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]; \ + w_2_indep = tmpArray[TMP_INDEX(index, TMP_W2)]; \ + \ + ui = _expr; \ + evenPart = omegaEven * (0.5 * (JOIN(pdf_,_dir1) + JOIN(pdf_,_dir2)) - ui * ui * w_2_nine_half - w_2_indep); \ + oddPart = omegaOdd * (0.5 * (JOIN(pdf_,_dir1) - JOIN(pdf_,_dir2)) - ui * w_2_x3); \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))] = JOIN(pdf_,_dir1) - evenPart - oddPart; \ + dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))] = JOIN(pdf_,_dir2) - evenPart + oddPart; \ + } + + LOOP_1(N, S, uy, UY); + LOOP_1(E, W, ux, UX); + LOOP_1(T, B, uz, UZ); + + LOOP_2(NW, SE, uy, ux, UY, UX, uy - ux); + LOOP_2(NE, SW, uy, ux, UY, UX, uy + ux); + LOOP_2(TW, BE, ux, uz, UX, UZ, uz - ux); + LOOP_2(TE, BW, ux, uz, UX, UZ, uz + ux); + LOOP_2(TS, BN, uy, uz, UY, UZ, uz - uy); + LOOP_2(TN, BS, uy, uz, UY, UZ, uz + uy); + + #undef LOOP_1 + #undef LOOP_2 + + + } + + #undef I + + #undef INDEX_START + #undef INDEX_STOP diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.c b/src/BenchKernelD3Q19ListPullSplitNtCommon.c new file mode 100644 index 0000000..b5df14b --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNtCommon.c @@ -0,0 +1,797 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "BenchKernelD3Q19ListPullSplitNtCommon.h" + +#include "Memory.h" +#include "Vtk.h" +#include "Vector.h" + +#include + +#ifdef _OPENMP + #include +#endif + +// Forward definition. +void FNAME(KernelPullSplitNt1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); +void FNAME(KernelPullSplitNt2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + +void FNAME(KernelPullSplitNtRia1S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); +void FNAME(KernelPullSplitNtRia2S)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + + + + +// ----------------------------------------------------------------------- +// Functions which are used as callback by the kernel to read or write +// PDFs and nodes. + +static void FNAME(BCGetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT * pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdf != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + // The relevant PDFs here are the ones, which will get streamed in later + // during propagation. So we must return the *remote* PDFs. + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + *pdf = kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]]; + } + else { + *pdf = kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)]; + + } + + return; +} + +static void FNAME(BCSetPdf)(KernelData * kd, int x, int y, int z, int dir, PdfT pdf) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + Assert(dir >= 0); Assert(dir < N_D3Q19); + + if (isnan(pdf)) { + printf("ERROR: setting nan %d %d %d %d %s\n", x, y, z, dir, D3Q19_NAMES[dir]); + DEBUG_BREAK_POINT(); + exit(1); + } + + // The relevant PDFs here are the ones, which will get streamed in later + // during propagation. So we must set this *remote* PDFs. + uint32_t nodeIndex = KDL(kd)->Grid[L_INDEX_4(kd->Dims, x, y, z)]; + + if (dir != D3Q19_C) { + + uint32_t adjListIndex = nodeIndex * N_D3Q19_IDX; + + kd->PdfsActive[KDL(kd)->AdjList[adjListIndex + dir]] = pdf; + } + else { + kd->PdfsActive[P_INDEX_3(KDL(kd)->nCells, nodeIndex, dir)] = pdf; + + } + + return; +} + + +static void GetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + + PdfT sum = 0.0; + + // TODO: pull scheme? + + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) pdfs[idx] = kd->PdfsActive[I(x, y, z, idx)]; sum += pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + +#ifdef DETECT_NANS + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d]) || isinf(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + return; +} + + +static void SetNode(KernelData * kd, int x, int y, int z, PdfT * pdfs) +{ + Assert(kd != NULL); + Assert(kd->PdfsActive != NULL); + Assert(kd->PdfsActive == kd->Pdfs[0] || kd->PdfsActive == kd->Pdfs[1]); + Assert(pdfs != NULL); + + Assert(x >= 0); Assert(y >= 0); Assert(z >= 0); + Assert(x < kd->Dims[0]); Assert(y < kd->Dims[1]); Assert(z < kd->Dims[2]); + +#ifdef DETECT_NANS + for (int d = 0; d < 19; ++d) { + if(isnan(pdfs[d])) { + printf("%d %d %d %d nan! get node\n", x, y, z, d); + for (int d2 = 0; d2 < 19; ++d2) { + printf("%d: %e\n", d2, pdfs[d2]); + } + exit(1); + } + } +#endif + + // TODO: pull scheme? + #define I(x, y, z, dir) P_INDEX_5(KDL(kd), (x), (y), (z), (dir)) + #define X(name, idx, idxinv, _x, _y, _z) kd->PdfsActive[I(x, y, z, idx)] = pdfs[idx]; + D3Q19_LIST + #undef X + #undef I + + return; +} + +static void ParameterUsage() +{ + printf("Kernel parameters:\n"); + printf(" [-blk ] [-blk-[xyz] ] [-n-tmp-array ]\n"); + + return; +} + +static void ParseParameters(Parameters * params, int * blk, int * nTmpArray) +{ + Assert(blk != NULL); + + blk[0] = 0; blk[1] = 0; blk[2] = 0; + *nTmpArray = 152; + + #define ARG_IS(param) (!strcmp(params->KernelArgs[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= params->nKernelArgs) { \ + printf("ERROR: argument %s requires a parameter.\n", params->KernelArgs[i]); \ + exit(1); \ + } \ + } while (0) + + + for (int i = 0; i < params->nKernelArgs; ++i) { + if (ARG_IS("-blk") || ARG_IS("--blk")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = blk[1] = blk[2] = tmp; + } + else if (ARG_IS("-blk-x") || ARG_IS("--blk-x")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[0] = tmp; + } + else if (ARG_IS("-blk-y") || ARG_IS("--blk-y")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[1] = tmp; + } + else if (ARG_IS("-blk-z") || ARG_IS("--blk-z")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: blocking parameter must be > 0.\n"); + exit(1); + } + + blk[2] = tmp; + } + else if (ARG_IS("-n-tmp-array") || ARG_IS("--n-tmp-array")) { + NEXT_ARG_PRESENT(); + + int tmp = strtol(params->KernelArgs[++i], NULL, 0); + + if (tmp <= 0) { + printf("ERROR: -n-tmp-array parameter must be > 0.\n"); + exit(1); + } + + if (tmp % VSIZE != 0) { + printf("ERROR: value for -n-tmp-array must be a multiple of %d.\n", VSIZE); + exit(1); + } + + *nTmpArray = tmp; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + ParameterUsage(); + exit(1); + } + else { + printf("ERROR: unknown kernel parameter.\n"); + ParameterUsage(); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + return; +} + +static void SetupConsecNodes(LatticeDesc * ld, KernelDataListRia * kdlr, int nThreads) +{ + Assert(ld != NULL); + Assert(kdlr != NULL); + Assert(nThreads > 0); + + uint32_t * adjList = kdlr->kdl.AdjList; + + uint32_t nConsecNodes = 0; + uint32_t consecIndex = 0; + + int nFluid = kdlr->kdl.nFluid; + + uint32_t * consecThreadIndices = (uint32_t *)malloc(sizeof(uint32_t) * (nThreads + 1)); + + int nNodesPerThread = nFluid / nThreads; + + for (int i = 0; i < nThreads; ++i) { + consecThreadIndices[i] = i * nNodesPerThread + MinI(i, nFluid % nThreads); + } + consecThreadIndices[nThreads] = -1; + + int indexThread = 1; + + // We execute following code two times. + // - The first time to get the count of how many entries we need for the + // consecNodes array. + // - The second time to fill the array. + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + } + } + + if (nFluid > 0) { + nConsecNodes = consecIndex + 1; + } + + uint32_t * consecNodes; + MemAlloc((void **)&consecNodes, sizeof(uint32_t) * nConsecNodes); + + consecIndex = 0; + + if (nFluid > 0) { + consecNodes[consecIndex] = 1; + } + + indexThread = 1; + consecThreadIndices[0] = 0; + + // Loop over adjacency list of all nodes. + // Compare if adjacent nodes share the same access pattern. + for (int index = 1; index < nFluid; ++index) { + + int different = 0; + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + + if (adjList[index * N_D3Q19_IDX + d] != adjList[(index - 1) * N_D3Q19_IDX + d] + 1) { + // Different access pattern. + different = 1; + break; + } + } + + if (consecThreadIndices[indexThread] == index) { + // We are at a thread boundary. Starting from this index the fluids + // belong to another thread. Force a break, if nodes are consecutive. + consecThreadIndices[indexThread] = consecIndex + 1; + ++indexThread; + different = 1; + } + + if (different) { + ++consecIndex; + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] = 1; + } + else { + Assert(consecIndex < nConsecNodes); + consecNodes[consecIndex] += 1; + } + } + + + kdlr->ConsecNodes = consecNodes; + kdlr->nConsecNodes = nConsecNodes; + + kdlr->ConsecThreadIndices = consecThreadIndices; + kdlr->nConsecThreadIndices = nThreads; + + // printf("# total fluid nodes: %d consecutive blocks: %d\n", nFluid, nConsecNodes); + + return; +} + + +static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + KernelData * kd; + KernelDataList * kdl; + KernelDataListRia * kdlr; + MemAlloc((void **)&kdlr, sizeof(KernelDataListRia)); + + kd = (KernelData *)kdlr; + kdl = KDL(kdlr); + + *kernelData = kd; + +#ifdef DEBUG + kd->Pdfs[0] = NULL; + kd->Pdfs[1] = NULL; + kd->PdfsActive = NULL; + kd->DstPdfs = NULL; + kd->SrcPdfs = NULL; + kd->Dims[0] = -1; + kd->Dims[1] = -1; + kd->Dims[2] = -1; + kd->GlobalDims[0] = -1; + kd->GlobalDims[1] = -1; + kd->GlobalDims[2] = -1; + kd->Offsets[0] = -1; + kd->Offsets[1] = -1; + kd->Offsets[2] = -1; + + kd->ObstIndices = NULL; + kd->nObstIndices = -1; + kd->BounceBackPdfsSrc = NULL; + kd->BounceBackPdfsDst = NULL; + kd->nBounceBackPdfs = -1; + + kdl->AdjList = NULL; + kdl->Coords = NULL; + kdl->Grid = NULL; + kdl->nCells = -1; + kdl->nFluid = -1; + + kdlr->ConsecNodes = NULL; + kdlr->nConsecNodes = 0; + kdlr->ConsecThreadIndices = NULL; + kdlr->nConsecThreadIndices = 0; +#endif + + // Ajust the dimensions according to padding, if used. + kd->Dims[0] = kd->GlobalDims[0] = ld->Dims[0]; + kd->Dims[1] = kd->GlobalDims[1] = ld->Dims[1]; + kd->Dims[2] = kd->GlobalDims[2] = ld->Dims[2]; + + int * lDims = ld->Dims; + + int lX = lDims[0]; + int lY = lDims[1]; + int lZ = lDims[2]; + + int nTotalCells = lX * lY * lZ; + int nCells = ld->nFluid; + int nFluid = ld->nFluid; + + // We padd each stream of a PDF array for a complete cache line. + // TODO: padding for L1/L2 and TLB. + nCells = nCells + (8 - nCells % 8); + + Assert(nCells % VSIZE == 0); + + kdl->nCells = nCells; + kdl->nFluid = nFluid; + + PdfT * pdfs[2]; + + int blk[3] = { 0 }; + + ParseParameters(params, blk, &kdlr->nTmpArray); + + if (blk[0] == 0) blk[0] = lX; + if (blk[1] == 0) blk[1] = lY; + if (blk[2] == 0) blk[2] = lZ; + + printf("# blocking x: %3d y: %3d z: %3d\n", blk[0], blk[1], blk[2]); + printf("# temporary array size: %d PDFs, %lu b\n", kdlr->nTmpArray, kdlr->nTmpArray * sizeof(PdfT) * 23); + + double latMiB = nCells * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latFluidMib = nFluid * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + double latPadMib = (nCells - nFluid) * sizeof(PdfT) * N_D3Q19 / 1024.0 / 1024.0; + + printf("# lattice size: %e MiB total: %e MiB\n", latMiB, latMiB * 2); + printf("# fluid lattice size: %e MiB total: %e MiB\n", latFluidMib, latFluidMib * 2); + printf("# lattice padding: %e MiB total: %e MiB\n", latPadMib, latPadMib * 2); + +#define PAGE_4K 4096 + + printf("# aligning lattices to: %d b\n", PAGE_4K); + + MemAllocAligned((void **)&pdfs[0], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + MemAllocAligned((void **)&pdfs[1], sizeof(PdfT) * nCells * N_D3Q19, PAGE_4K); + + kd->Pdfs[0] = pdfs[0]; + kd->Pdfs[1] = pdfs[1]; + + // Initialize PDFs with some (arbitrary) data for correct NUMA placement. + // Here we touch only the fluid nodes as this loop is OpenMP parallel and + // we want the same scheduling as in the kernel. + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 1.0; + pdfs[1][P_INDEX_3(nCells, i, d)] = 1.0; + } } + + // Initialize all PDFs to some standard value. + for (int i = 0; i < nFluid; ++i) { for(int d = 0; d < N_D3Q19; ++d) { + pdfs[0][P_INDEX_3(nCells, i, d)] = 0.0; + pdfs[1][P_INDEX_3(nCells, i, d)] = 0.0; + } } + + // ---------------------------------------------------------------------- + // create grid which will hold the index numbers of the fluid nodes + + uint32_t * grid; + + if (MemAlloc((void **)&grid, nTotalCells * sizeof(uint32_t))) { + printf("ERROR: allocating grid for numbering failed: %lu bytes.\n", nTotalCells * sizeof(uint32_t)); + exit(1); + } + kdl->Grid = grid; + + int latticeIndex; + +#ifdef DEBUG + for(int z = 0; z < lZ; ++z) { + for(int y = 0; y < lY; ++y) { + for(int x = 0; x < lX; ++x) { + + latticeIndex = L_INDEX_4(ld->Dims, x, y, z); + + grid[latticeIndex] = ~0; + } + } + } +#endif + + // ---------------------------------------------------------------------- + // generate numbering over grid + + uint32_t * coords; + + if (MemAlloc((void **)&coords, nFluid * sizeof(uint32_t) * 3)) { + printf("ERROR: allocating coords array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * 3); + exit(1); + } + + kdl->Coords = coords; + + // Index for the PDF nodes can start at 0 as we distinguish solid and fluid nodes + // through the ld->Lattice array. + int counter = 0; + + // Blocking is implemented via setup of the adjacency list. The kernel later will + // walk through the lattice blocked automatically. + for (int bZ = 0; bZ < lZ; bZ += blk[2]) { + for (int bY = 0; bY < lY; bY += blk[1]) { + for (int bX = 0; bX < lX; bX += blk[0]) { + + int eX = MIN(bX + blk[0], lX); + int eY = MIN(bY + blk[1], lY); + int eZ = MIN(bZ + blk[2], lZ); + + + for (int z = bZ; z < eZ; ++z) { + for (int y = bY; y < eY; ++y) { + for (int x = bX; x < eX; ++x) { + + latticeIndex = L_INDEX_4(lDims, x, y, z); + + if (ld->Lattice[latticeIndex] != LAT_CELL_OBSTACLE) { + grid[latticeIndex] = counter; + + coords[C_INDEX_X(counter)] = x; + coords[C_INDEX_Y(counter)] = y; + coords[C_INDEX_Z(counter)] = z; + + ++counter; + } + } } } + } } } + + Verify(counter == nFluid); + + uint32_t * adjList; + + double indexMib = nFluid * sizeof(uint32_t) * N_D3Q19_IDX / 1024.0 / 1024.0; + + printf("# index size: %e MiB\n", indexMib); + + + // AdjList only requires 18 instead of 19 entries per node, as + // the center PDF needs no addressing. + if (MemAlloc((void **)&adjList, nFluid * sizeof(uint32_t) * N_D3Q19_IDX)) { + printf("ERROR: allocating adjList array failed: %lu bytes.\n", nFluid * sizeof(uint32_t) * N_D3Q19_IDX); + exit(1); + } + + kdl->AdjList = adjList; + + int x, y, z; + + uint32_t neighborIndex; + uint32_t dstIndex; + + int nx, ny, nz, px, py, pz; + + // Loop over all fluid nodes and compute the indices to the neighboring + // PDFs for configured data layout (AoS/SoA). + // Parallelized loop to ensure correct NUMA placement. + // #ifdef _OPENMP --> add line continuation + // #pragma omp parallel for default(none) + // shared(nFluid, nCells, coords, D3Q19_INV, D3Q19_X, D3Q19_Y, D3Q19_Z, + // stderr, + // lDims, grid, ld, lX, lY, lZ, adjList) + // private(x, y, z, nx, ny, nz, neighborIndex, dstIndex) + // #endif + for (int index = 0; index < nFluid; ++index) { + x = coords[C_INDEX_X(index)]; + y = coords[C_INDEX_Y(index)]; + z = coords[C_INDEX_Z(index)]; + + Assert(x >= 0 && x < lX); + Assert(y >= 0 && y < lY); + Assert(z >= 0 && z < lZ); + + Assert(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE); + + // Loop over all directions except the center one. + for(int d = 0; d < N_D3Q19 - 1; ++d) { + Assert(d != D3Q19_C); + +#ifdef PROP_MODEL_PUSH + nx = x + D3Q19_X[d]; + ny = y + D3Q19_Y[d]; + nz = z + D3Q19_Z[d]; + +#elif PROP_MODEL_PULL + nx = x - D3Q19_X[d]; + ny = y - D3Q19_Y[d]; + nz = z - D3Q19_Z[d]; +#else + #error No implementation for this PROP_MODEL_NAME. +#endif + // If the neighbor is outside the latcie in X direction and we have a + // periodic boundary then we need to wrap around. + if ( ((nx < 0 || nx >= lX) && ld->PeriodicX) || + ((ny < 0 || ny >= lY) && ld->PeriodicY) || + ((nz < 0 || nz >= lZ) && ld->PeriodicZ) + ){ + // x periodic + + if (nx < 0) { + px = lX - 1; + } + else if (nx >= lX) { + px = 0; + } else { + px = nx; + } + // y periodic + if (ny < 0) { + py = lY - 1; + } + else if (ny >= lY) { + py = 0; + } else { + py = ny; + } + + // z periodic + if (nz < 0) { + pz = lZ - 1; + } + else if (nz >= lZ) { + pz = 0; + } else { + pz = nz; + } + + if (ld->Lattice[L_INDEX_4(lDims, px, py, pz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, px, py, pz)]; + + AssertMsg(neighborIndex != ~0, "Neighbor has no Index. (%d %d %d) direction %s (%d)\n", px, py, pz, D3Q19_NAMES[d], d); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + } + else if (nx < 0 || ny < 0 || nz < 0 || nx >= lX || ny >= lY || nz >= lZ) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else if (ld->Lattice[L_INDEX_4(lDims, nx, ny, nz)] == LAT_CELL_OBSTACLE) { + dstIndex = P_INDEX_3(nCells, index, D3Q19_INV[d]); + } + else { + neighborIndex = grid[L_INDEX_4(lDims, nx, ny, nz)]; + + Assert(neighborIndex != ~0); + + dstIndex = P_INDEX_3(nCells, neighborIndex, d); + } + + Assert(dstIndex >= 0); + Assert(dstIndex < nCells * N_D3Q19); + + adjList[index * N_D3Q19_IDX + d] = dstIndex; + } + } + + int nThreads = 1; + +#ifdef _OPENMP + nThreads = omp_get_max_threads(); +#endif + + SetupConsecNodes(ld, KDLR(kd), nThreads); + + + // Fill remaining KernelData structures + kd->GetNode = GetNode; + kd->SetNode = SetNode; + + kd->BoundaryConditionsGetPdf = FNAME(BCGetPdf); + kd->BoundaryConditionsSetPdf = FNAME(BCSetPdf); + + kd->Kernel = NULL; // FNAME(KernelPullSplitNt2S); + + kd->DstPdfs = NULL; + kd->PdfsActive = kd->Pdfs[0]; + + return; +} + +void FNAME(D3Q19ListPullSplitNt1SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + FNAME(Init)(ld, kernelData, params); + (*kernelData)->Kernel = FNAME(KernelPullSplitNt1S); + + double loopBalance = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0); + printf("# loop balance: %.2f B/FLUP\n", loopBalance); +} + +void FNAME(D3Q19ListPullSplitNt2SInit)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params) +{ + FNAME(Init)(ld, kernelData, params); + (*kernelData)->Kernel = FNAME(KernelPullSplitNt2S); + + double loopBalance = 2.0 * 19 * sizeof(PdfT) + (18 * 4.0); + printf("# loop balance: %.2f B/FLUP\n", loopBalance); +} + + +void FNAME(D3Q19ListPullSplitNtDeinit)(LatticeDesc * ld, KernelData ** kernelData) +{ + KernelDataListRia ** kdlr = (KernelDataListRia **)kernelData; + + MemFree((void **)&((*kdlr)->ConsecNodes)); + + if ((*kdlr)->ConsecThreadIndices != NULL) { + MemFree((void **)&((*kdlr)->ConsecThreadIndices)); + } + + KernelDataList ** kdl = (KernelDataList **)kernelData; + + MemFree((void **)&((*kdl)->AdjList)); + MemFree((void **)&((*kdl)->Coords)); + MemFree((void **)&((*kdl)->Grid)); + + MemFree((void **)&((*kernelData)->Pdfs[0])); + MemFree((void **)&((*kernelData)->Pdfs[1])); + + MemFree((void **)kernelData); + return; +} + diff --git a/src/BenchKernelD3Q19ListPullSplitNtCommon.h b/src/BenchKernelD3Q19ListPullSplitNtCommon.h new file mode 100644 index 0000000..720c0d4 --- /dev/null +++ b/src/BenchKernelD3Q19ListPullSplitNtCommon.h @@ -0,0 +1,57 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__ +#define __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__ + +#if !defined(DATA_LAYOUT_SOA) + #error List AA Ria works only with DATA_LAYOUT_SOA +#endif + +#include "BenchKernelD3Q19ListCommon.h" + +typedef struct KernelDataListRia_ { + KernelDataList kdl; + + // Array contains information of how many adjacent nodes share the same access pattern. + uint32_t * ConsecNodes; + uint32_t nConsecNodes; // Number of entries in ConsecNodes array. + + // Array contains (for each thread) an index into ConsecNodes. + uint32_t * ConsecThreadIndices; + // Number of entries in ConsecThreadIndices. + uint32_t nConsecThreadIndices; + + // Size of temporary array in units of PdfTs. + int nTmpArray; +} KernelDataListRia; + +// Macro for casting KernelData * to KernelDataList *. +#define KDLR(_x_) ((KernelDataListRia *)(_x_)) + + +#endif // __BENCH_KERNEL_D3Q19_LIST_PULL_SPLIT_NT_COMMON_H__ + diff --git a/src/BoostJoin.h b/src/BoostJoin.h new file mode 100644 index 0000000..e9167ab --- /dev/null +++ b/src/BoostJoin.h @@ -0,0 +1,38 @@ +// Boost config.hpp configuration header file ------------------------------// +// boostinspect:ndprecated_macros -- tell the inspect tool to ignore this file + +// Copyright (c) 2001-2003 John Maddock +// Copyright (c) 2001 Darin Adler +// Copyright (c) 2001 Peter Dimov +// Copyright (c) 2002 Bill Kempf +// Copyright (c) 2002 Jens Maurer +// Copyright (c) 2002-2003 David Abrahams +// Copyright (c) 2003 Gennaro Prota +// Copyright (c) 2003 Eric Friedman +// Copyright (c) 2010 Eric Jourdanneau, Joel Falcou +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// See http://www.boost.org/ for most recent version. + +// Boost config.hpp policy and rationale documentation has been moved to +// http://www.boost.org/libs/config/ +// +// This file is intended to be stable, and relatively unchanging. +// It should contain boilerplate code only - no compiler specific +// code unless it is unavoidable - no changes unless unavoidable. + +// from boost/config/suffix.hpp + +// +// Helper macro BOOST_JOIN: +// The following piece of macro magic joins the two +// arguments together, even when one of the arguments is +// itself a macro (see 16.3.1 in C++ standard). The key +// is that macro expansion of macro arguments does not +// occur in BOOST_DO_JOIN2 but does in BOOST_DO_JOIN. +// +#define BOOST_JOIN( X, Y ) BOOST_DO_JOIN( X, Y ) +#define BOOST_DO_JOIN( X, Y ) BOOST_DO_JOIN2(X,Y) +#define BOOST_DO_JOIN2( X, Y ) X##Y diff --git a/src/Geometry.c b/src/Geometry.c new file mode 100644 index 0000000..31c985a --- /dev/null +++ b/src/Geometry.c @@ -0,0 +1,267 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "Geometry.h" +#include "Memory.h" + +#include +#include + +#include + +void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], LatticeDesc * ld) +{ + int type = -1; + void * typeDetails = NULL; + int tmp; + + if (strncasecmp("channel", geometryType, 7) == 0) { + type = GEO_TYPE_CHANNEL; + } + else if (strncasecmp("box", geometryType, 3) == 0) { + type = GEO_TYPE_BOX; + } + else if (strncasecmp("pipe", geometryType, 4) == 0) { + type = GEO_TYPE_PIPE; + } + else if (strncasecmp("blocks", geometryType, 6) == 0) { + type = GEO_TYPE_BLOCKS; + + // Default block size + tmp = 8; + + if (strlen(geometryType) > 7) { + int blockSize = atoi(&geometryType[7]); + + int dimMin = dims[0]; + + if (dims[1] < dimMin) dimMin = dims[1]; + if (dims[2] < dimMin) dimMin = dims[2]; + + if (blockSize < 0 || blockSize > dimMin / 2) { + printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); + // TODO: find a better solution for handling errors in here. + Verify(0); + } + + tmp = blockSize; + } + + typeDetails = &tmp; + } + else { + printf("ERROR: unknown geometry specified.\n"); + Verify(0); + } + + GeoCreateByType(type, typeDetails, dims, periodic, ld); + + return; +} + +void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int periodic[3], LatticeDesc * ld) +{ + Assert(dims != NULL); + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(periodic != NULL); + Assert(periodic[0] >= 0); + Assert(periodic[1] >= 0); + Assert(periodic[2] >= 0); + + Assert(ld != NULL); + + Assert(type >= GEO_TYPE_MIN); + Assert(type <= GEO_TYPE_MAX); + + const char * geoTypeStr[] = { "box", "channel", "pipe", "blocks" }; + + printf("# geometry: %d x %d x %d nodes, type %d %s\n", dims[0], dims[1], dims[2], type, geoTypeStr[type]); + + ld->Dims[0] = dims[0]; + ld->Dims[1] = dims[1]; + ld->Dims[2] = dims[2]; + ld->nCells = dims[0] * dims[1] * dims[2]; + ld->PeriodicX = periodic[0]; + ld->PeriodicY = periodic[1]; + ld->PeriodicZ = periodic[2]; + + LatticeT * lattice; + MemAlloc((void **)&lattice, sizeof(LatticeT) * dims[0] * dims[1] * dims[2]); + + ld->Lattice = lattice; + + for (int z = 0; z < dims[2]; ++z) { + for (int y = 0; y < dims[1]; ++y) { + for (int x = 0; x < dims[0]; ++x) { + lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_FLUID; + } + } + } + + if (type == GEO_TYPE_CHANNEL || type == GEO_TYPE_BLOCKS || type == GEO_TYPE_PIPE) { + periodic[0] = 1; + } + + // Walls or periodic on first and last x plane. + for (int z = 0; z < dims[2]; ++z) { + for (int y = 0; y < dims[1]; ++y) { + if(periodic[0]){ + lattice[L_INDEX_4(dims, 0, y, z)] = LAT_CELL_FLUID; + lattice[L_INDEX_4(dims, dims[0] - 1, y, z)] = LAT_CELL_FLUID; + } else { + lattice[L_INDEX_4(dims, 0, y, z)] = LAT_CELL_OBSTACLE; + lattice[L_INDEX_4(dims, dims[0] - 1, y, z)] = LAT_CELL_OBSTACLE; + } + } + } + + // Walls or periodic on first and last y plane. + for (int z = 0; z < dims[2]; ++z) { + for (int x = 0; x < dims[0]; ++x) { + if(periodic[1]){ + lattice[L_INDEX_4(dims, x, 0, z)] = LAT_CELL_FLUID; + lattice[L_INDEX_4(dims, x, dims[1] - 1, z)] = LAT_CELL_FLUID; + } else { + lattice[L_INDEX_4(dims, x, 0, z)] = LAT_CELL_OBSTACLE; + lattice[L_INDEX_4(dims, x, dims[1] - 1, z)] = LAT_CELL_OBSTACLE; + } + } + } + + // Walls or periodic on first and last z plane. + for (int y = 0; y < dims[1]; ++y) { + for (int x = 0; x < dims[0]; ++x) { + if(periodic[2]){ + lattice[L_INDEX_4(dims, x, y, 0)] = LAT_CELL_FLUID; + lattice[L_INDEX_4(dims, x, y, dims[2] - 1)] = LAT_CELL_FLUID; + } else { + lattice[L_INDEX_4(dims, x, y, 0)] = LAT_CELL_OBSTACLE; + lattice[L_INDEX_4(dims, x, y, dims[2] - 1)] = LAT_CELL_OBSTACLE; + } + } + } + + if (type == GEO_TYPE_CHANNEL) { + periodic[0] = 1; + } + else if (type == GEO_TYPE_PIPE) { + #define SQR(a) ((a)*(a)) + double centerZ = dims[2] / 2.0 - 0.5; + double centerY = dims[1] / 2.0 - 0.5; + double minDiameter = MIN(dims[1], dims[2]); + double minRadiusSquared = SQR(minDiameter / 2 - 1); + + for (int z = 0; z < dims[2]; ++z) { + for (int y = 0; y < dims[1]; ++y) { + if((SQR(z - centerZ) + SQR(y - centerY)) >= minRadiusSquared) { + for (int x = 0; x < dims[0]; ++x) { + lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + } + } + } + } + #undef SQR + } + else if (type == GEO_TYPE_BLOCKS) { + + int blockSize = *((int *)typeDetails); + + if (blockSize == 0) { + blockSize = 8; + } + + int dimMin = dims[0]; + + if (dims[1] < dimMin) dimMin = dims[1]; + if (dims[2] < dimMin) dimMin = dims[2]; + + if (blockSize < 0 || blockSize > dimMin / 2) { + printf("ERROR: block size for geometry must be > 0 and smaller than half of the smalest dimension.\n"); + // TODO: find a better solution for handling errors in here. + Verify(0); + } + + // Number of blocks in x, y, and z direction. + int nbx = blockSize, nby = blockSize, nbz = blockSize; + + for (int z = 0; z < dims[2]; ++z) { + if ((z % (2 * nbz)) < nbz) continue; + + for (int y = 0; y < dims[1]; ++y) { + if ((y % (2 * nby)) < nby) continue; + + for (int x = 0; x < dims[0]; ++x) { + + if ((x % (2 * nbx)) >= nbx) { + lattice[L_INDEX_4(dims, x, y, z)] = LAT_CELL_OBSTACLE; + } + } + } + } + } + +// if (latticeDumpAscii) { +// const char strLatCellType[] = "X.IxO"; // X = Obstacle, . = Fluid, I = inlet, O = outlet +// for (int z = dims[2] - 1; z >= 0; --z) { +// printf("plane % 2d\n", z); +// +// for (int y = dims[1] - 1; y >= 0; --y) { +// printf(" %2d ", y); +// for (int x = 0; x < dims[0]; ++x) { +// printf("%c", strLatCellType[lattice[L_INDEX_4(dims, x, y, z)]]); +// } +// printf("\n"); +// } +// } +// } + +// Lattice Helper Function + + ld->nObst = 0; + ld->nFluid = 0; + ld->nInlet = 0; + ld->nOutlet = 0; + + for (int z = 0; z < dims[2]; ++z) { + for (int y = 0; y < dims[1]; ++y) { + for (int x = 0; x < dims[0]; ++x) { + switch (lattice[L_INDEX_4(dims, x, y, z)]) { + case LAT_CELL_OBSTACLE: ld->nObst++; break; + case LAT_CELL_FLUID: ld->nFluid++; break; + case LAT_CELL_INLET: ld->nInlet++; ld->nFluid++; break; + case LAT_CELL_OUTLET: ld->nOutlet++; ld->nFluid++; break; + default: + Verify(0); + } + } + } + } + + return; +} diff --git a/src/Geometry.h b/src/Geometry.h new file mode 100644 index 0000000..11d72ff --- /dev/null +++ b/src/Geometry.h @@ -0,0 +1,48 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __GEOMETRY_H__ +#define __GEOMETRY_H__ + +#include "Lattice.h" + + +typedef enum GEO_TYPES_ { + GEO_TYPE_MIN = 0, + GEO_TYPE_BOX = 0, + GEO_TYPE_CHANNEL = 1, + GEO_TYPE_PIPE = 2, + GEO_TYPE_BLOCKS = 3, // Expects a pointer to an integer, holding the + // value of the block size as type detail. + GEO_TYPE_MAX = 3 +} GEO_TYPES; + + +void GeoCreateByType(GEO_TYPES type, void * typeDetails, int dims[3], int periodic[3], LatticeDesc * ld); + +void GeoCreateByStr(const char * geometryType, int dims[3], int periodic[3], LatticeDesc * ld); + +#endif // __GEOMETRY_H__ diff --git a/src/Kernel.c b/src/Kernel.c new file mode 100644 index 0000000..88018b4 --- /dev/null +++ b/src/Kernel.c @@ -0,0 +1,801 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "Kernel.h" +#include "Lattice.h" + +#include +#include +#include + +#define X(name, idx, idx_inv, x, y, z) , x +int D3Q19_X[] = { + EXPAND(D3Q19_LIST) +}; +#undef X + +#define X(name, idx, idx_inv, x, y, z) , y +int D3Q19_Y[] = { + EXPAND(D3Q19_LIST) +}; +#undef X + +#define X(name, idx, idx_inv, x, y, z) , z +int D3Q19_Z[] = { + EXPAND(D3Q19_LIST) +}; +#undef X + +#define X(name, idx, idxinv, x, y, z) , idxinv +int D3Q19_INV[] = { + EXPAND(D3Q19_LIST) +}; +#undef X + + +#define X(name, idx, idxinv, x, y, z) , STRINGIFY(name) +const char * D3Q19_NAMES[N_D3Q19] = { + EXPAND(D3Q19_LIST) +}; +#undef X + +void KernelComputeBoundaryConditions(KernelData * kd, LatticeDesc * ld, CaseData * cd) +{ + Assert(kd != NULL); + Assert(ld != NULL); + Assert(cd != NULL); + + Assert(cd->RhoIn > 0.0); + Assert(cd->RhoOut > 0.0); + + PdfT rho_in = cd->RhoIn; + PdfT rho_out = cd->RhoOut; + PdfT rho_in_inv = 1.0 / rho_in; + PdfT rho_out_inv = 1.0 / rho_out; + PdfT indep_ux = 0.0; + + PdfT dens; + PdfT ux; + + const PdfT one_third = 1.0 / 3.0; + const PdfT one_fourth = 1.0 / 4.0; + const PdfT one_sixth = 1.0 / 6.0; + + PdfT pdfs[N_D3Q19]; + + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + int x; + int x_in = 0; + int x_out = nX - 1; + + double density_in = 0.0; + double density_out = 0.0; + + // update inlet / outlet boundary conditions + for (int z = 1; z < nZ - 1; ++z) { + for (int y = 1; y < nY - 1; ++y) { + + + // ----------------------------------------------------------------------------- + // update inlet conditions + + if (ld->Lattice[L_INDEX_4(ld->Dims, x_in, y, z)] == LAT_CELL_INLET) { + + x = x_in; + + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_C , pdfs + D3Q19_C); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_T , pdfs + D3Q19_T); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_B , pdfs + D3Q19_B); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_S , pdfs + D3Q19_S); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_N , pdfs + D3Q19_N); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TS, pdfs + D3Q19_TS); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BS, pdfs + D3Q19_BS); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TN, pdfs + D3Q19_TN); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BN, pdfs + D3Q19_BN); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_SW, pdfs + D3Q19_SW); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TW, pdfs + D3Q19_TW); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_W , pdfs + D3Q19_W); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BW, pdfs + D3Q19_BW); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_NW, pdfs + D3Q19_NW); + + dens = rho_in; + + ux = 1 - (pdfs[D3Q19_C] + + (pdfs[D3Q19_T] + pdfs[D3Q19_B] + pdfs[D3Q19_S] + pdfs[D3Q19_N]) + + (pdfs[D3Q19_TS] + pdfs[D3Q19_BS] + pdfs[D3Q19_TN] + pdfs[D3Q19_BN]) + + 2 * (pdfs[D3Q19_SW] + pdfs[D3Q19_TW] + pdfs[D3Q19_W] + pdfs[D3Q19_BW] + pdfs[D3Q19_NW])) * rho_in_inv; + + indep_ux = one_sixth * dens * ux; + + pdfs[D3Q19_E ] = pdfs[D3Q19_W] + one_third * dens * ux; + pdfs[D3Q19_NE] = pdfs[D3Q19_SW] - one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) + indep_ux; + pdfs[D3Q19_SE] = pdfs[D3Q19_NW] + one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) + indep_ux; + pdfs[D3Q19_TE] = pdfs[D3Q19_BW] - one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) + indep_ux; + pdfs[D3Q19_BE] = pdfs[D3Q19_TW] + one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) + indep_ux; + + + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_E , pdfs[D3Q19_E ]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_NE, pdfs[D3Q19_NE]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_SE, pdfs[D3Q19_SE]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_TE, pdfs[D3Q19_TE]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_BE, pdfs[D3Q19_BE]); + + for(int d = 0; d < N_D3Q19; ++d) { + density_in += pdfs[d]; + } + } + + // ----------------------------------------------------------------------------- + // update outlet conditions + + if (ld->Lattice[L_INDEX_4(ld->Dims, x_out, y, z)] == LAT_CELL_OUTLET) { + // update outlet conditions + + x = x_out; + + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_C , pdfs + D3Q19_C ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_T , pdfs + D3Q19_T ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_B , pdfs + D3Q19_B ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_S , pdfs + D3Q19_S ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_N , pdfs + D3Q19_N ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TS, pdfs + D3Q19_TS); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BS, pdfs + D3Q19_BS); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TN, pdfs + D3Q19_TN); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BN, pdfs + D3Q19_BN); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_NE, pdfs + D3Q19_NE); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_BE, pdfs + D3Q19_BE); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_E , pdfs + D3Q19_E ); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_TE, pdfs + D3Q19_TE); + kd->BoundaryConditionsGetPdf(kd, x, y, z, D3Q19_SE, pdfs + D3Q19_SE); + + dens = rho_out; + + ux = -1 + (pdfs[D3Q19_C] + + (pdfs[D3Q19_T] + pdfs[D3Q19_B] + pdfs[D3Q19_S] + pdfs[D3Q19_N]) + + (pdfs[D3Q19_TS] + pdfs[D3Q19_BS] + pdfs[D3Q19_TN] + pdfs[D3Q19_BN]) + + 2 * (pdfs[D3Q19_NE] + pdfs[D3Q19_BE] + pdfs[D3Q19_E] + pdfs[D3Q19_TE] + pdfs[D3Q19_SE])) * rho_out_inv; + indep_ux = one_sixth * dens * ux; + + pdfs[D3Q19_W ] = pdfs[D3Q19_E] - one_third * dens * ux; + pdfs[D3Q19_SW] = pdfs[D3Q19_NE] + one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) - indep_ux; + pdfs[D3Q19_NW] = pdfs[D3Q19_SE] - one_fourth * (pdfs[D3Q19_N] - pdfs[D3Q19_S]) - indep_ux; + pdfs[D3Q19_BW] = pdfs[D3Q19_TE] + one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) - indep_ux; + pdfs[D3Q19_TW] = pdfs[D3Q19_BE] - one_fourth * (pdfs[D3Q19_T] - pdfs[D3Q19_B]) - indep_ux; + + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_W , pdfs[D3Q19_W ]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_NW, pdfs[D3Q19_NW]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_SW, pdfs[D3Q19_SW]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_TW, pdfs[D3Q19_TW]); + kd->BoundaryConditionsSetPdf(kd, x, y, z, D3Q19_BW, pdfs[D3Q19_BW]); + + for(int d = 0; d < N_D3Q19; ++d) { + density_out += pdfs[d]; + } + } + } + } + + // DEBUG: printf("# density inlet: %e density outlet: %e\n", density_in, density_out); + +} + + +PdfT KernelDensity(KernelData * kd, LatticeDesc * ld) +{ + Assert(kd != NULL); + Assert(ld != NULL); + + Assert(ld->Lattice != NULL); + Assert(ld->Dims != NULL); + + Assert(ld->Dims[0] > 0); + Assert(ld->Dims[1] > 0); + Assert(ld->Dims[2] > 0); + + int * lDims = ld->Dims; + int nX = lDims[0]; + int nY = lDims[1]; + int nZ = lDims[2]; + + PdfT pdfs[N_D3Q19] = { -1.0 }; + PdfT density = 0.0; + + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + for(int x = 0; x < nX; ++x) { + + if(ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) { + + kd->GetNode(kd, x, y, z, pdfs); + + for(int d = 0; d < N_D3Q19; ++d) { +// if (pdfs[d] < 0.0) { +// printf("# %d %d %d %d < 0 %e %s\n", x, y, z, d, pdfs[d], D3Q19_NAMES[d]); +// exit(1); +// } + density += pdfs[d]; + } + } + + } + } + } + + return density / ld->nFluid; +} + + +// prescribes a given density +void KernelSetInitialDensity(LatticeDesc * ld, KernelData * kd, CaseData * cd) +{ + int * lDims = ld->Dims; + + PdfT rho_in = cd->RhoIn; + PdfT rho_out = cd->RhoOut; + + PdfT ux = 0.0; + PdfT uy = 0.0; + PdfT uz = 0.0; + PdfT dens = 1.0; + + PdfT omega = cd->Omega; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT dir_indep_trm; + PdfT omega_w0 = 3.0 * w_0 * omega; + PdfT omega_w1 = 3.0 * w_1 * omega; + PdfT omega_w2 = 3.0 * w_2 * omega; + PdfT one_third = 1.0 / 3.0; + + int nX = lDims[0]; + int nY = lDims[1]; + int nZ = lDims[2]; + + PdfT pdfs[N_D3Q19]; + + #ifdef _OPENMP + #pragma omp parallel for collapse(3) + #endif + for(int z = 0; z < nZ; ++z) { for(int y = 0; y < nY; ++y) { for(int x = 0; x < nX; ++x) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + // TODO: fix later. + // if((caseData->geoType == GEO_TYPE_CHANNEL) || (caseData->geoType == GEO_TYPE_RCHANNEL)) + dens = rho_in + (rho_out - rho_in)*(x)/(nX-1.0); + + #define SQR(a) ((a)*(a)) + + dir_indep_trm = one_third * dens - 0.5 * (ux * ux + uy * uy + uz * uz); + + pdfs[D3Q19_C] = omega_w0 * (dir_indep_trm); + + pdfs[D3Q19_NW] = omega_w2 * (dir_indep_trm - (ux - uy) + 1.5 * SQR(ux - uy)); + pdfs[D3Q19_SE] = omega_w2 * (dir_indep_trm + (ux - uy) + 1.5 * SQR(ux - uy)); + + pdfs[D3Q19_NE] = omega_w2 * (dir_indep_trm + (ux + uy) + 1.5 * SQR(ux + uy)); + pdfs[D3Q19_SW] = omega_w2 * (dir_indep_trm - (ux + uy) + 1.5 * SQR(ux + uy)); + + + pdfs[D3Q19_TW] = omega_w2 * (dir_indep_trm - (ux - uz) + 1.5 * SQR(ux - uz)); + pdfs[D3Q19_BE] = omega_w2 * (dir_indep_trm + (ux - uz) + 1.5 * SQR(ux - uz)); + + pdfs[D3Q19_TE] = omega_w2 * (dir_indep_trm + (ux + uz) + 1.5 * SQR(ux + uz)); + pdfs[D3Q19_BW] = omega_w2 * (dir_indep_trm - (ux + uz) + 1.5 * SQR(ux + uz)); + + + pdfs[D3Q19_TS] = omega_w2 * (dir_indep_trm - (uy - uz) + 1.5 * SQR(uy - uz)); + pdfs[D3Q19_BN] = omega_w2 * (dir_indep_trm + (uy - uz) + 1.5 * SQR(uy - uz)); + + pdfs[D3Q19_TN] = omega_w2 * (dir_indep_trm + (uy + uz) + 1.5 * SQR(uy + uz)); + pdfs[D3Q19_BS] = omega_w2 * (dir_indep_trm - (uy + uz) + 1.5 * SQR(uy + uz)); + + + pdfs[D3Q19_N] = omega_w1 * (dir_indep_trm + uy + 1.5 * SQR(uy)); + pdfs[D3Q19_S] = omega_w1 * (dir_indep_trm - uy + 1.5 * SQR(uy)); + + pdfs[D3Q19_E] = omega_w1 * (dir_indep_trm + ux + 1.5 * SQR(ux)); + pdfs[D3Q19_W] = omega_w1 * (dir_indep_trm - ux + 1.5 * SQR(ux)); + + pdfs[D3Q19_T] = omega_w1 * (dir_indep_trm + uz + 1.5 * SQR(uz)); + pdfs[D3Q19_B] = omega_w1 * (dir_indep_trm - uz + 1.5 * SQR(uz)); + + + kd->SetNode(kd, x, y, z, pdfs); + + #undef SQR + } + } } } +} + + +// prescribes a given velocity +void KernelSetInitialVelocity(LatticeDesc * ld, KernelData * kd, CaseData * cd) +{ + + int * lDims = ld->Dims; + + // TODO: ux is overriden below... + PdfT ux = 0.09; // caseData->initUx; + PdfT uy = 0.0; // caseData->initUy; + PdfT uz = 0.0; // caseData->initUz; + PdfT dens = 1.0; + + PdfT omega = cd->Omega; + + PdfT w_0 = 1.0 / 3.0; + PdfT w_1 = 1.0 / 18.0; + PdfT w_2 = 1.0 / 36.0; + + PdfT dir_indep_trm; + PdfT omega_w0 = 3.0 * w_0 * omega; + PdfT omega_w1 = 3.0 * w_1 * omega; + PdfT omega_w2 = 3.0 * w_2 * omega; + PdfT one_third = 1.0 / 3.0; + + int nX = lDims[0]; + int nY = lDims[1]; + int nZ = lDims[2]; + + PdfT pdfs[N_D3Q19]; + + PdfT density; + + #ifdef _OPENMP + #pragma omp parallel for collapse(3) + #endif + for(int z = 0; z < nZ; ++z) { for(int y = 0; y < nY; ++y) { for(int x = 0; x < nX; ++x) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_FLUID) { + + ux = 0.0; + uy = 0.0; + uz = 0.0; + + kd->GetNode(kd, x, y, z, pdfs); + + + density = 0.0; + + #define X(name, idx, idxinv, _x, _y, _z) density += pdfs[idx]; + D3Q19_LIST + #undef X + + + #define SQR(a) ((a)*(a)) + dir_indep_trm = one_third * dens - 0.5 * (ux * ux + uy * uy + uz * uz); + + pdfs[D3Q19_C] = omega_w0 * (dir_indep_trm); + + pdfs[D3Q19_NW] = omega_w2 * (dir_indep_trm - (ux - uy) + 1.5 * SQR(ux - uy)); + pdfs[D3Q19_SE] = omega_w2 * (dir_indep_trm + (ux - uy) + 1.5 * SQR(ux - uy)); + + pdfs[D3Q19_NE] = omega_w2 * (dir_indep_trm + (ux + uy) + 1.5 * SQR(ux + uy)); + pdfs[D3Q19_SW] = omega_w2 * (dir_indep_trm - (ux + uy) + 1.5 * SQR(ux + uy)); + + + pdfs[D3Q19_TW] = omega_w2 * (dir_indep_trm - (ux - uz) + 1.5 * SQR(ux - uz)); + pdfs[D3Q19_BE] = omega_w2 * (dir_indep_trm + (ux - uz) + 1.5 * SQR(ux - uz)); + + pdfs[D3Q19_TE] = omega_w2 * (dir_indep_trm + (ux + uz) + 1.5 * SQR(ux + uz)); + pdfs[D3Q19_BW] = omega_w2 * (dir_indep_trm - (ux + uz) + 1.5 * SQR(ux + uz)); + + + pdfs[D3Q19_TS] = omega_w2 * (dir_indep_trm - (uy - uz) + 1.5 * SQR(uy - uz)); + pdfs[D3Q19_BN] = omega_w2 * (dir_indep_trm + (uy - uz) + 1.5 * SQR(uy - uz)); + + pdfs[D3Q19_TN] = omega_w2 * (dir_indep_trm + (uy + uz) + 1.5 * SQR(uy + uz)); + pdfs[D3Q19_BS] = omega_w2 * (dir_indep_trm - (uy + uz) + 1.5 * SQR(uy + uz)); + + + pdfs[D3Q19_N] = omega_w1 * (dir_indep_trm + uy + 1.5 * SQR(uy)); + pdfs[D3Q19_S] = omega_w1 * (dir_indep_trm - uy + 1.5 * SQR(uy)); + + pdfs[D3Q19_E] = omega_w1 * (dir_indep_trm + ux + 1.5 * SQR(ux)); + pdfs[D3Q19_W] = omega_w1 * (dir_indep_trm - ux + 1.5 * SQR(ux)); + + pdfs[D3Q19_T] = omega_w1 * (dir_indep_trm + uz + 1.5 * SQR(uz)); + pdfs[D3Q19_B] = omega_w1 * (dir_indep_trm - uz + 1.5 * SQR(uz)); + + #undef SQR + + + kd->SetNode(kd, x, y, z, pdfs); + } + } } } + +} + +// Compute analytical x velocity for channel flow. +// +// Formula 7 from Kutay et al. "Laboratory validation of lattice Boltzmann method for modeling +// pore-scale flow in granular materials", doi:10.1016/j.compgeo.2006.08.002. +// +// also formula 10 from +// Pan et al. "An evaluation of lattice Boltzmann equation methods for simulating flow +// through porous media", doi:10.1016/S0167-5648(04)80040-6. +// +// calculate velocity in a pipe for a given radius +// +static PdfT CalcXVelForPipeProfile(PdfT maxRadiusSquared, PdfT curRadiusSquared, PdfT xForce, PdfT viscosity) +{ + return xForce*(maxRadiusSquared - curRadiusSquared) / (2.0*viscosity); +} + +static void KernelGetXSlice(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * outputArray, int xPos) +{ + Assert(ld != NULL); + Assert(kd != NULL); + + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + Assert(xPos >= 0); + Assert(xPos < ld->Dims[0]); + + + PdfT ux = 0.0; + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + PdfT pdfs[N_D3Q19]; + + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, xPos, y, z)] != LAT_CELL_OBSTACLE) { + kd->GetNode(kd, xPos, y, z, pdfs); + + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = pdfs[idx]; + D3Q19_LIST + #undef X + UNUSED(pdf_C); UNUSED(pdf_S); UNUSED(pdf_N); UNUSED(pdf_T); UNUSED(pdf_B); + UNUSED(pdf_TN); UNUSED(pdf_BN); UNUSED(pdf_TS); UNUSED(pdf_BS); + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + + #ifdef VERIFICATION + ux += 0.5 * cd->XForce; + #endif + + outputArray[y * nZ + z] = ux; + } + else { + outputArray[y * nZ + z] = 0.0; + } + } + } + +} + +// Verification of channel profile with analytical solution. +// Taken from Kutay et al. "Laboratory validation of lattice Boltzmann method for modeling +// pore-scale flow in granular materials", doi:10.1016/j.compgeo.2006.08.002. and +// Pan et al. "An evaluation of lattice Boltzmann equation methods for simulating flow +// through porous media", doi:10.1016/S0167-5648(04)80040-6 +// +void KernelVerifiy(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * errorNorm) +{ + Assert(ld != NULL); + Assert(kd != NULL); + Assert(cd != NULL); + Assert(errorNorm != NULL); + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + PdfT omega = cd->Omega; + PdfT viscosity = (1.0 / omega - 0.5) / 3.0; + + // ux averaged across cross sections in x direction + PdfT * outputArray = (PdfT *)malloc(nZ * nY * sizeof(PdfT)); + Verify(outputArray != NULL); + + memset(outputArray, -10, nZ*nY*sizeof(PdfT)); + + // uncomment this to get values averaged along the x-axis + //AveragePipeCrossSections(ld, kd, outputArray); + KernelGetXSlice(ld, kd, cd, outputArray, (int)(nX/2)); + + + FILE * fh; + char fileName[1024]; + PdfT tmpAvgUx = 0.0; + PdfT tmpAnalyUx = 0.0; + int flagEvenNy = 0; + int y = 0; + + if (nY % 2 == 0) + flagEvenNy = 1; + + y = (nY-flagEvenNy-1)/2; + + snprintf(fileName, sizeof(fileName), "flow-profile.dat"); + + printf("# Kernel validation: writing profile to %s\n", fileName); + + fh = fopen(fileName, "w"); + + if(fh == NULL) { + printf("ERROR: opening file %s failed.\n", fileName); + exit(1); + } + + fprintf(fh, "# Flow profile in Z direction. Taken at the middle of the X length (= %d) of total length %d.\n", nZ / 2, nZ); + // fprintf(fh, "# Snapshot taken at iteration %d.\n", iteration); + fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\" u 1:3 t \\\"analytical\\\", \\\"\\\" u 1:4 t \\\"simulation\\\";\"\n", fileName); + fprintf(fh, "# Plot graphically: gnuplot -e \"plot \\\"%s\\\" u 1:3 w linesp t \\\"analytical\\\", \\\"\\\" u 1:4 w linesp t \\\"simulation\\\"; pause -1;\"\n", fileName); + fprintf(fh, "# z coord., radius, analytic, simulation, diff abs, diff rel, undim_analytic, undim_sim\n"); + + double deviation = 0.0; + double curRadiusSquared; + double center = nY / 2.0; + double minDiameter = nY; + #define SQR(a) ((a)*(a)) + double minRadiusSquared = SQR(minDiameter / 2.0 - 1.0); + #undef SQR + double u_max = cd->XForce*minRadiusSquared/(2.0*viscosity); + + for(int z = 0; z < nZ; ++z) { + + fprintf(fh, "%d\t", z); + + #define SQR(a) ((a)*(a)) + curRadiusSquared = SQR(z-center+0.5); + + + // dimensionless radius + fprintf(fh, "%e\t", (z-center+0.5)/center); + + // analytic profile + if(curRadiusSquared >= minRadiusSquared) + tmpAnalyUx = 0.0; + else + tmpAnalyUx = CalcXVelForPipeProfile(minRadiusSquared, curRadiusSquared, cd->XForce, viscosity); + + //averaged profile + if(flagEvenNy == 1) + tmpAvgUx = (outputArray[y*nZ + z] + outputArray[(y+1)*nZ + z])/2.0; + else + tmpAvgUx = outputArray[y*nZ + z]; + + fprintf(fh, "%e\t", tmpAnalyUx); + fprintf(fh, "%e\t", tmpAvgUx); + + fprintf(fh, "%e\t", fabs(tmpAnalyUx-tmpAvgUx)); + if (tmpAnalyUx != 0.0) { + fprintf(fh, "%e\t", fabs(tmpAnalyUx - tmpAvgUx) / tmpAnalyUx); + deviation += SQR(fabs(tmpAnalyUx - tmpAvgUx) / tmpAnalyUx); + } + else { + fprintf(fh, "0.0\t"); + } + + fprintf(fh, "%e\t", tmpAnalyUx / u_max); + fprintf(fh, "%e\t", tmpAvgUx / u_max); + fprintf(fh, "\n"); + + #undef SQR + } + + *errorNorm = sqrt(deviation); + + printf("# Kernel validation: L2 error norm of relative error: %e\n", *errorNorm); + + + fclose(fh); + free(outputArray); + + +} + + +void KernelStatistics(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration) +{ + KernelStatisticsAdv(kd, ld, cd, iteration, 0); +} + +void KernelStatisticsAdv(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration, int forceOutput) +{ + if (iteration % cd->StatisticsModulus == 0 || forceOutput) { + printf("# iter: %4d avg density: %e\n", iteration, KernelDensity(kd, ld)); + } + + if (iteration % 10 != 0 && !forceOutput) { + return; + } + + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + + int x = nX / 2; + + PdfT pdfs[N_D3Q19]; + + // ---------------------------------------------------------------------- + // velocity in x-direction in cross section appended for each iteration + + double density; + double densitySum; + double ux; + double uxSum = 0.0; + int nFluidNodes = 0; + + for (int y = 0; y < nY; ++y) { + for (int z = 0; z < nZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] != LAT_CELL_OBSTACLE) { + kd->GetNode(kd, x, y, z, pdfs); + + ux = pdfs[D3Q19_E] + pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_TE] + pdfs[D3Q19_BE] - + pdfs[D3Q19_W] - pdfs[D3Q19_NW] - pdfs[D3Q19_SW] - pdfs[D3Q19_TW] - pdfs[D3Q19_BW]; + + uxSum += ux; + ++nFluidNodes; + } + } + } + + const char * mode = "w"; + + if (iteration > 0) { + mode = "a"; + } + + const char * fileName = "ux-progress.dat"; + FILE * fh; + + fh = fopen(fileName, mode); + + if(fh == NULL) { + printf("ERROR: opening file %s failed.\n", fileName); + exit(1); + } + + if (iteration == 0) { + fprintf(fh, "# Average velocity in x direction of cross section in the middle (x = %d) of the geometry (NX = %d).\n", x, nX); + fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\";\"\n", fileName); + fprintf(fh, "# iteration, avg ux\n"); + } + + fprintf(fh, "%d %e\n", iteration, uxSum / nFluidNodes); + + fclose(fh); + + // ---------------------------------------------------------------------- + // average velocity/density for each in cross section in x direction + + fileName = "density-ux.dat"; + + fh = fopen(fileName, "w"); + + if(fh == NULL) { + printf("ERROR: opening file %s failed.\n", fileName); + exit(1); + } + + fprintf(fh, "# Average density and average x velocity over each cross section in x direction. Snapshot taken at iteration %d.\n", iteration); + fprintf(fh, "# Plot on terminal: gnuplot -e \"set terminal dumb; plot \\\"%s\\\" u 1:2; plot \\\"%s\\\" u 1:3;\"\n", fileName, fileName); +// fprintf(fh, "# Plot graphically: gnuplot -e \"plot \\\"%s\\\" u 1:3 w linesp t \\\"l\\\", \\\"\\\" u 1:4 w linesp t \\\"simulation\\\"; pause -1;" + fprintf(fh, "# x, avg density, avg ux\n"); + + for (x = 0; x < nX; ++x) { + + uxSum = 0.0; + densitySum = 0.0; + nFluidNodes = 0; + + for (int y = 0; y < nY; ++y) { + for (int z = 0; z < nZ; ++z) { + + if (ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_OBSTACLE) { + continue; + } + + kd->GetNode(kd, x, y, z, pdfs); + + density = + pdfs[D3Q19_C] + + pdfs[D3Q19_N] + pdfs[D3Q19_E] + pdfs[D3Q19_S] + pdfs[D3Q19_W] + + pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_SW] + pdfs[D3Q19_NW] + + pdfs[D3Q19_T] + pdfs[D3Q19_TN] + pdfs[D3Q19_TE] + pdfs[D3Q19_TS] + pdfs[D3Q19_TW] + + pdfs[D3Q19_B] + pdfs[D3Q19_BN] + pdfs[D3Q19_BE] + pdfs[D3Q19_BS] + pdfs[D3Q19_BW]; + + densitySum += density; + + ux = + pdfs[D3Q19_E] + pdfs[D3Q19_NE] + pdfs[D3Q19_SE] + pdfs[D3Q19_TE] + pdfs[D3Q19_BE] - + pdfs[D3Q19_W] - pdfs[D3Q19_NW] - pdfs[D3Q19_SW] - pdfs[D3Q19_TW] - pdfs[D3Q19_BW]; + + uxSum += ux; + + ++nFluidNodes; + } + } + + fprintf(fh, "%d %e %e\n", x, densitySum / nFluidNodes, uxSum / nFluidNodes); + } + + fclose(fh); +} + + + +void KernelAddBodyForce(KernelData * kd, LatticeDesc * ld, CaseData * cd) +{ + Assert(kd != NULL); + Assert(ld != NULL); + Assert(cd != NULL); + + int nX = kd->Dims[0]; + int nY = kd->Dims[1]; + int nZ = kd->Dims[2]; + + PdfT w_0 = 1.0 / 3.0; // C + PdfT w_1 = 1.0 / 18.0; // N,S,E,W,T,B + PdfT w_2 = 1.0 / 36.0; // NE,NW,SE,SW,TE,TW,BE,BW,TN,TS,BN,BS + PdfT w[] = {w_1,w_1,w_1,w_1,w_2,w_2,w_2,w_2,w_1,w_2,w_2,w_2,w_2,w_1,w_2,w_2,w_2,w_2,w_0}; + + PdfT xForce = cd->XForce; + + PdfT pdfs[N_D3Q19]; + + + #ifdef _OPENMP + #pragma omp parallel for collapse(3) default(none) \ + shared(nX,nY,nZ,ld,kd,w,xForce,D3Q19_X,cd) \ + private(pdfs) + #endif + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + for(int x = 0; x < nX; ++x) { + if(ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)] == LAT_CELL_OBSTACLE) + continue; + + // load pdfs into temp array + kd->GetNode(kd, x, y, z, pdfs); + + // add body force in x direction ( method by Luo) + for (int d = 0; d < N_D3Q19; ++d) { + pdfs[d] = pdfs[d] + 3.0*w[d]*D3Q19_X[d]*xForce; + } + + kd->SetNode(kd, x, y, z, pdfs); + + } + } + } +} diff --git a/src/Kernel.h b/src/Kernel.h new file mode 100644 index 0000000..eddae64 --- /dev/null +++ b/src/Kernel.h @@ -0,0 +1,217 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __KERNEL_H__ +#define __KERNEL_H__ + +#include "Base.h" +#include "Lattice.h" + +#ifdef DATA_LAYOUT_NAME + #error DATA_LAYOUT_NAME must not be defined here. +#endif + +#ifdef PROP_MODEL_NAME + #error PROP_MODEL_NAME must not be defined here. +#endif + + +#ifdef DATA_LAYOUT_SOA + #define DATA_LAYOUT_NAME SoA +#endif + +#ifdef DATA_LAYOUT_AOS + #define DATA_LAYOUT_NAME AoS +#endif + +#ifdef PROP_MODEL_PUSH + #define PROP_MODEL_NAME Push +#endif + +#ifdef PROP_MODEL_PULL + #define PROP_MODEL_NAME Pull +#endif + +#ifdef PROP_MODEL_AA + #define PROP_MODEL_NAME AA +#endif + + + +typedef double PdfT; + + + +#define D3Q19 + + +#define N_D3Q19 19 + +#define D3Q19_N 0 +#define D3Q19_S 1 +#define D3Q19_E 2 +#define D3Q19_W 3 + +#define D3Q19_NE 4 +#define D3Q19_SE 5 +#define D3Q19_NW 6 +#define D3Q19_SW 7 + +#define D3Q19_T 8 +#define D3Q19_TN 9 +#define D3Q19_TE 10 +#define D3Q19_TW 11 +#define D3Q19_TS 12 + +#define D3Q19_B 13 +#define D3Q19_BS 14 +#define D3Q19_BN 15 +#define D3Q19_BW 16 +#define D3Q19_BE 17 + +#define D3Q19_C 18 // IMPORTANT: Center particle must be the last one. + +// --- + +#ifdef X + #error X is not allowed to be defined here! +#endif + +// The following list must be sorted ascending according +// to the index of the direction, i.e. D3Q19_N, D3Q19_S, ... +#define D3Q19_LIST \ + X(N, D3Q19_N, D3Q19_S, 0, 1, 0) \ + X(S, D3Q19_S, D3Q19_N, 0, -1, 0) \ + X(E, D3Q19_E, D3Q19_W, 1, 0, 0) \ + X(W, D3Q19_W, D3Q19_E, -1, 0, 0) \ + X(NE, D3Q19_NE, D3Q19_SW, 1, 1, 0) \ + X(SE, D3Q19_SE, D3Q19_NW, 1, -1, 0) \ + X(NW, D3Q19_NW, D3Q19_SE, -1, 1, 0) \ + X(SW, D3Q19_SW, D3Q19_NE, -1, -1, 0) \ + X(T, D3Q19_T, D3Q19_B, 0, 0, 1) \ + X(TN, D3Q19_TN, D3Q19_BS, 0, 1, 1) \ + X(TE, D3Q19_TE, D3Q19_BW, 1, 0, 1) \ + X(TW, D3Q19_TW, D3Q19_BE, -1, 0, 1) \ + X(TS, D3Q19_TS, D3Q19_BN, 0, -1, 1) \ + X(B, D3Q19_B, D3Q19_T, 0, 0, -1) \ + X(BS, D3Q19_BS, D3Q19_TN, 0, -1, -1) \ + X(BN, D3Q19_BN, D3Q19_TS, 0, 1, -1) \ + X(BW, D3Q19_BW, D3Q19_TE, -1, 0, -1) \ + X(BE, D3Q19_BE, D3Q19_TW, 1, 0, -1) \ + X(C, D3Q19_C, D3Q19_C, 0, 0, 0) + +#define D3Q19_LIST_WO_C \ + X(N, D3Q19_N, D3Q19_S, 0, 1, 0) \ + X(S, D3Q19_S, D3Q19_N, 0, -1, 0) \ + X(E, D3Q19_E, D3Q19_W, 1, 0, 0) \ + X(W, D3Q19_W, D3Q19_E, -1, 0, 0) \ + X(NE, D3Q19_NE, D3Q19_SW, 1, 1, 0) \ + X(SE, D3Q19_SE, D3Q19_NW, 1, -1, 0) \ + X(NW, D3Q19_NW, D3Q19_SE, -1, 1, 0) \ + X(SW, D3Q19_SW, D3Q19_NE, -1, -1, 0) \ + X(T, D3Q19_T, D3Q19_B, 0, 0, 1) \ + X(TN, D3Q19_TN, D3Q19_BS, 0, 1, 1) \ + X(TE, D3Q19_TE, D3Q19_BW, 1, 0, 1) \ + X(TW, D3Q19_TW, D3Q19_BE, -1, 0, 1) \ + X(TS, D3Q19_TS, D3Q19_BN, 0, -1, 1) \ + X(B, D3Q19_B, D3Q19_T, 0, 0, -1) \ + X(BS, D3Q19_BS, D3Q19_TN, 0, -1, -1) \ + X(BN, D3Q19_BN, D3Q19_TS, 0, 1, -1) \ + X(BW, D3Q19_BW, D3Q19_TE, -1, 0, -1) \ + X(BE, D3Q19_BE, D3Q19_TW, 1, 0, -1) + + +extern int D3Q19_X[N_D3Q19]; +extern int D3Q19_Y[N_D3Q19]; +extern int D3Q19_Z[N_D3Q19]; +extern int D3Q19_INV[N_D3Q19]; + +extern const char * D3Q19_NAMES[N_D3Q19]; + + + +typedef struct CaseData_ { + PdfT Omega; + PdfT RhoIn; + PdfT RhoOut; + PdfT Ux; + PdfT Uy; + PdfT Uz; + PdfT XForce; + int MaxIterations; + int VtkOutput; + int VtkModulus; + int StatisticsModulus; +} CaseData; + + +typedef struct KernelData_ { + PdfT * Pdfs[2]; + PdfT * SrcPdfs; + PdfT * DstPdfs; + PdfT * PdfsActive; + int Dims[3]; + int GlobalDims[3]; + int Offsets[3]; + int * ObstIndices; + int nObstIndices; + int * BounceBackPdfsSrc; + int * BounceBackPdfsDst; + int nBounceBackPdfs; + + void (* BoundaryConditionsGetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT * pdf); + void (* BoundaryConditionsSetPdf)(struct KernelData_ * kd, int x, int y, int z, int dir, PdfT pdf); + + void (* GetNode)(struct KernelData_ * kd, int x, int y, int z, PdfT * pdfs); + void (* SetNode)(struct KernelData_ * kd, int x, int y, int z, PdfT * pdfs); + + void (* Kernel)(LatticeDesc * ld, struct KernelData_ * kd, CaseData * cd); + +} KernelData; + +typedef struct Parameters_ { + int nArgs; + char ** Args; + int nKernelArgs; + char ** KernelArgs; +} Parameters; + +void KernelComputeBoundaryConditions(KernelData * kd, LatticeDesc * ld, CaseData * cd); + +PdfT KernelDensity(KernelData * kd, LatticeDesc * ld); + +void KernelStatistics(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration); +void KernelStatisticsAdv(KernelData * kd, LatticeDesc * ld, CaseData * cd, int iteration, int forceOutput); + + +void KernelSetInitialDensity (LatticeDesc * ld, KernelData * kd, CaseData * cd); +void KernelSetInitialVelocity(LatticeDesc * ld, KernelData * kd, CaseData * cd); + +void KernelVerifiy(LatticeDesc * ld, KernelData * kd, CaseData * cd, PdfT * errorNorm); + +void KernelAddBodyForce(KernelData * kd, LatticeDesc * ld, CaseData * cd); + +#endif // __KERNEL_H__ diff --git a/src/KernelFunctions.h b/src/KernelFunctions.h new file mode 100644 index 0000000..2ee063a --- /dev/null +++ b/src/KernelFunctions.h @@ -0,0 +1,138 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __KERNEL_FUNCTIONS_H__ +#define __KERNEL_FUNCTIONS_H__ + +#include "BenchKernelD3Q19.h" +#include "BenchKernelD3Q19List.h" +#include "BenchKernelD3Q19ListAa.h" +#include "BenchKernelD3Q19ListAaRia.h" +#include "BenchKernelD3Q19ListAaPv.h" +#include "BenchKernelD3Q19ListPullSplitNt.h" + +typedef struct KernelFunctions_ +{ + char Name[128]; + void (* Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters * params); + void (* Deinit)(LatticeDesc * ld, KernelData ** kernelData); +} KernelFunctions; + +KernelFunctions g_kernels[] = +{ + { + .Name = "list-aa-pv-soa", + .Init = D3Q19ListAaPvInit_PushSoA, + .Deinit = D3Q19ListAaPvDeinit_PushSoA + }, + { + .Name = "list-aa-ria-soa", + .Init = D3Q19ListAaRiaInit_PushSoA, + .Deinit = D3Q19ListAaRiaDeinit_PushSoA + }, + { + .Name = "list-aa-soa", + .Init = D3Q19ListAaInit_PushSoA, + .Deinit = D3Q19ListAaDeinit_PushSoA + }, + { + .Name = "list-aa-aos", + .Init = D3Q19ListAaInit_PushAoS, + .Deinit = D3Q19ListAaDeinit_PushAoS + }, + { + .Name = "list-pull-split-nt-1s-soa", + .Init = D3Q19ListPullSplitNt1SInit_PullSoA, + .Deinit = D3Q19ListPullSplitNtDeinit_PullSoA + }, + { + .Name = "list-pull-split-nt-2s-soa", + .Init = D3Q19ListPullSplitNt2SInit_PullSoA, + .Deinit = D3Q19ListPullSplitNtDeinit_PullSoA + }, + { + .Name = "list-push-soa", + .Init = D3Q19ListInit_PushSoA, + .Deinit = D3Q19ListDeinit_PushSoA + }, + { + .Name = "list-push-aos", + .Init = D3Q19ListInit_PushAoS, + .Deinit = D3Q19ListDeinit_PushAoS + }, + { + .Name = "list-pull-soa", + .Init = D3Q19ListInit_PullSoA, + .Deinit = D3Q19ListDeinit_PullSoA + }, + { + .Name = "list-pull-aos", + .Init = D3Q19ListInit_PullAoS, + .Deinit = D3Q19ListDeinit_PullAoS + }, + { + .Name = "push-soa", + .Init = D3Q19Init_PushSoA, + .Deinit = D3Q19Deinit_PushSoA + }, + { + .Name = "push-aos", + .Init = D3Q19Init_PushAoS, + .Deinit = D3Q19Deinit_PushAoS + }, + { + .Name = "pull-soa", + .Init = D3Q19Init_PullSoA, + .Deinit = D3Q19Deinit_PullSoA + }, + { + .Name = "pull-aos", + .Init = D3Q19Init_PullAoS, + .Deinit = D3Q19Deinit_PullAoS + }, + { + .Name = "blk-push-soa", + .Init = D3Q19BlkInit_PushSoA, + .Deinit = D3Q19BlkDeinit_PushSoA + }, + { + .Name = "blk-push-aos", + .Init = D3Q19BlkInit_PushAoS, + .Deinit = D3Q19BlkDeinit_PushAoS + }, + { + .Name = "blk-pull-soa", + .Init = D3Q19BlkInit_PullSoA, + .Deinit = D3Q19BlkDeinit_PullSoA + }, + { + .Name = "blk-pull-aos", + .Init = D3Q19BlkInit_PullAoS, + .Deinit = D3Q19BlkDeinit_PullAoS + }, +}; + +#endif // __KERNEL_FUNCTIONS_H__ diff --git a/src/Lattice.h b/src/Lattice.h new file mode 100644 index 0000000..99b3fc3 --- /dev/null +++ b/src/Lattice.h @@ -0,0 +1,78 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __LATTICE_H__ +#define __LATTICE_H__ + +#include "Base.h" + + +typedef int LatticeT; + +typedef enum LAT_CELL_TYPES_ { + LAT_CELL_OBSTACLE = 0, + LAT_CELL_FLUID = 1, + LAT_CELL_INLET = 2, + LAT_CELL_OUTLET = 4 +} LAT_CELL_TYPES; + + +typedef struct LatticeDesc_ { + int Dims[3]; + LatticeT * Lattice; + int nCells; // Total number of cells (Obstacles + Fluids). + int nFluid; // Total number of fluid cells. Fluid cells are fluid, inlet, and outlet. + int nObst; // Total number of obstacles in the lattice. + int nInlet; // Total number of inlets in the lattice. + int nOutlet; // Total number of outlets. + int PeriodicX; // Periodic in X direction. + int PeriodicY; // Periodic in Y direction. + int PeriodicZ; // Periodic in Z direction. +} LatticeDesc; + + +// #define L_INDEX_4(dims, x, y, z) ((z) * (dims[0]) * (dims[1]) + (y) * (dims[0]) + (x)) + +static inline int L_INDEX_4(int dims[3], int x, int y, int z) +{ + Assert(dims != NULL); + + Assert(dims[0] > 0); + Assert(dims[1] > 0); + Assert(dims[2] > 0); + + Assert(x >= 0); + Assert(x < dims[0]); + Assert(y >= 0); + Assert(y < dims[1]); + Assert(z >= 0); + Assert(z < dims[2]); + + return z * dims[0] * dims[1] + y * dims[0] + x; +} + + +#endif // __LATTICE_H__ diff --git a/src/LikwidIf.h b/src/LikwidIf.h new file mode 100644 index 0000000..e3c24da --- /dev/null +++ b/src/LikwidIf.h @@ -0,0 +1,46 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __LIKWID_IF_H__ +#define __LIKWID_IF_H__ + +#ifdef HAVE_LIKWID + #include + + + #define X_LIKWID_INIT() likwid_markerInit() + #define X_LIKWID_DEINIT() likwid_markerClose() + #define X_LIKWID_START(region_name) likwid_markerStartRegion(region_name) + #define X_LIKWID_STOP(region_name) likwid_markerStopRegion(region_name) +#else + + #define X_LIKWID_INIT() + #define X_LIKWID_DEINIT() + #define X_LIKWID_START(region_name) + #define X_LIKWID_STOP(region_name) +#endif + +#endif // __LIKWID_IF_H__ diff --git a/src/Main.c b/src/Main.c new file mode 100644 index 0000000..1525ad5 --- /dev/null +++ b/src/Main.c @@ -0,0 +1,589 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include +#include +#include +#include // strcasecmp + +#include + +#ifdef _OPENMP +#include +#endif + +#include "Base.h" +#include "Kernel.h" +#include "Memory.h" + +#include "Lattice.h" +#include "Geometry.h" +#include "Pinning.h" +#include "LikwidIf.h" + +#include "KernelFunctions.h" + +#ifdef __x86_64__ + #include + + + #define MXCSR_DAZ 6 + #define MXCSR_FTZ 15 + + + int FpIsMxCsrMaskSet(unsigned int mask) + { + unsigned int mxcsr; + unsigned int mxcsrNew; + + mxcsr = _mm_getcsr(); + + mxcsrNew = mxcsr & mask; + + return (mxcsrNew == mask); + } + + int FpGetFtz() + { + return FpIsMxCsrMaskSet(1 << MXCSR_FTZ); + } + + int FpGetDaz() + { + return FpIsMxCsrMaskSet(1 << MXCSR_DAZ); + } +#endif + + +int ParseDimensions(const char * parameter, int * nX, int * nY, int * nZ) +{ + char * tmp; + + *nX = atoi(parameter); + + if (*nX <= 0) { + printf("ERROR: parameter for X dimension must be > 0.\n"); + return 0; + } + + tmp = strchr(parameter, 'x'); + + if (tmp == NULL) { + printf("ERROR: parameter for Y dimension is missing.\n"); + return 0; + } + + *nY = atoi(tmp + 1); + + if (*nY <= 0) { + printf("ERROR: parameter for Y dimension must be > 0.\n"); + return 0; + } + + tmp = strchr(tmp + 1, 'x'); + + if (tmp == NULL) { + printf("ERROR: parameter for Z dimension is missing.\n"); + return 0; + } + + *nZ = atoi(tmp + 1); + + if (*nZ <= 0) { + printf("ERROR: parameter for Z dimension must be > 0.\n"); + return 0; + } + + return 1; +} + +int main(int argc, char * argv[]) +{ + int dims[3] = { 20, 20, 20 }; // Dimensions in x, y, and z direction + const char * geometryType = "channel"; + // int latticeDumpAscii = 0; + int verify = 0; UNUSED(verify); + char * kernelToUse = NULL; + int nThreads = 1; + const char * pinString = NULL; + int periodic[3] = { 0 }; + + CaseData cd; + + cd.MaxIterations = 1000; + cd.RhoIn = 1.0; + cd.RhoOut = 1.0; + cd.Omega = 1.0; + cd.VtkOutput = 0; + cd.VtkModulus = 100; + cd.StatisticsModulus = 100; + cd.XForce = 0.00001; + kernelToUse = "push-soa"; + + Parameters p; + p.nArgs = argc; + p.Args = argv; + p.nKernelArgs = 0; + p.KernelArgs = NULL; + +#define LBM_BENCH_KERNELS_VERSION_MAJOR 0 +#define LBM_BENCH_KERNELS_VERSION_MINOR 1 + + printf("Lattice Boltzmann Benchmark Kernels (LbmBenchKernels) Copyright (C) 2016, 2017 LSS, RRZE\n"); + printf("This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE.\n"); + printf("This is free software, and you are welcome to redistribute it under certain conditions.\n"); + printf("\n"); + printf("LBM Benchmark Kernels %d.%d, compiled %s %s, type: %s\n", + LBM_BENCH_KERNELS_VERSION_MAJOR, LBM_BENCH_KERNELS_VERSION_MINOR, __DATE__, __TIME__, +#ifdef VERIFICATION + "verification" +#else + "benchmark" +#endif + ); + + // ---------------------------------------------------------------------- + // Parse command line arguments + + #define ARG_IS(param) (!strcmp(argv[i], param)) + #define NEXT_ARG_PRESENT() \ + do { \ + if (i + 1 >= argc) { \ + printf("ERROR: argument %s requires a parameter.\n", argv[i]); \ + return 1; \ + } \ + } while (0) + + for (int i = 1; i < argc; ++i) { + + if (ARG_IS("-dims") || ARG_IS("--dims")) { + NEXT_ARG_PRESENT(); + + + if (!ParseDimensions(argv[++i], &dims[0], &dims[1], &dims[2])) { + return 1; + } + } + // else if (ARG_IS("-lattice-dump-ascii") || ARG_IS("--lattice-dump-ascii")) { + // latticeDumpAscii = 1; + // } + else if (ARG_IS("-geometry") || ARG_IS("--geometry")) { + NEXT_ARG_PRESENT(); + + geometryType = argv[++i]; + } + else if (ARG_IS("-iterations") ||ARG_IS("--iterations")) { + NEXT_ARG_PRESENT(); + + cd.MaxIterations = strtol(argv[++i], NULL, 0); + + if (cd.MaxIterations <= 0) { + printf("ERROR: number of iterations must be > 0.\n"); + return 1; + } + } + else if (ARG_IS("-rho-in") ||ARG_IS("--rho-in")) { + NEXT_ARG_PRESENT(); + + cd.RhoIn = strtod(argv[++i], NULL); + } + else if (ARG_IS("-rho-out") ||ARG_IS("--rho-out")) { + NEXT_ARG_PRESENT(); + + cd.RhoOut = strtod(argv[++i], NULL); + } + else if (ARG_IS("-omega") ||ARG_IS("--omega")) { + NEXT_ARG_PRESENT(); + + cd.Omega = strtod(argv[++i], NULL); + } + else if (ARG_IS("-x-force") ||ARG_IS("--x-force")) { + NEXT_ARG_PRESENT(); + + cd.XForce = strtod(argv[++i], NULL); + } + else if (ARG_IS("-verify") || ARG_IS("--verify")) { +#ifdef VERIFICATION + + // Choose this preset for verification. As geometry type "box" is + // used but x and y direction are made pridoc. + // Everything else can be altered, but enough iterations should be + // performed in order to receive a fully developed flow field. + verify = 1; + + cd.Omega = 1.0; + cd.RhoIn = 1.0; + cd.RhoOut = 1.0; + geometryType = "box"; + dims[0] = 16; + dims[1] = 16; + dims[2] = 16; + cd.XForce = 0.00001; + cd.MaxIterations = 1000; + periodic[0] = 1; + periodic[1] = 1; + periodic[2] = 0; + + printf("#\n"); + printf("# VERIFICATION: verifying flow profile of channel flow.\n"); + printf("#\n"); + + // TODO: this is not a good idea as we ignore all other options... + +#else + printf("ERROR: in order to use -verify VERIFICATION must be defined during compilation.\n"); + printf(" Recompile with VERIFICATION=on.\n"); + return 1; +#endif + } + else if (ARG_IS("-vtk") || ARG_IS("--vtk")) { +#ifdef VTK_OUTPUT + + cd.VtkOutput = 1; + + // If the next parameter is a number it is used as the itartion count, + // if not it is probably another parameter. + if (i + 1 < argc) { + + int vtkModulus = atoi(argv[i+1]); + + if (vtkModulus > 0) { + cd.VtkModulus = vtkModulus; + ++i; + } + } +#else + printf("ERROR: in order to use -vtk VTK_OUTPUT must be defined during compilation.\n"); + printf(" Recompile with VTK_OUTPUT=on.\n"); + return 1; +#endif + } + else if (ARG_IS("-statistics") || ARG_IS("--statistics")) { +#ifdef STATISTICS + NEXT_ARG_PRESENT(); + + cd.StatisticsModulus = atoi(argv[++i]); + + if (cd.StatisticsModulus <= 0) { + printf("ERROR: the iteration count for -statistics must be > 0.\n"); + return 1; + } +#else + printf("ERROR: in order to use -statistics STATISTICS must be defined during compilation.\n"); + printf(" Recompile with STATISTICS=on.\n"); + return 1; +#endif + } + else if (ARG_IS("-kernel") || ARG_IS("--kernel")) { + NEXT_ARG_PRESENT(); + + kernelToUse = argv[++i]; + } + else if (ARG_IS("-list") || ARG_IS("--list")) { + printf("Available kernels to benchmark:\n"); + + for (int j = 0; j < N_ELEMS(g_kernels); ++j) { + printf(" %s\n", g_kernels[j].Name); + } + + return 0; + } + else if (ARG_IS("-pin") || ARG_IS("--pin")) { + NEXT_ARG_PRESENT(); + + pinString = argv[++i]; + } + else if (ARG_IS("-t") || ARG_IS("-threads") || ARG_IS("--threads")) { +#ifdef _OPENMP + NEXT_ARG_PRESENT(); + + nThreads = atoi(argv[++i]); + + if (nThreads <= 0) { + printf("ERROR: number of threads must be > 0.\n"); + return 1; + } +#else + printf("ERROR: specifying number of threads is only available when compiled with OpenMP support.\n"); + return 1; +#endif + } + else if (ARG_IS("-periodic-x") || ARG_IS("--periodic-x")) { + periodic[0] = 1; + } + else if (ARG_IS("-periodic-y") || ARG_IS("--periodic-y")) { + periodic[1] = 1; + } + else if (ARG_IS("-periodic-z") || ARG_IS("--periodic-z")) { + periodic[2] = 1; + } + else if (ARG_IS("-h") || ARG_IS("-help") || ARG_IS("--help")) { + printf("ERROR: unknown argument: %s\n", argv[i]); + printf("\n"); + printf("Usage:\n"); + printf("./lbmbenchk -list\n"); + printf("./lbmbenchk \n"); + printf(" [-dims XxYyZ] [-geometry box|channel|pipe|porosity[-value]] [-iterations ] [-lattice-dump-ascii]\n"); + printf(" [-rho-in ] [-rho-out ] [-kernel ]\n"); + printf(" [-periodic-x]\n"); +#ifdef STATISTICS + printf(" [-statistics ]\n"); +#endif +#ifdef VTK_OUTPUT + printf(" [-vtk []]\n"); +#endif +#ifdef _OPENMP + printf(" [-t ]\n"); +#endif + printf(" [-pin core{,core}*]\n"); +#ifdef VERIFICATION + printf(" [-verify]\n"); +#endif + printf(" -- \n"); + printf("\n"); + printf("-list List available kernels.\n"); + printf("\n"); + printf("-dims XxYxZ Specify geometry dimensions.\n"); + printf("\n"); + printf("-geometry porosity-\n"); + printf(" Geometetry with blocks of size regularily layout out.\n"); + printf("\n"); + return 1; + } + else if (ARG_IS("--")) { + // printf("# kernel args start with %s these are %d args.\n", argv[i + 1], argc - i - 1); + p.KernelArgs = &argv[++i]; + p.nKernelArgs = argc - i; + break; + } + else { + printf("ERROR: unknown parameter: %s.\n", argv[i]); + exit(1); + } + } + + #undef ARG_IS + #undef NEXT_ARG_PRESENT + + + // ---------------------------------------------------------------------- + // Check if we exceed our index addressing PDFs. + + { + uint64_t nPdfs = ((uint64_t)19) * dims[0] * dims[1] * dims[2]; + + if (nPdfs > ((2LU << 31) - 1)) { + printf("ERROR: number of PDFs exceed 2^31.\n"); + exit(1); + } + } + + // ---------------------------------------------------------------------- + +#ifdef _OPENMP + omp_set_num_threads(nThreads); +#endif + + LatticeDesc ld; + + GeoCreateByStr(geometryType, dims, periodic, &ld); + + const char * defines[] = { +#ifdef VTK_OUTPUT + "VTK_OUTPUT", +#endif +#ifdef STATISTICS + "STATISTICS", +#endif +#ifdef VERIFICATION + "VERIFICATION", +#endif +#ifdef _OPENMP + "_OPENMP", +#endif +#ifdef HAVE_LIKWID + "HAVE_LIKWID", +#endif + }; + + printf("# defines: "); + for (int j = 0; j < N_ELEMS(defines); ++j) { + printf("%s ", defines[j]); + } + printf("\n"); + + printf("# nodes total: % 10d\n", ld.nObst + ld.nFluid); + printf("# nodes fluid: % 10d (including inlet & outlet)\n", ld.nFluid); + printf("# nodes obstacles: % 10d\n", ld.nObst); + printf("# nodes inlet: % 10d\n", ld.nInlet); + printf("# nodes outlet: % 10d\n", ld.nOutlet); + printf("# periodicity: x: %d y: %d z: %d\n", ld.PeriodicX, ld.PeriodicY, ld.PeriodicZ); + +#ifdef VTK_OUTPUT + printf("# VTK output: %d (every %d iteration)\n", cd.VtkOutput, cd.VtkModulus); +#endif +#ifdef STATISTICS + printf("# statistics: every %d iteration\n", cd.StatisticsModulus); +#endif + + printf("# omega: %f\n", cd.Omega); + printf("# initial density at inlet/outlet:\n"); + printf("# rho in: %e\n", cd.RhoIn); + printf("# rho out: %e\n", cd.RhoOut); + printf("# iterations: %d\n", cd.MaxIterations); + +#ifdef __x86_64__ + printf("# fp status: DAZ: %d FTZ: %d\n", FpGetDaz(), FpGetFtz()); +#endif + +#ifdef _OPENMP + printf("# OpenMP threads: %d\n", omp_get_max_threads()); + + if (pinString != NULL) { + #pragma omp parallel + { + int threadId = omp_get_thread_num(); + int err; + + err = PinCurrentThreadByCpuList(pinString, 0, 0, threadId); + + if (err) { + printf("ERROR [thread %d]: pinning failed.\n", threadId); + exit(1); + } + + const char * cpuList = PinCpuListAsString(); + Assert(cpuList != NULL); + + // Not so nice hack to print the thread ids ordered. + #pragma omp for ordered + for (int i = 0; i < omp_get_num_threads(); ++i) { + #pragma omp ordered + printf("# thread %2d pinned to core(s): %s\n", threadId, cpuList); + } + + free((void *)cpuList); + } + } +#endif + + KernelData * kd; + + KernelFunctions * kf = NULL; + + if (kernelToUse == NULL) { + kf = &g_kernels[0]; + } + else { + for (int j = 0; j < N_ELEMS(g_kernels); ++j) { + + if (!strcasecmp(kernelToUse, g_kernels[j].Name)) { + kf = &g_kernels[j]; + break; + } + } + } + + if (kf == NULL) { + printf("ERROR: requested kernel \"%s\" not found.\n", kernelToUse); + exit(1); + } + + printf("#\n"); + printf("# kernel: %s\n", kf->Name); + printf("#\n"); + + // Initialize kernel by calling its own initialization function + kf->Init(&ld, &kd, &p); + +#ifdef VERIFICATION + if (verify) { + KernelSetInitialDensity( &ld, kd, &cd); + KernelSetInitialVelocity(&ld, kd, &cd); + } +#endif + + printf("# starting kernel...\n"); + + X_LIKWID_INIT(); + + double timeStart = Time(); + + // Call the LBM kernel + kd->Kernel(&ld, kd, &cd); + + double duration = Time() - timeStart; + + X_LIKWID_DEINIT(); + + // Print some statistics... + KernelStatisticsAdv(kd, &ld, &cd, cd.MaxIterations, 1 /* force output */); + +#ifdef VERIFICATION + PdfT errorNorm = -1.0; + KernelVerifiy(&ld, kd, &cd, &errorNorm); +#endif + + // Deinitialize kernel by calling its own deinitialization function + kf->Deinit(&ld, &kd); + + + double perf = (double)ld.nFluid * (double)cd.MaxIterations / duration / 1.e6; + + printf("P: %f MFLUP/s t: %d d: %f s iter: %d fnodes: %f x1e6 geo: %s kernel: %s %s\n", + perf, nThreads, duration, cd.MaxIterations, ld.nFluid / 1e6, + geometryType, kernelToUse, +#ifdef VERIFICATION + "# VERIFICATION" +#else + "# benchmark" +#endif + ); + + int exitCode = 0; + +#ifdef VERIFICATION + + if (verify) { + printf("# VERIFICATION: deviation from analytical solution: %e\n", errorNorm); + + if (errorNorm > 0.1) { + printf("# VERIFICATION FAILED.\n"); + exitCode = 1; + } + else { + printf("# VERIFICATION SUCCEEDED.\n"); + } + } +#else +// printf("# VERIFICATION: deviation from analytical solution: %e\n", errorNorm); +// printf("# VERIFICATION: this is only valid for pipe geometry with enough iterations performed.\n"); +#endif + + MemFree((void **)&ld.Lattice); + + return exitCode; +} diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..a9889a5 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,298 @@ +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016-2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- + +# Default configuration to chose. +# CONFIG ?= linux-gcc +CONFIG ?= linux-intel + +BUILD ?= debug + +# If we compile for BENCHMARK all verifcation and statistics are disabled, +# if not specified otherwise. +ifeq (on,$(BENCHMARK)) + VERIFICATION ?= off + VTK_OUTPUT ?= off + STATISTICS ?= off +endif + +VERIFICATION ?= on + +STATISTICS ?= on + +VTK_OUTPUT ?= on + +COLOR ?= on + +OPENMP ?= on + +VTK_OUTPUT_ASCII ?= off + +LID_DRIVEN_CAVITY ?= off + +ISA ?= avx + +LIKWID ?= off + +# Global settings for the Makefile +SHELL = sh + +ifdef V + Q = +else + Q = @ +endif + +AR = ar +CMP = cmp +CHMOD = chmod +CPP = cpp +DIFF = diff +ECHO = echo +ECHO_E = echo -e +ECHO_N = echo -n +GIT = git +GREP = grep +MKDIR = mkdir +MKDIR_P = mkdir -p +NM = nm +RM = rm +RM_F = rm -f +RM_RF = rm -rf +TAR = tar +TAR_CZF = tar czf +TOUCH = touch +SED = sed + + + +# Where to store objects and dependency files. +OBJECT_DIR = obj/$(CONFIG)-$(BUILD)$(TAG) +DEP_DIR = obj/$(CONFIG)-$(BUILD)$(TAG)-dep + +# Sources to consider. +SOURCES_C = Main.c Memory.c Geometry.c Kernel.c \ + Vtk.c Pinning.c + +# ------------------------------------------------------------------------ +# NO CHANGE BELOW SHOULD BE NEEDED +# ------------------------------------------------------------------------ + +OBJ_C = $(foreach SOURCE,$(SOURCES_C),$(OBJECT_DIR)/$(SOURCE:%.c=%.o)) \ + $(OBJECT_DIR)/BenchKernelD3Q19Common_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Common_PullAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Common_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19Common_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19_PullAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19List_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19List_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19List_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19List_PullAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListCommon_PullAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAa_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaCommon_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAa_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaCommon_PushAoS.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaRia_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaRiaCommon_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaPv_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListAaPvCommon_PushSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNt_PullSoA.o \ + $(OBJECT_DIR)/BenchKernelD3Q19ListPullSplitNtCommon_PullSoA.o + +OBJ = $(OBJ_C) + +include ./config.$(CONFIG).mk + + +# Colorize command lines. +ifeq (on,$(COLOR)) + COLOR_GRAY =\033[1;30m + COLOR_LIGHT_GRAY =\033[0;37m + COLOR_CYAN =\033[0;36m + COLOR_LIGHT_CYAN =\033[1;36m + COLOR_MAGENTA =\033[0;35m + COLOR_DARK_MAGENTA=\033[1;35m + COLOR_NO =\033[0m + COLOR_RED =\033[0;31m + COLOR_DARK_RED =\033[1;31m + COLOR_GREEN =\033[0;32m + COLOR_DARK_GREEEN =\033[1;32m +else + COLOR_GRAY = + COLOR_LIGHT_GRAY = + COLOR_CYAN = + COLOR_LIGHT_CYAN = + COLOR_MAGENTA = + COLOR_DARK_MAGENTA= + COLOR_NO = + COLOR_RED = + COLOR_DARK_RED = + COLOR_GREEN = + COLOR_DARK_GREEEN = +endif + +# Dependencies which are not source files, that require +# rebuilding the application. +REBUILD_DEPS = $(filter-out %.d,$(MAKEFILE_LIST)) $(OBJECT_DIR)/.target $(DEP_DIR)/.target + +BUILD_CONFIG= + +# Verify computed flow field +ifeq (on,$(VERIFICATION)) + PP_FLAGS += $(D)VERIFICATION +endif + +ifeq (on,$(VTK_OUTPUT)) + PP_FLAGS += $(D)VTK_OUTPUT +endif + +ifeq (on,$(STATISTICS)) + PP_FLAGS += $(D)STATISTICS +endif + +ifeq (on,$(VTK_OUTPUT_ASCII)) + PP_FLAGS += $(D)VTK_OUTPUT_ASCII +endif + +ifeq (on,$(LID_DRIVEN_CAVITY)) + PP_FLAGS += $(D)LID_DRIVEN_CAVITY +endif + +ifeq (avx,$(ISA)) + PP_FLAGS += $(D)VECTOR_AVX +endif +ifeq (sse,$(ISA)) + PP_FLAGS += $(D)VECTOR_SSE +endif + +# Required for posix_memalign +PP_FLAGS += $(D)_POSIX_C_SOURCE=200112L + +ifeq (on,$(LIKWID)) + PP_FLAGS += $(LIKWID_INC) $(D)HAVE_LIKWID + LD_LIBS += $(LIKWID_LIB) -llikwid +endif + +.phony: all clean clean-all + +$(info $(shell $(ECHO_E) "# Configuration: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) building $(.DEFAULT_GOAL)...")) + + +$(info # Object dir: $(OBJECT_DIR)) +$(info # Dependency dir: $(DEP_DIR)) +# $(info # objects: $(OBJ)) + +BIN_DIR=../bin + +all: $(BIN_DIR)/lbmbenchk-$(CONFIG)-$(BUILD)$(BUILD_CONFIG)$(TAG) + + +# ------------------------------------------------------------------------ +# TODO: Include dependencies, if they exist and no clean target is specified. + +#ifeq (,$(SKIP_DEP_INCLUDE)) +# ifneq (,$(OBJ)) +# TMP_OBJ__ = $(patsubst $(OBJECT_DIR)/%,$(DEP_DIR)/%,$(OBJ)) +# $(warning TMP_OBJ__: $(TMP_OBJ__)) +# $(warning OBJ: $(OBJ)) +# $(warning DEPS: $(patsubst %.o,%.d,$(TMP_OBJ__))) +# $(warning $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$(OBJ))) + -include $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$(OBJ)) +# endif +#endif + + +$(BIN_DIR): + [ -d "$@" ] || mkdir -p "$@" + +$(BIN_DIR)/lbmbenchk-$(CONFIG)-$(BUILD)$(BUILD_CONFIG)$(TAG): $(OBJ) $(REBUILD_DEPS) $(DEP_DIR)/.target | $(BIN_DIR) + @$(ECHO_E) "linking: $(COLOR_CYAN)$@$(COLOR_NO)" + $(LD) $(LD_FLAGS) -o $@ $(filter-out $(REBUILD_DEPS),$^) $(LD_LIBS) + @$(ECHO_E) "# Builded binary: $(COLOR_CYAN)$@$(COLOR_NO)" + @$(ECHO_E) "# Configuration was: CONFIG=$(COLOR_CYAN)$(CONFIG)$(COLOR_NO) BUILD=$(COLOR_CYAN)$(BUILD)$(COLOR_NO) VERIFICATION=$(COLOR_CYAN)$(VERIFICATION)$(COLOR_NO) STATISTICS=$(COLOR_CYAN)$(STATISTICS)$(COLOR_NO) VTK_OUTPUT=$(COLOR_CYAN)$(VTK_OUTPUT)$(COLOR_NO) OPENMP=$(COLOR_CYAN)$(OPENMP)$(COLOR_NO) ISA=$(COLOR_CYAN)$(ISA)$(COLOR_NO) LIKWID=$(COLOR_CYAN)$(LIKWID)$(COLOR_NO) target=$(.DEFAULT_GOAL)" + +$(OBJECT_DIR)/%_SoA.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA -c $< -o $@ + +$(OBJECT_DIR)/%_PushSoA.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_PUSH -c $< -o $@ + +$(OBJECT_DIR)/%_PushAoS.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PUSH$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_PUSH -c $< -o $@ + +$(OBJECT_DIR)/%_PullSoA.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_SOA$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PULL$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_SOA $(D)PROP_MODEL_PULL -c $< -o $@ + +$(OBJECT_DIR)/%_PullAoS.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO) $(COLOR_MAGENTA)PROP_MODEL_PULL$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS $(D)PROP_MODEL_PULL -c $< -o $@ + +$(OBJECT_DIR)/%_AoS.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO) $(COLOR_MAGENTA)DATA_LAYOUT_AOS$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) $(D)DATA_LAYOUT_AOS -c $< -o $@ + + +$(OBJECT_DIR)/%.o: %.c $(REBUILD_DEPS) + @$(ECHO_E) "compiling: $(COLOR_CYAN)$@$(COLOR_NO)" + $(CC) $(strip $(C_FLAGS)) $(strip $(PP_FLAGS)) -c $< -o $@ + + +# ------------------------------------------------------------------------ +# Following targets in princibal make only sure, that the directories +# are created if the do not exist already. + +$(DEP_DIR)/.target: + $(Q)$(MKDIR_P) $(DEP_DIR) + $(Q)$(TOUCH) $@ + +# ------------------------------------------------------------------------ +# Current configuration. +MAKE_CFG = SYSTEM=$(SYSTEM) // BUILD=$(BUILD) // MAKEOVERRIDES=\"$(strip $(MAKEOVERRIDES))\" // VERIFICATION=$(VERIFICATION) // STATISTICS=$(STATISTICS) // VTK_OUTPUT=$(VTK_OUTPUT) // VTK_OUTPUT_ASCII=$(VTK_OUTPUT_ASCII) // LID_DRIVEN_CAVITY=$(LID_DRIVEN_CAVITY) // ISA=$(ISA) // LIKWID=$(LIKWID) + +# Compare current configuration to the last one so we know when to +# rebuild this system/target despite when sources have not changed. +$(OBJECT_DIR)/.target:: + $(Q)$(MKDIR_P) $(OBJECT_DIR) + @ ( [ -f $(OBJECT_DIR)/.target ] && $(ECHO) $(MAKE_CFG) | $(CMP) -s $(OBJECT_DIR)/.target - ) \ + || ( $(ECHO) $(MAKE_CFG) > $(OBJECT_DIR)/.target; $(ECHO) "# configuration variables changed, rebuilding ..." ) + + +clean: + -rm $(OBJECT_DIR)/*.o $(DEP_DIR)/*.d + +clean-all: + -[ -d obj ] && rm -r obj diff --git a/src/Memory.c b/src/Memory.c new file mode 100644 index 0000000..d9d1f94 --- /dev/null +++ b/src/Memory.c @@ -0,0 +1,86 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include +#include +#include // strerror +#include + +#include "Base.h" +#include "Memory.h" + + +int MemAlloc(void ** ptr, size_t bytesToAlloc) +{ + void * tmpPtr; + + tmpPtr = malloc(bytesToAlloc); + + if (tmpPtr == NULL) { // && bytesToAlloc != 0) { + Error("allocation of %lu bytes failed: %d - %s\n", bytesToAlloc, errno, strerror(errno)); + exit(1); + } + + *ptr = tmpPtr; + + return 0; +} + +int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes) +{ + int ret; + + ret = posix_memalign(ptr, alignmentBytes, bytesToAlloc); + + if (ret) { + Error("allocation of %lu bytes aligned to %lu bytes failed: %d - %s\n", bytesToAlloc, alignmentBytes, errno, strerror(errno)); + exit(1); + } + + return 0; +} + + +int MemFree(void ** ptr) +{ + Assert(*ptr != NULL); + + free(*ptr); + + *ptr = NULL; + + return 0; +} + +int MemZero(void * ptr, size_t bytesToZero) +{ + Assert(ptr != NULL); + Assert(bytesToZero > 0); + + memset(ptr, 0, bytesToZero); + + return 0; +} diff --git a/src/Memory.h b/src/Memory.h new file mode 100644 index 0000000..cfe1dfc --- /dev/null +++ b/src/Memory.h @@ -0,0 +1,38 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __MEMORY_H__ +#define __MEMORY_H__ + +#include // size_t + +int MemAlloc(void ** ptr, size_t bytesToAlloc); +int MemAllocAligned(void ** ptr, size_t bytesToAlloc, size_t alignmentBytes); +int MemFree(void ** ptr); + +int MemZero(void * ptr, size_t bytesToZero); + +#endif // __MEMORY_H__ diff --git a/src/Pinning.c b/src/Pinning.c new file mode 100644 index 0000000..0cf70ea --- /dev/null +++ b/src/Pinning.c @@ -0,0 +1,393 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif +#include +#include + + +#include "Base.h" +#include "Pinning.h" + + + + +// ----------------------------------------------------------------------- +// +// Binds the calling thread to specified core. +// +// Return value: 0 = success, else error. +// +// ----------------------------------------------------------------------- + +int PinCurrentThreadToCore(int coreNumber) +{ + int error = 0; + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + CPU_SET(coreNumber, &cpu_set); + + error = sched_setaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set); + + if (error != 0) { + Error("pinning thread to core %d failed (%d): %s\n", + coreNumber, error, strerror(error)); + } + + return error; +} + + +// ----------------------------------------------------------------------- +// +// Binds the calling thread to specified core by a cpu list specified +// in the given environment variable. +// +// Return value: 0 = success, else error. +// +// ----------------------------------------------------------------------- + +int PinCurrentThreadByEnvVar(const char * envVarName, + int mpiRank, int nodeRank, int threadNumber) +{ + const char * envVarValue; + int core; + + envVarValue = getenv(envVarName); + + if (envVarValue == NULL) { + if (mpiRank == 0) { + Print("skip pinning: env var %s not set\n", envVarName); + } + + return 0; + } + + core = PinParseCpuList(envVarValue, mpiRank, nodeRank, threadNumber); + + if (core < 0) { + return core; + } + + return PinCurrentThreadToCore(core); +} + + +// ----------------------------------------------------------------------- +// +// Binds the calling thread to a core specified in the CPU list. +// +// Return value: 0 = success, else error. +// +// ----------------------------------------------------------------------- + +int PinCurrentThreadByCpuList(const char * cpuList, + int mpiRank, int nodeRank, int threadNumber) +{ + int core; + + if (cpuList == NULL) { + if (mpiRank == 0) { + printf("ERROR: cpu list is NULL.\n"); + } + + exit(1); + } + + core = PinParseCpuList(cpuList, mpiRank, nodeRank, threadNumber); + + if (core < 0) { + return core; + } + + return PinCurrentThreadToCore(core); +} + + +// ----------------------------------------------------------------------- +// +// Parses the provided cpu list and returns the core number for the +// specified MPI rank, local rank, and thread. +// +// The cpu list has for example a format of: 0,1,2 or 0,1,2_3,4,5 +// +// Blocks (0,1,2 or 3,4,5) separated by "_" specify pinning inside a +// node rank. The first block maps to node rank 1, the second to node +// rank 2, etc. +// +// Inside a block the core numbers specify where the threads should +// be pinned to. They are separated by "," and the first number maps +// to the first core, the second number to the second core, etc. +// +// For example: 0,2,4_6,8,10 +// +// Node rank 0 thread 0 pinned to core 0 +// 0 1 2 +// 0 2 4 +// 1 0 6 +// 1 1 8 +// 1 2 10 +// +// ----------------------------------------------------------------------- + +int PinParseCpuList(const char * cpuList, + int mpiRank, int nodeRank, int threadNumber) +{ + int cpu = -1; + + if (cpuList == NULL) { + return -1; + } + + const char * c = cpuList; + + // Ensure only valid characters are in the cpu list. + // Cpu list is in the format of "0,1,2_3,4,5". + while (((*c >= '0' && *c <= '9') || *c == ',' || *c == '_')) { + ++c; + } + + if (*c != 0x00) { + // Invalid character detected. + return -2; + } + + c = cpuList; + + int i = 0; + + // Move variable c after the "nodeRank"th "_" in the cpu list. + while (i < nodeRank && *c != 0x00) { + if (*c == '_') ++i; + ++c; + } + + if (i != nodeRank || *c < '0' || *c > '9') { + // Cpu list for this node rank not found. + return -3; + } + + // Now find the core for the specified thread. + + int t = 0; + + while (t < threadNumber && *c != 0x00) { + if (*c == ',') { + ++t; + } + else if (*c == '_') { + // Unexpected character at this position. + break; + } + + ++c; + } + + if (t != threadNumber || *c < '0' || *c > '9') { + // Cpu for this threadNumber not found. + return -4; + } + + cpu = atoi(c); + + return cpu; +} + + + +// ----------------------------------------------------------------------- +// +// Returns the first core from the calling thread's affinity set. +// +// On error a value < 0 is returned. +// +// ----------------------------------------------------------------------- + +int PinCurrentCore() +{ + int core = -1; + int err; + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + + err = sched_getaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set); + + // constant CPU_SETSIZE is one larger than the maximum CPU + // number that can be stored in a CPU set + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &cpu_set)) { + core = i; + break; + } + } + + if (err != 0) { + Error("getting thread affinty failed (%d): %s\n", err, strerror(err)); + return -1; + } + + return core; +} + + + +// ----------------------------------------------------------------------- +// +// Returns the all cores from the calling thread's affinity set. +// +// On error a value < 0 is returned. +// +// ----------------------------------------------------------------------- + +typedef cpu_set_t CpuSet; + + +static CpuSet PinCurrentCores() +{ + CpuSet cpuSet; + int err; + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + + err = sched_getaffinity((pid_t)0, sizeof(cpu_set_t), &cpu_set); + + cpuSet = cpu_set; + + if (err != 0) { + Error("getting thread affinty failed (%d): %s\n", err, strerror(err)); + return cpuSet; + } + + return cpuSet; +} + +static char * CpuSetToString(cpu_set_t * cpu_set) +{ + int previousSetCore = -2; + int rangeBeginCore = -2; + + char * buffer1 = (char *)malloc(1024); + Assert(buffer1 != NULL); + char * buffer2 = (char *)malloc(1024); + Assert(buffer2 != NULL); + + buffer1[0] = 0x00; + buffer2[0] = 0x00; + + char * buffer = buffer1; + char * bufferOld = buffer2; + + const char * empty = ""; + const char * realComma = ","; + const char * comma = empty; + + // TODO: use snprintf + // TODO: increase allocated buffer if necessary + + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (!CPU_ISSET(i, cpu_set)) { + continue; + } + + if (i == previousSetCore + 1) { + previousSetCore = i; + continue; + } + + // Now we reached the end of a range. + // The range can also consist of only one core. + // Be aware, that this core is not part of the range. + + // TODO: this code is repeated below -> use it only once + if (rangeBeginCore >= 0 && previousSetCore >= 0) { + char * tmp; + + tmp = buffer; + buffer = bufferOld; + bufferOld = tmp; + + if (rangeBeginCore < previousSetCore) { + sprintf(buffer, "%s%s%d-%d", bufferOld, comma, rangeBeginCore, previousSetCore); + } + else { + sprintf(buffer, "%s%s%d", bufferOld, comma, previousSetCore); + } + + comma = realComma; + } + + // With this core a new range begins. + rangeBeginCore = i; + previousSetCore = i; + } + + if (rangeBeginCore >= 0 && previousSetCore >= 0) { + char * tmp; + + tmp = buffer; + buffer = bufferOld; + bufferOld = tmp; + + if (rangeBeginCore < previousSetCore) { + sprintf(buffer, "%s%s%d-%d", bufferOld, comma, rangeBeginCore, previousSetCore); + } + else { + sprintf(buffer, "%s%s%d", bufferOld, comma, previousSetCore); + } + } + + free(bufferOld); bufferOld = NULL; + + return buffer; +} + +char * PinCpuListAsString() +{ + CpuSet cpuSet = PinCurrentCores(); + + return CpuSetToString(&cpuSet); +} + +#ifdef TEST + +int main(int argc, char * argv[]) +{ + char * cpuList = PinCpuListAsString(); + + printf("pinned to cores: %s\n", cpuList); + + free(cpuList); cpuList = NULL; + + return 0; +} + +#endif // TEST + diff --git a/src/Pinning.h b/src/Pinning.h new file mode 100644 index 0000000..40225c1 --- /dev/null +++ b/src/Pinning.h @@ -0,0 +1,47 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __PINNING_H__ +#define __PINNING_H__ + + +int PinCurrentThreadToCore(int coreNumber); + +int PinParseCpuList(const char * cpuList, + int mpiRank, int nodeRank, int threadNumber); + +int PinCurrentThreadByEnvVar(const char * envVarName, + int mpiRank, int nodeRank, int threadNumber); + +int PinCurrentThreadByCpuList(const char * cpuList, + int mpiRank, int nodeRank, int threadNumber); + +int PinCurrentCore(); + +char * PinCpuListAsString(); + + +#endif // __PINNING_H__ diff --git a/src/Vector.h b/src/Vector.h new file mode 100644 index 0000000..41b9a79 --- /dev/null +++ b/src/Vector.h @@ -0,0 +1,83 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __VECTOR_H__ +#define __VECTOR_H__ + +#if !defined(VECTOR_AVX) && !defined(VECTOR_SSE) + #warning Defining VECTOR_AVX as no ISA extension was selected. + #define VECTOR_AVX +#endif + +#if defined(VECTOR_AVX) && defined(VECTOR_SSE) + #error Only VECTOR_AVX or VECTOR_SSE can be defined at the same time. +#endif + +#ifdef VECTOR_AVX + + #include + // Vector size in double-precision floatin-point numbers. + #define VSIZE 4 + + #define VPDFT __m256d + + #define VSET(scalar) _mm256_set1_pd(scalar) + + #define VLD(expr) _mm256_load_pd(expr) + #define VLDU(expr) _mm256_loadu_pd(expr) + + #define VST(dst, src) _mm256_store_pd(dst, src) + #define VSTU(dst, src) _mm256_storeu_pd(dst, src) + #define VSTNT(dst, src) _mm256_stream_pd(dst, src) + + #define VMUL(a, b) _mm256_mul_pd(a, b) + #define VADD(a, b) _mm256_add_pd(a, b) + #define VSUB(a, b) _mm256_sub_pd(a, b) +#endif + +#ifdef VECTOR_SSE + #include + // Vector size in double-precision floatin-point numbers. + #define VSIZE 2 + + #define VPDFT __m128d + + #define VSET(scalar) _mm_set1_pd(scalar) + + #define VLD(expr) _mm_load_pd(expr) + #define VLDU(expr) _mm_loadu_pd(expr) + + #define VST(dst, src) _mm_store_pd(dst, src) + #define VSTU(dst, src) _mm_storeu_pd(dst, src) + #define VSTNT(dst, src) _mm_stream_pd(dst, src) + + #define VMUL(a, b) _mm_mul_pd(a, b) + #define VADD(a, b) _mm_add_pd(a, b) + #define VSUB(a, b) _mm_sub_pd(a, b) +#endif + + +#endif // __VECTOR_H__ diff --git a/src/Vtk.c b/src/Vtk.c new file mode 100644 index 0000000..e650722 --- /dev/null +++ b/src/Vtk.c @@ -0,0 +1,245 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#include "Vtk.h" + +#include + +// TODO: make this portable + +// needed for stat & mkdir +#include +#include +#include +#include +#include // strerror + +// TODO: make byteswap portable + +#include +// glibc +#include + +// macros for portability +// #define BS32(a) bswap_32(*((uint32_t *)(&a))) +#define BS64(a) bswap_64(*((uint64_t *)(&a))) + + +void VtkWrite(LatticeDesc * ld, KernelData * kd, CaseData * cd, int iteration) +{ + Assert(kd != NULL); + Assert(ld != NULL); + Assert(ld->Dims[0] > 0); + Assert(ld->Dims[1] > 0); + Assert(ld->Dims[2] > 0); + + // TODO: this should be made portable... + // Check if subdirectory vtk exists, if not, create it. + { + int err; + struct stat fileStatus; + + err = stat("vtk", &fileStatus); + + if (err) { + // printf("ERROR: stat %d - %s\n", errno, strerror(errno)); + + // Set default mask and hope mkdir applies umask... + err = mkdir("vtk", 0700); + + if (err) { + printf("ERROR: cannot create directory vtk - %d: %s\n", errno, strerror(errno)); + exit(1); + } + + printf("# created directory vtk.\n"); + } + else { + + if (!S_ISDIR(fileStatus.st_mode)) { + printf("ERROR: cannot create subdirectory vtk as already a file with the same name exists.\n"); + exit(1); + } + + } + } + + + char fileName[1024]; + + snprintf(fileName, sizeof(fileName), "vtk/file-%04d.vtk", iteration); + + printf("# VTK: writing file %s\n", fileName); + + FILE * fh; + + fh = fopen(fileName, "w"); + + if(fh == NULL) { + printf("ERROR: opening file %s failed.\n", fileName); + exit(1); + } + + // http://www.vtk.org/pdf/file-formats.pdf + int nX = ld->Dims[0]; + int nY = ld->Dims[1]; + int nZ = ld->Dims[2]; + int * lDims = ld->Dims; + + // Temporaries for endian conversion. + uint64_t uDensity, uUx, uUy, uUz; + + PdfT pdfs[N_D3Q19]; + + fprintf(fh, "# vtk DataFile Version 1.0\n"); + fprintf(fh, "Comment: lid driven cavity, iteration % 4d\n", iteration); +#ifdef VTK_OUTPUT_ASCII + fprintf(fh, "ASCII\n"); +#else + fprintf(fh, "BINARY\n"); +#endif + fprintf(fh, "DATASET STRUCTURED_POINTS\n"); + fprintf(fh, "DIMENSIONS %d %d %d\n", nX, nY, nZ); + fprintf(fh, "ORIGIN 0 0 0 \n"); + fprintf(fh, "SPACING 1 1 1\n"); + fprintf(fh, "POINT_DATA %d\n", nX * nY * nZ); + + // ---------------------------------------------------------------------- + // Flag field: obstacle = 0, fluid = 1, inlet = 2, outlet = 4 + + fprintf(fh, "SCALARS NodesTypes unsigned_char 1\n"); + fprintf(fh, "LOOKUP_TABLE default\n"); + + unsigned char c; + + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + for(int x = 0; x < nX; ++x) { +#ifdef VTK_OUTPUT_ASCII + fprintf(fh, "%d\n", ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)]); +#else + c = (unsigned char)ld->Lattice[L_INDEX_4(ld->Dims, x, y, z)]; + fwrite(&c, sizeof(unsigned char), 1, fh); +#endif + } + } + } + + // ---------------------------------------------------------------------- + // Density field + + fprintf(fh, "SCALARS Density double\n"); + fprintf(fh, "LOOKUP_TABLE default\n"); + + double density; + + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + for(int x = 0; x < nX; ++x) { + + density = 0.0; + if (ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) { + kd->GetNode(kd, x, y, z, pdfs); + + for (int d = 0; d < N_D3Q19; ++d) { + density += pdfs[d]; + } + } + +#ifdef VTK_OUTPUT_ASCII + fprintf(fh, "%e\n", density); +#else + uDensity = BS64(density); + fwrite(&uDensity, sizeof(double), 1, fh); +#endif + } + } + } + + // ---------------------------------------------------------------------- + // Velocity vectors: velocity in x, y, and z direction + + fprintf(fh, "VECTORS VelocityVectors double\n"); + + // Declare pdf_N, pdf_E, pdf_S, pdf_W, ... + #define X(name, idx, idxinv, x, y, z) PdfT JOIN(pdf_,name); + D3Q19_LIST + #undef X + + double ux, uy, uz; + + for(int z = 0; z < nZ; ++z) { + for(int y = 0; y < nY; ++y) { + for(int x = 0; x < nX; ++x) { + + if (ld->Lattice[L_INDEX_4(lDims, x, y, z)] != LAT_CELL_OBSTACLE) { + kd->GetNode(kd, x, y, z, pdfs); + +// DETECT NANS +// for (int d = 0; d < 19; ++d) { +// if(isnan(pdfs[d])) { +// printf("%d %d %d %d nan!\n", x, y, z, d); +// for (int d2 = 0; d2 < 19; ++d2) { +// printf("%d: %e\n", d2, pdfs[d2]); +// } +// exit(1); +// } +// } + #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = pdfs[idx]; + D3Q19_LIST + #undef X + UNUSED(pdf_C); + + + ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - + pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; + uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - + pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; + uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - + pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; + #ifdef VERIFICATION + ux += 0.5 * cd->XForce; + #endif + } + else { + ux = 0.0; uy = 0.0; uz = 0.0; + } + +#ifdef VTK_OUTPUT_ASCII + fprintf(fh, "%f %f %f\n", ux, uy, uz); +#else + uUx = BS64(ux); uUy = BS64(uy); uUz = BS64(uz); + fwrite(&uUx, sizeof(double), 1, fh); + fwrite(&uUy, sizeof(double), 1, fh); + fwrite(&uUz, sizeof(double), 1, fh); +#endif + } + } + } + + fclose(fh); +} + diff --git a/src/Vtk.h b/src/Vtk.h new file mode 100644 index 0000000..b36395d --- /dev/null +++ b/src/Vtk.h @@ -0,0 +1,34 @@ +// -------------------------------------------------------------------------- +// +// Copyright +// Markus Wittmann, 2016-2017 +// RRZE, University of Erlangen-Nuremberg, Germany +// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +// +// Viktor Haag, 2016 +// LSS, University of Erlangen-Nuremberg, Germany +// +// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +// +// LbmBenchKernels is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// LbmBenchKernels is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with LbmBenchKernels. If not, see . +// +// -------------------------------------------------------------------------- +#ifndef __VTK_H__ +#define __VTK_H__ + +#include "Kernel.h" + +void VtkWrite(LatticeDesc * ld, KernelData * kd, CaseData *cd, int iteration); + +#endif // __VTK_H__ diff --git a/src/config.linux-gcc.mk b/src/config.linux-gcc.mk new file mode 100644 index 0000000..41dea5d --- /dev/null +++ b/src/config.linux-gcc.mk @@ -0,0 +1,97 @@ +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016-2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- + + +# ------------------------------------------------------------------------ +# C ompiler/linker to use. +# Flags are specified at the end of the file. +# ------------------------------------------------------------------------ +CC = gcc +LD = gcc + +# ------------------------------------------------------------------------ + +# Preprocessing variables. +D = -D +I = -I +PP_FLAGS += + +# Architecture to optimize for. +TARCH ?= -mavx + +# Generated dependencies, can be left empty. +MAKE_DEPEND = $(CC) -MM -MQ'$(OBJECT_DIR)/$(<:%.c=%.o)' -MF'$(DEP_DIR)/$(<:%.c=%.d)' $(PP_FLAGS) $< > /dev/null + +# Generates dependencies, can be left empty. +# $(call make_depend,, /dev/null +endef + + +ifeq (on,$(OPENMP)) + OPENMP_C_FLAGS += -fopenmp + OPENMP_LD_FLAGS += -fopenmp +endif + + +ifeq (release,$(BUILD)) + + C_FLAGS += -O3 $(TARCH) + LD_FLAGS += -O3 $(TARCH) + + PP_FLAGS += + +else +ifeq (debug,$(BUILD)) + + C_FLAGS += -O0 $(TARCH) -g -ggdb + LD_FLAGS += -O0 $(TARCH) -g -ggdb + + PP_FLAGS += $(D)DEBUG + +else + $(error unknown BUILD=$(BUILD), specify release or debug) +endif +endif + +ifeq (on,$(ADDRESS_SANITIZER)) + # see https://github.com/google/sanitizers/wiki/AddressSanitizerFlags for details + C_FLAGS += -fsanitize=address + LD_FLAGS += -fsanitize=address +endif + +ifeq (on,$(DEBUG_SYMBOLS)) + C_FLAGS += -g -ggdb + LD_FLAGS += -g -ggdb +endif + +C_FLAGS += -Wall -Wuninitialized -Wunused-variable -Wshadow -fargument-noalias -fargument-noalias-anything -std=c99 \ + -MT $@ -MF $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$@) -MMD $(OPENMP_C_FLAGS) +LD_FLAGS += -Wall $(OPENMP_LD_FLAGS) +LD_LIBS += -lm + diff --git a/src/config.linux-intel.mk b/src/config.linux-intel.mk new file mode 100644 index 0000000..00d1ca7 --- /dev/null +++ b/src/config.linux-intel.mk @@ -0,0 +1,100 @@ +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016, 2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- + + +# ------------------------------------------------------------------------ +# C compiler/linker to use. +# Flags are specified at the end of the file. +# ------------------------------------------------------------------------ +CC = icc +LD = icc + +# ------------------------------------------------------------------------ + +# Preprocessing variables. +D = -D +I = -I +PP_FLAGS += + +# Architecture to optimize for. +TARCH ?= -xAVX + +# Generated dependencies, can be left empty. +MAKE_DEPEND = $(CC) -MM -MQ'$(OBJECT_DIR)/$(<:%.c=%.o)' -MF'$(DEP_DIR)/$(<:%.c=%.d)' $(PP_FLAGS) $< > /dev/null + +# Generates dependencies, can be left empty. +# $(call make_depend,, /dev/null +endef + +ifeq (on,$(OPENMP)) + OPENMP_C_FLAGS += -qopenmp + OPENMP_LD_FLAGS += -qopenmp +endif + +ifeq (release,$(BUILD)) + + C_FLAGS += -O3 $(TARCH) + LD_FLAGS += -O3 $(TARCH) + + PP_FLAGS += + +else +ifeq (debug,$(BUILD)) + + C_FLAGS += -O0 $(TARCH) -g -fstack-protector -debug inline-debug-info -debug extended -debug variable-locations + LD_FLAGS += -O0 $(TARCH) -g -fstack-protector -debug inline-debug-info -debug extended -debug variable-locations + + # Intel compiler intrinsic reference: + # debugging: compile with -D__INTEL_COMPILER_USE_INTRINSIC_PROTOTYPES for + # improved compile-time checking + # release: remove this option as it significantly increases compile time + + PP_FLAGS += $(D)DEBUG $(D)__INTEL_COMPILER_USE_INTRINSIC_PROTOTYPES + +else + $(error unknown BUILD=$(BUILD), specify release or debug) +endif +endif + +ifeq (on,$(DEBUG_SYMBOLS)) + C_FLAGS += -g + LD_FLAGS += -g +endif + + +# ------------------------------------------------------------------------ +# C compiler/linker flags to use. +# ------------------------------------------------------------------------ +C_FLAGS += -Wall -Wcheck -Wabi -Wdeprecated -Wextra-tokens -Wformat -Wformat-security -Wshadow -Wuninitialized -Wunused-variable \ + -fno-alias -fargument-noalias -fno-fnalias -std=c99 \ + -MT $@ -MF $(patsubst $(OBJECT_DIR)/%.o,$(DEP_DIR)/%.d,$@) -MMD -diag-disable 10010 $(OPENMP_C_FLAGS) +LD_FLAGS += -Wall -Wcheck $(OPENMP_LD_FLAGS) +LD_LIBS += + + diff --git a/src/test-verification.sh b/src/test-verification.sh new file mode 100755 index 0000000..1dc8906 --- /dev/null +++ b/src/test-verification.sh @@ -0,0 +1,113 @@ +#!/bin/bash -l +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016-2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- +set -u + +Tmp="delme.test.sh.$(hostname).$$.tmp" +Binary="../bin/lbmbenchk-linux-intel-release" +NThreads="5" + +TestsTotal="0" +TestsFailed="0" +TestsSucceeded="0" + +if [ "$#" -ge "1" ]; then + Binary="$1" +fi + + +function on_exit +{ + if [ -e "$Tmp" ]; then + rm -f "$Tmp" 2>&1 || true + fi +} + +trap "on_exit" EXIT + +function run_kernel +{ + local Binary="$1" + local K="$2" # Kernel name + local T="$3" # Number of threads + local KernelArgs="${4:-""}" + + ((++TestsTotal)) + + echo -n "$Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) ${KernelArgs:+"-- "}$KernelArgs " + + if [ "$KernelArgs" == "" ]; then + $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) > "$Tmp" 2>&1 + else + $Binary -verify -kernel $K -t $T -pin $(seq -s , 0 $((T-1))) -- $KernelArgs > "$Tmp" 2>&1 + fi + + local ExitCode="$?" + + if [ "$ExitCode" != "0" ]; then + echo "" + cat "$Tmp" + echo "$Binary -verify -kernel $K" + echo "Verification failed. Exit code = $ExitCode." + ((++TestsFailed)) + else + echo "OK" + ((++TestsSucceeded)) + fi + +} + +for K in $("$Binary" -list | tail -n +7); do + + for T in $(seq 1 $NThreads); do + + run_kernel "$Binary" "$K" "$T" + + # Check in the usage string, if the kernel accepts parameters for blocking. + + $Binary -kernel $K -- -h > "$Tmp" 2>&1 + + LineParameterStart="$(grep -n "^Kernel parameters:" "$Tmp" | sed -e 's/:.*//')" + + if [ "$LineParameterStart" == "" ]; then + continue + fi + + tail -n +$LineParameterStart "$Tmp" | grep -q -- "-blk" + ExitCode="$?" + + if [ "$ExitCode" == "0" ]; then + # Kernel supports blocking + run_kernel "$Binary" "$K" "$T" "-blk 7" + fi + + done + +done + + +echo "# Tests toal: $TestsTotal succeeded: $TestsSucceeded failed: $TestsFailed" diff --git a/src/test.sh b/src/test.sh new file mode 100755 index 0000000..ea64791 --- /dev/null +++ b/src/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash -l +# -------------------------------------------------------------------------- +# +# Copyright +# Markus Wittmann, 2016-2017 +# RRZE, University of Erlangen-Nuremberg, Germany +# markus.wittmann -at- fau.de or hpc -at- rrze.fau.de +# +# Viktor Haag, 2016 +# LSS, University of Erlangen-Nuremberg, Germany +# +# This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). +# +# LbmBenchKernels is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# LbmBenchKernels is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LbmBenchKernels. If not, see . +# +# -------------------------------------------------------------------------- +set -u +set -e + +XTag="-test" + +Build=release + + + +Config=linux-intel + +module purge +module add intel64 + +make clean-all +make -j CONFIG=$Config TAG=$XTag-debug +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on + +BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v" +BinaryB="../bin/lbmbenchk-$Config-$Build$XTag-b" + +./test-verification.sh "$BinaryV" + + + +Config=linux-gcc + +module purge +module add gcc + +make clean-all + +make -j CONFIG=$Config TAG=$XTag-debug +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-v +make -j CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on + +BinaryV="../bin/lbmbenchk-$Config-$Build$XTag-v" +BinaryB="../bin/lbmbenchk-$Config-$Build$XTag-b" + +./test-verification.sh "$BinaryV" + -- 2.25.1