Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | ||
28 | #ifndef INDEX_START | |
29 | #error INDEX_START must be defined | |
30 | #endif | |
31 | ||
32 | #ifndef INDEX_STOP | |
33 | #error INDEX_STOP must be defined | |
34 | #endif | |
35 | ||
36 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
37 | ||
38 | for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { | |
39 | ||
40 | indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); | |
41 | #ifdef DEBUG | |
42 | memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); | |
43 | #endif | |
44 | for (int index = 0; index < indexMax; ++index) { | |
45 | ||
46 | ||
47 | adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; | |
48 | ||
49 | pdf_C = src[I(index + blockedIndex, D3Q19_C)]; | |
50 | ||
51 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); | |
52 | D3Q19_LIST_WO_C | |
53 | #undef X | |
54 | ||
55 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
56 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
57 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
58 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
59 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
60 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
61 | ||
62 | tmpArray[TMP_INDEX(index, TMP_UX)] = ux; | |
63 | tmpArray[TMP_INDEX(index, TMP_UY)] = uy; | |
64 | tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; | |
65 | ||
66 | dens = pdf_C + | |
67 | pdf_N + pdf_E + pdf_S + pdf_W + | |
68 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
69 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
70 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
71 | ||
72 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * 3.0 / 2.0; | |
73 | ||
74 | w_1_indep = w_1 * dir_indep_trm; | |
75 | w_2_indep = w_2 * dir_indep_trm; | |
76 | ||
77 | tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; | |
78 | tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; | |
79 | ||
80 | dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); | |
81 | } | |
82 | ||
83 | #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ | |
84 | for (int index = 0; index < indexMax; index += VSIZE) { \ | |
85 | vui = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \ | |
86 | vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ | |
87 | vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ | |
88 | vw_1_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \ | |
89 | \ | |
90 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
91 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
92 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
93 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ | |
94 | } | |
95 | ||
96 | #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ | |
97 | for (int index = 0; index < indexMax; index += VSIZE) { \ | |
98 | _v1 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \ | |
99 | _v2 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \ | |
100 | vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ | |
101 | vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ | |
102 | vw_2_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \ | |
103 | \ | |
104 | vui = _expr; \ | |
105 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
106 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
107 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
108 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ | |
109 | } | |
110 | ||
111 | LOOP_1(N, S, vuy, UY); | |
112 | LOOP_1(E, W, vux, UX); | |
113 | LOOP_1(T, B, vuz, UZ); | |
114 | LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux)); | |
115 | LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux)); | |
116 | LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux)); | |
117 | LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux)); | |
118 | LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy)); | |
119 | LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy)); | |
120 | ||
121 | #undef LOOP_1 | |
122 | #undef LOOP_2 | |
123 | ||
124 | } // loop over fluid nodes | |
125 | ||
126 | #undef I | |
127 | ||
128 | #undef INDEX_START | |
129 | #undef INDEX_STOP | |
130 |