Commit | Line | Data |
---|---|---|
10988083 MW |
1 | // -------------------------------------------------------------------------- |
2 | // | |
3 | // Copyright | |
4 | // Markus Wittmann, 2016-2017 | |
5 | // RRZE, University of Erlangen-Nuremberg, Germany | |
6 | // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de | |
7 | // | |
8 | // Viktor Haag, 2016 | |
9 | // LSS, University of Erlangen-Nuremberg, Germany | |
10 | // | |
11 | // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels). | |
12 | // | |
13 | // LbmBenchKernels is free software: you can redistribute it and/or modify | |
14 | // it under the terms of the GNU General Public License as published by | |
15 | // the Free Software Foundation, either version 3 of the License, or | |
16 | // (at your option) any later version. | |
17 | // | |
18 | // LbmBenchKernels is distributed in the hope that it will be useful, | |
19 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | // GNU General Public License for more details. | |
22 | // | |
23 | // You should have received a copy of the GNU General Public License | |
24 | // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>. | |
25 | // | |
26 | // -------------------------------------------------------------------------- | |
27 | ||
28 | #ifndef INDEX_START | |
29 | #error INDEX_START must be defined | |
30 | #endif | |
31 | ||
32 | #ifndef INDEX_STOP | |
33 | #error INDEX_STOP must be defined | |
34 | #endif | |
35 | ||
36 | #define I(index, dir) P_INDEX_3((nCells), (index), (dir)) | |
37 | ||
38 | for (int blockedIndex = (INDEX_START); blockedIndex < (INDEX_STOP); blockedIndex += nTmpArray) { | |
39 | ||
40 | indexMax = MinI(nTmpArray, (INDEX_STOP) - blockedIndex); | |
41 | #ifdef DEBUG | |
42 | memset(tmpArray, -1, sizeof(PdfT) * nTmpArray * N_TMP); | |
43 | #endif | |
0fde6e45 MW |
44 | #ifdef INTEL_OPT_DIRECTIVES |
45 | #pragma ivdep | |
46 | #endif | |
10988083 MW |
47 | for (int index = 0; index < indexMax; ++index) { |
48 | ||
49 | ||
50 | adjListIndex = (index + blockedIndex) * N_D3Q19_IDX; | |
51 | ||
52 | pdf_C = src[I(index + blockedIndex, D3Q19_C)]; | |
53 | ||
54 | #define X(name, idx, idxinv, _x, _y, _z) JOIN(pdf_,name) = src[adjList[adjListIndex + idx]]; tmpArray[TMP_INDEX(index, idx)] = JOIN(pdf_,name); | |
55 | D3Q19_LIST_WO_C | |
56 | #undef X | |
57 | ||
58 | ux = pdf_E + pdf_NE + pdf_SE + pdf_TE + pdf_BE - | |
59 | pdf_W - pdf_NW - pdf_SW - pdf_TW - pdf_BW; | |
60 | uy = pdf_N + pdf_NE + pdf_NW + pdf_TN + pdf_BN - | |
61 | pdf_S - pdf_SE - pdf_SW - pdf_TS - pdf_BS; | |
62 | uz = pdf_T + pdf_TE + pdf_TW + pdf_TN + pdf_TS - | |
63 | pdf_B - pdf_BE - pdf_BW - pdf_BN - pdf_BS; | |
64 | ||
65 | tmpArray[TMP_INDEX(index, TMP_UX)] = ux; | |
66 | tmpArray[TMP_INDEX(index, TMP_UY)] = uy; | |
67 | tmpArray[TMP_INDEX(index, TMP_UZ)] = uz; | |
68 | ||
69 | dens = pdf_C + | |
70 | pdf_N + pdf_E + pdf_S + pdf_W + | |
71 | pdf_NE + pdf_SE + pdf_SW + pdf_NW + | |
72 | pdf_T + pdf_TN + pdf_TE + pdf_TS + pdf_TW + | |
73 | pdf_B + pdf_BN + pdf_BE + pdf_BS + pdf_BW; | |
74 | ||
0fde6e45 | 75 | dir_indep_trm = dens - (ux * ux + uy * uy + uz * uz) * F(3.0) / F(2.0); |
10988083 MW |
76 | |
77 | w_1_indep = w_1 * dir_indep_trm; | |
78 | w_2_indep = w_2 * dir_indep_trm; | |
79 | ||
80 | tmpArray[TMP_INDEX(index, TMP_W1)] = w_1_indep; | |
81 | tmpArray[TMP_INDEX(index, TMP_W2)] = w_2_indep; | |
82 | ||
83 | dst[I(index + blockedIndex, D3Q19_C )] = pdf_C - omegaEven * (pdf_C - w_0 * dir_indep_trm); | |
84 | } | |
85 | ||
86 | #define LOOP_1(_dir1, _dir2, _vel, _vel_tmp) \ | |
87 | for (int index = 0; index < indexMax; index += VSIZE) { \ | |
88 | vui = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_vel_tmp))]); \ | |
89 | vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ | |
90 | vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ | |
91 | vw_1_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W1)]); \ | |
92 | \ | |
93 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_1_nine_half))), vw_1_indep)); \ | |
94 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_1_x3))); \ | |
95 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
96 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ | |
97 | } | |
98 | ||
99 | #define LOOP_2(_dir1, _dir2, _v1, _v2, _v1_tmp, _v2_tmp, _expr) \ | |
100 | for (int index = 0; index < indexMax; index += VSIZE) { \ | |
101 | _v1 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v1_tmp))]); \ | |
102 | _v2 = VLDU(&tmpArray[TMP_INDEX(index, JOIN(TMP_,_v2_tmp))]); \ | |
103 | vpdf_a = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir1))]); \ | |
104 | vpdf_b = VLDU(&tmpArray[TMP_INDEX(index, JOIN(D3Q19_,_dir2))]); \ | |
105 | vw_2_indep = VLDU(&tmpArray[TMP_INDEX(index, TMP_W2)]); \ | |
106 | \ | |
107 | vui = _expr; \ | |
108 | vevenPart = VMUL(vomegaEven, VSUB(VSUB(VMUL(voneHalf, VADD(vpdf_a, vpdf_b)), VMUL(vui, VMUL(vui, vw_2_nine_half))), vw_2_indep)); \ | |
109 | voddPart = VMUL(vomegaOdd, VSUB( VMUL(voneHalf, VSUB(vpdf_a, vpdf_b)), VMUL(vui, vw_2_x3))); \ | |
110 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir1))], VSUB(VSUB(vpdf_a, vevenPart), voddPart)); \ | |
111 | VSTNT(&dst[I(index + blockedIndex, JOIN(D3Q19_,_dir2))], VADD(VSUB(vpdf_b, vevenPart), voddPart)); \ | |
112 | } | |
113 | ||
114 | LOOP_1(N, S, vuy, UY); | |
115 | LOOP_1(E, W, vux, UX); | |
116 | LOOP_1(T, B, vuz, UZ); | |
117 | LOOP_2(NW, SE, vuy, vux, UY, UX, VSUB(vuy, vux)); | |
118 | LOOP_2(NE, SW, vuy, vux, UY, UX, VADD(vuy, vux)); | |
119 | LOOP_2(TW, BE, vux, vuz, UX, UZ, VSUB(vuz, vux)); | |
120 | LOOP_2(TE, BW, vux, vuz, UX, UZ, VADD(vuz, vux)); | |
121 | LOOP_2(TS, BN, vuy, vuz, UY, UZ, VSUB(vuz, vuy)); | |
122 | LOOP_2(TN, BS, vuy, vuz, UY, UZ, VADD(vuz, vuy)); | |
123 | ||
124 | #undef LOOP_1 | |
125 | #undef LOOP_2 | |
126 | ||
127 | } // loop over fluid nodes | |
128 | ||
129 | #undef I | |
130 | ||
131 | #undef INDEX_START | |
132 | #undef INDEX_STOP | |
133 |