1 // --------------------------------------------------------------------------
4 // Markus Wittmann, 2016-2018
5 // RRZE, University of Erlangen-Nuremberg, Germany
6 // markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
9 // LSS, University of Erlangen-Nuremberg, Germany
11 // Michael Hussnaetter, 2017-2018
12 // University of Erlangen-Nuremberg, Germany
13 // michael.hussnaetter -at- fau.de
15 // This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
17 // LbmBenchKernels is free software: you can redistribute it and/or modify
18 // it under the terms of the GNU General Public License as published by
19 // the Free Software Foundation, either version 3 of the License, or
20 // (at your option) any later version.
22 // LbmBenchKernels is distributed in the hope that it will be useful,
23 // but WITHOUT ANY WARRANTY; without even the implied warranty of
24 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 // GNU General Public License for more details.
27 // You should have received a copy of the GNU General Public License
28 // along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
30 // --------------------------------------------------------------------------
34 #if !defined(VECTOR_AVX512) && !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
35 #warning Defining VECTOR_AVX as no ISA extension was selected.
39 #if (defined(VECTOR_AVX512) && defined(VECTOR_AVX)) || \
40 (defined(VECTOR_AVX512) && defined(VECTOR_SEE)) || \
41 (defined(VECTOR_AVX) && defined(VECTOR_SSE))
42 #error Only VECTOR_AVX512 or VECTOR_AVX or VECTOR_SSE can be defined at the same time.
45 #if !defined(PRECISION_DP) && !defined(PRECISION_SP)
46 #error PRECISION_DP or PRECISION_SP must be defined.
49 #if defined(PRECISION_DP) && defined(PRECISION_SP)
50 #error Only PRECISION_DP or PRECISION_SP can be defined at the same time.
57 #include <immintrin.h>
58 // Vector size in double-precision floatin-point numbers.
63 #define VSET(scalar) _mm512_set1_pd(scalar)
64 #define VSETI32(scalar) _mm256_set1_epi32(scalar)
66 #define VLD(expr) _mm512_load_pd(expr)
67 #define VLDU(expr) _mm512_loadu_pd(expr)
68 #define VLIU(expr) _mm256_loadu_si256((__m256i const *)expr)
69 #define VLI64(expr) _mm512_load_epi64(expr)
71 #define VST(dst, src) _mm512_store_pd(dst, src)
72 #define VSTU(dst, src) _mm512_storeu_pd(dst, src)
73 #define VSTNT(dst, src) _mm512_stream_pd(dst, src)
75 #define VG32(offsets, base, scale) _mm512_i32gather_pd(offsets, base, scale)
76 #define VG64(offsets, base, scale) _mm512_i64gather_pd(offsets, base, scale)
78 #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
80 #define VS32(dst_base, dst_offsets, src, scale) _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
81 #define VS64(dst_base, dst_offsets, src, scale) _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
83 #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
85 #define VMUL(a, b) _mm512_mul_pd(a, b)
86 #define VADD(a, b) _mm512_add_pd(a, b)
87 #define VADDI32(a,b) _mm256_add_epi32(a,b)
88 #define VMULI32(a,b) _mm256_mul_epi32(a,b)
89 #define VSUB(a, b) _mm512_sub_pd(a, b)
94 #include <immintrin.h>
95 // Vector size in double-precision floating-point numbers.
100 #define VSET(scalar) _mm256_set1_pd(scalar)
102 #define VLD(expr) _mm256_load_pd(expr)
103 #define VLDU(expr) _mm256_loadu_pd(expr)
105 #define VST(dst, src) _mm256_store_pd(dst, src)
106 #define VSTU(dst, src) _mm256_storeu_pd(dst, src)
107 #define VSTNT(dst, src) _mm256_stream_pd(dst, src)
109 #define VMUL(a, b) _mm256_mul_pd(a, b)
110 #define VADD(a, b) _mm256_add_pd(a, b)
111 #define VSUB(a, b) _mm256_sub_pd(a, b)
115 #include <emmintrin.h>
116 // Vector size in double-precision floating-point numbers.
119 #define VPDFT __m128d
121 #define VSET(scalar) _mm_set1_pd(scalar)
123 #define VLD(expr) _mm_load_pd(expr)
124 #define VLDU(expr) _mm_loadu_pd(expr)
126 #define VST(dst, src) _mm_store_pd(dst, src)
127 #define VSTU(dst, src) _mm_storeu_pd(dst, src)
128 #define VSTNT(dst, src) _mm_stream_pd(dst, src)
130 #define VMUL(a, b) _mm_mul_pd(a, b)
131 #define VADD(a, b) _mm_add_pd(a, b)
132 #define VSUB(a, b) _mm_sub_pd(a, b)
135 #elif defined(PRECISION_SP)
139 #include <immintrin.h>
140 // Vector size in double-precision floatin-point numbers.
145 #define VSET(scalar) _mm512_set1_ps(scalar)
146 // #define VSETI32(scalar) _mm256_set1_epi32(scalar)
148 #define VLD(expr) _mm512_load_ps(expr)
149 #define VLDU(expr) _mm512_loadu_ps(expr)
150 #define VLIU(expr) _mm256_loadu_si256((__m256i const *)expr)
151 // #define VLI64(expr) _mm512_load_epi64(expr)
153 #define VST(dst, src) _mm512_store_ps(dst, src)
154 #define VSTU(dst, src) _mm512_storeu_ps(dst, src)
155 #define VSTNT(dst, src) _mm512_stream_ps(dst, src)
157 // #define VG32(offsets, base, scale) _mm512_i32gather_pd(offsets, base, scale)
158 // #define VG64(offsets, base, scale) _mm512_i64gather_pd(offsets, base, scale)
160 // #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
162 // #define VS32(dst_base, dst_offsets, src, scale) _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
163 // #define VS64(dst_base, dst_offsets, src, scale) _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
165 // #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
167 #define VMUL(a, b) _mm512_mul_ps(a, b)
168 #define VADD(a, b) _mm512_add_ps(a, b)
169 // #define VADDI32(a,b) _mm256_add_epi32(a,b)
170 // #define VMULI32(a,b) _mm256_mul_epi32(a,b)
171 #define VSUB(a, b) _mm512_sub_ps(a, b)
176 #include <immintrin.h>
177 // Vector size in double-precision floating-point numbers.
182 #define VSET(scalar) _mm256_set1_ps(scalar)
184 #define VLD(expr) _mm256_load_ps(expr)
185 #define VLDU(expr) _mm256_loadu_ps(expr)
187 #define VST(dst, src) _mm256_store_ps(dst, src)
188 #define VSTU(dst, src) _mm256_storeu_ps(dst, src)
189 #define VSTNT(dst, src) _mm256_stream_ps(dst, src)
191 #define VMUL(a, b) _mm256_mul_ps(a, b)
192 #define VADD(a, b) _mm256_add_ps(a, b)
193 #define VSUB(a, b) _mm256_sub_ps(a, b)
197 #include <emmintrin.h>
198 // Vector size in double-precision floating-point numbers.
203 #define VSET(scalar) _mm_set1_ps(scalar)
205 #define VLD(expr) _mm_load_ps(expr)
206 #define VLDU(expr) _mm_loadu_ps(expr)
208 #define VST(dst, src) _mm_store_ps(dst, src)
209 #define VSTU(dst, src) _mm_storeu_ps(dst, src)
210 #define VSTNT(dst, src) _mm_stream_ps(dst, src)
212 #define VMUL(a, b) _mm_mul_ps(a, b)
213 #define VADD(a, b) _mm_add_ps(a, b)
214 #define VSUB(a, b) _mm_sub_ps(a, b)
219 #endif // __VECTOR_H__