merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / Vector.h
1 // --------------------------------------------------------------------------
2 //
3 // Copyright
4 //   Markus Wittmann, 2016-2018
5 //   RRZE, University of Erlangen-Nuremberg, Germany
6 //   markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7 //
8 //   Viktor Haag, 2016
9 //   LSS, University of Erlangen-Nuremberg, Germany
10 //
11 //   Michael Hussnaetter, 2017-2018
12 //   University of Erlangen-Nuremberg, Germany
13 //   michael.hussnaetter -at- fau.de
14 //
15 //  This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16 //
17 //  LbmBenchKernels is free software: you can redistribute it and/or modify
18 //  it under the terms of the GNU General Public License as published by
19 //  the Free Software Foundation, either version 3 of the License, or
20 //  (at your option) any later version.
21 //
22 //  LbmBenchKernels is distributed in the hope that it will be useful,
23 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
24 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 //  GNU General Public License for more details.
26 //
27 //  You should have received a copy of the GNU General Public License
28 //  along with LbmBenchKernels.  If not, see <http://www.gnu.org/licenses/>.
29 //
30 // --------------------------------------------------------------------------
31 #ifndef __VECTOR_H__
32 #define __VECTOR_H__
33
34 #if !defined(VECTOR_AVX512) && !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
35         #warning Defining VECTOR_AVX as no ISA extension was selected.
36         #define VECTOR_AVX
37 #endif
38
39 #if (defined(VECTOR_AVX512) && defined(VECTOR_AVX)) || \
40         (defined(VECTOR_AVX512) && defined(VECTOR_SEE)) || \
41         (defined(VECTOR_AVX) && defined(VECTOR_SSE))
42         #error Only VECTOR_AVX512 or VECTOR_AVX or VECTOR_SSE can be defined at the same time.
43 #endif
44
45 #if !defined(PRECISION_DP) && !defined(PRECISION_SP)
46         #error PRECISION_DP or PRECISION_SP must be defined.
47 #endif
48
49 #if defined(PRECISION_DP) && defined(PRECISION_SP)
50         #error Only PRECISION_DP or PRECISION_SP can be defined at the same time.
51 #endif
52
53 #ifdef PRECISION_DP
54
55         #ifdef VECTOR_AVX512
56
57                 #include <immintrin.h>
58                 // Vector size in double-precision floatin-point numbers.
59                 #define VSIZE   8
60
61                 #define VPDFT                           __m512d
62
63                 #define VSET(scalar)            _mm512_set1_pd(scalar)
64                 #define VSETI32(scalar)     _mm256_set1_epi32(scalar)
65
66                 #define VLD(expr)                       _mm512_load_pd(expr)
67                 #define VLDU(expr)                      _mm512_loadu_pd(expr)
68                 #define VLIU(expr)                      _mm256_loadu_si256((__m256i const *)expr)
69                 #define VLI64(expr)                     _mm512_load_epi64(expr)
70
71                 #define VST(dst, src)           _mm512_store_pd(dst, src)
72                 #define VSTU(dst, src)          _mm512_storeu_pd(dst, src)
73                 #define VSTNT(dst, src)         _mm512_stream_pd(dst, src)
74
75                 #define VG32(offsets, base, scale)      _mm512_i32gather_pd(offsets, base, scale)
76                 #define VG64(offsets, base, scale)      _mm512_i64gather_pd(offsets, base, scale)
77
78                 #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
79
80                 #define VS32(dst_base, dst_offsets, src, scale) _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
81                 #define VS64(dst_base, dst_offsets, src, scale) _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
82
83                 #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
84
85                 #define VMUL(a, b)                      _mm512_mul_pd(a, b)
86                 #define VADD(a, b)                      _mm512_add_pd(a, b)
87                 #define VADDI32(a,b)            _mm256_add_epi32(a,b)
88                 #define VMULI32(a,b)            _mm256_mul_epi32(a,b)
89                 #define VSUB(a, b)                      _mm512_sub_pd(a, b)
90         #endif
91
92         #ifdef VECTOR_AVX
93
94                 #include <immintrin.h>
95                 // Vector size in double-precision floating-point numbers.
96                 #define VSIZE   4
97
98                 #define VPDFT                           __m256d
99
100                 #define VSET(scalar)            _mm256_set1_pd(scalar)
101
102                 #define VLD(expr)                       _mm256_load_pd(expr)
103                 #define VLDU(expr)                      _mm256_loadu_pd(expr)
104
105                 #define VST(dst, src)           _mm256_store_pd(dst, src)
106                 #define VSTU(dst, src)          _mm256_storeu_pd(dst, src)
107                 #define VSTNT(dst, src)         _mm256_stream_pd(dst, src)
108
109                 #define VMUL(a, b)                      _mm256_mul_pd(a, b)
110                 #define VADD(a, b)                      _mm256_add_pd(a, b)
111                 #define VSUB(a, b)                      _mm256_sub_pd(a, b)
112         #endif
113
114         #ifdef VECTOR_SSE
115                 #include <emmintrin.h>
116                 // Vector size in double-precision floating-point numbers.
117                 #define VSIZE 2
118
119                 #define VPDFT                           __m128d
120
121                 #define VSET(scalar)            _mm_set1_pd(scalar)
122
123                 #define VLD(expr)                       _mm_load_pd(expr)
124                 #define VLDU(expr)                      _mm_loadu_pd(expr)
125
126                 #define VST(dst, src)           _mm_store_pd(dst, src)
127                 #define VSTU(dst, src)          _mm_storeu_pd(dst, src)
128                 #define VSTNT(dst, src)         _mm_stream_pd(dst, src)
129
130                 #define VMUL(a, b)                      _mm_mul_pd(a, b)
131                 #define VADD(a, b)                      _mm_add_pd(a, b)
132                 #define VSUB(a, b)                      _mm_sub_pd(a, b)
133         #endif
134
135 #elif defined(PRECISION_SP)
136
137         #ifdef VECTOR_AVX512
138                 #error Single precision intrinsic kernels for AVX512 are currently not implemented.
139         #endif
140
141         #ifdef VECTOR_AVX
142
143                 #include <immintrin.h>
144                 // Vector size in double-precision floating-point numbers.
145                 #define VSIZE   8
146
147                 #define VPDFT                           __m256
148
149                 #define VSET(scalar)            _mm256_set1_ps(scalar)
150
151                 #define VLD(expr)                       _mm256_load_ps(expr)
152                 #define VLDU(expr)                      _mm256_loadu_ps(expr)
153
154                 #define VST(dst, src)           _mm256_store_ps(dst, src)
155                 #define VSTU(dst, src)          _mm256_storeu_ps(dst, src)
156                 #define VSTNT(dst, src)         _mm256_stream_ps(dst, src)
157
158                 #define VMUL(a, b)                      _mm256_mul_ps(a, b)
159                 #define VADD(a, b)                      _mm256_add_ps(a, b)
160                 #define VSUB(a, b)                      _mm256_sub_ps(a, b)
161         #endif
162
163         #ifdef VECTOR_SSE
164                 #include <emmintrin.h>
165                 // Vector size in double-precision floating-point numbers.
166                 #define VSIZE   4
167
168                 #define VPDFT                           __m128
169
170                 #define VSET(scalar)            _mm_set1_ps(scalar)
171
172                 #define VLD(expr)                       _mm_load_ps(expr)
173                 #define VLDU(expr)                      _mm_loadu_ps(expr)
174
175                 #define VST(dst, src)           _mm_store_ps(dst, src)
176                 #define VSTU(dst, src)          _mm_storeu_ps(dst, src)
177                 #define VSTNT(dst, src)         _mm_stream_ps(dst, src)
178
179                 #define VMUL(a, b)                      _mm_mul_ps(a, b)
180                 #define VADD(a, b)                      _mm_add_ps(a, b)
181                 #define VSUB(a, b)                      _mm_sub_ps(a, b)
182         #endif
183
184 #endif  // PRECISION
185
186 #endif // __VECTOR_H__
This page took 0.047549 seconds and 4 git commands to generate.