merge with kernels from MH's master thesis
[LbmBenchmarkKernelsPublic.git] / src / Vector.h
CommitLineData
10988083
MW
1// --------------------------------------------------------------------------
2//
3// Copyright
8cafd9ea 4// Markus Wittmann, 2016-2018
10988083
MW
5// RRZE, University of Erlangen-Nuremberg, Germany
6// markus.wittmann -at- fau.de or hpc -at- rrze.fau.de
7//
8// Viktor Haag, 2016
9// LSS, University of Erlangen-Nuremberg, Germany
10//
8cafd9ea
MW
11// Michael Hussnaetter, 2017-2018
12// University of Erlangen-Nuremberg, Germany
13// michael.hussnaetter -at- fau.de
14//
10988083
MW
15// This file is part of the Lattice Boltzmann Benchmark Kernels (LbmBenchKernels).
16//
17// LbmBenchKernels is free software: you can redistribute it and/or modify
18// it under the terms of the GNU General Public License as published by
19// the Free Software Foundation, either version 3 of the License, or
20// (at your option) any later version.
21//
22// LbmBenchKernels is distributed in the hope that it will be useful,
23// but WITHOUT ANY WARRANTY; without even the implied warranty of
24// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25// GNU General Public License for more details.
26//
27// You should have received a copy of the GNU General Public License
28// along with LbmBenchKernels. If not, see <http://www.gnu.org/licenses/>.
29//
30// --------------------------------------------------------------------------
31#ifndef __VECTOR_H__
32#define __VECTOR_H__
33
8cafd9ea 34#if !defined(VECTOR_AVX512) && !defined(VECTOR_AVX) && !defined(VECTOR_SSE)
10988083
MW
35 #warning Defining VECTOR_AVX as no ISA extension was selected.
36 #define VECTOR_AVX
37#endif
38
8cafd9ea
MW
39#if (defined(VECTOR_AVX512) && defined(VECTOR_AVX)) || \
40 (defined(VECTOR_AVX512) && defined(VECTOR_SEE)) || \
41 (defined(VECTOR_AVX) && defined(VECTOR_SSE))
42 #error Only VECTOR_AVX512 or VECTOR_AVX or VECTOR_SSE can be defined at the same time.
10988083
MW
43#endif
44
0fde6e45
MW
45#if !defined(PRECISION_DP) && !defined(PRECISION_SP)
46 #error PRECISION_DP or PRECISION_SP must be defined.
47#endif
10988083 48
0fde6e45
MW
49#if defined(PRECISION_DP) && defined(PRECISION_SP)
50 #error Only PRECISION_DP or PRECISION_SP can be defined at the same time.
51#endif
10988083 52
0fde6e45 53#ifdef PRECISION_DP
10988083 54
8cafd9ea
MW
55 #ifdef VECTOR_AVX512
56
57 #include <immintrin.h>
58 // Vector size in double-precision floatin-point numbers.
59 #define VSIZE 8
60
61 #define VPDFT __m512d
62
63 #define VSET(scalar) _mm512_set1_pd(scalar)
64 #define VSETI32(scalar) _mm256_set1_epi32(scalar)
65
66 #define VLD(expr) _mm512_load_pd(expr)
67 #define VLDU(expr) _mm512_loadu_pd(expr)
68 #define VLIU(expr) _mm256_loadu_si256((__m256i const *)expr)
69 #define VLI64(expr) _mm512_load_epi64(expr)
70
71 #define VST(dst, src) _mm512_store_pd(dst, src)
72 #define VSTU(dst, src) _mm512_storeu_pd(dst, src)
73 #define VSTNT(dst, src) _mm512_stream_pd(dst, src)
74
75 #define VG32(offsets, base, scale) _mm512_i32gather_pd(offsets, base, scale)
76 #define VG64(offsets, base, scale) _mm512_i64gather_pd(offsets, base, scale)
77
78 #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
79
80 #define VS32(dst_base, dst_offsets, src, scale) _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
81 #define VS64(dst_base, dst_offsets, src, scale) _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
82
83 #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
84
85 #define VMUL(a, b) _mm512_mul_pd(a, b)
86 #define VADD(a, b) _mm512_add_pd(a, b)
87 #define VADDI32(a,b) _mm256_add_epi32(a,b)
88 #define VMULI32(a,b) _mm256_mul_epi32(a,b)
89 #define VSUB(a, b) _mm512_sub_pd(a, b)
90 #endif
91
0fde6e45 92 #ifdef VECTOR_AVX
10988083 93
0fde6e45
MW
94 #include <immintrin.h>
95 // Vector size in double-precision floating-point numbers.
96 #define VSIZE 4
10988083 97
0fde6e45 98 #define VPDFT __m256d
10988083 99
0fde6e45 100 #define VSET(scalar) _mm256_set1_pd(scalar)
10988083 101
0fde6e45
MW
102 #define VLD(expr) _mm256_load_pd(expr)
103 #define VLDU(expr) _mm256_loadu_pd(expr)
10988083 104
0fde6e45
MW
105 #define VST(dst, src) _mm256_store_pd(dst, src)
106 #define VSTU(dst, src) _mm256_storeu_pd(dst, src)
107 #define VSTNT(dst, src) _mm256_stream_pd(dst, src)
10988083 108
0fde6e45
MW
109 #define VMUL(a, b) _mm256_mul_pd(a, b)
110 #define VADD(a, b) _mm256_add_pd(a, b)
111 #define VSUB(a, b) _mm256_sub_pd(a, b)
112 #endif
10988083 113
0fde6e45
MW
114 #ifdef VECTOR_SSE
115 #include <emmintrin.h>
116 // Vector size in double-precision floating-point numbers.
117 #define VSIZE 2
10988083 118
0fde6e45 119 #define VPDFT __m128d
10988083 120
0fde6e45
MW
121 #define VSET(scalar) _mm_set1_pd(scalar)
122
123 #define VLD(expr) _mm_load_pd(expr)
124 #define VLDU(expr) _mm_loadu_pd(expr)
125
126 #define VST(dst, src) _mm_store_pd(dst, src)
127 #define VSTU(dst, src) _mm_storeu_pd(dst, src)
128 #define VSTNT(dst, src) _mm_stream_pd(dst, src)
129
130 #define VMUL(a, b) _mm_mul_pd(a, b)
131 #define VADD(a, b) _mm_add_pd(a, b)
132 #define VSUB(a, b) _mm_sub_pd(a, b)
133 #endif
134
135#elif defined(PRECISION_SP)
136
8cafd9ea
MW
137 #ifdef VECTOR_AVX512
138 #error Single precision intrinsic kernels for AVX512 are currently not implemented.
139 #endif
140
0fde6e45
MW
141 #ifdef VECTOR_AVX
142
143 #include <immintrin.h>
144 // Vector size in double-precision floating-point numbers.
145 #define VSIZE 8
146
147 #define VPDFT __m256
148
149 #define VSET(scalar) _mm256_set1_ps(scalar)
150
151 #define VLD(expr) _mm256_load_ps(expr)
152 #define VLDU(expr) _mm256_loadu_ps(expr)
153
154 #define VST(dst, src) _mm256_store_ps(dst, src)
155 #define VSTU(dst, src) _mm256_storeu_ps(dst, src)
156 #define VSTNT(dst, src) _mm256_stream_ps(dst, src)
157
158 #define VMUL(a, b) _mm256_mul_ps(a, b)
159 #define VADD(a, b) _mm256_add_ps(a, b)
160 #define VSUB(a, b) _mm256_sub_ps(a, b)
161 #endif
162
163 #ifdef VECTOR_SSE
164 #include <emmintrin.h>
165 // Vector size in double-precision floating-point numbers.
166 #define VSIZE 4
167
168 #define VPDFT __m128
169
170 #define VSET(scalar) _mm_set1_ps(scalar)
171
172 #define VLD(expr) _mm_load_ps(expr)
173 #define VLDU(expr) _mm_loadu_ps(expr)
174
175 #define VST(dst, src) _mm_store_ps(dst, src)
176 #define VSTU(dst, src) _mm_storeu_ps(dst, src)
177 #define VSTNT(dst, src) _mm_stream_ps(dst, src)
178
179 #define VMUL(a, b) _mm_mul_ps(a, b)
180 #define VADD(a, b) _mm_add_ps(a, b)
181 #define VSUB(a, b) _mm_sub_ps(a, b)
182 #endif
10988083 183
0fde6e45 184#endif // PRECISION
10988083
MW
185
186#endif // __VECTOR_H__
This page took 0.067978 seconds and 5 git commands to generate.