<tr><td>OPENMP</td>
<td>on, off</td>
<td>on</td>
-<td>OpenMP, i.,e.. threading support.</td>
+<td>OpenMP, i.e. threading support.</td>
</tr>
<tr><td>PRECISION</td>
<td>dp, sp</td>
</tr>
</thead>
<tbody valign="top">
+<tr><td>ADJ_LIST_MEM_TYPE</td>
+<td>HBM</td>
+<td><ul class="first last simple">
+<li></li>
+</ul>
+</td>
+<td>Determines memory location of adjacency list array, DRAM or HBM.</td>
+</tr>
+<tr><td>PDF_MEM_TYPE</td>
+<td>HBM</td>
+<td><ul class="first last simple">
+<li></li>
+</ul>
+</td>
+<td>Determines memory location of PDF array, DRAM or HBM.</td>
+</tr>
<tr><td>SOFTWARE_PREFETCH_LOOKAHEAD_L1</td>
<td>int >= 0</td>
<td>0</td>
</li>
</ul>
<p><strong>Skylake, Intel Xeon Gold 6148</strong></p>
-<p>NOTE: currently we only use AVX2 intrinsics.</p>
<ul class="simple">
<li>Skylake server architecture, AVX2, AVX512, 2 FMA units</li>
<li>20 cores, 2.4 GHz</li>
</tr>
<tr><td><img alt="perf_meggie_sp" src="images/benchmark-meggie-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
</tr>
-<tr><td>Skylake, Intel Xeon Gold 6148, Double Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+<tr><td>Skylake, Intel Xeon Gold 6148, Double Precision</td>
</tr>
<tr><td><img alt="perf_skylakesp2_dp" src="images/benchmark-skylakesp2-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
</tr>
-<tr><td>Skylake, Intel Xeon Gold 6148, Single Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+<tr><td>Skylake, Intel Xeon Gold 6148, Single Precision</td>
</tr>
<tr><td><img alt="perf_skylakesp2_sp" src="images/benchmark-skylakesp2-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
</tr>
Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
</tbody>
</table>
-<p>Document was generated at 2018-05-10 14:10.</p>
+<p>Document was generated at 2018-05-22 10:11.</p>
</div>
</div>
</body>
**Skylake, Intel Xeon Gold 6148**
-NOTE: currently we only use AVX2 intrinsics.
-
- Skylake server architecture, AVX2, AVX512, 2 FMA units
- 20 cores, 2.4 GHz
- SMT enabled
* - |perf_meggie_dp|
* - Broadwell, Intel Xeon E5-2630 v4, Single Precision
* - |perf_meggie_sp|
- * - Skylake, Intel Xeon Gold 6148, Double Precision, **NOTE: currently we only use AVX2 intrinsics.**
+ * - Skylake, Intel Xeon Gold 6148, Double Precision
* - |perf_skylakesp2_dp|
- * - Skylake, Intel Xeon Gold 6148, Single Precision, **NOTE: currently we only use AVX2 intrinsics.**
+ * - Skylake, Intel Xeon Gold 6148, Single Precision
* - |perf_skylakesp2_sp|
* - Zen, AMD Ryzen 7 1700X, Double Precision
* - |perf_summitridge1_dp|
Assert(blk != NULL);
blk[0] = 0; blk[1] = 0; blk[2] = 0;
- *nTmpArray = 152;
+ *nTmpArray = 152 - (152 % VSIZE);
*padInfo = NULL;
#define ARG_IS(param) (!strcmp(params->KernelArgs[i], param))
}
// We padd each stream of a PDF array for a complete cache line.
- nCells = nCells + (8 - nCells % 8);
+ int nElementsPerCl = 64 / sizeof(PdfT);
+
+ nCells = nCells + (nElementsPerCl - nCells % nElementsPerCl);
Assert(nCells % VSIZE == 0);
#include "BenchKernelD3Q19ListAa.h"
#include "BenchKernelD3Q19ListAaRia.h"
#include "BenchKernelD3Q19ListAaPv.h"
-#ifdef VECTOR_AVX512
+#ifdef VECTOR_AVX512_GATHER
#include "BenchKernelD3Q19ListAaPvGatherAoSoA.h"
#include "BenchKernelD3Q19ListAaPvGather.h"
#include "BenchKernelD3Q19ListAaPvGatherHybrid.h"
.Init = D3Q19ListAaInit_PushAoS,
.Deinit = D3Q19ListAaDeinit_PushAoS
},
-#ifdef VECTOR_AVX512
+#ifdef VECTOR_AVX512_GATHER
{
.Name = "list-aa-pv-gather-aosoa",
.Init = D3Q19ListAaPvGatherAoSoAInit_PushAoSoA,
printf("# - intrinsics: UNKNOWN\n");
#endif
+#if defined(VECTOR_AVX512_GATHER)
+ printf("# - intrinsics: AVX512 gather (VECTOR_AVX512_GATHER defined)\n");
+#endif
+
printf("# - defines: ");
for (int j = 0; j < N_ELEMS(defines); ++j) {
printf("%s ", defines[j]);
ifeq (avx512,$(ISA))
ifeq (dp,$(PRECISION))
OBJ_C += $(OBJ_C_ISA_AVX_512)
+ PP_FLAGS += $(D)VECTOR_AVX512_GATHER
endif
endif
#elif defined(PRECISION_SP)
#ifdef VECTOR_AVX512
- #error Single precision intrinsic kernels for AVX512 are currently not implemented.
+
+ #include <immintrin.h>
+ // Vector size in double-precision floatin-point numbers.
+ #define VSIZE 16
+
+ #define VPDFT __m512
+
+ #define VSET(scalar) _mm512_set1_ps(scalar)
+ // #define VSETI32(scalar) _mm256_set1_epi32(scalar)
+
+ #define VLD(expr) _mm512_load_ps(expr)
+ #define VLDU(expr) _mm512_loadu_ps(expr)
+ #define VLIU(expr) _mm256_loadu_si256((__m256i const *)expr)
+ // #define VLI64(expr) _mm512_load_epi64(expr)
+
+ #define VST(dst, src) _mm512_store_ps(dst, src)
+ #define VSTU(dst, src) _mm512_storeu_ps(dst, src)
+ #define VSTNT(dst, src) _mm512_stream_ps(dst, src)
+
+ // #define VG32(offsets, base, scale) _mm512_i32gather_pd(offsets, base, scale)
+ // #define VG64(offsets, base, scale) _mm512_i64gather_pd(offsets, base, scale)
+
+ // #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
+
+ // #define VS32(dst_base, dst_offsets, src, scale) _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
+ // #define VS64(dst_base, dst_offsets, src, scale) _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
+
+ // #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
+
+ #define VMUL(a, b) _mm512_mul_ps(a, b)
+ #define VADD(a, b) _mm512_add_ps(a, b)
+ // #define VADDI32(a,b) _mm256_add_epi32(a,b)
+ // #define VMULI32(a,b) _mm256_mul_epi32(a,b)
+ #define VSUB(a, b) _mm512_sub_ps(a, b)
#endif
#ifdef VECTOR_AVX
Config="$1"
-# make clean-all
-#
-# make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
-# make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
-# make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
-#
-# BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
-# BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
+make clean-all
+
+make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
+make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
+make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
+
+BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
+BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
make -j $NProc PRECISION=sp CONFIG=$Config TAG=$XTag-debug