squased commits from internal repository
authorMarkus Wittmann <markus.wittmann@fau.de>
Tue, 22 May 2018 08:24:08 +0000 (10:24 +0200)
committerMarkus Wittmann <markus.wittmann@fau.de>
Tue, 22 May 2018 08:24:08 +0000 (10:24 +0200)
- doc: skylakesp2 results now with AVX512 intrinsics

- add AVX512 support for single precision intrinsics without
  gather/scatter

  The AVX512 intrinsics are divided into "pure" AVX512
  (load/store/compute) and AVX512-GATHER which include gather/scatter.

  This enables us to support at least AVX512 single precision intrinsics
  for all kernels which do not require gather/scatter support.

- fix test.sh: reenabled dp tests

- fix pull-split-nt: adjusted tmp array size to different vector
  lenghts

doc/images/benchmark-skylakesp2-dp.png
doc/images/benchmark-skylakesp2-sp.png
doc/main.html
doc/main.rst
src/BenchKernelD3Q19ListPullSplitNtCommon.c
src/KernelFunctions.h
src/Main.c
src/Makefile
src/Vector.h
src/test.sh

index 974cbcba3ac25267b59500ad32440222642d154d..5488063275334d9316e30024f14c217467d30c32 100644 (file)
Binary files a/doc/images/benchmark-skylakesp2-dp.png and b/doc/images/benchmark-skylakesp2-dp.png differ
index 583b053b9079f9e9e548eee55de691988d37d6ee..d369e63f2b779cb6388c17ceb79a8ca8c382f187 100644 (file)
Binary files a/doc/images/benchmark-skylakesp2-sp.png and b/doc/images/benchmark-skylakesp2-sp.png differ
index dfd45ecc93a1b4bea8ec14e1fd029069ba807551..89f4676419cf6af9d3fb9a59d4e0cf4e98ee694d 100644 (file)
@@ -592,7 +592,7 @@ make clean-all
 <tr><td>OPENMP</td>
 <td>on, off</td>
 <td>on</td>
-<td>OpenMP, i.,e.. threading support.</td>
+<td>OpenMP, i.e. threading support.</td>
 </tr>
 <tr><td>PRECISION</td>
 <td>dp, sp</td>
@@ -637,6 +637,22 @@ make clean-all
 </tr>
 </thead>
 <tbody valign="top">
+<tr><td>ADJ_LIST_MEM_TYPE</td>
+<td>HBM</td>
+<td><ul class="first last simple">
+<li></li>
+</ul>
+</td>
+<td>Determines memory location of adjacency list array, DRAM or HBM.</td>
+</tr>
+<tr><td>PDF_MEM_TYPE</td>
+<td>HBM</td>
+<td><ul class="first last simple">
+<li></li>
+</ul>
+</td>
+<td>Determines memory location of PDF array, DRAM or HBM.</td>
+</tr>
 <tr><td>SOFTWARE_PREFETCH_LOOKAHEAD_L1</td>
 <td>int &gt;= 0</td>
 <td>0</td>
@@ -1153,7 +1169,6 @@ which mimics the kernels memory access pattern and the kernel's loop balance
 </li>
 </ul>
 <p><strong>Skylake, Intel Xeon Gold 6148</strong></p>
-<p>NOTE: currently we only use AVX2 intrinsics.</p>
 <ul class="simple">
 <li>Skylake server architecture, AVX2, AVX512, 2 FMA units</li>
 <li>20 cores, 2.4 GHz</li>
@@ -1225,11 +1240,11 @@ which mimics the kernels memory access pattern and the kernel's loop balance
 </tr>
 <tr><td><img alt="perf_meggie_sp" src="images/benchmark-meggie-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
 </tr>
-<tr><td>Skylake, Intel Xeon Gold 6148, Double Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+<tr><td>Skylake, Intel Xeon Gold 6148, Double Precision</td>
 </tr>
 <tr><td><img alt="perf_skylakesp2_dp" src="images/benchmark-skylakesp2-dp.png" style="width: 1000.0px; height: 250.0px;" /></td>
 </tr>
-<tr><td>Skylake, Intel Xeon Gold 6148, Single Precision, <strong>NOTE: currently we only use AVX2 intrinsics.</strong></td>
+<tr><td>Skylake, Intel Xeon Gold 6148, Single Precision</td>
 </tr>
 <tr><td><img alt="perf_skylakesp2_sp" src="images/benchmark-skylakesp2-sp.png" style="width: 1000.0px; height: 250.0px;" /></td>
 </tr>
@@ -1280,7 +1295,7 @@ Roofline: an insightful visual performance model for multicore architectures.
 Commun. ACM, 52(4):65-76, Apr 2009. doi:10.1145/1498765.1498785</td></tr>
 </tbody>
 </table>
-<p>Document was generated at 2018-05-10 14:10.</p>
+<p>Document was generated at 2018-05-22 10:11.</p>
 </div>
 </div>
 </body>
index 3a5c75ca9d9b64542fb9036aae4a6df8c31917f5..4e13caa970d8fb8d750713fa03b7bbb44ef69b1c 100644 (file)
@@ -550,8 +550,6 @@ Machine Specifications
 
 **Skylake, Intel Xeon Gold 6148**
 
-NOTE: currently we only use AVX2 intrinsics.
-
 - Skylake server architecture, AVX2, AVX512, 2 FMA units
 - 20 cores, 2.4 GHz
 - SMT enabled
@@ -628,9 +626,9 @@ Single Socket Results
   * - |perf_meggie_dp|
   * - Broadwell, Intel Xeon E5-2630 v4, Single Precision
   * - |perf_meggie_sp|
-  * - Skylake, Intel Xeon Gold 6148, Double Precision, **NOTE: currently we only use AVX2 intrinsics.**
+  * - Skylake, Intel Xeon Gold 6148, Double Precision
   * - |perf_skylakesp2_dp|
-  * - Skylake, Intel Xeon Gold 6148, Single Precision, **NOTE: currently we only use AVX2 intrinsics.**
+  * - Skylake, Intel Xeon Gold 6148, Single Precision
   * - |perf_skylakesp2_sp|
   * - Zen, AMD Ryzen 7 1700X, Double Precision
   * - |perf_summitridge1_dp|
index 8c8da885578c21b5b6883619536464e993f7f8c9..e06d7fac1ec4c0379d0ed90fb3081dfd51bea0b6 100644 (file)
@@ -195,7 +195,7 @@ static void ParseParameters(Parameters * params, int * blk, int * nTmpArray, Pad
        Assert(blk != NULL);
 
        blk[0] = 0; blk[1] = 0; blk[2] = 0;
-       *nTmpArray = 152;
+       *nTmpArray = 152 - (152 % VSIZE);
        *padInfo = NULL;
 
        #define ARG_IS(param)                   (!strcmp(params->KernelArgs[i], param))
@@ -491,7 +491,9 @@ static void FNAME(Init)(LatticeDesc * ld, KernelData ** kernelData, Parameters *
        }
 
        // We padd each stream of a PDF array for a complete cache line.
-       nCells = nCells + (8 - nCells % 8);
+       int nElementsPerCl = 64 / sizeof(PdfT);
+
+       nCells = nCells + (nElementsPerCl - nCells % nElementsPerCl);
 
        Assert(nCells % VSIZE == 0);
 
index 9c6b53e3ddc64c8a0b38d339b09b816e4cf6e377..1f3d16cfa1e42f771a95b31b08aa5a1b40002c1b 100644 (file)
@@ -39,7 +39,7 @@
 #include "BenchKernelD3Q19ListAa.h"
 #include "BenchKernelD3Q19ListAaRia.h"
 #include "BenchKernelD3Q19ListAaPv.h"
-#ifdef VECTOR_AVX512
+#ifdef VECTOR_AVX512_GATHER
 #include "BenchKernelD3Q19ListAaPvGatherAoSoA.h"
 #include "BenchKernelD3Q19ListAaPvGather.h"
 #include "BenchKernelD3Q19ListAaPvGatherHybrid.h"
@@ -75,7 +75,7 @@ KernelFunctions g_kernels[] =
                .Init   = D3Q19ListAaInit_PushAoS,
                .Deinit = D3Q19ListAaDeinit_PushAoS
        },
-#ifdef VECTOR_AVX512
+#ifdef VECTOR_AVX512_GATHER
        {
                .Name   = "list-aa-pv-gather-aosoa",
                .Init   = D3Q19ListAaPvGatherAoSoAInit_PushAoSoA,
index c9bb590cf91868058c02f6af39f5ea51847d66c5..bb2d526b44bd519f661f3d6edc34ab03fc89e87c 100644 (file)
@@ -455,6 +455,10 @@ int main(int argc, char * argv[])
        printf("# - intrinsics:        UNKNOWN\n");
 #endif
 
+#if defined(VECTOR_AVX512_GATHER)
+       printf("# - intrinsics:        AVX512 gather (VECTOR_AVX512_GATHER defined)\n");
+#endif
+
        printf("# - defines:           ");
        for (int j = 0; j < N_ELEMS(defines); ++j) {
                printf("%s ", defines[j]);
index 8eb58d44150461f9866dcb806b2a6d52a00b4396..16277cddddccf440d897d003bf6a8eef6ecc295f 100644 (file)
@@ -277,6 +277,7 @@ endif
 ifeq (avx512,$(ISA))
   ifeq (dp,$(PRECISION))
     OBJ_C += $(OBJ_C_ISA_AVX_512)
+    PP_FLAGS += $(D)VECTOR_AVX512_GATHER
   endif
 endif
 
index cc9b7d7a3533a6fee1989b23f581e478a576d173..490fbd1f9c46a5b94c2973ca2e5cca6626959918 100644 (file)
 #elif defined(PRECISION_SP)
 
        #ifdef VECTOR_AVX512
-               #error Single precision intrinsic kernels for AVX512 are currently not implemented.
+
+               #include <immintrin.h>
+               // Vector size in double-precision floatin-point numbers.
+               #define VSIZE   16
+
+               #define VPDFT                           __m512
+
+               #define VSET(scalar)            _mm512_set1_ps(scalar)
+               // #define VSETI32(scalar)     _mm256_set1_epi32(scalar)
+
+               #define VLD(expr)                       _mm512_load_ps(expr)
+               #define VLDU(expr)                      _mm512_loadu_ps(expr)
+               #define VLIU(expr)                      _mm256_loadu_si256((__m256i const *)expr)
+               // #define VLI64(expr)                  _mm512_load_epi64(expr)
+
+               #define VST(dst, src)           _mm512_store_ps(dst, src)
+               #define VSTU(dst, src)          _mm512_storeu_ps(dst, src)
+               #define VSTNT(dst, src)         _mm512_stream_ps(dst, src)
+
+               // #define VG32(offsets, base, scale)   _mm512_i32gather_pd(offsets, base, scale)
+               // #define VG64(offsets, base, scale)   _mm512_i64gather_pd(offsets, base, scale)
+
+               // #define VPG32(offsets, base, scale, hint) _mm512_prefetch_i32gather_pd(offsets, base, scale, hint)
+
+               // #define VS32(dst_base, dst_offsets, src, scale)      _mm512_i32scatter_pd(dst_base, dst_offsets, src, scale)
+               // #define VS64(dst_base, dst_offsets, src, scale)      _mm512_i64scatter_pd(dst_base, dst_offsets, src, scale)
+
+               // #define VPS32(dst_base, dst_offsets, scale, hint) _mm512_prefetch_i32scatter_pd(dst_base, dst_offsets, scale, hint)
+
+               #define VMUL(a, b)                      _mm512_mul_ps(a, b)
+               #define VADD(a, b)                      _mm512_add_ps(a, b)
+               // #define VADDI32(a,b)         _mm256_add_epi32(a,b)
+               // #define VMULI32(a,b)         _mm256_mul_epi32(a,b)
+               #define VSUB(a, b)                      _mm512_sub_ps(a, b)
        #endif
 
        #ifdef VECTOR_AVX
index aafefc8115a806be1312ec8b9ab5695e3f07c0b5..121de498ec3041a185dbace304f24b6a752b2b52 100755 (executable)
@@ -55,14 +55,14 @@ fi
 
 Config="$1"
 
-make clean-all
-#
-make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
-make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
-make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
-#
-BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
-BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
+make clean-all
+
+make -j $NProc PRECISION=dp CONFIG=$Config TAG=$XTag-debug
+make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-v VERIFICATION=on
+make -j $NProc PRECISION=dp CONFIG=$Config BUILD=$Build TAG=$XTag-b BENCHMARK=on
+
+BinaryVDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-v"
+BinaryBDp="../bin/lbmbenchk-$Config-$Build-dp$XTag-b"
 
 
 make -j $NProc PRECISION=sp CONFIG=$Config TAG=$XTag-debug
This page took 0.285744 seconds and 5 git commands to generate.