cpu: x64: gemm: change decomposition for tp on avx2

Giving more data to operate on for each thread seems to give better performance.
oneapi-src · Jun 28, 2022 · 2be0060 · 2be0060
1 parent efbf9b5
commit 2be0060
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 6 deletions.
diff --git a/src/cpu/x64/gemm/gemm_driver.cpp b/src/cpu/x64/gemm/gemm_driver.cpp
@@ -1591,11 +1591,21 @@ static inline void adjust_thread_count(dim_t m, dim_t n, dim_t k, int *nthrs) {
     gemm_cycles *= is_f32 ? 2.0 : 8.0;
 
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
-    if (is_avx512 && is_f32) {
-        auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
-        int n_cores_per_socket = static_cast<int>(platform::get_num_cores());
+    if (is_f32) {
+        static auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
+        static int n_cores_per_socket
+                = static_cast<int>(platform::get_num_cores());
         auto l2_cache_socket = l2_cache_per_thread * n_cores_per_socket;
         auto problem_memory_footprint = (m * n + m * k + n * k) * sizeof(float);
+
+        if (is_only_avx2) {
+            // Somehow it seems beneficial to split the job into bigger pieces.
+            // Use L2 per-core cache size as a deal-breaker.
+            int use_n_threads = utils::div_up(
+                    problem_memory_footprint, l2_cache_per_thread);
+            *nthrs = nstl::min(*nthrs, use_n_threads);
+            return;
+        }
         if (l2_cache_socket > problem_memory_footprint) {
             *nthrs = nstl::min(*nthrs, n_cores_per_socket);
             return;

diff --git a/src/cpu/x64/gemm/gemv_driver.cpp b/src/cpu/x64/gemm/gemv_driver.cpp
@@ -265,12 +265,22 @@ static inline int thread_checker(
             }
         }
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
-        if (mayiuse(avx512_core) && is_f32) {
-            auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
-            int n_cores_per_socket
+        if (is_f32) {
+            static const bool is_avx2 = mayiuse(avx2) && !mayiuse(avx512_core);
+            static auto l2_cache_per_thread
+                    = platform::get_per_core_cache_size(2);
+            static int n_cores_per_socket
                     = static_cast<int>(platform::get_num_cores());
             auto l2_cache_socket = l2_cache_per_thread * n_cores_per_socket;
             auto problem_memory_footprint = m * n * sizeof(float);
+
+            if (is_avx2) {
+                // Somehow it seems beneficial to split the job into bigger
+                // pieces. Use L2 per-core cache size as a deal-breaker.
+                int use_n_threads = utils::div_up(
+                        problem_memory_footprint, l2_cache_per_thread);
+                return nstl::min(nthr, use_n_threads);
+            }
             if (l2_cache_socket > problem_memory_footprint) {
                 return nstl::min(nthr, n_cores_per_socket);
             }