Skip to content

Commit

Permalink
cpu: x64: gemm: change decomposition for tp on avx2
Browse files Browse the repository at this point in the history
Giving more data to operate on for each thread seems to give better
performance.
  • Loading branch information
dzarukin committed Jun 28, 2022
1 parent efbf9b5 commit 2be0060
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
16 changes: 13 additions & 3 deletions src/cpu/x64/gemm/gemm_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1591,11 +1591,21 @@ static inline void adjust_thread_count(dim_t m, dim_t n, dim_t k, int *nthrs) {
gemm_cycles *= is_f32 ? 2.0 : 8.0;

#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
if (is_avx512 && is_f32) {
auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
int n_cores_per_socket = static_cast<int>(platform::get_num_cores());
if (is_f32) {
static auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
static int n_cores_per_socket
= static_cast<int>(platform::get_num_cores());
auto l2_cache_socket = l2_cache_per_thread * n_cores_per_socket;
auto problem_memory_footprint = (m * n + m * k + n * k) * sizeof(float);

if (is_only_avx2) {
// Somehow it seems beneficial to split the job into bigger pieces.
// Use L2 per-core cache size as a deal-breaker.
int use_n_threads = utils::div_up(
problem_memory_footprint, l2_cache_per_thread);
*nthrs = nstl::min(*nthrs, use_n_threads);
return;
}
if (l2_cache_socket > problem_memory_footprint) {
*nthrs = nstl::min(*nthrs, n_cores_per_socket);
return;
Expand Down
16 changes: 13 additions & 3 deletions src/cpu/x64/gemm/gemv_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,22 @@ static inline int thread_checker(
}
}
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
if (mayiuse(avx512_core) && is_f32) {
auto l2_cache_per_thread = platform::get_per_core_cache_size(2);
int n_cores_per_socket
if (is_f32) {
static const bool is_avx2 = mayiuse(avx2) && !mayiuse(avx512_core);
static auto l2_cache_per_thread
= platform::get_per_core_cache_size(2);
static int n_cores_per_socket
= static_cast<int>(platform::get_num_cores());
auto l2_cache_socket = l2_cache_per_thread * n_cores_per_socket;
auto problem_memory_footprint = m * n * sizeof(float);

if (is_avx2) {
// Somehow it seems beneficial to split the job into bigger
// pieces. Use L2 per-core cache size as a deal-breaker.
int use_n_threads = utils::div_up(
problem_memory_footprint, l2_cache_per_thread);
return nstl::min(nthr, use_n_threads);
}
if (l2_cache_socket > problem_memory_footprint) {
return nstl::min(nthr, n_cores_per_socket);
}
Expand Down

0 comments on commit 2be0060

Please sign in to comment.