Compute Library v24.08.1

ARM-software · Aug 22, 2024 · de7288c · de7288c
1 parent f1929dc
commit de7288c
Showing 24 changed files with 75 additions and 67 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 40.0.0
+  VERSION 41.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.08-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.08.1-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.08-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/index.xhtml)
+[![Documentation](https://img.shields.io/badge/documentation-24.08.1-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/index.xhtml)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-bin.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-android-aarch64-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08.1-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08.1)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
@@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract
 
 ## Experimental builds
 
-**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/how_to_build.xhtml) for more details.
+**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/how_to_build.xhtml) for more details.
 
 <br>
 
 ## How to contribute
 
-Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/contribution_guidelines.xhtml).
+Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/contribution_guidelines.xhtml).
 
 ### Developer Certificate of Origin (DCO)
 Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/)

diff --git a/SConscript b/SConscript
@@ -33,8 +33,8 @@ import codecs
 import platform
 import SCons
 
-VERSION = "v24.08"
-LIBRARY_VERSION_MAJOR = 40
+VERSION = "v24.08.1"
+LIBRARY_VERSION_MAJOR = 41
 LIBRARY_VERSION_MINOR = 0
 LIBRARY_VERSION_PATCH = 0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)

diff --git a/arm_compute/runtime/experimental/operators/CpuActivation.h b/arm_compute/runtime/experimental/operators/CpuActivation.h
@@ -38,7 +38,7 @@ namespace op
 /** Wrapper class for CpuActivation. For information on the functions,
  * see "src/cpu/operators/CpuActivation.h"
 */
-class CpuActivation : INEOperator
+class CpuActivation : public INEOperator
 {
 public:
     /** Constructor **/

diff --git a/arm_compute/runtime/experimental/operators/CpuAdd.h b/arm_compute/runtime/experimental/operators/CpuAdd.h
@@ -39,7 +39,7 @@ namespace op
 /** Wrapper class for CpuAdd. For information on the functions,
  * see "src/cpu/operators/CpuAdd.h"
 */
-class CpuAdd : INEOperator
+class CpuAdd : public INEOperator
 {
 public:
     /** Constructor */

diff --git a/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h b/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h
@@ -41,7 +41,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuDepthwiseConv2d and
  * arm_compute::experimental::op::CpuDepthwiseConv2d should remain a shallow wrapper.
 */
-class CpuDepthwiseConv2d : IOperator
+class CpuDepthwiseConv2d : public IOperator
 {
 public:
     /** Constructor **/
@@ -55,7 +55,7 @@ class CpuDepthwiseConv2d : IOperator
     /** Default move assignment */
     CpuDepthwiseConv2d &operator=(CpuDepthwiseConv2d &&) = default;
     /** Default destructor */
-    ~CpuDepthwiseConv2d();
+    ~CpuDepthwiseConv2d() override;
 
     /** Initialize the function's source, destination, weights and convolution information.
      *

diff --git a/arm_compute/runtime/experimental/operators/CpuElementwise.h b/arm_compute/runtime/experimental/operators/CpuElementwise.h
@@ -41,7 +41,7 @@ namespace op
 /** Wrapper class for CpuElementwiseDivision. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseDivision : INEOperator
+class CpuElementwiseDivision : public INEOperator
 {
 public:
     /** Constructor */
@@ -81,7 +81,7 @@ class CpuElementwiseDivision : INEOperator
 /** Wrapper class for CpuElementwiseMax. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseMax : INEOperator
+class CpuElementwiseMax : public INEOperator
 {
 public:
     /** Constructor */
@@ -121,7 +121,7 @@ class CpuElementwiseMax : INEOperator
 /** Wrapper class for CpuElementwiseMin. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseMin : INEOperator
+class CpuElementwiseMin : public INEOperator
 {
 public:
     /** Constructor */

diff --git a/arm_compute/runtime/experimental/operators/CpuGemm.h b/arm_compute/runtime/experimental/operators/CpuGemm.h
@@ -45,7 +45,7 @@ namespace op
 /** Wrapper class for CpuGemm. For information on the operators,
  * see "src/cpu/operators/CpuGemm.h"
 */
-class CpuGemm : IOperator
+class CpuGemm : public IOperator
 {
 public:
     /** Constructor **/

diff --git a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
@@ -42,7 +42,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuGemmConv2d and
  * arm_compute::experimental::op::CpuGemmConv2d should remain a shallow wrapper.
 */
-class CpuGemmConv2d : IOperator
+class CpuGemmConv2d : public IOperator
 {
 public:
     /** Constructor */
@@ -135,7 +135,7 @@ class CpuGemmConv2d : IOperator
                                const WeightsInfo         &weights_info     = WeightsInfo(),
                                const Size2D              &dilation         = Size2D(1U, 1U),
                                const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
-                               const bool                 enable_fast_math = false);
+                               bool                       enable_fast_math = false);
 
     void                             run(ITensorPack &tensors) override;
     void                             prepare(ITensorPack &tensors) override;

diff --git a/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h
@@ -41,7 +41,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuGemmDirectConv2d and
  * arm_compute::experimental::op::CpuGemmDirectConv2d should remain a shallow wrapper.
 */
-class CpuGemmDirectConv2d : IOperator
+class CpuGemmDirectConv2d : public IOperator
 {
 public:
     /** Constructor **/

diff --git a/arm_compute/runtime/experimental/operators/CpuMul.h b/arm_compute/runtime/experimental/operators/CpuMul.h
@@ -39,7 +39,7 @@ namespace op
 /** Wrapper class for CpuMul. For information on the functions,
  * see "src/cpu/operators/CpuMul.h"
 */
-class CpuMul : INEOperator
+class CpuMul : public INEOperator
 {
 public:
     /** Constructor */

diff --git a/arm_compute/runtime/experimental/operators/CpuSub.h b/arm_compute/runtime/experimental/operators/CpuSub.h
@@ -41,7 +41,7 @@ namespace op
 /** Wrapper class for CpuSub. For information on the functions,
  * see "src/cpu/operators/CpuSub.h"
 */
-class CpuSub : INEOperator
+class CpuSub : public INEOperator
 {
 public:
     /** Constructor */

diff --git a/arm_compute/runtime/experimental/operators/CpuTranspose.h b/arm_compute/runtime/experimental/operators/CpuTranspose.h
@@ -38,7 +38,7 @@ namespace op
 /** Wrapper class for CpuTranspose. For information on the functions,
  * see "src/cpu/operators/CpuTranspose.h"
 */
-class CpuTranspose : INEOperator
+class CpuTranspose : public INEOperator
 {
 public:
     /** Constructor **/

diff --git a/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h b/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h
@@ -40,7 +40,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuWinogradConv2d and
  * arm_compute::experimental::op::CpuWinogradConv2d should remain a shallow wrapper.
 */
-class CpuWinogradConv2d : IOperator
+class CpuWinogradConv2d : public IOperator
 {
 public:
     /** Constructors */
@@ -55,7 +55,7 @@ class CpuWinogradConv2d : IOperator
     CpuWinogradConv2d &operator=(CpuWinogradConv2d &&) = default;
 
     /** Destructor */
-    ~CpuWinogradConv2d();
+    ~CpuWinogradConv2d() override;
 
     /** Set the input and output tensors.
      *

diff --git a/docs/Doxyfile b/docs/Doxyfile
@@ -60,7 +60,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.08
+PROJECT_NUMBER         = 24.08.1
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -292,7 +292,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tw, Tr> {
     // Array of pointers to output rows
 //    Tr * const *        _output_ptrs;
 
-    const NDRange<4> _window_range;
+    NDRange<4> _window_range;
 
     unsigned int get_col_sum_size() const {
         if (std::is_same<OutputStage, Requantize32>::value) {
@@ -850,6 +850,18 @@ class GemmHybridIndirect : public GemmCommon<To, Tw, Tr> {
             qp->minval = re.minval;
             qp->maxval = re.maxval;
             _n_block = compute_n_block(_args, _os);
+
+            // Also update the window range because computation of n_block may change wrt B's offset
+            NDRange<4> window_range(iceildiv(_args._Msize, strategy::out_height()), _args._nbatches,
+                              iceildiv(_args._Nsize, _n_block), _args._nmulti);
+
+            // The updated window range should be propagated to kernel execution window
+            // after this method has been called. Otherwise, the window set up at configure time
+            // of the associated kernel will remain.
+            //
+            // See Fallback::update_quantization_parameters() in src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+            // for how this is done.
+            _window_range = window_range;
         }
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -281,7 +281,6 @@ class GemmHybridQuantizedInline : public GemmCommon<To, Tr> {
             qp->per_channel_muls = re.per_channel_muls;
             qp->minval = re.minval;
             qp->maxval = re.maxval;
-            _n_block = compute_n_block(_args, _os);
         }
     }
 };

diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,14 +30,9 @@ namespace arm_conv {
 namespace winograd {
 namespace input_transform {
 
-void a64_fp16_6x6(
-    const unsigned int n_channels,
-    const __fp16* const input_base,
-    const size_t input_row_stride,
-    const size_t input_col_stride,
-    __fp16* outptr,
-    const size_t matrix_stride
-)
+void a64_fp16_6x6(unsigned int n_channels, const __fp16 * input_base,
+        size_t input_row_stride, size_t input_col_stride,
+        __fp16 * outptr, size_t matrix_stride)
 {
     constexpr int inner_tile_rows = 6;
     constexpr int inner_tile_cols = 6;

diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
@@ -31,17 +31,9 @@ namespace arm_conv {
 namespace winograd {
 namespace output_transform {
 
-void a64_fp16_4x4_3x3(
-    unsigned int n_channels,
-    const __fp16* inptr,
-    const size_t matrix_stride,
-    const __fp16* bptr,
-    __fp16* const output,
-    const size_t output_row_stride,
-    const size_t output_col_stride,
-    const __fp16 output_min,
-    const __fp16 output_max
-)
+void a64_fp16_4x4_3x3(unsigned int n_channels,
+        const __fp16 * inptr, size_t matrix_stride, const __fp16 * bptr, __fp16 *output,
+        size_t output_row_stride, size_t output_col_stride, __fp16 output_min, __fp16 output_max)
 {
     constexpr int output_tile_rows = 4, output_tile_cols = 4;
 

diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,22 +22,16 @@
  * SOFTWARE.
  */
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
 #include <cstddef>
 #include <arm_neon.h>
 
 namespace arm_conv {
 namespace winograd {
 namespace weight_transform {
 
-void a64_fp16_4x4_3x3(
-    unsigned int n_channels,
-    const __fp16* inptr,  // NOTE: Data in HWIO order
-    const size_t ld_weight_row,
-    const size_t ld_weight_col,
-    __fp16* outptr,
-    const size_t matrix_stride
-)
+void a64_fp16_4x4_3x3(unsigned int n_channels, const __fp16 * inptr,
+                      size_t ld_weight_row, size_t ld_weight_col, __fp16 * outptr,
+                      size_t matrix_stride)
 {
 #ifdef __aarch64__
     for (; n_channels >= 8; n_channels -= 8)

diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@ namespace arm_conv {
 namespace winograd {
 namespace weight_transform {
 
-void *a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
+void a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
 
 #define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
   new Transform<__fp16>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)

diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -96,6 +96,15 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
         _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
     }
 
+    /** Configure window of the kernel
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void configure_window(const Window &win)
+    {
+        INEKernel::configure(win);
+    }
+
     /** Initialise the kernel's input and output.
      *
      * @param[in] kernel          Pointer to an assembly kernel implementation.

diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
@@ -49,7 +49,7 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, floa
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis, is_log));
     ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
 
     const unsigned int actual_axis =