[ACL] revert ACL stateless feature

openvinotoolkit · Dec 20, 2024 · a4d931d · a4d931d
1 parent 6d362b1
commit a4d931d
Show file tree

Hide file tree

Showing 22 changed files with 1,082 additions and 1,091 deletions.
diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp
@@ -200,9 +200,6 @@ enum {
     key_conv_gemm_zp_src_comp,
     key_conv_int_dat_in_acc_dt,
     key_conv_padded_bias,
-    key_conv_permuted_inputs,
-    key_conv_permuted_outputs,
-    key_conv_permuted_weights,
     key_conv_rtus_space,
     key_conv_store_wsp,
     key_conv_tails,
@@ -225,20 +222,10 @@ enum {
     key_eltwise_src,
     key_fusion_forward_scratchpad,
     key_fusion_inout_buffer,
-    key_gemm_asm_tmp_buffer,
     key_gemm_tmp_buffer,
     key_gemm_blocked_a,
     key_gemm_blocked_b,
     key_gemm_accumulator,
-    key_gemm_interleaved_lhs,
-    key_gemm_mm_result_s32,
-    key_gemm_mm_signed_a,
-    key_gemm_mm_signed_output,
-    key_gemm_output,
-    key_gemm_pretranspose,
-    key_gemm_pretranspose_b,
-    key_gemm_pretransposed_rhs,
-    key_gemm_transposed_1xwrhs,
     key_generic_acc,
     key_gnorm_cvt,
     key_gnorm_reduction,
@@ -311,15 +298,9 @@ enum {
     key_softmax_interim_store,
     key_sum_reduction,
     key_sum_srcs_cvt,
-    key_wino_transformed_weights,
     key_wino_U,
     key_wino_V,
     key_wino_M,
-    key_wino_workspace,
-    key_decompression_scales,
-    key_decompression_zero_points,
-    key_src_quantized,
-    key_src_dequantized_scales,
     // These two keys should always be the last ones,
     // even though they are not in alphabetical order
     key_nested,

diff --git a/src/cpu/acl/acl_batch_normalization.hpp b/src/cpu/acl/acl_batch_normalization.hpp
@@ -258,6 +258,8 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
         CHECK(r->configure(pd()->abp, pd()));
         mapper.add(this, std::move(r));
 
+        CHECK(pd()->post_ops.create_resource(engine, mapper));
+
         return status::success;
     }
 

diff --git a/src/cpu/acl/acl_binary.cpp b/src/cpu/acl/acl_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022, 2024 Arm Ltd. and affiliates
+* Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,198 +16,32 @@
 
 #include "acl_binary.hpp"
 
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/experimental/operators/CpuAdd.h"
-#include "arm_compute/runtime/experimental/operators/CpuElementwise.h"
-#include "arm_compute/runtime/experimental/operators/CpuMul.h"
-#include "arm_compute/runtime/experimental/operators/CpuSub.h"
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace acl {
 
-status_t acl_binary_t::pd_t::init(engine_t *engine) {
-    using namespace acl_utils;
-
-    // Only support f16/f32/s32 for now
-    data_type_t ddt = dst_md(0)->data_type;
-    if (!utils::one_of(ddt, data_type::f16, data_type::f32, data_type::s32))
-        return status::unimplemented;
-
-    // Only support src and dst all matching for now
-    if (ddt != src_md(0)->data_type || src_md(1)->data_type != ddt)
-        return status::unimplemented;
-
-    // Sets the memory format of dst from any to src_md(0) blocking desc
-    CHECK(set_default_params());
-
-    if (!attr()->has_default_values()) return status::unimplemented;
-
-    asp_.alg = desc()->alg_kind;
-
-    // All the algorithms we support
-    if (!utils::one_of(asp_.alg, alg_kind::binary_add, alg_kind::binary_sub,
-                alg_kind::binary_mul, alg_kind::binary_div,
-                alg_kind::binary_max, alg_kind::binary_min))
-        return status::unimplemented;
-
-    // s32 div in ACL does not round as oneDNN expects
-    if (ddt == data_type::s32 && asp_.alg == alg_kind::binary_div)
-        return status::unimplemented;
-
-    // ACL pointwise arithmetic operators assume that the innermost
-    // dimensions are dense for src0, src1 and dst. Reordering the
-    // logical dimensions by stride does this (if reordered_dims >= 1 )
-    // and also makes memory accesses contiguous in ACL (without any
-    // data reordering).
-    memory_desc_t src_d0_permed, src_d1_permed, dst_d_permed;
-    int reordered_dims = reorder_dimensions_by_stride(
-            {&src_d0_permed, &src_d1_permed, &dst_d_permed},
-            {src_md(0), src_md(1), dst_md()});
-    if (reordered_dims < 1) return status::unimplemented;
-
-    // Create ACL tensor infos with permuted descs
-    CHECK(tensor_info(asp_.src0_info, src_d0_permed));
-    CHECK(tensor_info(asp_.src1_info, src_d1_permed));
-    CHECK(tensor_info(asp_.dst_info, dst_d_permed));
-
-    // In this case ACL tries to treat src0 and src1 as a 1D array, but
-    // fails because the strides aren't equal. TODO: remove when fixed
-    // in ACL
-    if (asp_.alg == alg_kind::binary_add
-            && asp_.src0_info.tensor_shape() == asp_.src1_info.tensor_shape()
-            && asp_.src0_info.strides_in_bytes()
-                    != asp_.src1_info.strides_in_bytes()) {
-        return status::unimplemented;
-    }
-
-    // This forces ACL not to parallelise with small workloads, this is
-    // a temporary fix and should be removed in future versions (TODO)
-    memory_desc_wrapper dst_d(dst_md());
-    if (dst_d.nelems() < 40000) {
-        size_t acl_y_axis_i = 1;
-        CHECK(insert_singleton_dimension(asp_.src0_info, acl_y_axis_i));
-        CHECK(insert_singleton_dimension(asp_.src1_info, acl_y_axis_i));
-        CHECK(insert_singleton_dimension(asp_.dst_info, acl_y_axis_i));
-    }
-
-    // Call operator specific validate function to check support
-    ACL_CHECK_VALID(validate(asp_));
-
-    return status::success;
-}
-
-arm_compute::Status acl_binary_t::pd_t::validate(const acl_binary_conf_t &asp) {
-    switch (asp.alg) {
-        case alg_kind::binary_add:
-            return arm_compute::experimental::op::CpuAdd::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-        case alg_kind::binary_sub:
-            return arm_compute::experimental::op::CpuSub::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-        case alg_kind::binary_div:
-            return arm_compute::experimental::op::CpuElementwiseDivision::
-                    validate(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-        case alg_kind::binary_mul:
-            return arm_compute::experimental::op::CpuMul::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info, 1.0f,
-                    arm_compute::ConvertPolicy::SATURATE,
-                    arm_compute::RoundingPolicy::TO_ZERO);
-        case alg_kind::binary_min:
-            return arm_compute::experimental::op::CpuElementwiseMin::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info);
-        case alg_kind::binary_max:
-            return arm_compute::experimental::op::CpuElementwiseMax::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info);
-        default:
-            return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                    "unsupported alg_kind");
-    }
-}
-
-status_t acl_binary_t::init(engine_t *engine) {
-    auto asp = pd()->asp_;
-
-    switch (asp.alg) {
-        case alg_kind::binary_add: {
-            auto add_op
-                    = std::make_unique<arm_compute::experimental::op::CpuAdd>();
-            add_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-            binary_op_ = std::move(add_op);
-            break;
-        }
-        case alg_kind::binary_sub: {
-            auto sub_op
-                    = std::make_unique<arm_compute::experimental::op::CpuSub>();
-            sub_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-            binary_op_ = std::move(sub_op);
-            break;
-        }
-        case alg_kind::binary_div: {
-            auto div_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseDivision>();
-            div_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(div_op);
-            break;
-        }
-        case alg_kind::binary_mul: {
-            auto mul_op
-                    = std::make_unique<arm_compute::experimental::op::CpuMul>();
-            mul_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    1.0f, arm_compute::ConvertPolicy::SATURATE,
-                    arm_compute::RoundingPolicy::TO_ZERO);
-            binary_op_ = std::move(mul_op);
-            break;
-        }
-        case alg_kind::binary_min: {
-            auto min_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseMin>();
-            min_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(min_op);
-            break;
-        }
-        case alg_kind::binary_max: {
-            auto max_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseMax>();
-            max_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(max_op);
-            break;
-        }
-        default: return status::runtime_error;
-    }
-
-    return status::success;
-}
-
 status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx, const void *src0,
         const void *src1, void *dst) const {
 
-    auto asp = pd()->asp_;
+    // Lock here is needed because resource_mapper does not support
+    // concurrent multithreaded access.
+    std::lock_guard<std::mutex> _lock {this->mtx};
 
-    arm_compute::Tensor src0_tensor;
-    arm_compute::Tensor src1_tensor;
-    arm_compute::Tensor dst_tensor;
+    // Retrieve primitive resource and configured Compute Library objects
+    acl_binary_obj_t &acl_obj = ctx.get_resource_mapper()
+                                        ->get<acl_binary_resource_t>(this)
+                                        ->get_acl_obj();
 
-    src0_tensor.allocator()->init(asp.src0_info);
-    src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
-    src1_tensor.allocator()->init(asp.src1_info);
-    src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
-    dst_tensor.allocator()->init(asp.dst_info);
-    dst_tensor.allocator()->import_memory(dst);
+    acl_obj.src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
+    acl_obj.src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
+    acl_obj.dst_tensor.allocator()->import_memory(dst);
 
-    arm_compute::ITensorPack run_pack {
-            {arm_compute::TensorType::ACL_SRC_0, &src0_tensor},
-            {arm_compute::TensorType::ACL_SRC_1, &src1_tensor},
-            {arm_compute::TensorType::ACL_DST, &dst_tensor}};
+    acl_obj.binary_op->run();
 
-    binary_op_->run(run_pack);
+    acl_obj.src0_tensor.allocator()->free();
+    acl_obj.src1_tensor.allocator()->free();
+    acl_obj.dst_tensor.allocator()->free();
 
     return status::success;
 }
@@ -221,14 +55,6 @@ status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx) const {
     return execute_forward(ctx, src0, src1, dst);
 }
 
-status_t acl_binary_t::execute(const exec_ctx_t &ctx) const {
-    return execute_forward(ctx);
-}
-
-const acl_binary_t::pd_t *acl_binary_t::pd() const {
-    return static_cast<const pd_t *>(primitive_t::pd().get());
-}
-
 } // namespace acl
 } // namespace cpu
 } // namespace impl