Skip to content

Commit

Permalink
[ACL] revert ACL stateless feature
Browse files Browse the repository at this point in the history
  • Loading branch information
alvoron committed Dec 20, 2024
1 parent 6d362b1 commit a4d931d
Show file tree
Hide file tree
Showing 22 changed files with 1,082 additions and 1,091 deletions.
19 changes: 0 additions & 19 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,6 @@ enum {
key_conv_gemm_zp_src_comp,
key_conv_int_dat_in_acc_dt,
key_conv_padded_bias,
key_conv_permuted_inputs,
key_conv_permuted_outputs,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
key_conv_tails,
Expand All @@ -225,20 +222,10 @@ enum {
key_eltwise_src,
key_fusion_forward_scratchpad,
key_fusion_inout_buffer,
key_gemm_asm_tmp_buffer,
key_gemm_tmp_buffer,
key_gemm_blocked_a,
key_gemm_blocked_b,
key_gemm_accumulator,
key_gemm_interleaved_lhs,
key_gemm_mm_result_s32,
key_gemm_mm_signed_a,
key_gemm_mm_signed_output,
key_gemm_output,
key_gemm_pretranspose,
key_gemm_pretranspose_b,
key_gemm_pretransposed_rhs,
key_gemm_transposed_1xwrhs,
key_generic_acc,
key_gnorm_cvt,
key_gnorm_reduction,
Expand Down Expand Up @@ -311,15 +298,9 @@ enum {
key_softmax_interim_store,
key_sum_reduction,
key_sum_srcs_cvt,
key_wino_transformed_weights,
key_wino_U,
key_wino_V,
key_wino_M,
key_wino_workspace,
key_decompression_scales,
key_decompression_zero_points,
key_src_quantized,
key_src_dequantized_scales,
// These two keys should always be the last ones,
// even though they are not in alphabetical order
key_nested,
Expand Down
2 changes: 2 additions & 0 deletions src/cpu/acl/acl_batch_normalization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
CHECK(r->configure(pd()->abp, pd()));
mapper.add(this, std::move(r));

CHECK(pd()->post_ops.create_resource(engine, mapper));

return status::success;
}

Expand Down
204 changes: 15 additions & 189 deletions src/cpu/acl/acl_binary.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2022, 2024 Arm Ltd. and affiliates
* Copyright 2022 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,198 +16,32 @@

#include "acl_binary.hpp"

#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/experimental/operators/CpuAdd.h"
#include "arm_compute/runtime/experimental/operators/CpuElementwise.h"
#include "arm_compute/runtime/experimental/operators/CpuMul.h"
#include "arm_compute/runtime/experimental/operators/CpuSub.h"

namespace dnnl {
namespace impl {
namespace cpu {
namespace acl {

status_t acl_binary_t::pd_t::init(engine_t *engine) {
using namespace acl_utils;

// Only support f16/f32/s32 for now
data_type_t ddt = dst_md(0)->data_type;
if (!utils::one_of(ddt, data_type::f16, data_type::f32, data_type::s32))
return status::unimplemented;

// Only support src and dst all matching for now
if (ddt != src_md(0)->data_type || src_md(1)->data_type != ddt)
return status::unimplemented;

// Sets the memory format of dst from any to src_md(0) blocking desc
CHECK(set_default_params());

if (!attr()->has_default_values()) return status::unimplemented;

asp_.alg = desc()->alg_kind;

// All the algorithms we support
if (!utils::one_of(asp_.alg, alg_kind::binary_add, alg_kind::binary_sub,
alg_kind::binary_mul, alg_kind::binary_div,
alg_kind::binary_max, alg_kind::binary_min))
return status::unimplemented;

// s32 div in ACL does not round as oneDNN expects
if (ddt == data_type::s32 && asp_.alg == alg_kind::binary_div)
return status::unimplemented;

// ACL pointwise arithmetic operators assume that the innermost
// dimensions are dense for src0, src1 and dst. Reordering the
// logical dimensions by stride does this (if reordered_dims >= 1 )
// and also makes memory accesses contiguous in ACL (without any
// data reordering).
memory_desc_t src_d0_permed, src_d1_permed, dst_d_permed;
int reordered_dims = reorder_dimensions_by_stride(
{&src_d0_permed, &src_d1_permed, &dst_d_permed},
{src_md(0), src_md(1), dst_md()});
if (reordered_dims < 1) return status::unimplemented;

// Create ACL tensor infos with permuted descs
CHECK(tensor_info(asp_.src0_info, src_d0_permed));
CHECK(tensor_info(asp_.src1_info, src_d1_permed));
CHECK(tensor_info(asp_.dst_info, dst_d_permed));

// In this case ACL tries to treat src0 and src1 as a 1D array, but
// fails because the strides aren't equal. TODO: remove when fixed
// in ACL
if (asp_.alg == alg_kind::binary_add
&& asp_.src0_info.tensor_shape() == asp_.src1_info.tensor_shape()
&& asp_.src0_info.strides_in_bytes()
!= asp_.src1_info.strides_in_bytes()) {
return status::unimplemented;
}

// This forces ACL not to parallelise with small workloads, this is
// a temporary fix and should be removed in future versions (TODO)
memory_desc_wrapper dst_d(dst_md());
if (dst_d.nelems() < 40000) {
size_t acl_y_axis_i = 1;
CHECK(insert_singleton_dimension(asp_.src0_info, acl_y_axis_i));
CHECK(insert_singleton_dimension(asp_.src1_info, acl_y_axis_i));
CHECK(insert_singleton_dimension(asp_.dst_info, acl_y_axis_i));
}

// Call operator specific validate function to check support
ACL_CHECK_VALID(validate(asp_));

return status::success;
}

arm_compute::Status acl_binary_t::pd_t::validate(const acl_binary_conf_t &asp) {
switch (asp.alg) {
case alg_kind::binary_add:
return arm_compute::experimental::op::CpuAdd::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
case alg_kind::binary_sub:
return arm_compute::experimental::op::CpuSub::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
case alg_kind::binary_div:
return arm_compute::experimental::op::CpuElementwiseDivision::
validate(&asp.src0_info, &asp.src1_info, &asp.dst_info);
case alg_kind::binary_mul:
return arm_compute::experimental::op::CpuMul::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info, 1.0f,
arm_compute::ConvertPolicy::SATURATE,
arm_compute::RoundingPolicy::TO_ZERO);
case alg_kind::binary_min:
return arm_compute::experimental::op::CpuElementwiseMin::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info);
case alg_kind::binary_max:
return arm_compute::experimental::op::CpuElementwiseMax::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info);
default:
return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
"unsupported alg_kind");
}
}

status_t acl_binary_t::init(engine_t *engine) {
auto asp = pd()->asp_;

switch (asp.alg) {
case alg_kind::binary_add: {
auto add_op
= std::make_unique<arm_compute::experimental::op::CpuAdd>();
add_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
binary_op_ = std::move(add_op);
break;
}
case alg_kind::binary_sub: {
auto sub_op
= std::make_unique<arm_compute::experimental::op::CpuSub>();
sub_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
binary_op_ = std::move(sub_op);
break;
}
case alg_kind::binary_div: {
auto div_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseDivision>();
div_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(div_op);
break;
}
case alg_kind::binary_mul: {
auto mul_op
= std::make_unique<arm_compute::experimental::op::CpuMul>();
mul_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
1.0f, arm_compute::ConvertPolicy::SATURATE,
arm_compute::RoundingPolicy::TO_ZERO);
binary_op_ = std::move(mul_op);
break;
}
case alg_kind::binary_min: {
auto min_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseMin>();
min_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(min_op);
break;
}
case alg_kind::binary_max: {
auto max_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseMax>();
max_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(max_op);
break;
}
default: return status::runtime_error;
}

return status::success;
}

status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx, const void *src0,
const void *src1, void *dst) const {

auto asp = pd()->asp_;
// Lock here is needed because resource_mapper does not support
// concurrent multithreaded access.
std::lock_guard<std::mutex> _lock {this->mtx};

arm_compute::Tensor src0_tensor;
arm_compute::Tensor src1_tensor;
arm_compute::Tensor dst_tensor;
// Retrieve primitive resource and configured Compute Library objects
acl_binary_obj_t &acl_obj = ctx.get_resource_mapper()
->get<acl_binary_resource_t>(this)
->get_acl_obj();

src0_tensor.allocator()->init(asp.src0_info);
src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
src1_tensor.allocator()->init(asp.src1_info);
src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
dst_tensor.allocator()->init(asp.dst_info);
dst_tensor.allocator()->import_memory(dst);
acl_obj.src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
acl_obj.src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
acl_obj.dst_tensor.allocator()->import_memory(dst);

arm_compute::ITensorPack run_pack {
{arm_compute::TensorType::ACL_SRC_0, &src0_tensor},
{arm_compute::TensorType::ACL_SRC_1, &src1_tensor},
{arm_compute::TensorType::ACL_DST, &dst_tensor}};
acl_obj.binary_op->run();

binary_op_->run(run_pack);
acl_obj.src0_tensor.allocator()->free();
acl_obj.src1_tensor.allocator()->free();
acl_obj.dst_tensor.allocator()->free();

return status::success;
}
Expand All @@ -221,14 +55,6 @@ status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx) const {
return execute_forward(ctx, src0, src1, dst);
}

status_t acl_binary_t::execute(const exec_ctx_t &ctx) const {
return execute_forward(ctx);
}

const acl_binary_t::pd_t *acl_binary_t::pd() const {
return static_cast<const pd_t *>(primitive_t::pd().get());
}

} // namespace acl
} // namespace cpu
} // namespace impl
Expand Down
Loading

0 comments on commit a4d931d

Please sign in to comment.