Skip to content

Commit

Permalink
Revert "src: cpu: aarch64: Enable convolution static quantisation."
Browse files Browse the repository at this point in the history
This reverts commit d6f82b3.
  • Loading branch information
renato-arantes authored and theComputeKid committed Dec 20, 2024
1 parent 715b4e9 commit ab668ec
Show file tree
Hide file tree
Showing 9 changed files with 29 additions and 145 deletions.
1 change: 0 additions & 1 deletion src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ enum {
key_conv_amx_wsp_buffer,
key_conv_bia_reduction,
key_conv_bias_bf16_convert_wsp,
key_conv_bias_s32_convert,
key_conv_cudnn,
key_conv_cudnn_algo,
key_conv_cudnn_filter,
Expand Down
29 changes: 6 additions & 23 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,8 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
everyone_is(data_type::f16, src_d.data_type(),
wei_d.data_type(), dst_d.data_type()),
everyone_is(data_type::bf16, src_d.data_type(),
wei_d.data_type(), dst_d.data_type()),
everyone_is(data_type::s8, src_d.data_type(),
wei_d.data_type(), dst_d.data_type()),
(everyone_is(data_type::u8, src_d.data_type(),
dst_d.data_type())
&& wei_d.data_type() == data_type::s8)),
" src, dst and wei must be s8, u8, bf16, fp16 or fp32");
wei_d.data_type(), dst_d.data_type())),
" src, dst and wei must be fp16, bf16 or fp32");
// batch size
const int mb = src_d.dims()[0];

Expand Down Expand Up @@ -170,8 +165,7 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
: arm_compute::DataLayout::NCHW;

// all have the same datatype
auto acl_data_type
= acl_utils::get_acl_data_t(src_d.data_type(), acp.is_quantized);
auto acl_data_type = acl_utils::get_acl_data_t(src_d.data_type());

// clang-format off
acp.src_tensor_info = arm_compute::TensorInfo(
Expand All @@ -185,9 +179,8 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
is_nhwc ? arm_compute::TensorShape(ic, kw, kh, oc) :
arm_compute::TensorShape(kw, kh, ic, oc),
1,
acl_utils::get_acl_data_t(wei_d.data_type(), acp.is_quantized),
acl_data_type,
acl_layout);

if(is_depthwise) {
// We need to set that values are not constant so that we
// we can update them in-place in ACL
Expand All @@ -205,20 +198,10 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
acp.with_bias ? arm_compute::TensorShape(oc)
: arm_compute::TensorShape(),
1,
acp.is_quantized ? acl_utils::get_acl_data_t(data_type::s32) : acl_data_type,
acl_data_type,
acl_layout);
// clang-format on

if (acp.is_quantized) {
// ACL rejects the operation if quantization information is empty during configuration.
// Since the correct parameters are not available at this stage, we provide placeholder values.
// These values are then updated with the correct ones during the run stage.
arm_compute::QuantizationInfo qi {1.0, 0, true};
acp.src_tensor_info.set_quantization_info(qi);
acp.wei_tensor_info.set_quantization_info(qi);
acp.dst_tensor_info.set_quantization_info(qi);
}

// ACL Winograd is not prepared for fixed format kernels
if (acp.alg_winograd) {
const bool is_1d = ndims == 3;
Expand All @@ -233,7 +216,7 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
// Are we allowed to cast down to bf16 or not?
acp.fast_math
= one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any);
if (is_depthwise || acp.is_quantized) {
if (is_depthwise) {
// There is no support for fixed format kernels for depthwise convolution
// in ACL so we are going to use weight format that we set up earlier
return status::success;
Expand Down
98 changes: 13 additions & 85 deletions src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@
#include <map>
#include "acl_post_ops.hpp"
#include "acl_utils.hpp"
#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
#include "cpu/cpu_convolution_pd.hpp"
#include "cpu/cpu_primitive.hpp"

#include "arm_compute/runtime/experimental/operators/CpuGemmConv2d.h"

#include <type_traits>
namespace dnnl {
namespace impl {
namespace cpu {
Expand All @@ -46,8 +44,6 @@ struct acl_conv_conf_t {
// algorithm can be set to algorithm::convolution_auto and later on we need to
// skip fixed-format protocol as ACL Winograd does not support it.
bool alg_winograd;
// currently, only CpuGemmConv2d has the static quantization update interface.
bool is_quantized;
arm_compute::TensorInfo src_tensor_info;
arm_compute::TensorInfo wei_tensor_info;
arm_compute::TensorInfo bia_tensor_info;
Expand All @@ -74,13 +70,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);

template <typename op_t, typename post_ops_t>
status_t init_scratchpad(const op_t &conv,
memory_tracking::registrar_t &scratchpad,
status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
const dnnl::impl::memory_desc_t &dst_md,
const dnnl::impl::memory_desc_t &bias_md, const bool is_quantized) {
const dnnl::impl::memory_desc_t &dst_md) {

// Book temp mem.
const auto aux_mem_req = conv.workspace();
Expand All @@ -101,20 +95,14 @@ status_t init_scratchpad(const op_t &conv,
dst_d.data_type_size());
}

if (is_quantized && bias_md.format_kind != format_kind::undef) {
const memory_desc_wrapper bias_d(&bias_md);
scratchpad.book(memory_tracking::names::key_conv_bias_s32_convert,
bias_d.nelems(), bias_d.data_type_size());
}

return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
conv_obj_t *acl_conv_obj, const conv_pd_t *pd_,
conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
const std::map<int, conv_key_t> &conv_keys) {

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
Expand All @@ -127,49 +115,16 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
arm_compute::Tensor bia_tensor = nullptr;
arm_compute::Tensor dst_tensor;

auto const acp = pd_->acp_;
auto const acp = pd->acp_;
src_tensor.allocator()->init(acp.src_tensor_info);
wei_tensor.allocator()->init(acp.wei_tensor_info);
dst_tensor.allocator()->init(acp.dst_tensor_info);

const auto scratchpad = ctx.get_scratchpad_grantor();

if (acp.is_quantized) {
// DEFINE_(ARG|ZERO)... demands 'pd' as a function
auto pd = [pd_] { return pd_; };

DEFINE_ARG_SCALES_BUFFER(src_scale, DNNL_ARG_SRC);
DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
DEFINE_ARG_SCALES_BUFFER(wei_scale, DNNL_ARG_WEIGHTS);
DEFINE_ZERO_POINT_VALUE(wei_zero_point, DNNL_ARG_WEIGHTS);
DEFINE_ARG_SCALES_BUFFER(dst_scale, DNNL_ARG_DST);
DEFINE_ZERO_POINT_VALUE(dst_zero_point, DNNL_ARG_DST);

// s8s8s8 uses D = Sx*Sy*(XY + X*zy + Y*zx + zx*zy) and u8s8u8 uses D = Sx*Sy*(XW - X*zw - W*zx + zx*zw)
if (dst_tensor.info()->data_type() == arm_compute::DataType::QASYMM8) {
src_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
*src_scale, -src_zero_point, true));
wei_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
*wei_scale, -wei_zero_point, true));
} else {
src_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
*src_scale, src_zero_point, true));
wei_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
*wei_scale, wei_zero_point, true));
}

// for efficiency reasons, oneDNN saves the inverse of the destination
dst_tensor.info()->set_quantization_info(arm_compute::QuantizationInfo(
1.0 / (*dst_scale), dst_zero_point, true));
}

src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));

const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = acp.use_dst_acc_for_sum
Expand All @@ -178,30 +133,10 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
dst_tensor.allocator()->import_memory(dst_base);

if (acp.with_bias) {
if (acp.is_quantized) {
auto bia_s32_base = scratchpad.get<uint32_t>(
memory_tracking::names::key_conv_bias_s32_convert);
auto bia_f32_base = CTX_IN_MEM(const float32_t *, DNNL_ARG_BIAS);
auto src_scale
= src_tensor.info()->quantization_info().uniform().scale;
auto wei_scale
= wei_tensor.info()->quantization_info().uniform().scale;
const float bias_scale = 1 / (src_scale * wei_scale);
const int num_elements
= acp.bia_tensor_info.total_size() / sizeof(float32_t);
parallel_nd(num_elements, [&](dim_t e) {
const auto b
= int32_t(std::round(bia_f32_base[e] * bias_scale));
bia_s32_base[e] = b;
});
bia_tensor.allocator()->init(acp.bia_tensor_info);
bia_tensor.allocator()->import_memory(bia_s32_base);
} else {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
bia_tensor.allocator()->init(acp.bia_tensor_info);
bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
bia_tensor.allocator()->init(acp.bia_tensor_info);
bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}

// Constness of the weight tensor matters for depthwise conv in ACL.
Expand Down Expand Up @@ -232,17 +167,10 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
}
}

if (acp.is_quantized) {
arm_compute::experimental::op::CpuGemmConv2d *conv
= dynamic_cast<arm_compute::experimental::op::CpuGemmConv2d *>(
&acl_conv_obj->conv);
if (conv) conv->update_quantization_parameters(pack);
}

acl_conv_obj->conv.run(pack);

void *dst = dst_tensor.buffer();
pd_->post_ops.execute(ctx, dst);
pd->post_ops.execute(ctx, dst);

return status::success;
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpu/aarch64/acl_depthwise_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ status_t acl_depthwise_convolution_fwd_t::pd_t::init(engine_t *engine) {
auto scratchpad = scratchpad_registry().registrar();
return init_scratchpad(conv, scratchpad, depthwise_conv_keys, engine,
post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
dst_md_, bias_md_, false);
dst_md_);
}

status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {
Expand Down
34 changes: 6 additions & 28 deletions src/cpu/aarch64/acl_gemm_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,17 @@ template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::pd_t::init(
engine_t *engine) {
using namespace data_type;
using smask_t = primitive_attr_t::skip_mask_t;

bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
&& expect_data_types(src_t, wei_t, bia_t, dst_t, undef)
&& !has_zero_dim_memory() && output_scales_mask_ok()
&& zero_points_ok();

&& !has_zero_dim_memory()
&& attr()->has_default_values(
smask_t::post_ops | smask_t::fpmath_mode, dst_t);
if (!ok) return status::unimplemented;

if (weights_md_.ndims != 4) return status::unimplemented;

// currently, only CpuGemmConv2d has the static quantization update interface.
acp_.is_quantized
= utils::one_of(dst_md_.data_type, data_type::s8, data_type::u8);

// General Compute Library checks, memory tags are also set there
CHECK(acl_convolution_utils::acl_init_conf(
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
Expand All @@ -85,25 +82,7 @@ status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::pd_t::init(
auto scratchpad = scratchpad_registry().registrar();
const auto mem_req = conv.workspace();
return init_scratchpad(conv, scratchpad, gemm_conv_keys, engine, post_ops,
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_,
bias_md_, acp_.is_quantized);
}

template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
bool acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t,
bia_t>::pd_t::output_scales_mask_ok() const {
int mask_src = attr()->scales_.get(DNNL_ARG_SRC).mask_;
int mask_wei = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
int mask_dst = attr()->scales_.get(DNNL_ARG_DST).mask_;
return mask_src == 0 && mask_wei == 0 && mask_dst == 0;
}

template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
bool acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t,
bia_t>::pd_t::zero_points_ok() const {
return attr()->zero_points_.common();
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
}

template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
Expand Down Expand Up @@ -153,8 +132,7 @@ acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::execute_forward(
using namespace data_type;
template struct acl_gemm_convolution_fwd_t<f32>;
template struct acl_gemm_convolution_fwd_t<f16>;
template struct acl_gemm_convolution_fwd_t<s8, s8, s8, f32>;
template struct acl_gemm_convolution_fwd_t<u8, s8, u8, f32>;
template struct acl_gemm_convolution_fwd_t<s8, s8, s8, s32>;

} // namespace aarch64
} // namespace cpu
Expand Down
2 changes: 0 additions & 2 deletions src/cpu/aarch64/acl_gemm_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
"gemm:acl", acl_gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);

status_t init(engine_t *engine);
bool output_scales_mask_ok() const;
bool zero_points_ok() const;

acl_conv_conf_t acp_ = utils::zero<decltype(acp_)>();
acl_post_ops_t post_ops;
Expand Down
2 changes: 1 addition & 1 deletion src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
auto scratchpad = scratchpad_registry().registrar();
return init_scratchpad(conv, scratchpad, indirect_conv_keys, engine,
post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
dst_md_, bias_md_, false);
dst_md_);
}

} // namespace aarch64
Expand Down
3 changes: 1 addition & 2 deletions src/cpu/aarch64/acl_winograd_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
auto scratchpad = scratchpad_registry().registrar();
const auto aux_mem = conv.workspace();
return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_,
bias_md_, acp_.is_quantized);
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
}

status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {
Expand Down
3 changes: 1 addition & 2 deletions src/cpu/cpu_convolution_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,6 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
}},
{{forward, s8, s8, s8}, {
CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<s8, s8, s8, f32>)
CPU_INSTANCE_X64(ip_convolution_fwd_t)
CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
Expand All @@ -511,6 +510,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<s8, s8>)
CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<s8, s8, s8, s32>)
CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
CPU_INSTANCE(ref_convolution_int8_fwd_t)
CPU_INSTANCE(ref_fused_convolution_fwd_t)
Expand Down Expand Up @@ -642,7 +642,6 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
nullptr,
}},
{{forward, u8, s8, u8}, {
CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<u8, s8, u8, f32>)
CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
CPU_INSTANCE_X64(ip_convolution_fwd_t)
CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
Expand Down

0 comments on commit ab668ec

Please sign in to comment.