Skip to content

Commit

Permalink
api: extend persistent cache API to OCL engine
Browse files Browse the repository at this point in the history
  • Loading branch information
densamoilov committed Oct 28, 2022
1 parent de2db04 commit 068071b
Show file tree
Hide file tree
Showing 11 changed files with 430 additions and 13 deletions.
59 changes: 58 additions & 1 deletion include/oneapi/dnnl/dnnl_ocl.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2021 Intel Corporation
* Copyright 2020-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -106,6 +106,63 @@ dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object(
dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object(
dnnl_memory_t memory, cl_mem mem_object);

/// Retrieves a cache blob ID for the OpenCL device.
///
/// @warning
/// This API is intended to be used with
/// #dnnl_ocl_interop_engine_get_cache_blob() and
/// #dnnl_ocl_interop_engine_create_from_cache_blob(). The returned cache
/// blob ID can only be used as an ID of the cache blob returned by
/// #dnnl_ocl_interop_engine_get_cache_blob().
///
/// @note The cache blob ID can be empty (@p size will be 0 and
/// @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
/// put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
/// return an empty cache blob).
///
/// @param device An OpenCL device.
/// @param size Size of the cache blob ID in bytes.
/// @param cache_blob_id Cache blob id of size @p size. If
/// the @p cache_blob_id is nullptr then the size of the cache blob ID is
/// returned in @p size.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob_id(
cl_device_id device, size_t *size, uint8_t *cache_blob_id);

/// Retrieves a cache blob associated with the given engine.
///
/// @note The cache blob can be empty (@p size will be 0 and @p cache_blob
/// will be nullptr) if oneDNN doesn't have anything to put in the cache
/// blob. It's the user's responsibility to check whether it's empty
/// prior to passing it to
/// #dnnl_ocl_interop_engine_create_from_cache_blob().
///
/// @param engine Engine to query for the cache blob.
/// @param size Size of the cache blob in bytes.
/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is
/// nullptr then the size of the cache blob is returned in @p size.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob(
dnnl_engine_t engine, size_t *size, uint8_t *cache_blob);

/// Creates an engine from the given cache blob.
///
/// @param engine Output engine.
/// @param device The OpenCL device that this engine will encapsulate.
/// @param context The OpenCL context (containing the device) that this
/// engine will use for all operations.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
/// @param size Size of the cache blob in bytes.
/// @param cache_blob Cache blob of size @p size.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob(
dnnl_engine_t *engine, cl_device_id device, cl_context context,
size_t size, const uint8_t *cache_blob);

/// Creates an engine associated with an OpenCL device and an OpenCL context.
///
/// @param engine Output engine.
Expand Down
70 changes: 69 additions & 1 deletion include/oneapi/dnnl/dnnl_ocl.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2021 Intel Corporation
* Copyright 2020-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,6 +67,74 @@ inline dnnl_ocl_interop_memory_kind_t convert_to_c(memory_kind akind) {
return static_cast<dnnl_ocl_interop_memory_kind_t>(akind);
}

/// Returns the cache blob ID of the OpenCL device.
///
/// @warning
/// This API is intended to be used with
/// #dnnl::ocl_interop::get_engine_cache_blob() and
/// #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &).
/// The returned cache blob ID can only be used as an ID of the cache blob
/// returned by #dnnl::ocl_interop::get_engine_cache_blob().
///
/// @note The cache blob ID can be empty (@p size will be 0 and
/// @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
/// put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
/// return an empty cache blob).
///
/// @param device An OpenCL device.
/// @returns A vector containing the cache blob ID.
inline std::vector<uint8_t> get_engine_cache_blob_id(cl_device_id device) {
size_t size = 0;
error::wrap_c_api(
dnnl_ocl_interop_engine_get_cache_blob_id(device, &size, nullptr),
"could not get an engine cache blob id size");

std::vector<uint8_t> cache_blob_id(size);
error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob_id(
device, &size, cache_blob_id.data()),
"could not get an engine cache blob id");
return cache_blob_id;
}

/// Returns a cache blob for the engine.
///
/// @note The cache blob vector can be empty if oneDNN doesn't have anything
/// to put in the cache blob. It's the user's responsibility to check
/// whether it's empty prior to passing it to
/// #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &)
///
/// @param aengine Engine to query for the cache blob.
/// @returns Vector containing the cache blob.
inline std::vector<uint8_t> get_engine_cache_blob(const engine &aengine) {
size_t size = 0;
error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
aengine.get(), &size, nullptr),
"could not get an engine cache blob size");

std::vector<uint8_t> cache_blob(size);
error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
aengine.get(), &size, cache_blob.data()),
"could not get an engine cache blob");
return cache_blob;
}

/// Constructs an engine from the given cache blob.
///
/// @param device The OpenCL device that this engine will encapsulate.
/// @param context The OpenCL context (containing the device) that this
/// engine will use for all operations.
/// @param cache_blob Cache blob.
/// @returns An engine.
inline engine make_engine(cl_device_id device, cl_context context,
const std::vector<uint8_t> &cache_blob) {
dnnl_engine_t c_engine;
error::wrap_c_api(
dnnl_ocl_interop_engine_create_from_cache_blob(&c_engine, device,
context, cache_blob.size(), cache_blob.data()),
"could not create an engine from cache blob");
return engine(c_engine);
}

/// Constructs an engine from OpenCL device and context objects.
///
/// @param device The OpenCL device that this engine will encapsulate.
Expand Down
13 changes: 10 additions & 3 deletions src/gpu/compute/compute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,17 @@ void device_info_cache_set(
}

status_t compute_engine_t::init() {
if (device_info_cache_get(&device_info_, this)) return status::success;

CHECK(init_device_info());
return init({});
}

status_t compute_engine_t::init(const std::vector<uint8_t> &cache_blob) {
if (device_info_cache_get(&device_info_, this)) return status::success;
// Since init_device_info that takes a cache blob is only defined for
// OpenCL we need to do manual dispatching here.
if (cache_blob.empty())
CHECK(init_device_info());
else
CHECK(init_device_info(cache_blob));
device_info_cache_set(this, device_info_);

return status::success;
Expand Down
5 changes: 5 additions & 0 deletions src/gpu/compute/compute_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class compute_engine_t : public engine_t {
: engine_t(kind, runtime_kind, index) {}

virtual status_t init();
status_t init(const std::vector<uint8_t> &cache_blob);

const device_info_t *device_info() const { return device_info_.get(); }

Expand Down Expand Up @@ -173,6 +174,10 @@ class compute_engine_t : public engine_t {

protected:
virtual status_t init_device_info() = 0;
virtual status_t init_device_info(const std::vector<uint8_t> &cache_blob) {
assert(!"unexpected");
return status::runtime_error;
}

#ifdef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE
~compute_engine_t() override = default;
Expand Down
74 changes: 74 additions & 0 deletions src/gpu/compute/device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <mutex>
#include <thread>
#include <type_traits>

#include "gpu/compute/device_info.hpp"

Expand Down Expand Up @@ -200,6 +201,79 @@ status_t device_info_t::init_attributes_common(engine_t *engine) {
return status::success;
}

status_t device_info_t::init_serialized_device_info(
const std::vector<uint8_t> &cache_blob) {
if (!cache_blob.empty()) {
serialized_device_info_.write(cache_blob.data(), cache_blob.size());
return status::success;
}

serialized_device_info_.write(&gpu_arch_);
serialized_device_info_.write(&stepping_id_);
serialized_device_info_.write(&runtime_version_.major);
serialized_device_info_.write(&runtime_version_.minor);
serialized_device_info_.write(&runtime_version_.build);
serialized_device_info_.write(hw_threads_, 2);
serialized_device_info_.write(&eu_count_);
serialized_device_info_.write(&max_eus_per_wg_);
serialized_device_info_.write(&max_subgroup_size_);
serialized_device_info_.write(&max_wg_size_);
serialized_device_info_.write(&llc_cache_size_);
serialized_device_info_.write(&extensions_);
serialized_device_info_.write(&mayiuse_ngen_kernels_);
serialized_device_info_.write(&checked_ngen_kernels_);
serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);

const size_t name_size = name_.size();
serialized_device_info_.write(&name_size);
serialized_device_info_.write(name_.data(), name_size);

return status::success;
}

status_t device_info_t::init_from_cache_blob(
const std::vector<uint8_t> &cache_blob) {
if (cache_blob.empty()) return status::invalid_arguments;

size_t pos = 0;
#define DESERIALIZE(val, expected_type) \
static_assert(std::is_same<std::remove_reference<decltype(val)>::type, \
expected_type>::value, \
#val " has incorrect type"); \
val = *reinterpret_cast<const expected_type *>(cache_blob.data() + pos); \
pos += sizeof(expected_type);

DESERIALIZE(gpu_arch_, compute::gpu_arch_t);
DESERIALIZE(stepping_id_, int);
DESERIALIZE(runtime_version_.major, int);
DESERIALIZE(runtime_version_.minor, int);
DESERIALIZE(runtime_version_.build, int);
DESERIALIZE(hw_threads_[0], int32_t);
DESERIALIZE(hw_threads_[1], int32_t);
DESERIALIZE(eu_count_, int32_t);
DESERIALIZE(max_eus_per_wg_, int32_t);
DESERIALIZE(max_subgroup_size_, int32_t);
DESERIALIZE(max_wg_size_, size_t);
DESERIALIZE(llc_cache_size_, size_t);
DESERIALIZE(extensions_, uint64_t);
DESERIALIZE(mayiuse_ngen_kernels_, bool);
DESERIALIZE(checked_ngen_kernels_, bool);
DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
#undef DESERIALIZE

// name_ is not trivially copyable type
const size_t name_size
= *reinterpret_cast<const size_t *>(cache_blob.data() + pos);
pos += sizeof(size_t);
name_ = std::string(
reinterpret_cast<const char *>(cache_blob.data() + pos), name_size);
pos += name_size;
assert(name_size == name_.size());
assert(pos == cache_blob.size());

return status::success;
}

} // namespace compute
} // namespace gpu
} // namespace impl
Expand Down
36 changes: 35 additions & 1 deletion src/gpu/compute/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
#include <string.h>

#include "common/c_types_map.hpp"
#include "common/serialization_stream.hpp"
#include "common/utils.hpp"
#include "common/z_magic.hpp"

#include "cpu/platform.hpp"
#include "oneapi/dnnl/dnnl_config.h"

Expand Down Expand Up @@ -190,14 +192,25 @@ struct device_info_t {
public:
virtual ~device_info_t() = default;

status_t init(engine_t *engine) {
status_t init(
engine_t *engine, const std::vector<uint8_t> &cache_blob = {}) {
if (!cache_blob.empty()) {
CHECK(init_from_cache_blob(cache_blob));
return init_serialized_device_info(cache_blob);
}

CHECK(init_device_name(engine));
CHECK(init_arch(engine));
CHECK(init_runtime_version(engine));
CHECK(init_extensions(engine));
CHECK(init_attributes(engine));

CHECK(init_attributes_common(engine));

if (dnnl_version()->gpu_runtime == DNNL_RUNTIME_OCL) {
CHECK(init_serialized_device_info());
}

return status::success;
}

Expand Down Expand Up @@ -234,6 +247,22 @@ struct device_info_t {

bool mayiuse_sub_group(int size) const;

const std::vector<uint8_t> &get_cache_blob() const {
return serialized_device_info_.get_data();
}

status_t get_cache_blob_size(size_t *size) const {
(*size) = serialized_device_info_.get_data().size();
return status::success;
}

status_t get_cache_blob(size_t size, uint8_t *cache_blob) const {
const auto &cb = serialized_device_info_.get_data();
if (size != cb.size()) return status::invalid_arguments;
std::memcpy(cache_blob, cb.data(), size);
return status::success;
}

protected:
virtual status_t init_device_name(engine_t *engine) = 0;
virtual status_t init_arch(engine_t *engine) = 0;
Expand Down Expand Up @@ -262,11 +291,16 @@ struct device_info_t {

private:
status_t init_attributes_common(engine_t *engine);
status_t init_serialized_device_info(
const std::vector<uint8_t> &cache_blob = {});
status_t init_from_cache_blob(const std::vector<uint8_t> &cache_blob);

bool mayiuse_ngen_kernels_ = false;
bool checked_ngen_kernels_ = false;

bool mayiuse_non_uniform_work_groups_ = false;

serialization_stream_t serialized_device_info_;
};

} // namespace compute
Expand Down
Loading

0 comments on commit 068071b

Please sign in to comment.