api: extend persistent cache API to OCL engine

oneapi-src · Oct 28, 2022 · 068071b · 068071b
1 parent de2db04
commit 068071b
Show file tree

Hide file tree

Showing 11 changed files with 430 additions and 13 deletions.
diff --git a/include/oneapi/dnnl/dnnl_ocl.h b/include/oneapi/dnnl/dnnl_ocl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -106,6 +106,63 @@ dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object(
 dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object(
         dnnl_memory_t memory, cl_mem mem_object);
 
+/// Retrieves a cache blob ID for the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl_ocl_interop_engine_get_cache_blob() and
+///     #dnnl_ocl_interop_engine_create_from_cache_blob(). The returned cache
+///     blob ID can only be used as an ID of the cache blob returned by
+///     #dnnl_ocl_interop_engine_get_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @param size Size of the cache blob ID in bytes.
+/// @param cache_blob_id Cache blob id of size @p size. If
+///     the @p cache_blob_id is nullptr then the size of the cache blob ID is
+///     returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob_id(
+        cl_device_id device, size_t *size, uint8_t *cache_blob_id);
+
+/// Retrieves a cache blob associated with the given engine.
+///
+/// @note The cache blob can be empty (@p size will be 0 and @p cache_blob
+///     will be nullptr) if oneDNN doesn't have anything to put in the cache
+///     blob. It's the user's responsibility to check whether it's empty
+///     prior to passing it to
+///     #dnnl_ocl_interop_engine_create_from_cache_blob().
+///
+/// @param engine Engine to query for the cache blob.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is
+///     nullptr then the size of the cache blob is returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob(
+        dnnl_engine_t engine, size_t *size, uint8_t *cache_blob);
+
+/// Creates an engine from the given cache blob.
+///
+/// @param engine Output engine.
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context,
+        size_t size, const uint8_t *cache_blob);
+
 /// Creates an engine associated with an OpenCL device and an OpenCL context.
 ///
 /// @param engine Output engine.

diff --git a/include/oneapi/dnnl/dnnl_ocl.hpp b/include/oneapi/dnnl/dnnl_ocl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,6 +67,74 @@ inline dnnl_ocl_interop_memory_kind_t convert_to_c(memory_kind akind) {
     return static_cast<dnnl_ocl_interop_memory_kind_t>(akind);
 }
 
+/// Returns the cache blob ID of the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl::ocl_interop::get_engine_cache_blob() and
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &).
+///     The returned cache blob ID can only be used as an ID of the cache blob
+///     returned by #dnnl::ocl_interop::get_engine_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @returns A vector containing the cache blob ID.
+inline std::vector<uint8_t> get_engine_cache_blob_id(cl_device_id device) {
+    size_t size = 0;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_cache_blob_id(device, &size, nullptr),
+            "could not get an engine cache blob id size");
+
+    std::vector<uint8_t> cache_blob_id(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob_id(
+                              device, &size, cache_blob_id.data()),
+            "could not get an engine cache blob id");
+    return cache_blob_id;
+}
+
+/// Returns a cache blob for the engine.
+///
+/// @note The cache blob vector can be empty if oneDNN doesn't have anything
+///     to put in the cache blob. It's the user's responsibility to check
+///     whether it's empty prior to passing it to
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &)
+///
+/// @param aengine Engine to query for the cache blob.
+/// @returns Vector containing the cache blob.
+inline std::vector<uint8_t> get_engine_cache_blob(const engine &aengine) {
+    size_t size = 0;
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, nullptr),
+            "could not get an engine cache blob size");
+
+    std::vector<uint8_t> cache_blob(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, cache_blob.data()),
+            "could not get an engine cache blob");
+    return cache_blob;
+}
+
+/// Constructs an engine from the given cache blob.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param cache_blob Cache blob.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context,
+        const std::vector<uint8_t> &cache_blob) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create_from_cache_blob(&c_engine, device,
+                    context, cache_blob.size(), cache_blob.data()),
+            "could not create an engine from cache blob");
+    return engine(c_engine);
+}
+
 /// Constructs an engine from OpenCL device and context objects.
 ///
 /// @param device The OpenCL device that this engine will encapsulate.

diff --git a/src/gpu/compute/compute_engine.cpp b/src/gpu/compute/compute_engine.cpp
@@ -66,10 +66,17 @@ void device_info_cache_set(
 }
 
 status_t compute_engine_t::init() {
-    if (device_info_cache_get(&device_info_, this)) return status::success;
-
-    CHECK(init_device_info());
+    return init({});
+}
 
+status_t compute_engine_t::init(const std::vector<uint8_t> &cache_blob) {
+    if (device_info_cache_get(&device_info_, this)) return status::success;
+    // Since init_device_info that takes a cache blob is only defined for
+    // OpenCL we need to do manual dispatching here.
+    if (cache_blob.empty())
+        CHECK(init_device_info());
+    else
+        CHECK(init_device_info(cache_blob));
     device_info_cache_set(this, device_info_);
 
     return status::success;

diff --git a/src/gpu/compute/compute_engine.hpp b/src/gpu/compute/compute_engine.hpp
@@ -46,6 +46,7 @@ class compute_engine_t : public engine_t {
         : engine_t(kind, runtime_kind, index) {}
 
     virtual status_t init();
+    status_t init(const std::vector<uint8_t> &cache_blob);
 
     const device_info_t *device_info() const { return device_info_.get(); }
 
@@ -173,6 +174,10 @@ class compute_engine_t : public engine_t {
 
 protected:
     virtual status_t init_device_info() = 0;
+    virtual status_t init_device_info(const std::vector<uint8_t> &cache_blob) {
+        assert(!"unexpected");
+        return status::runtime_error;
+    }
 
 #ifdef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE
     ~compute_engine_t() override = default;

diff --git a/src/gpu/compute/device_info.cpp b/src/gpu/compute/device_info.cpp
@@ -16,6 +16,7 @@
 
 #include <mutex>
 #include <thread>
+#include <type_traits>
 
 #include "gpu/compute/device_info.hpp"
 
@@ -200,6 +201,79 @@ status_t device_info_t::init_attributes_common(engine_t *engine) {
     return status::success;
 }
 
+status_t device_info_t::init_serialized_device_info(
+        const std::vector<uint8_t> &cache_blob) {
+    if (!cache_blob.empty()) {
+        serialized_device_info_.write(cache_blob.data(), cache_blob.size());
+        return status::success;
+    }
+
+    serialized_device_info_.write(&gpu_arch_);
+    serialized_device_info_.write(&stepping_id_);
+    serialized_device_info_.write(&runtime_version_.major);
+    serialized_device_info_.write(&runtime_version_.minor);
+    serialized_device_info_.write(&runtime_version_.build);
+    serialized_device_info_.write(hw_threads_, 2);
+    serialized_device_info_.write(&eu_count_);
+    serialized_device_info_.write(&max_eus_per_wg_);
+    serialized_device_info_.write(&max_subgroup_size_);
+    serialized_device_info_.write(&max_wg_size_);
+    serialized_device_info_.write(&llc_cache_size_);
+    serialized_device_info_.write(&extensions_);
+    serialized_device_info_.write(&mayiuse_ngen_kernels_);
+    serialized_device_info_.write(&checked_ngen_kernels_);
+    serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
+
+    const size_t name_size = name_.size();
+    serialized_device_info_.write(&name_size);
+    serialized_device_info_.write(name_.data(), name_size);
+
+    return status::success;
+}
+
+status_t device_info_t::init_from_cache_blob(
+        const std::vector<uint8_t> &cache_blob) {
+    if (cache_blob.empty()) return status::invalid_arguments;
+
+    size_t pos = 0;
+#define DESERIALIZE(val, expected_type) \
+    static_assert(std::is_same<std::remove_reference<decltype(val)>::type, \
+                          expected_type>::value, \
+            #val " has incorrect type"); \
+    val = *reinterpret_cast<const expected_type *>(cache_blob.data() + pos); \
+    pos += sizeof(expected_type);
+
+    DESERIALIZE(gpu_arch_, compute::gpu_arch_t);
+    DESERIALIZE(stepping_id_, int);
+    DESERIALIZE(runtime_version_.major, int);
+    DESERIALIZE(runtime_version_.minor, int);
+    DESERIALIZE(runtime_version_.build, int);
+    DESERIALIZE(hw_threads_[0], int32_t);
+    DESERIALIZE(hw_threads_[1], int32_t);
+    DESERIALIZE(eu_count_, int32_t);
+    DESERIALIZE(max_eus_per_wg_, int32_t);
+    DESERIALIZE(max_subgroup_size_, int32_t);
+    DESERIALIZE(max_wg_size_, size_t);
+    DESERIALIZE(llc_cache_size_, size_t);
+    DESERIALIZE(extensions_, uint64_t);
+    DESERIALIZE(mayiuse_ngen_kernels_, bool);
+    DESERIALIZE(checked_ngen_kernels_, bool);
+    DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
+#undef DESERIALIZE
+
+    // name_ is not trivially copyable type
+    const size_t name_size
+            = *reinterpret_cast<const size_t *>(cache_blob.data() + pos);
+    pos += sizeof(size_t);
+    name_ = std::string(
+            reinterpret_cast<const char *>(cache_blob.data() + pos), name_size);
+    pos += name_size;
+    assert(name_size == name_.size());
+    assert(pos == cache_blob.size());
+
+    return status::success;
+}
+
 } // namespace compute
 } // namespace gpu
 } // namespace impl

diff --git a/src/gpu/compute/device_info.hpp b/src/gpu/compute/device_info.hpp
@@ -22,8 +22,10 @@
 #include <string.h>
 
 #include "common/c_types_map.hpp"
+#include "common/serialization_stream.hpp"
 #include "common/utils.hpp"
 #include "common/z_magic.hpp"
+
 #include "cpu/platform.hpp"
 #include "oneapi/dnnl/dnnl_config.h"
 
@@ -190,14 +192,25 @@ struct device_info_t {
 public:
     virtual ~device_info_t() = default;
 
-    status_t init(engine_t *engine) {
+    status_t init(
+            engine_t *engine, const std::vector<uint8_t> &cache_blob = {}) {
+        if (!cache_blob.empty()) {
+            CHECK(init_from_cache_blob(cache_blob));
+            return init_serialized_device_info(cache_blob);
+        }
+
         CHECK(init_device_name(engine));
         CHECK(init_arch(engine));
         CHECK(init_runtime_version(engine));
         CHECK(init_extensions(engine));
         CHECK(init_attributes(engine));
 
         CHECK(init_attributes_common(engine));
+
+        if (dnnl_version()->gpu_runtime == DNNL_RUNTIME_OCL) {
+            CHECK(init_serialized_device_info());
+        }
+
         return status::success;
     }
 
@@ -234,6 +247,22 @@ struct device_info_t {
 
     bool mayiuse_sub_group(int size) const;
 
+    const std::vector<uint8_t> &get_cache_blob() const {
+        return serialized_device_info_.get_data();
+    }
+
+    status_t get_cache_blob_size(size_t *size) const {
+        (*size) = serialized_device_info_.get_data().size();
+        return status::success;
+    }
+
+    status_t get_cache_blob(size_t size, uint8_t *cache_blob) const {
+        const auto &cb = serialized_device_info_.get_data();
+        if (size != cb.size()) return status::invalid_arguments;
+        std::memcpy(cache_blob, cb.data(), size);
+        return status::success;
+    }
+
 protected:
     virtual status_t init_device_name(engine_t *engine) = 0;
     virtual status_t init_arch(engine_t *engine) = 0;
@@ -262,11 +291,16 @@ struct device_info_t {
 
 private:
     status_t init_attributes_common(engine_t *engine);
+    status_t init_serialized_device_info(
+            const std::vector<uint8_t> &cache_blob = {});
+    status_t init_from_cache_blob(const std::vector<uint8_t> &cache_blob);
 
     bool mayiuse_ngen_kernels_ = false;
     bool checked_ngen_kernels_ = false;
 
     bool mayiuse_non_uniform_work_groups_ = false;
+
+    serialization_stream_t serialized_device_info_;
 };
 
 } // namespace compute