From 25b1240da6e004000cb26b213d1bd42c720e3345 Mon Sep 17 00:00:00 2001
From: Daniel Taller <taller1@llnl.gov>
Date: Tue, 8 Sep 2020 09:24:10 -0700
Subject: [PATCH 01/15] surround gpu functions by CHAI_GPUCC

---
 src/chai/ArrayManager.hpp | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index 294492aa..af57b023 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -66,6 +66,8 @@ inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abor
 
 #endif
 
+#if defined(CHAI_GPUCC)
+
 // wrapper for hip/cuda synchronize
 inline void synchronize() {
 #if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__)
@@ -111,6 +113,8 @@ CHAI_HOST inline void  gpuMemcpy(void* dst, const void* src, size_t count, gpuMe
 #endif
 }
 
+#endif //#if defined(CHAI_GPUCC)
+
 /*!
  * \brief Singleton that manages caching and movement of ManagedArray objects.
  *

From 2d1b393b844570c34d5acc114ea5096a56ec327c Mon Sep 17 00:00:00 2001
From: Daniel Taller <taller1@llnl.gov>
Date: Tue, 8 Sep 2020 09:35:33 -0700
Subject: [PATCH 02/15] oops, macro in wrong place

---
 src/chai/ArrayManager.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index af57b023..41237001 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -66,8 +66,6 @@ inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abor
 
 #endif
 
-#if defined(CHAI_GPUCC)
-
 // wrapper for hip/cuda synchronize
 inline void synchronize() {
 #if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__)
@@ -77,6 +75,8 @@ inline void synchronize() {
 #endif
 }
 
+#if defined(CHAI_GPUCC)
+
 // wrapper for hip/cuda free
 CHAI_HOST inline void gpuFree(void* buffer) {
 #if defined (CHAI_ENABLE_HIP)

From afd9c0764b618b09ecb8859dc21c679386c640a8 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 8 Sep 2020 17:14:35 -0700
Subject: [PATCH 03/15] Add cdata method

---
 src/chai/ManagedArray.hpp      | 12 +++++++++++-
 src/chai/ManagedArray.inl      | 27 +++++++++++++++++++++++++--
 src/chai/ManagedArray_thin.inl |  8 +++++++-
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp
index fe313219..81503b40 100644
--- a/src/chai/ManagedArray.hpp
+++ b/src/chai/ManagedArray.hpp
@@ -173,7 +173,8 @@ class ManagedArray : public CHAICopyable
    */
   CHAI_HOST void registerTouch(ExecutionSpace space);
 
-  CHAI_HOST void move(ExecutionSpace space=NONE) const;
+  CHAI_HOST void move(ExecutionSpace space=NONE,
+                      bool registerTouch=!std::is_const<T>::value) const;
 
   CHAI_HOST_DEVICE ManagedArray<T> slice(size_t begin, size_t elems=(size_t)-1) const;
 
@@ -207,6 +208,15 @@ class ManagedArray : public CHAICopyable
    */
   CHAI_HOST_DEVICE T* data() const;
 
+  /*!
+   * \brief Move data to the current execution space (actually determined
+   *        by where the code is executing) and return a raw pointer. Do
+   *        not mark data as touched since a pointer to const is returned.
+   *
+   * \return Raw pointer to data in the current execution space
+   */
+  CHAI_HOST_DEVICE const T* cdata() const;
+
   /*!
    * \brief Return the raw pointer to the data in the given execution
    *        space. Optionally move the data to that execution space.
diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl
index b922ca59..4eebf458 100644
--- a/src/chai/ManagedArray.inl
+++ b/src/chai/ManagedArray.inl
@@ -386,7 +386,7 @@ CHAI_HOST_DEVICE void ManagedArray<T>::decr(size_t i) const {
 template <typename T>
 CHAI_INLINE
 CHAI_HOST
-void ManagedArray<T>::move(ExecutionSpace space) const
+void ManagedArray<T>::move(ExecutionSpace space, bool registerTouch) const
 {
   if (m_pointer_record != &ArrayManager::s_null_record) {
      ExecutionSpace prev_space = m_pointer_record->m_last_space;
@@ -408,7 +408,7 @@ void ManagedArray<T>::move(ExecutionSpace space) const
     if (m_pointer_record->m_last_space == PINNED) {
     } else 
 #endif
-     if (!std::is_const<T>::value) {
+     if (registerTouch) {
        CHAI_LOG(Debug, "T is non-const, registering touch of pointer" << m_active_pointer);
        m_resource_manager->registerTouch(m_pointer_record, space);
      }
@@ -499,6 +499,29 @@ T* ManagedArray<T>::data() const {
 #endif
 }
 
+template<typename T>
+CHAI_INLINE
+CHAI_HOST_DEVICE
+const T* ManagedArray<T>::cdata() const {
+#if !defined(CHAI_DEVICE_COMPILE)
+  if (m_active_pointer) {
+     if (m_pointer_record == nullptr || m_pointer_record == &ArrayManager::s_null_record) {
+        CHAI_LOG(Warning, "nullptr pointer_record associated with non-nullptr active_pointer")
+     }
+
+     move(CPU, false);
+  }
+
+  if (m_elems == 0 && !m_is_slice) {
+     return nullptr;
+  }
+
+  return m_active_pointer;
+#else
+  return m_active_pointer;
+#endif
+}
+
 template<typename T>
 T* ManagedArray<T>::data(ExecutionSpace space, bool do_move) const {
    if (m_pointer_record == nullptr || m_pointer_record == &ArrayManager::s_null_record) {
diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index d0a63fd4..25e0c5e2 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -118,6 +118,12 @@ CHAI_HOST_DEVICE T* ManagedArray<T>::data() const
    return m_active_pointer;
 }
 
+template <typename T>
+CHAI_HOST_DEVICE T* ManagedArray<T>::cdata() const
+{
+   return m_active_pointer;
+}
+
 template <typename T>
 T* ManagedArray<T>::data(ExecutionSpace /*space*/, bool /*do_move*/) const
 {
@@ -265,7 +271,7 @@ CHAI_INLINE CHAI_HOST void ManagedArray<T>::registerTouch(ExecutionSpace)
 }
 
 template <typename T>
-CHAI_INLINE CHAI_HOST void ManagedArray<T>::move(ExecutionSpace) const
+CHAI_INLINE CHAI_HOST void ManagedArray<T>::move(ExecutionSpace, bool) const
 {
 }
 

From 7d3338cb06486a70b42506c6ffe896a6f0fa8974 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 9 Sep 2020 17:41:30 -0700
Subject: [PATCH 04/15] Adds explicit unpack functionality to make_managed

---
 src/chai/managed_ptr.hpp | 630 +++++++++++++++++++++------------------
 1 file changed, 348 insertions(+), 282 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 0285d824..64784959 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -59,8 +59,8 @@ namespace chai {
    ///    used in both contexts with a single API.
    /// The make_managed and make_managed_from_factory functions call new on both the
    ///    host and device so that polymorphism is valid in both contexts. Simply copying
-   ///    an object to the device will not copy the vtable, so new must be called on
-   ///    the device.
+   ///    the bits of an object to the device will not copy the vtable, so new must be
+   ///    called on the device.
    ///
    /// Usage Requirements:
    ///    Methods that can be called on the host and/or device must be declared
@@ -71,27 +71,30 @@ namespace chai {
    ///       is updated and vice versa. If you wish to keep both instances in sync,
    ///       you must explicitly modify the object in both the host context and the
    ///       device context.
-   ///    Raw array members of T need to be initialized correctly with a host or
-   ///       device array. If a ManagedArray is passed to the make_managed or
-   ///       make_managed_from_factory methods in place of a raw array, it will be
-   ///       cast to the appropriate host or device pointer when passed to T's
-   ///       constructor on the host and on the device. If it is desired that these
-   ///       host and device pointers be kept in sync, define a callback that maintains
-   ///       a copy of the ManagedArray and upon the ACTION_MOVE event calls the copy
-   ///       constructor of that ManagedArray.
-   ///    If a raw array is passed to make_managed, accessing that member will be
+   ///    C-style array members of T need to be initialized correctly with a host or
+   ///       device C-style array. If a ManagedArray is passed to the make_managed or
+   ///       make_managed_from_factory methods in place of a C-style array, wrap it in
+   ///       a call to chai::unpack to extract the C-style arrays contained within the
+   ///       ManagedArray. This will pass the extracted host C-style array to the host
+   ///       constructor and the extracted device C-style array to the device
+   ///       constructor. If it is desired that these host and device C-style arrays be
+   ///       kept in sync like the normal behavior of ManagedArray, define a callback
+   ///       that maintains a copy of the ManagedArray and upon the ACTION_MOVE event
+   ///       calls the copy constructor of that ManagedArray.
+   ///    If a C-style array is passed to make_managed, accessing that member will be
    ///       valid only in the correct context. To prevent the accidental use of that
    ///       member in the wrong context, any methods that access it should be __host__
-   ///       only or __device__ only. Special care should be taken when passing raw
+   ///       only or __device__ only. Special care should be taken when passing C-style
    ///       arrays as arguments to member functions.
-   ///    The same restrictions for raw array members also apply to raw pointer members.
-   ///       A managed_ptr can be passed to the make_managed or make_managed_from_factory
-   ///       methods in place of a raw pointer, and the host constructor of T will
-   ///       be given the extracted host pointer, and likewise the device constructor
-   ///       of T will be given the extracted device pointer. If it is desired that these
-   ///       host and device pointers be kept in sync, define a callback that maintains
-   ///       a copy of the managed_ptr and upon the ACTION_MOVE event calls the copy
-   ///       constructor of that managed_ptr.
+   ///    The same restrictions for C-style array members also apply to raw pointer
+   ///       members. If a managed_ptr is passed to the make_managed or
+   ///       make_managed_from_factory methods in place of a raw pointer, wrap it in
+   ///       a call to chai::unpack to extract the raw pointers contained within the
+   ///       managed_ptr. This will pass the extracted host pointer to the host
+   ///       constructor and the extracted device pointer to the device constructor.
+   ///       If it is desired that these host and device pointers be kept in sync,
+   ///       define a callback that maintains a copy of the managed_ptr and upon the
+   ///       ACTION_MOVE event call the copy constructor of that managed_ptr.
    ///    Again, if a raw pointer is passed to make_managed, accessing that member will
    ///       only be valid in the correct context. Take care when passing raw pointers
    ///       as arguments to member functions.
@@ -101,10 +104,10 @@ namespace chai {
    ///       every kernel, call ArrayManager::getInstance()->enableDeviceSynchronize().
    ///       Alternatively, call cudaDeviceSynchronize() after any call to make_managed,
    ///       make_managed_from_factory, or managed_ptr::free, and check the return code
-   ///       for errors. If your code crashes in the constructor/destructor of T, then it
-   ///       is recommended to turn on this synchronization. For example, the constructor
-   ///       of T might run out of per-thread stack space on the GPU. If that happens,
-   ///       you can increase the device limit of per-thread stack space.
+   ///       for errors. If your code crashes in the constructor/destructor of T, then
+   ///       it is recommended to turn on this synchronization. For example, the
+   ///       constructor of T might run out of per-thread stack space on the GPU. If
+   ///       that happens, you can increase the device limit of per-thread stack space.
    ///
    template <typename T>
    class managed_ptr {
@@ -491,12 +494,14 @@ namespace chai {
                         switch (execSpace) {
                            case CPU:
                               delete pointer;
+                              m_cpu_pointer = nullptr;
                               break;
 #if defined(CHAI_GPUCC)
                            case GPU:
                            {
                               if (pointer) {
                                  detail::destroy_on_device<<<1, 1>>>(temp);
+                                 m_gpu_pointer = nullptr;
 
 #ifndef CHAI_DISABLE_RM
                                  if (ArrayManager::getInstance()->deviceSynchronize()) {
@@ -523,12 +528,14 @@ namespace chai {
                      switch (execSpace) {
                         case CPU:
                            delete pointer;
+                           m_cpu_pointer = nullptr;
                            break;
 #if defined(CHAI_GPUCC)
                         case GPU:
                         {
                            if (pointer) {
                               detail::destroy_on_device<<<1, 1>>>(pointer);
+                              m_gpu_pointer = nullptr;
 
 #ifndef CHAI_DISABLE_RM
                               if (ArrayManager::getInstance()->deviceSynchronize()) {
@@ -547,6 +554,7 @@ namespace chai {
                }
 
                delete m_pointer_record;
+               m_pointer_record = nullptr;
             }
          }
 
@@ -601,7 +609,80 @@ namespace chai {
          }
    };
 
+   ///
+   /// @author Alan Dayton
+   ///
+   /// A wrapper used for unpacking the internal pointers in the correct space
+   ///
+   template <typename T>
+   class ManagedArrayUnpacker {
+      public:
+         CHAI_HOST ManagedArrayUnpacker() = delete;
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Constructor
+         ///
+         /// @param[in] arg The ManagedArray to unpack
+         ///
+         /// @return a new instance of ManagedArrayUnpacker
+         ///
+         explicit CHAI_HOST ManagedArrayUnpacker(const ManagedArray<T>& arg)
+            : m_array{arg}
+         {}
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Unpacks the data
+         ///
+         /// @return the unpacked data
+         ///
+         CHAI_HOST_DEVICE T* data() const { return m_array.data(); }
+
+      private:
+         ManagedArray<T> m_array = nullptr; //!< The ManagedArray to unpack
+   };
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// A wrapper used for unpacking the internal pointers in the correct space
+   ///
+   template <typename T>
+   class managed_ptr_unpacker {
+      public:
+         CHAI_HOST managed_ptr_unpacker() = delete;
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Constructor
+         ///
+         /// @param[in] arg The managed_ptr to unpack
+         ///
+         /// @return a new instance of managed_ptr_unpacker
+         ///
+         explicit CHAI_HOST managed_ptr_unpacker(const managed_ptr<T>& arg)
+            : m_managed_ptr{arg}
+         {}
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Unpacks the data
+         ///
+         /// @return the unpacked data
+         ///
+         CHAI_HOST_DEVICE T* get() const { return m_managed_ptr.get(); }
+
+      private:
+         managed_ptr<T> m_managed_ptr = nullptr; //!< The managed_ptr to unpack
+   };
+
    namespace detail {
+
       ///
       /// @author Alan Dayton
       ///
@@ -612,7 +693,7 @@ namespace chai {
       /// @return arg
       ///
       template <typename T>
-      CHAI_HOST_DEVICE T getRawPointers(T arg) {
+      CHAI_HOST_DEVICE T processArguments(const T& arg) {
          return arg;
       }
 
@@ -626,7 +707,7 @@ namespace chai {
       /// @return arg cast to a raw pointer
       ///
       template <typename T>
-      CHAI_HOST_DEVICE T* getRawPointers(ManagedArray<T> arg) {
+      CHAI_HOST_DEVICE T* processArguments(const ManagedArrayUnpacker<T>& arg) {
          return arg.data();
       }
 
@@ -643,156 +724,11 @@ namespace chai {
       /// @return a raw pointer acquired from arg
       ///
       template <typename T>
-      CHAI_HOST_DEVICE T* getRawPointers(managed_ptr<T> arg) {
+      CHAI_HOST_DEVICE T* processArguments(const managed_ptr_unpacker<T>& arg) {
          return arg.get();
       }
 
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new object on the host and returns a pointer to it.
-      /// This implementation of new_on_host is called when no arguments need to be
-      ///    converted to raw pointers.
-      ///
-      /// @param[in] args The arguments to T's constructor
-      ///
-      /// @return a pointer to the new object on the host
-      ///
-      template <typename T,
-                typename... Args,
-                typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_HOST T* new_on_host(Args&&... args) {
-         return new T(args...);
-      }
-
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new object on the host and returns a pointer to it.
-      /// This implementation of new_on_host is called when arguments do need to be
-      ///    converted to raw pointers.
-      ///
-      /// @param[in] args The arguments to T's constructor
-      ///
-      /// @return a pointer to the new object on the host
-      ///
-      template <typename T,
-                typename... Args,
-                typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_HOST T* new_on_host(Args&&... args) {
-         return new T(getRawPointers(args)...);
-      }
-
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new T on the host.
-      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
-      ///    are moved to the host as necessary.
-      ///
-      /// @param[in]  args The arguments to T's constructor
-      ///
-      /// @return The host pointer to the new T
-      ///
-      template <typename T,
-                typename... Args>
-      CHAI_HOST T* make_on_host(Args&&... args) {
-#ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
-
-         // Set the execution space so that ManagedArrays and managed_ptrs
-         // are handled properly
-         arrayManager->setExecutionSpace(CPU);
-#endif
-
-         // Create on the host
-         T* cpuPointer = detail::new_on_host<T>(args...);
-
-#ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
-#endif
-
-         // Return the CPU pointer
-         return cpuPointer;
-      }
-
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Calls a factory method to create a new object on the host.
-      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
-      ///    are moved to the host as necessary.
-      ///
-      /// @param[in]  f    The factory method
-      /// @param[in]  args The arguments to the factory method
-      ///
-      /// @return The host pointer to the new object
-      ///
-      template <typename T,
-                typename F,
-                typename... Args>
-      CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) {
-#ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
-
-         // Set the execution space so that ManagedArrays and managed_ptrs
-         // are handled properly
-         arrayManager->setExecutionSpace(CPU);
-#endif
-
-         // Create the object on the device
-         T* cpuPointer = f(args...);
-
-#ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
-#endif
-
-         // Return the GPU pointer
-         return cpuPointer;
-      }
-
 #if defined(CHAI_GPUCC)
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new object on the device and returns a pointer to it.
-      /// This implementation of new_on_device is called when no arguments need to be
-      ///    converted to raw pointers.
-      ///
-      /// @param[in] args The arguments to T's constructor
-      ///
-      /// @return a pointer to the new object on the device
-      ///
-      template <typename T,
-                typename... Args,
-                typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
-         *gpuPointer = new T(args...);
-      }
-
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new object on the device and returns a pointer to it.
-      /// This implementation of new_on_device is called when arguments do need to be
-      ///    converted to raw pointers.
-      ///
-      /// @param[in] args The arguments to T's constructor
-      ///
-      /// @return a pointer to the new object on the device
-      ///
-      template <typename T,
-                typename... Args,
-                typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
-         *gpuPointer = new T(getRawPointers(args)...);
-      }
 
       ///
       /// @author Alan Dayton
@@ -809,7 +745,7 @@ namespace chai {
                 typename... Args>
       __global__ void make_on_device(T** gpuPointer, Args... args)
       {
-         new_on_device(gpuPointer, args...);
+         *gpuPointer = new T(processArguments(args)...);
       }
 
       ///
@@ -830,7 +766,7 @@ namespace chai {
                 typename... Args>
       __global__ void make_on_device_from_factory(T** gpuPointer, F f, Args... args)
       {
-         *gpuPointer = f(args...);
+         *gpuPointer = f(processArguments(args)...);
       }
 
       ///
@@ -843,149 +779,279 @@ namespace chai {
       template <typename T>
       __global__ void destroy_on_device(T* gpuPointer)
       {
-         if (gpuPointer) {
-            delete gpuPointer;
-         }
+         delete gpuPointer;
       }
 
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new T on the device.
-      ///
-      /// @param[in]  args The arguments to T's constructor
-      ///
-      /// @return The device pointer to the new T
-      ///
-      template <typename T,
-                typename... Args>
-      CHAI_HOST T* make_on_device(Args... args) {
+#endif
+
+      // Adapted from "The C++ Programming Language," Fourth Edition,
+      // by Bjarne Stroustrup, pp. 814-816
+      // Used to determine if a functor is callable with the given arguments
+      struct substitution_failure {};
+
+      template <typename T>
+      struct substitution_succeeded : std::true_type {};
+
+      template<>
+      struct substitution_succeeded<substitution_failure> : std::false_type {};
+
+      template <typename F, typename... Args>
+      struct is_invocable_impl {
+         private:
+            template <typename X, typename... Ts>
+            static auto check(X const& x, Ts&&... ts) -> decltype(x(ts...));
+            static substitution_failure check(...);
+         public:
+            using type = decltype(check(std::declval<F>(), std::declval<Args>()...));
+      };
+
+      template <typename F, typename... Args>
+      struct is_invocable : substitution_succeeded<typename is_invocable_impl<F, Args...>::type> {};
+   } // namespace detail
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Unpacks the pointers contained in the ManagedArray and passes them to the
+   /// corresponding spaces.
+   ///
+   /// @param[in] arg The ManagedArray to unpack
+   ///
+   /// @return A wrapper used for unpacking the internal pointers in the correct space
+   ///
+   template <typename T>
+   CHAI_HOST ManagedArrayUnpacker<T> unpack(const ManagedArray<T>& arg) {
+      return ManagedArrayUnpacker<T>(arg);
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Unpacks the pointers contained in the managed_ptr and passes them to the
+   /// corresponding spaces.
+   ///
+   /// @param[in] arg The managed_ptr to unpack
+   ///
+   /// @return A wrapper used for unpacking the internal pointers in the correct space
+   ///
+   template <typename T>
+   CHAI_HOST managed_ptr_unpacker<T> unpack(const managed_ptr<T>& arg) {
+      return managed_ptr_unpacker<T>(arg);
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Creates a new T on the host.
+   /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
+   ///    are moved to the host as necessary.
+   ///
+   /// @param[in]  args The arguments to T's constructor
+   ///
+   /// @return The host pointer to the new T
+   ///
+   template <typename T,
+             typename... Args>
+   CHAI_HOST T* make_on_host(Args&&... args) {
 #ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+      // Get the ArrayManager and save the current execution space
+      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+      ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
 
-         // Set the execution space so that ManagedArrays and managed_ptrs
-         // are handled properly
-         arrayManager->setExecutionSpace(GPU);
+      // Set the execution space so that ManagedArrays and managed_ptrs
+      // are handled properly
+      arrayManager->setExecutionSpace(CPU);
 #endif
 
-         // Allocate space on the GPU to hold the pointer to the new object
-         T** gpuBuffer;
-         gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
-
-         // Create the object on the device
-         make_on_device<<<1, 1>>>(gpuBuffer, args...);
+      // Create on the host
+      T* cpuPointer = new T(detail::processArguments(args)...);
 
 #ifndef CHAI_DISABLE_RM
-         if (ArrayManager::getInstance()->deviceSynchronize()) {
-            synchronize();
-         }
+      // Set the execution space back to the previous value
+      arrayManager->setExecutionSpace(currentSpace);
 #endif
 
-         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
-         T** cpuBuffer = (T**) malloc(sizeof(T*));
-         gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost);
+      // Return the CPU pointer
+      return cpuPointer;
+   }
 
-         // Get the GPU pointer
-         T* gpuPointer = cpuBuffer[0];
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Calls a factory method to create a new object on the host.
+   /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
+   ///    are moved to the host as necessary.
+   ///
+   /// @param[in]  f    The factory method
+   /// @param[in]  args The arguments to the factory method
+   ///
+   /// @return The host pointer to the new object
+   ///
+   template <typename T,
+             typename F,
+             typename... Args>
+   CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+      // Get the ArrayManager and save the current execution space
+      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+      ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
 
-         // Free the host and device buffers
-         free(cpuBuffer);
-         gpuFree(gpuBuffer);
+      // Set the execution space so that ManagedArrays and managed_ptrs
+      // are handled properly
+      arrayManager->setExecutionSpace(CPU);
+#endif
+
+      // Create the object on the device
+      T* cpuPointer = f(args...);
 
 #ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
+      // Set the execution space back to the previous value
+      arrayManager->setExecutionSpace(currentSpace);
 #endif
 
-         // Return the GPU pointer
-         return gpuPointer;
-      }
+      // Return the GPU pointer
+      return cpuPointer;
+   }
 
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Calls a factory method to create a new object on the device.
-      ///
-      /// @param[in]  f    The factory method
-      /// @param[in]  args The arguments to the factory method
-      ///
-      /// @return The device pointer to the new object
-      ///
-      template <typename T,
-                typename F,
-                typename... Args>
-      CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) {
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Destroys the host pointer.
+   ///
+   /// @param[out] cpuPointer The host pointer to clean up
+   ///
+   template <typename T>
+   CHAI_HOST void destroy_on_host(T* cpuPointer) {
+      delete cpuPointer;
+   }
+
+#if defined(CHAI_GPUCC)
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Creates a new T on the device.
+   ///
+   /// @param[in]  args The arguments to T's constructor
+   ///
+   /// @return The device pointer to the new T
+   ///
+   template <typename T,
+             typename... Args>
+   CHAI_HOST T* make_on_device(Args... args) {
 #ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+      // Get the ArrayManager and save the current execution space
+      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+      ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
 
-         // Set the execution space so that chai::ManagedArrays and
-         // chai::managed_ptrs are handled properly
-         arrayManager->setExecutionSpace(GPU);
+      // Set the execution space so that ManagedArrays and managed_ptrs
+      // are handled properly
+      arrayManager->setExecutionSpace(GPU);
 #endif
 
-         // Allocate space on the GPU to hold the pointer to the new object
-         T** gpuBuffer;
-         gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
+      // Allocate space on the GPU to hold the pointer to the new object
+      T** gpuBuffer;
+      gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
 
-         // Create the object on the device
-         make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
+      // Create the object on the device
+      make_on_device<<<1, 1>>>(gpuBuffer, args...);
 
 #ifndef CHAI_DISABLE_RM
-         if (ArrayManager::getInstance()->deviceSynchronize()) {
-            synchronize();
-         }
+      if (ArrayManager::getInstance()->deviceSynchronize()) {
+         synchronize();
+      }
 #endif
 
-         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
-         T** cpuBuffer = (T**) malloc(sizeof(T*));
-         gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost);
+      // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+      T** cpuBuffer = (T**) malloc(sizeof(T*));
+      gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost);
 
-         // Get the GPU pointer
-         T* gpuPointer = cpuBuffer[0];
+      // Get the GPU pointer
+      T* gpuPointer = cpuBuffer[0];
 
-         // Free the host and device buffers
-         free(cpuBuffer);
-         gpuFree(gpuBuffer);
+      // Free the host and device buffers
+      free(cpuBuffer);
+      gpuFree(gpuBuffer);
 
 #ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
+      // Set the execution space back to the previous value
+      arrayManager->setExecutionSpace(currentSpace);
 #endif
 
-         // Return the GPU pointer
-         return gpuPointer;
-      }
+      // Return the GPU pointer
+      return gpuPointer;
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Calls a factory method to create a new object on the device.
+   ///
+   /// @param[in]  f    The factory method
+   /// @param[in]  args The arguments to the factory method
+   ///
+   /// @return The device pointer to the new object
+   ///
+   template <typename T,
+             typename F,
+             typename... Args>
+   CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+      // Get the ArrayManager and save the current execution space
+      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+      ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
 
+      // Set the execution space so that chai::ManagedArrays and
+      // chai::managed_ptrs are handled properly
+      arrayManager->setExecutionSpace(GPU);
 #endif
 
-      // Adapted from "The C++ Programming Language," Fourth Edition,
-      // by Bjarne Stroustrup, pp. 814-816
-      // Used to determine if a functor is callable with the given arguments
-      struct substitution_failure {};
+      // Allocate space on the GPU to hold the pointer to the new object
+      T** gpuBuffer;
+      gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
 
-      template <typename T>
-      struct substitution_succeeded : std::true_type {};
+      // Create the object on the device
+      make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
 
-      template<>
-      struct substitution_succeeded<substitution_failure> : std::false_type {};
+#ifndef CHAI_DISABLE_RM
+      if (ArrayManager::getInstance()->deviceSynchronize()) {
+         synchronize();
+      }
+#endif
 
-      template <typename F, typename... Args>
-      struct is_invocable_impl {
-         private:
-            template <typename X, typename... Ts>
-            static auto check(X const& x, Ts&&... ts) -> decltype(x(ts...));
-            static substitution_failure check(...);
-         public:
-            using type = decltype(check(std::declval<F>(), std::declval<Args>()...));
-      };
+      // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+      T** cpuBuffer = (T**) malloc(sizeof(T*));
+      gpuMemcpy(cpuBuffer, gpuBuffer, sizeof(T*), gpuMemcpyDeviceToHost);
 
-      template <typename F, typename... Args>
-      struct is_invocable : substitution_succeeded<typename is_invocable_impl<F, Args...>::type> {};
-   } // namespace detail
+      // Get the GPU pointer
+      T* gpuPointer = cpuBuffer[0];
+
+      // Free the host and device buffers
+      free(cpuBuffer);
+      gpuFree(gpuBuffer);
+
+#ifndef CHAI_DISABLE_RM
+      // Set the execution space back to the previous value
+      arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+      // Return the GPU pointer
+      return gpuPointer;
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Destroys the device pointer.
+   ///
+   /// @param[out] gpuPointer The device pointer to clean up
+   ///
+   template <typename T>
+   CHAI_HOST void destroy_on_device(T* gpuPointer) {
+      detail::destroy_on_device<<<1, 1>>>(gpuPointer);
+   }
+
+#endif
 
    ///
    /// @author Alan Dayton
@@ -1000,11 +1066,11 @@ namespace chai {
    CHAI_HOST managed_ptr<T> make_managed(Args... args) {
 #if defined(CHAI_GPUCC)
       // Construct on the GPU first to take advantage of asynchrony
-      T* gpuPointer = detail::make_on_device<T>(args...);
+      T* gpuPointer = make_on_device<T>(args...);
 #endif
 
       // Construct on the CPU
-      T* cpuPointer = detail::make_on_host<T>(args...);
+      T* cpuPointer = make_on_host<T>(args...);
 
       // Construct and return the managed_ptr
 #if defined(CHAI_GPUCC)
@@ -1040,11 +1106,11 @@ namespace chai {
 
 #if defined(CHAI_GPUCC)
       // Construct on the GPU first to take advantage of asynchrony
-      T* gpuPointer = detail::make_on_device_from_factory<R>(f, args...);
+      T* gpuPointer = make_on_device_from_factory<R>(f, args...);
 #endif
 
       // Construct on the CPU
-      T* cpuPointer = detail::make_on_host_from_factory<R>(f, args...);
+      T* cpuPointer = make_on_host_from_factory<R>(f, args...);
 
       // Construct and return the managed_ptr
 #if defined(CHAI_GPUCC)

From 7f12ee1ac96f245578e65440cb19687b128b1490 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 9 Sep 2020 17:55:23 -0700
Subject: [PATCH 05/15] Fix GPU build issue

---
 src/chai/managed_ptr.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 64784959..902701ab 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -954,7 +954,7 @@ namespace chai {
       gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
 
       // Create the object on the device
-      make_on_device<<<1, 1>>>(gpuBuffer, args...);
+      detail::make_on_device<<<1, 1>>>(gpuBuffer, args...);
 
 #ifndef CHAI_DISABLE_RM
       if (ArrayManager::getInstance()->deviceSynchronize()) {
@@ -1011,7 +1011,7 @@ namespace chai {
       gpuMalloc((void**)(&gpuBuffer), sizeof(T*));
 
       // Create the object on the device
-      make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
+      detail::make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
 
 #ifndef CHAI_DISABLE_RM
       if (ArrayManager::getInstance()->deviceSynchronize()) {

From d5456333094f2f0682430bd78f439acbe086a703 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 9 Sep 2020 18:01:21 -0700
Subject: [PATCH 06/15] Update managed_ptr tests

---
 tests/integration/managed_ptr_tests.cpp | 22 +++++++++++++---------
 tests/unit/managed_ptr_unit_tests.cpp   |  6 +++---
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index 50e40a17..441bc638 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -160,7 +160,7 @@ TEST(managed_ptr, class_with_raw_array)
     array[i] = expectedValue;
   });
 
-  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(chai::unpack(array));
 
   ASSERT_EQ(rawArrayClass->getValue(0), expectedValue);
 
@@ -181,7 +181,9 @@ TEST(managed_ptr, class_with_multiple_raw_arrays)
      array2[i] = expectedValue2;
   });
 
-  auto multipleRawArrayClass = chai::make_managed<MultipleRawArrayClass>(array1, array2);
+  auto multipleRawArrayClass = chai::make_managed<MultipleRawArrayClass>(
+                                 chai::unpack(array1),
+                                 chai::unpack(array2));
 
   ASSERT_EQ(multipleRawArrayClass->getValue(0, 0), expectedValue1);
   ASSERT_EQ(multipleRawArrayClass->getValue(1, 0), expectedValue2);
@@ -219,8 +221,9 @@ TEST(managed_ptr, class_with_raw_ptr)
      array[i] = expectedValue;
   });
 
-  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
-  auto rawPointerClass = chai::make_managed<RawPointerClass>(rawArrayClass);
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(chai::unpack(array));
+  auto rawPointerClass = chai::make_managed<RawPointerClass>(
+                           chai::unpack(rawArrayClass));
 
   ASSERT_EQ((*rawPointerClass).getValue(0), expectedValue);
 
@@ -452,7 +455,7 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array)
      array[i] = expectedValue;
   });
 
-  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(chai::unpack(array));
   chai::ManagedArray<int> results(1, chai::GPU);
 
   forall(gpu(), 0, 1, [=] __device__ (int i) {
@@ -482,7 +485,7 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback)
 #else
   auto cpuPointer = new RawArrayClass(array.data());
 #endif
-  auto gpuPointer = chai::detail::make_on_device<RawArrayClass>(array);
+  auto gpuPointer = chai::make_on_device<RawArrayClass>(chai::unpack(array));
 
   auto callback = [=] (chai::Action action, chai::ExecutionSpace space, void*) mutable -> bool {
      switch (action) {
@@ -553,8 +556,9 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
      array[0] = expectedValue;
   });
 
-  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
-  auto rawPointerClass = chai::make_managed<RawPointerClass>(rawArrayClass);
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(chai::unpack(array));
+  auto rawPointerClass = chai::make_managed<RawPointerClass>(
+                           chai::unpack(rawArrayClass));
 
   chai::ManagedArray<int> results(1, chai::GPU);
 
@@ -793,7 +797,7 @@ TEST(managed_ptr, class_with_raw_array_of_pointers)
   chai::ManagedArray<int> array(1, chai::CPU);
   array[0] = expectedValue;
 
-  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(chai::unpack(array));
   chai::managed_ptr<RawArrayClass> arrayOfPointers[1] = {rawArrayClass};
 
   auto rawArrayOfPointersClass = chai::make_managed<RawArrayOfPointersClass>(arrayOfPointers);
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index 927e6373..f6743251 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -552,7 +552,7 @@ GPU_TEST(managed_ptr, gpu_nullptr_constructor)
 
 GPU_TEST(managed_ptr, gpu_gpu_pointer_constructor)
 {
-  TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
+  TestDerived* gpuPointer = chai::make_on_device<TestDerived>(3);
   chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
 
   EXPECT_EQ(derived.get(), nullptr);
@@ -656,7 +656,7 @@ GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
 
 GPU_TEST(managed_ptr, simple_gpu_cpu_and_gpu_pointer_constructor)
 {
-  Simple* gpuPointer = chai::detail::make_on_device<Simple>(3);
+  Simple* gpuPointer = chai::make_on_device<Simple>(3);
   Simple* cpuPointer = new Simple(4);
 
   chai::managed_ptr<Simple> simple({chai::GPU, chai::CPU}, {gpuPointer, cpuPointer});
@@ -680,7 +680,7 @@ GPU_TEST(managed_ptr, simple_gpu_cpu_and_gpu_pointer_constructor)
 
 GPU_TEST(managed_ptr, gpu_cpu_and_gpu_pointer_constructor)
 {
-  TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
+  TestDerived* gpuPointer = chai::make_on_device<TestDerived>(3);
   TestDerived* cpuPointer = new TestDerived(4);
 
   chai::managed_ptr<TestDerived> derived({chai::GPU, chai::CPU}, {gpuPointer, cpuPointer});

From e24bd98c68434cf47a6015b0ddbfa035ba2e10bb Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 16 Sep 2020 14:57:55 -0700
Subject: [PATCH 07/15] Add better documentation for unpack and unpack classes

---
 src/chai/managed_ptr.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 902701ab..1afcf9c2 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -612,7 +612,9 @@ namespace chai {
    ///
    /// @author Alan Dayton
    ///
-   /// A wrapper used for unpacking the internal pointers in the correct space
+   /// A wrapper used by the make_managed family of functions to indicate when
+   /// the internal pointers contained by a ManagedArray should be extracted.
+   /// It is not intended to be used directly, but rather created by unpack.
    ///
    template <typename T>
    class ManagedArrayUnpacker {
@@ -648,7 +650,9 @@ namespace chai {
    ///
    /// @author Alan Dayton
    ///
-   /// A wrapper used for unpacking the internal pointers in the correct space
+   /// A wrapper used by the make_managed family of functions to indicate when
+   /// the internal pointers contained by a managed_ptr should be extracted.
+   /// It is not intended to be used directly, but rather created by unpack.
    ///
    template <typename T>
    class managed_ptr_unpacker {
@@ -817,7 +821,8 @@ namespace chai {
    ///
    /// @param[in] arg The ManagedArray to unpack
    ///
-   /// @return A wrapper used for unpacking the internal pointers in the correct space
+   /// @return A wrapper used by make_managed for unpacking the internal pointers
+   ///         in the correct space
    ///
    template <typename T>
    CHAI_HOST ManagedArrayUnpacker<T> unpack(const ManagedArray<T>& arg) {
@@ -832,7 +837,8 @@ namespace chai {
    ///
    /// @param[in] arg The managed_ptr to unpack
    ///
-   /// @return A wrapper used for unpacking the internal pointers in the correct space
+   /// @return A wrapper used by make_managed for unpacking the internal pointers
+   ///         in the correct space
    ///
    template <typename T>
    CHAI_HOST managed_ptr_unpacker<T> unpack(const managed_ptr<T>& arg) {

From 35c1c50ec04fe7cade4dd2b478f5cf948d455d8e Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 16 Sep 2020 16:01:11 -0700
Subject: [PATCH 08/15] Add some initial sphinx documentation for managed_ptr

---
 docs/sphinx/index.rst      |   1 +
 docs/sphinx/user_guide.rst | 189 +++++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 docs/sphinx/user_guide.rst

diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index e35e93a2..b391a334 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -63,6 +63,7 @@ Any questions? Contact chai-dev@llnl.gov
 
   getting_started
   tutorial
+  user_guide
 
 .. toctree::
   :maxdepth: 2
diff --git a/docs/sphinx/user_guide.rst b/docs/sphinx/user_guide.rst
new file mode 100644
index 00000000..d53db0f3
--- /dev/null
+++ b/docs/sphinx/user_guide.rst
@@ -0,0 +1,189 @@
+.. Copyright (c) 2016, Lawrence Livermore National Security, LLC. All
+ rights reserved.
+ 
+ Produced at the Lawrence Livermore National Laboratory
+ 
+ This file is part of CHAI.
+ 
+ LLNL-CODE-705877
+ 
+ For details, see https:://github.com/LLNL/CHAI
+ Please also see the NOTICE and LICENSE files.
+ 
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 
+ - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ 
+ - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+ 
+ - Neither the name of the LLNS/LLNL nor the names of its contributors
+   may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+ 
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+.. _user_guide:
+
+**********
+User Guide
+**********
+
+-----------------------------------
+A Portable Pattern for Polymorphism
+-----------------------------------
+
+CHAI provides a data structure to help handle cases where it is desirable to call virtual functions on the device. If you only call virtual functions on the host, this pattern is unnecessary. But for those who do want to use virtual functions on the device without a painstaking amount of refactoring, we begin with a short, albeit admittedly contrived example.
+
+.. code-block:: cpp
+
+   class MyBaseClass {
+      public:
+         MyBaseClass() {}
+         virtual ~MyBaseClass() {}
+         virtual int getValue() const = 0;
+   };
+
+   class MyDerivedClass : public MyBaseClass {
+      public:
+         MyDerivedClass(const int value) : MyBaseClass(), m_value(value) {}
+         virtual ~MyDerivedClass() {}
+         virtual int getValue() const { return m_value; }
+
+      private:
+         int m_value;
+   };
+
+   int main(int argc, char** argv) {
+      MyBaseClass* myBaseClass = new MyDerivedClass(0);
+      myBaseClass->getValue();
+      return 0;
+   }
+
+It is perfectly fine to call `myBaseClass->getValue()` in host code, since myBaseClass was created on the host. However, what if you want to call this virtual function on the device?
+
+.. code-block:: cpp
+
+   __global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
+      myBaseClass->getValue();
+   }
+
+   int main(int argc, char** argv) {
+      MyBaseClass* myBaseClass = new MyDerivedClass(0);
+      callVirtualFunction<<<1, 1>>>(myBaseClass);
+      return 0;
+   }
+
+At best, calling this code will result in a crash. At worst, it will access garbage and happily continue while giving incorrect results. It is illegal to access host pointers on the device and produces undefined behavior. So what is our next attempt? Why not pass the argument by value rather than by a pointer?
+
+.. code-block:: cpp
+
+   __global__ void callVirtualFunction(MyBaseClass myBaseClass) {
+      myBaseClass.getValue();
+   }
+
+   int main(int argc, char** argv) {
+      MyBaseClass* myBaseClass = new MyDerivedClass(0);
+      callVirtualFunction<<<1, 1>>>(*myBaseClass); // This will not compile
+      return 0;
+   }
+
+At first glance, this may seem like it would work, but there is a flaw - copy constructors are not virtual. You could cast to MyDerivedClass and then pass that by value, but if there are tons of classes in this heirarchy, how do you know which one to cast it to? You could try dynamic_cast dozens of times, but that is not performant or sustainable. You could also write a virtual clone method, but that is also not sustainable. You could refactor to use the curiously recurring template pattern, but that would likely require a large development effort. Also, there is a limitation on the size of the arguments passed to a global kernel, so if you have a very large class, this is simply impossible. So we make another attempt.
+
+.. code-block:: cpp
+
+   __global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
+      myBaseClass->getValue();
+   }
+
+   int main(int argc, char** argv) {
+      MyBaseClass* myBaseClass = new MyDerivedClass(0);
+      MyBaseClass* d_myBaseClass;
+      cudaMalloc(&d_myBaseClass, sizeof(MyBaseClass));
+      cudaMemcpy(d_myBaseClass, myBaseClass, sizeof(MyBaseClass), cudaMemcpyHostToDevice);
+      callVirtualFunction<<<1, 1>>>(d_myBaseClass);
+      return 0;
+   }
+
+We are getting nearer, but there is still a flaw. The bits of myBaseClass contain the virtual function table that allows virtual function lookups on the host, but that virtual function table is not valid for lookups on the device since it contains pointers to host functions. It will not work any better to cast to MyDerivedClass and copy the bits. The only option is to call the constructor on the device and keep that device pointer around.
+
+.. code-block:: cpp
+
+   __global__ void make_on_device(MyBaseClass** myBaseClass, int argument) {
+      *myBaseClass = new MyDerivedClass(argument);
+   }
+
+   __global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
+      myBaseClass->getValue();
+   }
+
+   int main(int argc, char** argv) {
+      MyBaseClass** d_temp;
+      cudaMalloc(&d_temp, sizeof(MyBaseClass*));
+      make_on_device<<<1, 1>>>(d_temp, 0);
+
+      MyBaseClass** temp = (MyBaseClass**) malloc(sizeof(MyBaseClass*));
+      cudaMemcpy(temp, d_temp, sizeof(MyBaseClass*), cudaMemcpyDeviceToHost);
+      MyBaseClass d_myBaseClass = *temp;
+
+      callVirtualFunction<<<1, 1>>>(d_myBaseClass);
+
+      free(temp);
+      cudaFree(d_temp);
+
+      // Still need to call delete on the device
+      return 0;
+   }
+
+OK, this is finally correct, but super tedious. So we took care of the details for you.
+
+.. code-block:: cpp
+
+   __global__ void callVirtualFunction(chai::managed_ptr<MyBaseClass> myBaseClass) {
+      myBaseClass->getValue();
+   }
+
+   int main(int argc, char** argv) {
+      chai::managed_ptr<MyBaseClass> myBaseClass = chai::make_managed<MyDerivedClass>(0);
+      callVirtualFunction<<<1, 1>>>(myBaseClass);
+      myBaseClass.free();
+
+      return 0;
+   }
+
+OK, so we didn't do all the work for you, but we definitely gave you a leg up. What's left for you to do? You just need to make sure the functions accessed on the device have the __device__ specifier (including constructors and destructors). You also need to make sure the destructors are virtual so the object gets cleaned up properly on the device.
+
+.. code-block:: cpp
+
+   class MyBaseClass {
+      public:
+         CARE_HOST_DEVICE MyBaseClass() {}
+         CARE_HOST_DEVICE virtual ~MyBaseClass() {}
+         CARE_HOST_DEVICE virtual int getValue() const = 0;
+   };
+
+   class MyDerivedClass : public MyBaseClass {
+      public:
+         CARE_HOST_DEVICE MyDerivedClass(const int value) : MyBaseClass(), m_value(value) {}
+         CARE_HOST_DEVICE virtual ~MyDerivedClass() {}
+         CARE_HOST_DEVICE virtual int getValue() const { return m_value; }
+
+      private:
+         int m_value;
+   };

From 562fc9138a7699ec7748a8aca3a81504d6549944 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 16 Sep 2020 17:09:17 -0700
Subject: [PATCH 09/15] Add more detailed information to user guide about
 managed_ptr

---
 docs/sphinx/user_guide.rst | 108 ++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 14 deletions(-)

diff --git a/docs/sphinx/user_guide.rst b/docs/sphinx/user_guide.rst
index d53db0f3..f28e7917 100644
--- a/docs/sphinx/user_guide.rst
+++ b/docs/sphinx/user_guide.rst
@@ -62,9 +62,9 @@ CHAI provides a data structure to help handle cases where it is desirable to cal
 
    class MyDerivedClass : public MyBaseClass {
       public:
-         MyDerivedClass(const int value) : MyBaseClass(), m_value(value) {}
-         virtual ~MyDerivedClass() {}
-         virtual int getValue() const { return m_value; }
+         MyDerivedClass(int value) : MyBaseClass(), m_value(value) {}
+         ~MyDerivedClass() {}
+         int getValue() const { return m_value; }
 
       private:
          int m_value;
@@ -73,10 +73,11 @@ CHAI provides a data structure to help handle cases where it is desirable to cal
    int main(int argc, char** argv) {
       MyBaseClass* myBaseClass = new MyDerivedClass(0);
       myBaseClass->getValue();
+      delete myBaseClass;
       return 0;
    }
 
-It is perfectly fine to call `myBaseClass->getValue()` in host code, since myBaseClass was created on the host. However, what if you want to call this virtual function on the device?
+It is perfectly fine to call `myBaseClass->getValue()` in host code, since `myBaseClass` was created on the host. However, what if you want to call this virtual function on the device?
 
 .. code-block:: cpp
 
@@ -87,6 +88,7 @@ It is perfectly fine to call `myBaseClass->getValue()` in host code, since myBas
    int main(int argc, char** argv) {
       MyBaseClass* myBaseClass = new MyDerivedClass(0);
       callVirtualFunction<<<1, 1>>>(myBaseClass);
+      delete myBaseClass;
       return 0;
    }
 
@@ -101,10 +103,11 @@ At best, calling this code will result in a crash. At worst, it will access garb
    int main(int argc, char** argv) {
       MyBaseClass* myBaseClass = new MyDerivedClass(0);
       callVirtualFunction<<<1, 1>>>(*myBaseClass); // This will not compile
+      delete myBaseClass;
       return 0;
    }
 
-At first glance, this may seem like it would work, but there is a flaw - copy constructors are not virtual. You could cast to MyDerivedClass and then pass that by value, but if there are tons of classes in this heirarchy, how do you know which one to cast it to? You could try dynamic_cast dozens of times, but that is not performant or sustainable. You could also write a virtual clone method, but that is also not sustainable. You could refactor to use the curiously recurring template pattern, but that would likely require a large development effort. Also, there is a limitation on the size of the arguments passed to a global kernel, so if you have a very large class, this is simply impossible. So we make another attempt.
+At first glance, this may seem like it would work, but this is not supported by nvidia: "It is not allowed to pass as an argument to a `__global__` function an object of a class with virtual functions" (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-functions). Also: "It is not allowed to pass as an argument to a `__global__` function an object of a class derived from virtual base classes" (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-base-classes). You could refactor to use the curiously recurring template pattern, but that would likely require a large development effort and also limits the programming patterns you can use. Also, there is a limitation on the size of the arguments passed to a global kernel, so if you have a very large class this is simply impossible. So we make another attempt.
 
 .. code-block:: cpp
 
@@ -117,11 +120,16 @@ At first glance, this may seem like it would work, but there is a flaw - copy co
       MyBaseClass* d_myBaseClass;
       cudaMalloc(&d_myBaseClass, sizeof(MyBaseClass));
       cudaMemcpy(d_myBaseClass, myBaseClass, sizeof(MyBaseClass), cudaMemcpyHostToDevice);
+
       callVirtualFunction<<<1, 1>>>(d_myBaseClass);
+
+      cudaFree(d_myBaseClass);
+      delete myBaseClass;
+
       return 0;
    }
 
-We are getting nearer, but there is still a flaw. The bits of myBaseClass contain the virtual function table that allows virtual function lookups on the host, but that virtual function table is not valid for lookups on the device since it contains pointers to host functions. It will not work any better to cast to MyDerivedClass and copy the bits. The only option is to call the constructor on the device and keep that device pointer around.
+We are getting nearer, but there is still a flaw. The bits of `myBaseClass` contain the virtual function table that allows virtual function lookups on the host, but that virtual function table is not valid for lookups on the device since it contains pointers to host functions. It will not work any better to cast to `MyDerivedClass` and copy the bits. The only option is to call the constructor on the device and keep that device pointer around.
 
 .. code-block:: cpp
 
@@ -129,6 +137,10 @@ We are getting nearer, but there is still a flaw. The bits of myBaseClass contai
       *myBaseClass = new MyDerivedClass(argument);
    }
 
+   __global__ void destroy_on_device(MyBaseClass* myBaseClass) {
+      delete myBaseClass;
+   }
+
    __global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
       myBaseClass->getValue();
    }
@@ -145,13 +157,13 @@ We are getting nearer, but there is still a flaw. The bits of myBaseClass contai
       callVirtualFunction<<<1, 1>>>(d_myBaseClass);
 
       free(temp);
+      destroy_on_device<<<1, 1>>>(d_myBaseClass);
       cudaFree(d_temp);
 
-      // Still need to call delete on the device
       return 0;
    }
 
-OK, this is finally correct, but super tedious. So we took care of the details for you.
+OK, this is finally correct, but super tedious. So we took care of all the boilerplate and underlying details for you. The final result is at least recognizable when compared to the original code. The added benefit is that you can use a `chai::managed_ptr` on the host AND the device.
 
 .. code-block:: cpp
 
@@ -161,13 +173,13 @@ OK, this is finally correct, but super tedious. So we took care of the details f
 
    int main(int argc, char** argv) {
       chai::managed_ptr<MyBaseClass> myBaseClass = chai::make_managed<MyDerivedClass>(0);
-      callVirtualFunction<<<1, 1>>>(myBaseClass);
+      myBaseClass->getValue(); // Accessible on the host
+      callVirtualFunction<<<1, 1>>>(myBaseClass); // Accessible on the device
       myBaseClass.free();
-
       return 0;
    }
 
-OK, so we didn't do all the work for you, but we definitely gave you a leg up. What's left for you to do? You just need to make sure the functions accessed on the device have the __device__ specifier (including constructors and destructors). You also need to make sure the destructors are virtual so the object gets cleaned up properly on the device.
+OK, so we didn't do all the work for you, but we definitely gave you a leg up. What's left for you to do? You just need to make sure the functions accessed on the device have the `__device__` specifier (including constructors and destructors). We use the `CHAI_HOST_DEVICE` macro in this example, which actually annotates the functions as `__host__ __device__` so we can call the virtual method on both the host and the device. You also need to make sure the destructors of all base classes are virtual so the object gets cleaned up properly on the device.
 
 .. code-block:: cpp
 
@@ -180,10 +192,78 @@ OK, so we didn't do all the work for you, but we definitely gave you a leg up. W
 
    class MyDerivedClass : public MyBaseClass {
       public:
-         CARE_HOST_DEVICE MyDerivedClass(const int value) : MyBaseClass(), m_value(value) {}
-         CARE_HOST_DEVICE virtual ~MyDerivedClass() {}
-         CARE_HOST_DEVICE virtual int getValue() const { return m_value; }
+         CARE_HOST_DEVICE MyDerivedClass(int value) : MyBaseClass(), m_value(value) {}
+         CARE_HOST_DEVICE ~MyDerivedClass() {}
+         CARE_HOST_DEVICE int getValue() const { return m_value; }
 
       private:
          int m_value;
    };
+
+Now you may rightfully ask, what happens when this class contains raw pointers? There is a convenient solution for this case and we demonstrate with a more interesting example.
+
+.. code-block:: cpp
+
+   class MyBaseClass {
+      public:
+         CARE_HOST_DEVICE MyBaseClass() {}
+         CARE_HOST_DEVICE virtual ~MyBaseClass() {}
+         CARE_HOST_DEVICE virtual int getScalarValue() const = 0;
+         CARE_HOST_DEVICE virtual int getArrayValue(int index) const = 0;
+   };
+
+   class MyDerivedClass : public MyBaseClass {
+      public:
+         CARE_HOST_DEVICE MyDerivedClass(int scalarValue, int* arrayValue)
+            : MyBaseClass(), m_scalarValue(scalarValue), m_arrayValue(arrayValue) {}
+         CARE_HOST_DEVICE ~MyDerivedClass() {}
+         CARE_HOST_DEVICE int getScalarValue() const { return m_scalarValue; }
+         CARE_HOST_DEVICE int getArrayValue() const { return m_arrayValue; }
+
+      private:
+         int m_scalarValue;
+         int* m_arrayValue;
+   };
+
+   __global__ void callVirtualFunction(chai::managed_ptr<MyBaseClass> myBaseClass) {
+      int i = blockIdx.x*blockDim.x + threadIdx.x;
+      myBaseClass->getScalarValue();
+      myBaseClass->getArrayValue(i);
+   }
+
+   int main(int argc, char** argv) {
+      chai::ManagedArray<int> arrayValue(10);
+      chai::managed_ptr<MyBaseClass> myBaseClass
+         = chai::make_managed<MyDerivedClass>(0, chai::unpack(arrayValue));
+      callVirtualFunction<<<1, 10>>>(myBaseClass);
+      myBaseClass.free();
+      arrayValue.free();
+      return 0;
+   }
+
+The respective host and device pointers contained in the `chai::ManagedArray` can be extracted and passed to the host and device instance of `MyDerivedClass` using `chai::unpack`. Of course, if you never dereference `m_arrayValue` on the device, you could simply pass a raw pointer to `chai::make_managed`. If the class contains a `chai::ManagedArray`, a `chai::ManagedArray` can simply be passed to the constructor. The same rules apply for passing a `chai::managed_ptr`, calling `chai::unpack` on a `chai::managed_ptr`, or passing a raw pointer and not accessing it on the device.
+
+More complicated rules apply for keeping the data in sync between the host and device instances of an object, but it is possible to do so to a limited extent. It is also possible to control the lifetimes of objects passed to `chai::make_managed`.
+
+.. code-block:: cpp
+   int main(int argc, char** argv) {
+      chai::ManagedArray<int> arrayValue(10);
+
+      chai::managed_ptr<MyBaseClass> myBaseClass
+         = chai::make_managed<MyDerivedClass>(0, chai::unpack(arrayValue));
+      myBaseClass.set_callback([=] (chai::Action action, chai::ExecutionSpace space, void*) mutable {
+         if (action == chai::ACTION_MOVE) {
+            (void) chai::ManagedArray<int> temp(arrayValue); // Copy constructor triggers movement
+         }
+         else if (action == chai::ACTION_FREE && space == chai::NONE) {
+            temp.free();
+         }
+
+         return false;
+      });
+
+      callVirtualFunction<<<1, 10>>>(myBaseClass);
+      myBaseClass.free();
+      // arrayValue.free(); // Not needed anymore
+      return 0;
+   }

From 78a5b3a8fb0e8006b1739dc9d74a3e1dc6bf8c27 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 17 Sep 2020 08:49:13 -0700
Subject: [PATCH 10/15] Fix cdata return type in thin version

---
 src/chai/ManagedArray_thin.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 25e0c5e2..5e3afffc 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -119,7 +119,7 @@ CHAI_HOST_DEVICE T* ManagedArray<T>::data() const
 }
 
 template <typename T>
-CHAI_HOST_DEVICE T* ManagedArray<T>::cdata() const
+CHAI_HOST_DEVICE const T* ManagedArray<T>::cdata() const
 {
    return m_active_pointer;
 }

From 8ef6866d75ca8459469fc647c473a9fdcb6826c2 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 18 Sep 2020 09:25:55 -0700
Subject: [PATCH 11/15] Add tests for data and cdata

---
 tests/integration/managed_array_tests.cpp | 162 ++++++++++++++++++++++
 1 file changed, 162 insertions(+)

diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp
index afc836d7..f2cea85c 100644
--- a/tests/integration/managed_array_tests.cpp
+++ b/tests/integration/managed_array_tests.cpp
@@ -850,6 +850,168 @@ GPU_TEST(ManagedArray, ExternalUnownedMoveToGPU)
 #endif
 #endif
 
+TEST(ManagedArray, data)
+{
+  int length = 10;
+  chai::ManagedArray<int> array(length);
+
+  forall(sequential(), 0, length, [=] (int i) {
+    array[i] = i;
+  });
+
+  int* data = array.data();
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data[i], i);
+    data[i] = length - 1 - i;
+  }
+
+  forall(sequential(), 0, length, [=] (int i) {
+    EXPECT_EQ(array[i], length - 1 - i);
+  });
+
+  array.free();
+  assert_empty_map(true);
+}
+
+#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
+#ifndef CHAI_DISABLE_RM
+GPU_TEST(ManagedArray, dataGPU)
+{
+  int transfersH2D = 0;
+  int transfersD2H = 0;
+
+  int length = 10;
+  chai::ManagedArray<int> array;
+  array.allocate(length,
+                 chai::GPU,
+                 [&] (const chai::PointerRecord* record, chai::Action act, chai::ExecutionSpace s) {
+                   if (act == chai::ACTION_MOVE) {
+                     if (s == chai::CPU) {
+                       ++transfersD2H;
+                     }
+                     else if (s == chai::GPU) {
+                       ++transfersH2D;
+                     }
+                   }
+                 });
+
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    int* d_data = array.data();
+    d_data[i] = i;
+  });
+
+  int* data = array.data();
+
+  EXPECT_EQ(transfersD2H, 1);
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data[i], i);
+    data[i] = length - 1 - i;
+  }
+
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    int* d_data = array.data();
+    array[i] += 1;
+  });
+
+  EXPECT_EQ(transfersD2H, 1);
+  EXPECT_EQ(transfersH2D, 1);
+
+  forall(sequential(), 0, length, [=] (int i) {
+    EXPECT_EQ(array[i], length - i);
+  });
+
+  EXPECT_EQ(transfersD2H, 2);
+  EXPECT_EQ(transfersH2D, 1);
+
+  array.free();
+  assert_empty_map(true);
+}
+#endif
+#endif
+
+TEST(ManagedArray, cdata)
+{
+  int length = 10;
+  chai::ManagedArray<int> array(length);
+
+  forall(sequential(), 0, length, [=] (int i) {
+    array[i] = i;
+  });
+
+  const int* data = array.cdata();
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data[i], i);
+  }
+
+  array.free();
+  assert_empty_map(true);
+}
+
+#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
+#ifndef CHAI_DISABLE_RM
+GPU_TEST(ManagedArray, cdataGPU)
+{
+  int transfersH2D = 0;
+  int transfersD2H = 0;
+
+  int length = 10;
+  chai::ManagedArray<int> array;
+  array.allocate(length,
+                 chai::GPU,
+                 [&] (const chai::PointerRecord* record, chai::Action act, chai::ExecutionSpace s) {
+                   if (act == chai::ACTION_MOVE) {
+                     if (s == chai::CPU) {
+                       ++transfersD2H;
+                     }
+                     else if (s == chai::GPU) {
+                       ++transfersH2D;
+                     }
+                   }
+                 });
+
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    const int* d_data = array.cdata();
+
+    if (d_data[i] == array[i]) {
+      array[i] = i;
+    }
+  });
+
+  const int* data = array.cdata();
+
+  EXPECT_EQ(transfersD2H, 1);
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data[i], i);
+  }
+
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    const int* d_data = array.cdata();
+
+    if (d_data[i] == array[i]) {
+       array[i] += 1;
+    }
+  });
+
+  EXPECT_EQ(transfersD2H, 1);
+  EXPECT_EQ(transfersH2D, 0);
+
+  forall(sequential(), 0, length, [=] (int i) {
+    EXPECT_EQ(array[i], i + 1);
+  });
+
+  EXPECT_EQ(transfersD2H, 2);
+  EXPECT_EQ(transfersH2D, 0);
+
+  array.free();
+  assert_empty_map(true);
+}
+#endif
+#endif
+
 TEST(ManagedArray, Reset)
 {
   chai::ManagedArray<float> array(20);

From 67888eb535ba56012bad9e0d358521f799fd43ed Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 18 Sep 2020 09:41:28 -0700
Subject: [PATCH 12/15] Add comments and ManagedArray<const T> to test

---
 tests/integration/managed_array_tests.cpp | 28 ++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp
index f2cea85c..a2afc0c5 100644
--- a/tests/integration/managed_array_tests.cpp
+++ b/tests/integration/managed_array_tests.cpp
@@ -878,6 +878,7 @@ TEST(ManagedArray, data)
 #ifndef CHAI_DISABLE_RM
 GPU_TEST(ManagedArray, dataGPU)
 {
+  // Initialize
   int transfersH2D = 0;
   int transfersD2H = 0;
 
@@ -896,11 +897,13 @@ GPU_TEST(ManagedArray, dataGPU)
                    }
                  });
 
+  // Touch on device
   forall(gpu(), 0, length, [=] __device__ (int i) {
     int* d_data = array.data();
     d_data[i] = i;
   });
 
+  // Move data to host with touch
   int* data = array.data();
 
   EXPECT_EQ(transfersD2H, 1);
@@ -910,19 +913,38 @@ GPU_TEST(ManagedArray, dataGPU)
     data[i] = length - 1 - i;
   }
 
+  // Move data to device with touch
   forall(gpu(), 0, length, [=] __device__ (int i) {
     int* d_data = array.data();
     array[i] += 1;
   });
 
-  EXPECT_EQ(transfersD2H, 1);
   EXPECT_EQ(transfersH2D, 1);
 
+  // Move data to host without touch
+  chai::ManagedArray<const int> array2 = array;
+  const int* data2 = array2.data();
+
+  EXPECT_EQ(transfersD2H, 2);
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data2[i], length - i);
+  }
+
+  // Access on device with touch (should not be moved)
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    int* d_data = array.data();
+    array[i] += i;
+  });
+
+  EXPECT_EQ(transfersH2D, 1);
+
+  // Move data to host
   forall(sequential(), 0, length, [=] (int i) {
-    EXPECT_EQ(array[i], length - i);
+    EXPECT_EQ(array[i], length);
   });
 
-  EXPECT_EQ(transfersD2H, 2);
+  EXPECT_EQ(transfersD2H, 3);
   EXPECT_EQ(transfersH2D, 1);
 
   array.free();

From 7c53018c22872ec7685e3f9175c5a4087530abb3 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 18 Sep 2020 09:45:40 -0700
Subject: [PATCH 13/15] Add comments and ManagedArray<const T> to cdata test

---
 tests/integration/managed_array_tests.cpp | 31 ++++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp
index a2afc0c5..c61320f3 100644
--- a/tests/integration/managed_array_tests.cpp
+++ b/tests/integration/managed_array_tests.cpp
@@ -897,7 +897,6 @@ GPU_TEST(ManagedArray, dataGPU)
                    }
                  });
 
-  // Touch on device
   forall(gpu(), 0, length, [=] __device__ (int i) {
     int* d_data = array.data();
     d_data[i] = i;
@@ -976,6 +975,7 @@ TEST(ManagedArray, cdata)
 #ifndef CHAI_DISABLE_RM
 GPU_TEST(ManagedArray, cdataGPU)
 {
+  // Initialize
   int transfersH2D = 0;
   int transfersD2H = 0;
 
@@ -1002,6 +1002,7 @@ GPU_TEST(ManagedArray, cdataGPU)
     }
   });
 
+  // Move data to host without touch
   const int* data = array.cdata();
 
   EXPECT_EQ(transfersD2H, 1);
@@ -1010,6 +1011,28 @@ GPU_TEST(ManagedArray, cdataGPU)
     EXPECT_EQ(data[i], i);
   }
 
+  // Access on device with touch (should not be moved)
+  forall(gpu(), 0, length, [=] __device__ (int i) {
+    const int* d_data = array.cdata();
+
+    if (d_data[i] == array[i]) {
+       array[i] += 1;
+    }
+  });
+
+  EXPECT_EQ(transfersH2D, 0);
+
+  // Move data to host without touch
+  chai::ManagedArray<const int> array2 = array;
+  const int* data2 = array2.cdata();
+
+  EXPECT_EQ(transfersD2H, 2);
+
+  for (int i = 0; i < length; ++i) {
+    EXPECT_EQ(data2[i], i + 1);
+  }
+
+  // Access on device with touch (should not be moved)
   forall(gpu(), 0, length, [=] __device__ (int i) {
     const int* d_data = array.cdata();
 
@@ -1018,14 +1041,14 @@ GPU_TEST(ManagedArray, cdataGPU)
     }
   });
 
-  EXPECT_EQ(transfersD2H, 1);
   EXPECT_EQ(transfersH2D, 0);
 
+  // Move data to host with touch
   forall(sequential(), 0, length, [=] (int i) {
-    EXPECT_EQ(array[i], i + 1);
+    EXPECT_EQ(array[i], i + 2);
   });
 
-  EXPECT_EQ(transfersD2H, 2);
+  EXPECT_EQ(transfersD2H, 3);
   EXPECT_EQ(transfersH2D, 0);
 
   array.free();

From b502d82156e424379a37de5ca49013eed0743043 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 7 Oct 2020 10:14:06 -0700
Subject: [PATCH 14/15] Update Umpire to 4.1.2 and fix bug in deepCopyRecord

---
 src/chai/ArrayManager.cpp | 5 ++++-
 src/tpl/umpire            | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp
index 5f364840..0d95afae 100644
--- a/src/chai/ArrayManager.cpp
+++ b/src/chai/ArrayManager.cpp
@@ -445,8 +445,11 @@ PointerRecord* ArrayManager::deepCopyRecord(PointerRecord const* record)
   copy->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {};
 
   const ExecutionSpace last_space = record->m_last_space;
-
   copy->m_last_space = last_space;
+  for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
+    copy->m_allocators[space] = record->m_allocators[space];
+  }
+
   allocate(copy, last_space);
 
   for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
diff --git a/src/tpl/umpire b/src/tpl/umpire
index bdd59851..447f4640 160000
--- a/src/tpl/umpire
+++ b/src/tpl/umpire
@@ -1 +1 @@
-Subproject commit bdd598512516bdc4238502f180c8a7e145c6e68f
+Subproject commit 447f4640eff7b8f39d3c59404f3b03629b90c021

From c2b23e92220130b040fc48bb1cb064f5f2d1832c Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Mon, 12 Oct 2020 12:19:58 -0700
Subject: [PATCH 15/15] Update release number

---
 CMakeLists.txt                  | 2 +-
 docs/sphinx/conf.py             | 2 +-
 docs/sphinx/conf.py.in          | 2 +-
 scripts/make_release_tarball.sh | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 221ee451..7fc21c58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@
 cmake_policy(SET CMP0057 NEW)
 cmake_policy(SET CMP0048 NEW)
 
-project(Chai LANGUAGES CXX VERSION 2.2.1)
+project(Chai LANGUAGES CXX VERSION 2.2.2)
 
 set(ENABLE_CUDA Off CACHE BOOL "Enable CUDA")
 set(ENABLE_HIP Off CACHE BOOL "Enable HIP")
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index 0b672421..2955a46a 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -63,7 +63,7 @@
 # The short X.Y version.
 version = u'2.2'
 # The full version, including alpha/beta/rc tags.
-release = u'2.2.1'
+release = u'2.2.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/conf.py.in b/docs/sphinx/conf.py.in
index c314b21a..3302b3cd 100644
--- a/docs/sphinx/conf.py.in
+++ b/docs/sphinx/conf.py.in
@@ -62,7 +62,7 @@ author = u''
 # The short X.Y version.
 version = u'2.2'
 # The full version, including alpha/beta/rc tags.
-release = u'2.2.1'
+release = u'2.2.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh
index 32699465..1d0e01bc 100755
--- a/scripts/make_release_tarball.sh
+++ b/scripts/make_release_tarball.sh
@@ -7,13 +7,13 @@
 ##############################################################################
 
 TAR_CMD=gtar
-VERSION=2.2.1
+VERSION=2.2.2
 
 git archive --prefix=chai-${VERSION}/ -o chai-${VERSION}.tar HEAD 2> /dev/null
 
 echo "Running git archive submodules..."
 
-p=`pwd` && (echo .; git submodule foreach) | while read entering path; do
+p=`pwd` && (echo .; git submodule foreach --recursive) | while read entering path; do
     temp="${path%\'}";
     temp="${temp#\'}";
     path=$temp;