add linear

apache · Jun 12, 2018 · e6f3f56 · e6f3f56
1 parent 542f382
commit e6f3f56
Show file tree

Hide file tree

Showing 25 changed files with 117 additions and 100 deletions.
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
@@ -28,6 +28,7 @@
 #if MXNET_USE_CUDA
   #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
+
 #include <mxnet/base.h>
 #include <mxnet/storage.h>
 #include <unordered_map>
@@ -44,7 +45,8 @@ namespace storage {
 
 #if MXNET_USE_CUDA
 /*!
- * \brief Storage manager with a memory pool on gpu.
+ * \brief Storage manager with a memory pool on gpu. Memory chunks are reused based on exact size
+ * match.
  */
 class GPUPooledStorageManager final : public StorageManager {
  public:
@@ -53,10 +55,10 @@ class GPUPooledStorageManager final : public StorageManager {
    */
   GPUPooledStorageManager() {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
-    min_chunk_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_MIN_CHUNK", 4096);
-    if (min_chunk_ < NDEV) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_MIN_CHUNK cannot be set to a value smaller than " << NDEV \
-                 << ". Got " << min_chunk_ << ".";
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    if (page_size_ < NDEV) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than " << NDEV \
+                 << ". Got " << page_size_ << ".";
     }
   }
   /*!
@@ -77,7 +79,7 @@ class GPUPooledStorageManager final : public StorageManager {
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
     cudaError_t err = cudaFree(handle.dptr);
-    size_t size = std::max(handle.size, min_chunk_);
+    size_t size = std::max(handle.size, page_size_);
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -88,7 +90,9 @@ class GPUPooledStorageManager final : public StorageManager {
  private:
   void ReleaseAll();
   // used memory
-  size_t used_memory_ = 0, min_chunk_;
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
   // percentage of reserved memory
   int reserve_;
   // number of devices
@@ -100,7 +104,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
 void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = std::max(handle->size, min_chunk_);
+  size_t size = std::max(handle->size, page_size_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -125,15 +129,15 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
 
 void GPUPooledStorageManager::Free(Storage::Handle handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = std::max(handle.size, min_chunk_);
+  size_t size = std::max(handle.size, page_size_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(handle.dptr);
 }
 
 void GPUPooledStorageManager::ReleaseAll() {
-  Storage::Handle handle;
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
+      Storage::Handle handle;
       handle.dptr = j;
       handle.size = i.first;
       DirectFreeNoLock(handle);
@@ -144,6 +148,17 @@ void GPUPooledStorageManager::ReleaseAll() {
 
 /*!
  * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ *
+ * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * nearest multiple (linear) rounding to help alleviate the memory allocation stress
+ * in which the default naive exact-size-match pool falls short, such as in variable-length
+ * input/output cases like RNN workloads.
+ *
+ * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB)
+ * and 34 (16 GB).
+ * Suppose the cutoff is X, the memory size buckets look like this:
+ * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
  */
 class GPUPooledRoundedStorageManager final : public StorageManager {
  public:
@@ -152,11 +167,27 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
    */
   GPUPooledRoundedStorageManager() {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
-    min_chunk_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_LOG2_MIN_CHUNK", 5);
-    if (min_chunk_ < 5) {
-      LOG(FATAL) << "MXNET_GPU_MEM_POOL_LOG2_MIN_CHUNK cannot be set to a value smaller than 5. " \
-                 << "Got " << min_chunk_ << ".";
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
+    if (page_size_ < 32) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \
+                 << "Got: " << page_size_ << ".";
+    }
+    if (page_size_ != 1ul << log2_round_up(page_size_)) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << ".";
+    }
+    page_size_ = log2_round_up(page_size_);
+    if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \
+                 << cut_off_ << ".";
     }
+    if (cut_off_ < page_size_) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \
+                 << cut_off_ << " vs " << page_size_ << ".";
+    }
+    memory_pool_ = std::vector<std::vector<void*>>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_);
   }
   /*!
    * \brief Default destructor.
@@ -169,94 +200,71 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   void Free(Storage::Handle handle) override;
 
   void DirectFree(Storage::Handle handle) override {
-    handle.size = 1ul << log2_round_up(handle.size);
     std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
     DirectFreeNoLock(handle);
   }
 
  private:
-#if __SIZEOF_SIZE_T__ == __SIZEOF_LONG__
-
-#if defined(__clang__) || defined(__GNUC__)
-#define clz(x) __builtin_clzl(x)
-#define ctz(x) __builtin_ctzl(x)
-
-#elif defined(__WINDOWS__)
-#define clz(x) __lzcnt64(x)
-  uint64_t __inline ctz(uint64_t value) {
-    QWORD trailing_zero = 0;
-    _BitScanForward64(&trailing_zero, value)
-    return trailing_zero;
+  inline int log2_round_up(size_t s) {
+    return static_cast<int>(std::ceil(std::log2(s)));
   }
-  uint64_t __inline clz(uint64_t value) {
-    QWORD leading_zero = 0;
-    _BitScanReverse64(&leading_zero, value)
-    return 63 - leading_zero;
+  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+    // (1025, 10) -> 2
+    // (2048, 10) -> 2
+    // (2049, 10) -> 3
+    size_t result = s >> divisor_log2;
+    return static_cast<int>(result + (s > (result << divisor_log2) ? 1 : 0));
   }
-
-#endif  // defined(__clang__) || defined(__GNUC__)
-
-#elif __SIZEOF_SIZE_T__ == __SIZEOF_INT__
-
-#if defined(__clang__) || defined(__GNUC__) || defined(__WINDOWS__)
-#define clz(x) __builtin_clz(x)
-#define ctz(x) __builtin_ctz(x)
-
-#elif defined(__WINDOWS__)
-  uint32_t __inline clz(uint32_t value) {
-    DWORD leading_zero = 0;
-    _BitScanReverse(&leading_zero, value)
-    return 31 - leading_zero;
+  inline int get_bucket(size_t s) {
+    int log_size = log2_round_up(s);
+    if (log_size > static_cast<int>(cut_off_))
+      return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
+    else
+      return std::max(log_size, static_cast<int>(page_size_));
   }
-  uint32_t __inline ctz(uint32_t value) {
-    DWORD trailing_zero = 0;
-    _BitScanForward(&trailing_zero, value)
-    return trailing_zero;
+  inline size_t get_size(int bucket) {
+    if (bucket <= static_cast<int>(cut_off_))
+      return 1ul << bucket;
+    else
+      return (bucket - cut_off_ + 1) * (1ul << cut_off_);
   }
 
-#endif  // defined(__clang__) || defined(__GNUC__)
-#endif  // __SIZEOF_SIZE_T__
-
-#if defined(__clang__) || defined(__GNUC__) || defined(__WINDOWS__)
-  inline int log2_round_up(size_t s) {
-    int fls = clz(s);  // find last set
-    // must be bigger than min_chunk_ (which is at least 32 for nccl scatter)
-    return std::max(static_cast<int>(min_chunk_), (addr_width-fls) + ((ctz(s) < fls - 1)?1:0));
-  }
-#else
-  inline int log2_round_up(size_t s) {
-    return std::max(static_cast<int>(min_chunk_),
-                    static_cast<int>(std::ceil(std::log2(s))));
-  }
-#endif  // defined(__clang__) || defined(__GNUC__) || defined(__WINDOWS__)
   void DirectFreeNoLock(Storage::Handle handle) {
     cudaError_t err = cudaFree(handle.dptr);
+    size_t size = get_size(get_bucket(handle.size));
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
     }
-    used_memory_ -= handle.size;
+    used_memory_ -= size;
   }
 
  private:
   void ReleaseAll();
   // number of devices
   const int NDEV = 32;
+  // log2 of maximum page size. 16GB
+  const size_t LOG2_MAX_MEM = 34;
+  // address width in bits
   static const int addr_width = sizeof(size_t) * 8;
   // used memory
-  size_t used_memory_ = 0, min_chunk_;
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
+  // log2 of memory size before switching to exponential mode to linear mode
+  size_t cut_off_;
   // percentage of reserved memory
   int reserve_;
   // memory pool
-  std::array<std::vector<void*>, addr_width> memory_pool_;
+  std::vector<std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
 };  // class GPUPooledRoundedStorageManager
 
 void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
-  int log2_size = log2_round_up(handle->size);
-  size_t size = 1ul << log2_size;
-  auto&& reuse_pool = memory_pool_[log2_size];
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle->size);
+  size_t size = get_size(bucket);
+  auto&& reuse_pool = memory_pool_[bucket];
   if (reuse_pool.size() == 0) {
     size_t free, total;
     cudaMemGetInfo(&free, &total);
@@ -278,17 +286,18 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
 }
 
 void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) {
-  int log2_size = log2_round_up(handle.size);
-  auto&& reuse_pool = memory_pool_[log2_size];
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle.size);
+  auto&& reuse_pool = memory_pool_[bucket];
   reuse_pool.push_back(handle.dptr);
 }
 
 void GPUPooledRoundedStorageManager::ReleaseAll() {
-  Storage::Handle handle;
   for (size_t i = 0; i < memory_pool_.size(); i++) {
-    handle.size = 1ul << i;
+    int size = get_size(i);
     for (auto& j : memory_pool_[i]) {
+      Storage::Handle handle;
+      handle.size = size;
       handle.dptr = j;
       DirectFreeNoLock(handle);
     }

diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
@@ -22,7 +22,7 @@
 from mxnet.test_utils import *
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import utils
 
 def _get_model():

diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -27,7 +27,7 @@
 import unittest
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)

diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
@@ -24,7 +24,7 @@
 from mxnet.test_utils import assert_almost_equal, default_context
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 shape = (4, 4)
 keys = [5, 7, 11]
@@ -83,7 +83,7 @@ def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
         check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], is_same_rowid=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)])
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True) 
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], use_slice=True)
 
     # test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/9384

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
@@ -32,7 +32,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from test_operator import *
 from test_optimizer import *
 from test_random import *

diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
@@ -241,3 +241,11 @@ def __enter__(self):
 
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self._dirname)
+
+def teardown():
+    """
+    A function with a 'magic name' executed automatically after each nosetests test module.
+
+    It waits for all operations in one file to finish before carrying on the next.
+    """
+    mx.nd.waitall()
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
@@ -20,7 +20,7 @@
 from mxnet.ndarray import zeros_like
 from mxnet.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def grad_and_loss(func, argnum=None):

diff --git a/tests/python/unittest/test_contrib_autograd.py b/tests/python/unittest/test_contrib_autograd.py
@@ -18,7 +18,7 @@
 import mxnet.ndarray as nd
 from mxnet.contrib.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def autograd_assert(*args, **kwargs):
     func   = kwargs["func"]

diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import assert_exception, default_context, set_default_context

diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import mxnet as mx
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def reldiff(a, b):

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
@@ -20,7 +20,7 @@
 from mxnet.gluon import nn
 from mxnet.test_utils import assert_almost_equal
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 import numpy as np
 from numpy.testing import assert_array_equal
 from nose.tools import raises, assert_raises

diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
@@ -21,7 +21,7 @@
 from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import Concurrent, HybridConcurrent, Identity, SparseEmbedding
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import numpy as np
 from numpy.testing import assert_allclose