NVIDIA · borisfom · Dec 12, 2015 · Dec 14, 2015 · Dec 15, 2015 · Dec 15, 2015
diff --git a/Makefile b/Makefile
@@ -31,6 +31,7 @@ PREFIX ?= /usr/local
 VERBOSE ?= 0
 
 CUDACODE := -gencode=arch=compute_35,code=sm_35 \
+            -gencode=arch=compute_50,code=sm_50 \
             -gencode=arch=compute_52,code=sm_52
 
 BUILDDIR := build
@@ -55,7 +56,7 @@ MPIFLAGS   := -I$(MPI_HOME)/include -L$(MPI_HOME)/lib -lmpi
 INCEXPORTS  := nccl.h
 LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
 LIBNAME     := libnccl.so
-APIVER      := 0
+APIVER      := 1
 TESTS       := all_gather_test all_reduce_test broadcast_test reduce_test reduce_scatter_test
 MPITESTS    := mpi_test
 
@@ -106,8 +107,8 @@ test : lib $(TESTBINS)
 $(TSTDIR)/% : src/%.cu lib
 	@printf "Building  %-25s > %-24s\n" $< $@
 	@mkdir -p $(TSTDIR)
-	@$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt -lnvidia-ml
-	@$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt -lnvidia-ml > $(@:%=%.d.tmp)
+	@$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
+	@$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
@@ -128,6 +129,5 @@ $(MPITSTDIR)/% : src/%.cu lib
 install : lib
 	@mkdir -p $(PREFIX)/lib
 	@mkdir -p $(PREFIX)/include
-	@cp -P -v build/lib/* $(PREFIX)/lib/
-	@cp -v build/include/* $(PREFIX)/include/
-
+	cp -P -v build/lib/* $(PREFIX)/lib/
+	cp -v build/include/* $(PREFIX)/include/
diff --git a/debian/changelog b/debian/changelog
@@ -0,0 +1,11 @@
+nccl (1.0.2) trusty; urgency=medium
+
+  * Merged latest upstream changes.
+
+ -- Boris Fomitchev <[email protected]>  Tue, 15 Dec 2015 14:52:01 -0800
+
+nccl (1.0.1) trusty; urgency=medium
+
+  * Initial release.
+
+ -- Boris Fomitchev <[email protected]>  Mon, 14 Dec 2015 09:52:01 -0800
diff --git a/debian/compat b/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/debian/control b/debian/control
@@ -0,0 +1,26 @@
+Source: nccl
+Section: universe/science
+Maintainer: Boris Fomitchev <[email protected]>
+Priority: optional
+Build-depends: debhelper(>=9), nvidia-352-dev, cuda-toolkit-7-0,
+               cuda-curand-dev-7-0, cuda-cublas-dev-7-0, cuda-cudart-dev-7-0, cuda-ld-conf-7-0
+Standards-Version: 3.9.5
+
+Package: nccl
+Section: universe/tools
+Architecture: amd64
+Depends: ${misc:Depends}, ${shlibs:Depends},  nvidia-352,
+               cuda-curand-7-0, cuda-cublas-7-0, cuda-cudart-7-0, cuda-ld-conf-7-0
+Description: Optimized primitives for collective multi-GPU communication
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines,
+ such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe.
+ NCCL supports up to eight GPUs and can be used in either single- or multi-process (e.g., MPI) applications.
+
+Package: nccl-dev
+Section: universe/tools
+Architecture: amd64
+Depends: ${misc:Depends}, ${shlibs:Depends},  nccl (= ${binary:Version})
+Description: Dev package for NCCL library
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines,
+ such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe.
+ NCCL supports up to eight GPUs and can be used in either single- or multi-process (e.g., MPI) applications.
diff --git a/debian/copyright b/debian/copyright
@@ -0,0 +1,26 @@
+
+ Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/gbp.conf b/debian/gbp.conf
@@ -0,0 +1,15 @@
+[DEFAULT]
+debian-branch   = master
+upstream-branch = master
+verbose = True
+
+ignore-new = True
+
+[git-buildpackage]
+verbose = True
+no-purge = True
+
+[git-import-orig]
+upstream-tag = v%(version)s
+upstream-tree = BRANCH
+dch = False
diff --git a/debian/nccl-dev.install b/debian/nccl-dev.install
@@ -0,0 +1,4 @@
+usr/lib
+usr/lib/libnccl.so
+usr/include
+usr/include/nccl.h
diff --git a/debian/nccl.install b/debian/nccl.install
@@ -0,0 +1,2 @@
+usr/lib
+usr/lib/libnccl.so.1
diff --git a/debian/nccl.lintian-overrides b/debian/nccl.lintian-overrides
@@ -0,0 +1 @@
+file-in-usr-local
diff --git a/debian/nccl.postinst b/debian/nccl.postinst
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -e
+
+ldconfig
diff --git a/debian/rules b/debian/rules
@@ -0,0 +1,14 @@
+#!/usr/bin/make -f
+
+
+
+
+# Enabling a single GPU for tests only
+# test.sh fails for non-idential multi-GPU
+# fix pending : https://github.com/torch/cutorch/issues/239
+export CUDA_VISIBLE_DEVICES=0
+%:
+	PREFIX="$(CURDIR)/usr" dh  $@ --build-system=make --parallel
+
+override_dh_auto_test:
+	echo "Skipping test: fix me later ..."
diff --git a/debian/shlibs.local b/debian/shlibs.local
@@ -0,0 +1 @@
+libcudart 7.0 cuda-cudart-7-0
diff --git a/debian/source/format b/debian/source/format
@@ -0,0 +1 @@
+3.0 (native)
diff --git a/export/nccl-dev_1.0.2_amd64.deb b/export/nccl-dev_1.0.2_amd64.deb
diff --git a/export/nccl_1.0.2_amd64.deb b/export/nccl_1.0.2_amd64.deb
diff --git a/src/all_gather.cu b/src/all_gather.cu
@@ -477,6 +477,12 @@ public:
     case ncclDouble:
       return ncclAllGatherWithType<double>(sendbuff, recvbuff, count, comm,
           numUnroll, stream);
+    case ncclInt64:
+      return ncclAllGatherWithType<long long>(sendbuff, recvbuff, count, comm,
+          numUnroll, stream);
+    case ncclUint64:
+      return ncclAllGatherWithType<unsigned long long>(sendbuff, recvbuff, count, comm,
+          numUnroll, stream);
     }
     return ncclInvalidType;
   }

diff --git a/src/all_gather_test.cu b/src/all_gather_test.cu
@@ -224,6 +224,8 @@ int main(int argc, char* argv[]) {
 #endif
   RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
   RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
 
   printf("\n");
 

diff --git a/src/all_reduce.cu b/src/all_reduce.cu
@@ -489,6 +489,12 @@ public:
     case ncclDouble:
       return ncclAllReduceWithType<double>(sendbuff, recvbuff, count, op,
           comm, stream);
+    case ncclInt64:
+      return ncclAllReduceWithType<long long>(sendbuff, recvbuff, count, op,
+          comm, stream);
+    case ncclUint64:
+      return ncclAllReduceWithType<unsigned long long int>(sendbuff, recvbuff, count, op,
+          comm, stream);
     }
 
     return ncclInvalidType;

diff --git a/src/all_reduce_test.cu b/src/all_reduce_test.cu
@@ -287,6 +287,8 @@ int main(int argc, char* argv[]) {
 #endif
   RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
   RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
 
   printf("\n");
 

diff --git a/src/broadcast.cu b/src/broadcast.cu
@@ -396,6 +396,10 @@ public:
       return ncclBcastWithType<float>(buff, count, root, comm, numUnroll, stream);
     case ncclDouble:
       return ncclBcastWithType<double>(buff, count, root, comm, numUnroll, stream);
+    case ncclInt64:
+      return ncclBcastWithType<long long>(buff, count, root, comm, numUnroll, stream);
+    case ncclUint64:
+      return ncclBcastWithType<unsigned long long>(buff, count, root, comm, numUnroll, stream);
     }
     return ncclInvalidType;
   }

diff --git a/src/broadcast_test.cu b/src/broadcast_test.cu
@@ -224,6 +224,8 @@ int main(int argc, char* argv[]) {
 #endif
   RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
   RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
 
   printf("\n");
 

diff --git a/src/common_kernel.h b/src/common_kernel.h
@@ -174,6 +174,26 @@ struct MULTI<FUNC, double> {
   }
 };
 
+template<class FUNC>
+struct MULTI<FUNC, unsigned long long> {
+  static_assert(sizeof(PackType) == sizeof(unsigned long long),
+      "PackType must be the same size as unsigned long long.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    unsigned long long rv = FUNC()(x, y);
+    return rv;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, long long> {
+  static_assert(sizeof(PackType) == sizeof(long long),
+      "PackType must be the same size as long long.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    long long rv = FUNC()((long long)x, (long long)y);
+    return rv;
+  }
+};
+
 template<typename T, bool FETCHTWO>
 __device__ inline void FetchOneOrTwo64b(PackType& s0,
     const volatile T * __restrict__ const src0, PackType& s1,

diff --git a/src/core.cu b/src/core.cu
@@ -459,9 +459,6 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
         commClearMaps(comm);
         return ncclUnhandledCudaError;
       }
-      if (shmUnlink(rankname) != ncclSuccess) {
-        INFO("rank %d failed to unlink sysmem beffer of rank %d, %s", rank, iRank, rankname);
-      }
       if (cudaHostGetDevicePointer(comm->remote+i, comm->cleanup[i].handle, 0) != cudaSuccess) {
         WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
         commClearMaps(comm);

diff --git a/src/nccl.h b/src/nccl.h
@@ -117,7 +117,9 @@ typedef enum { ncclChar       = 0,
 #endif
                ncclFloat      = 3,
                ncclDouble     = 4,
-               nccl_NUM_TYPES = 5 } ncclDataType_t;
+               ncclInt64      = 5,
+               ncclUint64     = 6,
+               nccl_NUM_TYPES = 7 } ncclDataType_t;
 
 /* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
  * recvbuf may be NULL on all calls except for root device.

diff --git a/src/reduce.cu b/src/reduce.cu
@@ -393,6 +393,10 @@ public:
       return ncclReduceWithType<float>(sendbuff, recvbuff, count, op, root, comm, stream);
     case ncclDouble:
       return ncclReduceWithType<double>(sendbuff, recvbuff, count, op, root, comm, stream);
+    case ncclInt64:
+      return ncclReduceWithType<long long>(sendbuff, recvbuff, count, op, root, comm, stream);
+    case ncclUint64:
+      return ncclReduceWithType<unsigned long long>(sendbuff, recvbuff, count, op, root, comm, stream);
     }
     return ncclInvalidType;
   }

diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
@@ -474,6 +474,12 @@ public:
     case ncclDouble:
       return ncclReduceScatterWithType<double>(sendbuff, recvbuff, recvcount,
           op, comm, stream);
+    case ncclInt64:
+      return ncclReduceScatterWithType<long long>(sendbuff, recvbuff, recvcount,
+          op, comm, stream);
+    case ncclUint64:
+      return ncclReduceScatterWithType<unsigned long long>(sendbuff, recvbuff, recvcount,
+          op, comm, stream);
     }
     return ncclInvalidType;
   }

diff --git a/src/reduce_scatter_test.cu b/src/reduce_scatter_test.cu
@@ -271,6 +271,8 @@ int main(int argc, char* argv[]) {
 #endif
   RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
   RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
 
   printf("\n");
 

diff --git a/src/reduce_test.cu b/src/reduce_test.cu
@@ -285,6 +285,8 @@ int main(int argc, char* argv[]) {
 #endif
   RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
   RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
+  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
+  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
 
   printf("\n");
 

diff --git a/src/test_utilities.h b/src/test_utilities.h
@@ -89,6 +89,12 @@ void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
   CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
 }
 
+template<>
+void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
+    const int N) {
+  CURAND_CHK(curandGenerateLongLong(generator, dest, N));
+}
+
 
 template<typename T>
 void Randomize(T* const dest, const int N, const int randomSeed) {
@@ -100,6 +106,24 @@ void Randomize(T* const dest, const int N, const int randomSeed) {
   CUDACHECK(cudaDeviceSynchronize());
 }
 
+template<>
+void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
+  GenerateRandom<unsigned long long>(gen, dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
+template<>
+void Randomize(long long* const dest, const int N, const int randomSeed) {
+  curandGenerator_t gen;
+  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
+  GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
+  CURAND_CHK(curandDestroyGenerator(gen));
+  CUDACHECK(cudaDeviceSynchronize());
+}
+
 #ifdef CUDA_HAS_HALF
 __global__ void halve(const float * src, half* dest, int N) {
   for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
@@ -268,6 +292,8 @@ std::string TypeName(const ncclDataType_t type) {
 #endif
     case ncclFloat:  return "float";
     case ncclDouble: return "double";
+    case ncclInt64:  return "int64";
+    case ncclUint64: return "uint64";
     default:         return "unknown";
   }
 }