Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding .deb installer #2

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ PREFIX ?= /usr/local
VERBOSE ?= 0

CUDACODE := -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52

BUILDDIR := build
Expand All @@ -55,7 +56,7 @@ MPIFLAGS := -I$(MPI_HOME)/include -L$(MPI_HOME)/lib -lmpi
INCEXPORTS := nccl.h
LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
LIBNAME := libnccl.so
APIVER := 0
APIVER := 1
TESTS := all_gather_test all_reduce_test broadcast_test reduce_test reduce_scatter_test
MPITESTS := mpi_test

Expand Down Expand Up @@ -106,8 +107,8 @@ test : lib $(TESTBINS)
$(TSTDIR)/% : src/%.cu lib
@printf "Building %-25s > %-24s\n" $< $@
@mkdir -p $(TSTDIR)
@$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt -lnvidia-ml
@$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt -lnvidia-ml > $(@:%=%.d.tmp)
@$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt
@$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
Expand All @@ -128,6 +129,5 @@ $(MPITSTDIR)/% : src/%.cu lib
install : lib
@mkdir -p $(PREFIX)/lib
@mkdir -p $(PREFIX)/include
@cp -P -v build/lib/* $(PREFIX)/lib/
@cp -v build/include/* $(PREFIX)/include/

cp -P -v build/lib/* $(PREFIX)/lib/
cp -v build/include/* $(PREFIX)/include/
11 changes: 11 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
nccl (1.0.2) trusty; urgency=medium

* Merged latest upstream changes.

-- Boris Fomitchev <[email protected]> Tue, 15 Dec 2015 14:52:01 -0800

nccl (1.0.1) trusty; urgency=medium

* Initial release.

-- Boris Fomitchev <[email protected]> Mon, 14 Dec 2015 09:52:01 -0800
1 change: 1 addition & 0 deletions debian/compat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
9
26 changes: 26 additions & 0 deletions debian/control
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Source: nccl
Section: universe/science
Maintainer: Boris Fomitchev <[email protected]>
Priority: optional
Build-depends: debhelper(>=9), nvidia-352-dev, cuda-toolkit-7-0,
cuda-curand-dev-7-0, cuda-cublas-dev-7-0, cuda-cudart-dev-7-0, cuda-ld-conf-7-0
Standards-Version: 3.9.5

Package: nccl
Section: universe/tools
Architecture: amd64
Depends: ${misc:Depends}, ${shlibs:Depends}, nvidia-352,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to depend on a specific driver version? What if they installed with a RUN file?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can confirm that I use the RUN file and know plenty of others do. A range of drivers should ideally be supported.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One option would be to move nvidia-352 to the Suggests field - i.e. not installed by default.

cuda-curand-7-0, cuda-cublas-7-0, cuda-cudart-7-0, cuda-ld-conf-7-0
Description: Optimized primitives for collective multi-GPU communication
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines,
such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe.
NCCL supports up to eight GPUs and can be used in either single- or multi-process (e.g., MPI) applications.

Package: nccl-dev
Section: universe/tools
Architecture: amd64
Depends: ${misc:Depends}, ${shlibs:Depends}, nccl (= ${binary:Version})
Description: Dev package for NCCL library
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines,
such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe.
NCCL supports up to eight GPUs and can be used in either single- or multi-process (e.g., MPI) applications.
26 changes: 26 additions & 0 deletions debian/copyright
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 changes: 15 additions & 0 deletions debian/gbp.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[DEFAULT]
debian-branch = master
upstream-branch = master
verbose = True

ignore-new = True

[git-buildpackage]
verbose = True
no-purge = True

[git-import-orig]
upstream-tag = v%(version)s
upstream-tree = BRANCH
dch = False
4 changes: 4 additions & 0 deletions debian/nccl-dev.install
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
usr/lib
usr/lib/libnccl.so
usr/include
usr/include/nccl.h
2 changes: 2 additions & 0 deletions debian/nccl.install
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
usr/lib
usr/lib/libnccl.so.1
1 change: 1 addition & 0 deletions debian/nccl.lintian-overrides
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
file-in-usr-local
5 changes: 5 additions & 0 deletions debian/nccl.postinst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/sh

set -e

ldconfig
14 changes: 14 additions & 0 deletions debian/rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/make -f




# Enabling a single GPU for tests only
# test.sh fails for non-idential multi-GPU
# fix pending : https://github.com/torch/cutorch/issues/239
export CUDA_VISIBLE_DEVICES=0
%:
PREFIX="$(CURDIR)/usr" dh $@ --build-system=make --parallel

override_dh_auto_test:
echo "Skipping test: fix me later ..."
1 change: 1 addition & 0 deletions debian/shlibs.local
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
libcudart 7.0 cuda-cudart-7-0
1 change: 1 addition & 0 deletions debian/source/format
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.0 (native)
Binary file added export/nccl-dev_1.0.2_amd64.deb
Binary file not shown.
Binary file added export/nccl_1.0.2_amd64.deb
Binary file not shown.
6 changes: 6 additions & 0 deletions src/all_gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,12 @@ public:
case ncclDouble:
return ncclAllGatherWithType<double>(sendbuff, recvbuff, count, comm,
numUnroll, stream);
case ncclInt64:
return ncclAllGatherWithType<long long>(sendbuff, recvbuff, count, comm,
numUnroll, stream);
case ncclUint64:
return ncclAllGatherWithType<unsigned long long>(sendbuff, recvbuff, count, comm,
numUnroll, stream);
}
return ncclInvalidType;
}
Expand Down
2 changes: 2 additions & 0 deletions src/all_gather_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ int main(int argc, char* argv[]) {
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);

printf("\n");

Expand Down
6 changes: 6 additions & 0 deletions src/all_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,12 @@ public:
case ncclDouble:
return ncclAllReduceWithType<double>(sendbuff, recvbuff, count, op,
comm, stream);
case ncclInt64:
return ncclAllReduceWithType<long long>(sendbuff, recvbuff, count, op,
comm, stream);
case ncclUint64:
return ncclAllReduceWithType<unsigned long long int>(sendbuff, recvbuff, count, op,
comm, stream);
}

return ncclInvalidType;
Expand Down
2 changes: 2 additions & 0 deletions src/all_reduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ int main(int argc, char* argv[]) {
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);

printf("\n");

Expand Down
4 changes: 4 additions & 0 deletions src/broadcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,10 @@ public:
return ncclBcastWithType<float>(buff, count, root, comm, numUnroll, stream);
case ncclDouble:
return ncclBcastWithType<double>(buff, count, root, comm, numUnroll, stream);
case ncclInt64:
return ncclBcastWithType<long long>(buff, count, root, comm, numUnroll, stream);
case ncclUint64:
return ncclBcastWithType<unsigned long long>(buff, count, root, comm, numUnroll, stream);
}
return ncclInvalidType;
}
Expand Down
2 changes: 2 additions & 0 deletions src/broadcast_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ int main(int argc, char* argv[]) {
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);

printf("\n");

Expand Down
20 changes: 20 additions & 0 deletions src/common_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,26 @@ struct MULTI<FUNC, double> {
}
};

template<class FUNC>
struct MULTI<FUNC, unsigned long long> {
static_assert(sizeof(PackType) == sizeof(unsigned long long),
"PackType must be the same size as unsigned long long.");
__device__ PackType operator()(const PackType x, const PackType y) const {
unsigned long long rv = FUNC()(x, y);
return rv;
}
};

template<class FUNC>
struct MULTI<FUNC, long long> {
static_assert(sizeof(PackType) == sizeof(long long),
"PackType must be the same size as long long.");
__device__ PackType operator()(const PackType x, const PackType y) const {
long long rv = FUNC()((long long)x, (long long)y);
return rv;
}
};

template<typename T, bool FETCHTWO>
__device__ inline void FetchOneOrTwo64b(PackType& s0,
const volatile T * __restrict__ const src0, PackType& s1,
Expand Down
3 changes: 0 additions & 3 deletions src/core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -459,9 +459,6 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
commClearMaps(comm);
return ncclUnhandledCudaError;
}
if (shmUnlink(rankname) != ncclSuccess) {
INFO("rank %d failed to unlink sysmem beffer of rank %d, %s", rank, iRank, rankname);
}
if (cudaHostGetDevicePointer(comm->remote+i, comm->cleanup[i].handle, 0) != cudaSuccess) {
WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
commClearMaps(comm);
Expand Down
4 changes: 3 additions & 1 deletion src/nccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ typedef enum { ncclChar = 0,
#endif
ncclFloat = 3,
ncclDouble = 4,
nccl_NUM_TYPES = 5 } ncclDataType_t;
ncclInt64 = 5,
ncclUint64 = 6,
nccl_NUM_TYPES = 7 } ncclDataType_t;

/* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
* recvbuf may be NULL on all calls except for root device.
Expand Down
4 changes: 4 additions & 0 deletions src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,10 @@ public:
return ncclReduceWithType<float>(sendbuff, recvbuff, count, op, root, comm, stream);
case ncclDouble:
return ncclReduceWithType<double>(sendbuff, recvbuff, count, op, root, comm, stream);
case ncclInt64:
return ncclReduceWithType<long long>(sendbuff, recvbuff, count, op, root, comm, stream);
case ncclUint64:
return ncclReduceWithType<unsigned long long>(sendbuff, recvbuff, count, op, root, comm, stream);
}
return ncclInvalidType;
}
Expand Down
6 changes: 6 additions & 0 deletions src/reduce_scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,12 @@ public:
case ncclDouble:
return ncclReduceScatterWithType<double>(sendbuff, recvbuff, recvcount,
op, comm, stream);
case ncclInt64:
return ncclReduceScatterWithType<long long>(sendbuff, recvbuff, recvcount,
op, comm, stream);
case ncclUint64:
return ncclReduceScatterWithType<unsigned long long>(sendbuff, recvbuff, recvcount,
op, comm, stream);
}
return ncclInvalidType;
}
Expand Down
2 changes: 2 additions & 0 deletions src/reduce_scatter_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,8 @@ int main(int argc, char* argv[]) {
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);

printf("\n");

Expand Down
2 changes: 2 additions & 0 deletions src/reduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ int main(int argc, char* argv[]) {
#endif
RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);

printf("\n");

Expand Down
26 changes: 26 additions & 0 deletions src/test_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
}

template<>
void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
const int N) {
CURAND_CHK(curandGenerateLongLong(generator, dest, N));
}


template<typename T>
void Randomize(T* const dest, const int N, const int randomSeed) {
Expand All @@ -100,6 +106,24 @@ void Randomize(T* const dest, const int N, const int randomSeed) {
CUDACHECK(cudaDeviceSynchronize());
}

template<>
void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
curandGenerator_t gen;
CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
GenerateRandom<unsigned long long>(gen, dest, N);
CURAND_CHK(curandDestroyGenerator(gen));
CUDACHECK(cudaDeviceSynchronize());
}

template<>
void Randomize(long long* const dest, const int N, const int randomSeed) {
curandGenerator_t gen;
CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
CURAND_CHK(curandDestroyGenerator(gen));
CUDACHECK(cudaDeviceSynchronize());
}

#ifdef CUDA_HAS_HALF
__global__ void halve(const float * src, half* dest, int N) {
for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
Expand Down Expand Up @@ -268,6 +292,8 @@ std::string TypeName(const ncclDataType_t type) {
#endif
case ncclFloat: return "float";
case ncclDouble: return "double";
case ncclInt64: return "int64";
case ncclUint64: return "uint64";
default: return "unknown";
}
}
Expand Down