diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 8fb23a2b80..856ae598e1 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -16,7 +16,7 @@ - ghcr.io/databricks-mosaic/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 CUDA_VERSION: 12.4.1 IMAGE_NAME: torch-2-4-0-cu124-aws @@ -78,7 +78,7 @@ - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-3-1-cu121-aws @@ -149,7 +149,7 @@ - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-2-2-cu121-aws diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 773a20f6db..d2261a4ea3 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -237,7 +237,7 @@ def _main(): if interconnect != 'EFA': entry['AWS_OFI_NCCL_VERSION'] = '' else: - entry['AWS_OFI_NCCL_VERSION'] = 'v1.9.1-aws' + entry['AWS_OFI_NCCL_VERSION'] = 'v1.11.0-aws' pytorch_entries.append(entry)