Skip to content

Commit

Permalink
Add retry logic to ibv_modify_qp (NVIDIA#7)
Browse files Browse the repository at this point in the history
Control plane ibv_modify_qp does not have retry logic so it could error out during link flaps even if the link gets back very quickly. This is to add retry logic so that ibv_modify_qp can have better chance to go through link flaps.
  • Loading branch information
Yinglin Sun authored and GitHub Enterprise committed Oct 22, 2024
1 parent 62188fd commit 4d893f8
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
15 changes: 15 additions & 0 deletions src/include/checks.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,21 @@
} \
} while (0);

#define NCCLCHECKRETRY(call, retry, timeout) do { \
ncclResult_t res = call; \
int attempts = 0; \
while (res != ncclSuccess && res != ncclInProgress && attempts < retry) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL, "%s:%d -> %d sleep %ld sec before retry", __FILE__, __LINE__, res, (timeout * (attempts + 1))); \
sleep(timeout * (attempts + 1)); \
res = call; \
attempts++; \
} \
if (res != ncclSuccess && res != ncclInProgress) { \
return res; \
} \
} while (0);


#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
Expand Down
11 changes: 8 additions & 3 deletions src/transport/net_ib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ NCCL_PARAM(IbTc, "IB_TC", 0);
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
NCCL_PARAM(IbMqpRetryCnt, "IB_MQP_RETRY_CNT", 6);
NCCL_PARAM(IbMqpTimeout, "IB_MQP_TIMEOUT", 5);

pthread_t ncclIbAsyncThread;

Expand Down Expand Up @@ -929,7 +931,8 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
qpAttr.pkey_index = ncclParamIbPkey();
qpAttr.port_num = ib_port;
qpAttr.qp_access_flags = access_flags;
NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
NCCLCHECKRETRY(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS),
ncclParamIbMqpRetryCnt(), ncclParamIbMqpTimeout());
return ncclSuccess;
}

Expand Down Expand Up @@ -957,7 +960,8 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint8_t sGidIndex, uint32_t dest_qp_
qpAttr.ah_attr.sl = ncclParamIbSl();
qpAttr.ah_attr.src_path_bits = 0;
qpAttr.ah_attr.port_num = info->ib_port;
NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
NCCLCHECKRETRY(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER),
ncclParamIbMqpRetryCnt(), ncclParamIbMqpTimeout());
return ncclSuccess;
}

Expand All @@ -970,7 +974,8 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
qpAttr.rnr_retry = 7;
qpAttr.sq_psn = 0;
qpAttr.max_rd_atomic = 1;
NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC));
NCCLCHECKRETRY(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC),
ncclParamIbMqpRetryCnt(), ncclParamIbMqpTimeout());
return ncclSuccess;
}

Expand Down

0 comments on commit 4d893f8

Please sign in to comment.