Skip to content

Commit

Permalink
move ntrace patch to third-party (exclude ext_tag) (NVIDIA#4)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookresearch#4

Introduce ntrace into NCCL to trace important IB events per collective communication. It requires a separate module ntrace_rt to enable in Makefile based build. For now, ENABLE_NTRACE is always off when building from Makefile.

Co-author: Xianghuai Zhang <[email protected]>

Reviewed By: kingchc

Differential Revision: D38637718

fbshipit-source-id: 94f8dd163df18cd6fe4e4abc811570cb159b876f
  • Loading branch information
minsii authored and facebook-github-bot committed Sep 22, 2022
1 parent 6e07cc8 commit 38e738a
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 0 deletions.
7 changes: 7 additions & 0 deletions src/include/ibvwrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,10 @@ typedef enum ibv_return_enum
IBV_SUCCESS = 0, //!< The operation was successful
} ibv_return_t;

// Explicitly include NTRACE after verbs type definition
// to pass these types to ntrace_rt.h.
#include "ntrace_profiler.h"

ncclResult_t wrap_ibv_symbols(void);
ncclResult_t wrap_ibv_fork_init(void);
ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
Expand Down Expand Up @@ -1077,6 +1081,7 @@ ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
NTRACE_PROFILING_RECORD(ibv_poll_cq, cq, num_entries, wc, done);
if (done < 0) {
WARN("Call to ibv_poll_cq() returned %d", done);
return ncclSystemError;
Expand All @@ -1092,6 +1097,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
}

static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
NTRACE_PROFILING_RECORD(ibv_post_send, qp, wr, bad_wr, NULL);
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
Expand All @@ -1101,6 +1107,7 @@ static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send
}

static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
NTRACE_PROFILING_RECORD(ibv_post_recv, qp, wr, bad_wr, NULL);
int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_recv() failed with error %s", strerror(ret));
Expand Down
26 changes: 26 additions & 0 deletions src/include/ntrace_profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* (c) Facebook, Inc. and its affiliates. Confidential and proprietary. */

#ifndef NCCL_NTRACE_PROFILER_H_
#define NCCL_NTRACE_PROFILER_H_

#ifdef ENABLE_NTRACE
#include "ntrace_rt.h"

#define NTRACE_PROFILING_RECORD(profile_state, ...) \
do { \
ntrace_log_##profile_state(__VA_ARGS__); \
} while (0)

static inline void ntraceProfilingDump(void) {
ntrace_dump();
}

#else
#define NTRACE_PROFILING_RECORD(profile_state, ...) \
do { /* no op */ \
} while (0)

static inline void ntraceProfilingDump(void){/* no op */};
#endif

#endif /* end of NCCL_NTRACE_PROFILER_H_ */
7 changes: 7 additions & 0 deletions src/misc/ibvwrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,13 @@ const char *wrap_ibv_get_device_name(struct ibv_device *device) {
ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
ncclResult_t res;
IBV_PTR_CHECK_NO_RETURN(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device", res);
NTRACE_PROFILING_RECORD(ibv_open_device, *ret, device);
return res;
}

ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
ncclResult_t res;
NTRACE_PROFILING_RECORD(ibv_close_device, context);
IBV_INT_CHECK_NO_RETURN(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device", res);
return res;
}
Expand All @@ -265,12 +267,14 @@ ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_devic
ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
ncclResult_t res;
IBV_INT_CHECK_RET_ERRNO_NO_RETURN(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port", res);
NTRACE_PROFILING_RECORD(ibv_query_port, context, port_num, *port_attr);
return res;
}

ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
ncclResult_t res;
IBV_INT_CHECK_RET_ERRNO_NO_RETURN(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid", res);
NTRACE_PROFILING_RECORD(ibv_query_gid, context, port_num, index, *gid);
return res;
}

Expand Down Expand Up @@ -350,18 +354,21 @@ ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {

ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
ncclResult_t res;
NTRACE_PROFILING_RECORD(ibv_destroy_qp, qp);
IBV_INT_CHECK_RET_ERRNO_NO_RETURN(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp", res);
return res;
}

ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
ncclResult_t res;
IBV_PTR_CHECK_NO_RETURN(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp", res);
NTRACE_PROFILING_RECORD(ibv_create_qp, *ret, qp_init_attr);
return res;
}

ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
ncclResult_t res;
NTRACE_PROFILING_RECORD(ibv_modify_qp, qp, attr, attr_mask);
IBV_INT_CHECK_RET_ERRNO_NO_RETURN(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp", res);
return res;
}
Expand Down
12 changes: 12 additions & 0 deletions src/proxy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@
#include "socket.h"
#include "shm.h"
#include "profiler.h"

// when build NCCL with ntrace_rt.h, we ensure ntrace_rt.h is always
// referenced via ibvwrap.h after verbs type definition, to obtain type
// definition while avoiding cross-reference issues.
#ifdef ENABLE_NTRACE
#include "ibvwrap.h"
#else
// for reference to no-op ntraceProfilingDump
#include "ntrace_profiler.h"
#endif

#define ENABLE_TIMER 0
#include "timer.h"

Expand Down Expand Up @@ -733,6 +744,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
}

ncclProfilingDump();
ntraceProfilingDump();
TIME_PRINT("Proxy");
return ncclSuccess;
}
Expand Down

0 comments on commit 38e738a

Please sign in to comment.