Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jgm/DAOS-16481: mercury: Added fixes and warning log entries for customer testing #121

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
63be52b
DAOS-16448 mercury: Add patch to check ep for null in UCX key resolve.
jgmoore-or Aug 29, 2024
e5d8513
DAOS-16271 mercury: Add patch to check ep for null in UCX key resolve.
jgmoore-or Aug 29, 2024
3fa5324
DAOS-16271 mercury: Add patch to check ep for null in UCX key resolve.
jgmoore-or Aug 29, 2024
fd9ae31
DAOS-16271 mercury: Add patch to check ep for null in UCX key resolve.
jgmoore-or Aug 30, 2024
e176560
DAOS-16481 mercury: Fixes and debug logging for customer testing.
jgmoore-or Sep 3, 2024
d8ac0aa
DAOS-16481 mercury: Enhanced logging for customer testing.
jgmoore-or Sep 4, 2024
b7bd193
DAOS-16481 mercury: Update patch to check ep in key_resolve.
jgmoore-or Sep 4, 2024
91d41e0
DAOS-16481 mercury: Add logging to UCX interface code.
jgmoore-or Sep 5, 2024
ebfea50
DAOS-16481 mercury: add patch for hg_first benchmark
soumagne Sep 5, 2024
887a462
DAOS-16481 mercury: Update logging and change reconnect handling.
jgmoore-or Sep 5, 2024
b1f9629
Merge branch 'master' into jgm/DAOS-16481
jgmoore-or Sep 5, 2024
6ce6b4b
Merge branch 'jgm/DAOS-16481' of github.com:daos-stack/mercury into j…
jgmoore-or Sep 5, 2024
bfe5b3a
DAOS-16481 mercury: Update logging.
jgmoore-or Sep 5, 2024
2307932
DAOS-16481 mercury: Add logging and change reconnect handling.
jgmoore-or Sep 5, 2024
24b8f80
DAOS-16481 mercury: Add logging for debugging.
jgmoore-or Sep 6, 2024
336e0f8
DAOS-16481 mercury: Add logging for address tracking.
jgmoore-or Sep 6, 2024
30d3ca1
DAOS-16481 mercury: Add logging for debug at customer site.
jgmoore-or Sep 9, 2024
7dae4f0
DAOS-16481 mercury: Add log entries and check for ep on disconnect.
jgmoore-or Sep 9, 2024
b4aeab9
Merge branch 'master' into jgm/DAOS-16481
jgmoore-or Sep 10, 2024
a4828f4
DAOS-16481 mercury: Add potential fix for IOR hang issue
jgmoore-or Sep 16, 2024
abd169a
DAOS-16481: mercury: Change ep close for UCX.
jgmoore-or Sep 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
mercury (2.4.0~rc5-2) unstable; urgency=medium
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here -- needs to be bumped to rc5-3; rc5-2 is already present in master

[ Joseph Moore ]
* Add patch to na_ucx.c to check ep in key_resolve

-- Joseph Moore <[email protected]> Thu, 29 Aug 2024 09:00:00 -0600

mercury (2.4.0~rc5-1) unstable; urgency=medium
[ Jerome Soumagne ]
* Update to 2.4.0rc5
Expand Down
6 changes: 5 additions & 1 deletion mercury.spec
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Name: mercury
Version: 2.4.0~rc5
Release: 1%{?dist}
Release: 2%{?dist}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to be bumped to 3, as its 2 already in master.


# dl_version is version with ~ removed
%{lua:
Expand All @@ -12,6 +12,7 @@ License: BSD
Group: Development/Libraries
URL: http://mercury-hpc.github.io/
Source0: https://github.com/mercury-hpc/%{name}/releases/download/v%{dl_version}/%{name}-%{dl_version}.tar.bz2
Patch0: na_ucx_keyres_epchk.patch

BuildRequires: libfabric-devel >= 1.15.0
BuildRequires: cmake
Expand Down Expand Up @@ -117,6 +118,9 @@ Mercury plugin to support the UCX transport.
%{_libdir}/cmake/

%changelog
* Thu Aug 29 2024 Joseph Moore <[email protected]> - 2.4.0~rc5-2
- Add patch to na_ucx.c to check ep in key_resolve.

* Mon Aug 26 2024 Jerome Soumagne <[email protected]> - 2.4.0~rc5-1
- Update to 2.4.0rc5

Expand Down
103 changes: 103 additions & 0 deletions na_ucx_keyres_epchk.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c
index 84eb8b0..78e63d6 100644
--- a/src/na/na_ucx.c
+++ b/src/na/na_ucx.c
@@ -1666,6 +1666,9 @@ na_ucp_listener_destroy(ucp_listener_h listener)
static void
na_ucp_listener_conn_cb(ucp_conn_request_h conn_request, void *arg)
{
+
+ char host_string[NI_MAXHOST];
+ char serv_string[NI_MAXSERV];
struct na_ucx_class *na_ucx_class = (struct na_ucx_class *) arg;
ucp_conn_request_attr_t conn_request_attrs = {
.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR};
@@ -1683,13 +1686,24 @@ na_ucp_listener_conn_cb(ucp_conn_request_h conn_request, void *arg)
UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR) == 0,
error, "conn attributes contain no client addr");

+ int rc = getnameinfo((const struct sockaddr *) &conn_request_attrs.client_address,
+ sizeof(struct sockaddr), host_string,
+ sizeof(host_string), serv_string,
+ sizeof(serv_string), NI_NUMERICHOST | NI_NUMERICSERV);
+
/* Lookup address from table */
addr_key = (ucs_sock_addr_t){
.addr = (const struct sockaddr *) &conn_request_attrs.client_address,
.addrlen = sizeof(conn_request_attrs.client_address)};
na_ucx_addr = na_ucx_addr_map_lookup(&na_ucx_class->addr_map, &addr_key);
- NA_CHECK_SUBSYS_ERROR_NORET(addr, na_ucx_addr != NULL, error,
- "An entry is already present for this address");
+
+
+ if (na_ucx_addr != NULL) {
+ NA_LOG_SUBSYS_WARNING(addr,
+ "An entry is already present for this address: %s:%s, refcnt: %d",
+ host_string, serv_string, na_ucx_addr->refcount);
+ na_ucx_addr_map_remove(&na_ucx_class->addr_map, &addr_key);
+ }

/* Insert new entry and create new address */
na_ret = na_ucx_addr_map_insert(na_ucx_class, &na_ucx_class->addr_map,
@@ -1919,12 +1933,20 @@ error:
/*---------------------------------------------------------------------------*/
static void
na_ucp_ep_error_cb(
- void *arg, ucp_ep_h NA_UNUSED ep, ucs_status_t NA_DEBUG_LOG_USED status)
+ void *arg, ucp_ep_h NA_UNUSED ep, ucs_status_t status)
{
+ char host_string[NI_MAXHOST];
+ char serv_string[NI_MAXSERV];
struct na_ucx_addr *na_ucx_addr = (struct na_ucx_addr *) arg;

- NA_LOG_SUBSYS_DEBUG(addr, "ep_err_handler() returned (%s) for address (%p)",
- ucs_status_string(status), (void *) na_ucx_addr);
+ int rc = getnameinfo((const struct sockaddr *) &na_ucx_addr->ss_addr,
+ sizeof(struct sockaddr), host_string,
+ sizeof(host_string), serv_string,
+ sizeof(serv_string), NI_NUMERICHOST | NI_NUMERICSERV);
+
+ NA_LOG_SUBSYS_WARNING(addr,
+ "ep_err_handler() returned (%s) for address (%s:%s), ep (%p) with refcnt: %d",
+ ucs_status_string(status), host_string, serv_string, ep, na_ucx_addr->refcount);

/* Mark addr as no longer resolved to force reconnection */
hg_atomic_and32(&na_ucx_addr->status, ~NA_UCX_ADDR_RESOLVED);
@@ -2993,7 +3015,7 @@ na_ucx_unexpected_info_free(

/*---------------------------------------------------------------------------*/
static na_return_t
-na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,
+na_ucx_rma(struct na_ucx_class *na_ucx_class, na_context_t *context,
na_cb_type_t cb_type, na_cb_t callback, void *arg,
struct na_ucx_mem_handle *local_mem_handle, na_offset_t local_offset,
struct na_ucx_mem_handle *remote_mem_handle, na_offset_t remote_offset,
@@ -3023,6 +3045,18 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,

/* There is no need to have a fully resolved address to start an RMA.
* This is only necessary for two-sided communication. */
+ /* The above assumption is now in question, so the following will resolve
+ * the address if required. */
+
+ /* Check addr to ensure the EP for that addr is still valid */
+ if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
+ ret = na_ucx_addr_map_update(
+ na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr);
+ NA_CHECK_SUBSYS_NA_ERROR(
+ addr, error, ret, "Could not update NA UCX address");
+ }
+ NA_CHECK_SUBSYS_ERROR(msg, na_ucx_addr->ucp_ep == NULL, error, ret,
+ NA_ADDRNOTAVAIL, "UCP endpoint is NULL for that address");

/* TODO UCX requires the remote key to be bound to the origin, do we need a
* new API? */
@@ -3061,6 +3095,9 @@ na_ucx_rma_key_resolve(ucp_ep_h ep, struct na_ucx_mem_handle *na_ucx_mem_handle,

hg_thread_mutex_lock(&na_ucx_mem_handle->rkey_unpack_lock);

+ NA_CHECK_SUBSYS_ERROR(
+ mem, ep == NULL, error, ret, NA_INVALID_ARG, "Invalid endpoint (%p)", ep);
+
switch (hg_atomic_get32(&na_ucx_mem_handle->type)) {
case NA_UCX_MEM_HANDLE_REMOTE_PACKED: {
ucs_status_t status = ucp_ep_rkey_unpack(ep,