[lustre-devel] [PATCH 08/49] lnet: Lookup lpni after discovery
James Simmons
jsimmons at infradead.org
Wed Apr 14 21:02:00 PDT 2021
From: Chris Horn <chris.horn at hpe.com>
The lpni for a nid can change as part of the discovery process (see
lnet_peer_add_nid()). As such, callers of lnet_discover_peer_locked()
need to lookup the lpni again after discovery completes to make sure
they get the correct peer.
An exception is lnet_check_routers() which doesn't do anything with
the peer or peer NI after the call to lnet_discover_peer_locked().
If the router list is changed then lnet_check_routers() will already
repeat discovery.
HPE-bug-id: LUS-9167
WC-bug-id: https://jira.whamcloud.com/browse/LU-13883
Lustre-commit: 584d9e46053234d0 ("LU-13883 lnet: Lookup lpni after discovery")
Signed-off-by: Chris Horn <chris.horn at hpe.com>
Reviewed-on: https://review.whamcloud.com/39747
Reviewed-by: Serguei Smirnov <ssmirnov at whamcloud.com>
Reviewed-by: James Simmons <jsimmons at infradead.org>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
include/linux/lnet/lib-lnet.h | 1 +
net/lnet/lnet/api-ni.c | 12 ++++++++++++
net/lnet/lnet/lib-move.c | 30 ++++++++++++++++++++++++------
net/lnet/lnet/peer.c | 30 ++++++++++++++++++++++++++++++
4 files changed, 67 insertions(+), 6 deletions(-)
diff --git a/include/linux/lnet/lib-lnet.h b/include/linux/lnet/lib-lnet.h
index 2741c6f..1954614 100644
--- a/include/linux/lnet/lib-lnet.h
+++ b/include/linux/lnet/lib-lnet.h
@@ -817,6 +817,7 @@ struct lnet_peer_ni *lnet_peer_get_ni_locked(struct lnet_peer *lp,
void lnet_peer_net_added(struct lnet_net *net);
lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block);
+void lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg);
int lnet_peer_discovery_start(void);
void lnet_peer_discovery_stop(void);
void lnet_push_update_to_peers(int force);
diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c
index 542cc2e..0c0b304 100644
--- a/net/lnet/lnet/api-ni.c
+++ b/net/lnet/lnet/api-ni.c
@@ -4540,6 +4540,18 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
if (rc)
goto out_decref;
+ /* The lpni (or lp) for this NID may have changed and our ref is
+ * the only thing keeping the old one around. Release the ref
+ * and lookup the lpni again
+ */
+ lnet_peer_ni_decref_locked(lpni);
+ lpni = lnet_find_peer_ni_locked(id.nid);
+ if (!lpni) {
+ rc = -ENOENT;
+ goto out;
+ }
+ lp = lpni->lpni_peer_net->lpn_peer;
+
i = 0;
p = NULL;
while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) {
diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index de17de4b..25e0fd2 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -1834,6 +1834,7 @@ struct lnet_ni *
int cpt)
{
struct lnet_peer *peer;
+ struct lnet_peer_ni *new_lpni;
int rc;
lnet_peer_ni_addref_locked(lpni);
@@ -1855,21 +1856,38 @@ struct lnet_ni *
lnet_peer_ni_decref_locked(lpni);
return rc;
}
- /* The peer may have changed. */
- peer = lpni->lpni_peer_net->lpn_peer;
+
+ new_lpni = lnet_find_peer_ni_locked(lpni->lpni_nid);
+ if (!new_lpni) {
+ lnet_peer_ni_decref_locked(lpni);
+ return -ENOENT;
+ }
+
+ peer = new_lpni->lpni_peer_net->lpn_peer;
spin_lock(&peer->lp_lock);
- if (lnet_peer_is_uptodate_locked(peer)) {
+ if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) {
+ /* The peer NI did not change and the peer is up to date.
+ * Nothing more to do.
+ */
spin_unlock(&peer->lp_lock);
lnet_peer_ni_decref_locked(lpni);
+ lnet_peer_ni_decref_locked(new_lpni);
return 0;
}
- /* queue message and return */
+ spin_unlock(&peer->lp_lock);
+
+ /* Either the peer NI changed during discovery, or the peer isn't up
+ * to date. In both cases we want to queue the message on the
+ * (possibly new) peer's pending queue and queue the peer for discovery
+ */
msg->msg_sending = 0;
msg->msg_txpeer = NULL;
- list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
- spin_unlock(&peer->lp_lock);
+ lnet_net_unlock(cpt);
+ lnet_peer_queue_message(peer, msg);
+ lnet_net_lock(cpt);
lnet_peer_ni_decref_locked(lpni);
+ lnet_peer_ni_decref_locked(new_lpni);
CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
msg, libcfs_nid2str(peer->lp_primary_nid));
diff --git a/net/lnet/lnet/peer.c b/net/lnet/lnet/peer.c
index 1b240f1..ba41d86 100644
--- a/net/lnet/lnet/peer.c
+++ b/net/lnet/lnet/peer.c
@@ -1346,6 +1346,16 @@ struct lnet_peer_ni *
rc = lnet_discover_peer_locked(lpni, cpt, true);
if (rc)
goto out_decref;
+ /* The lpni (or lp) for this NID may have changed and our ref is
+ * the only thing keeping the old one around. Release the ref
+ * and lookup the lpni again
+ */
+ lnet_peer_ni_decref_locked(lpni);
+ lpni = lnet_find_peer_ni_locked(nid);
+ if (!lpni) {
+ rc = -ENOENT;
+ goto out_unlock;
+ }
lp = lpni->lpni_peer_net->lpn_peer;
/* Only try once if discovery is disabled */
@@ -2054,6 +2064,26 @@ struct lnet_peer_ni *
return rc;
}
+/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */
+void
+lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg)
+{
+ /* The discovery thread holds net_lock/EX and lp_lock when it splices
+ * the lp_dc_pendq onto a local list for resending. Thus, we do the same
+ * when adding to the list and queuing the peer to ensure that we do not
+ * strand any messages on the lp_dc_pendq. This scheme ensures the
+ * message will be resent even if the peer is already being discovered.
+ * Therefore we needn't check the return value of
+ * lnet_peer_queue_for_discovery(lp).
+ */
+ lnet_net_lock(LNET_LOCK_EX);
+ spin_lock(&lp->lp_lock);
+ list_add_tail(&msg->msg_list, &lp->lp_dc_pendq);
+ spin_unlock(&lp->lp_lock);
+ lnet_peer_queue_for_discovery(lp);
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+
/*
* Queue a peer for the attention of the discovery thread. Call with
* lnet_net_lock/EX held. Returns 0 if the peer was queued, and
--
1.8.3.1
More information about the lustre-devel
mailing list