[lustre-devel] [PATCH 145/622] lnet: unlink md if fail to send recovery
James Simmons
jsimmons at infradead.org
Thu Feb 27 13:10:13 PST 2020
From: Amir Shehata <ashehata at whamcloud.com>
MD for recovery ping should be unlinked if we fail to send the GET.
WC-bug-id: https://jira.whamcloud.com/browse/LU-11474
Lustre-commit: e0132e16df15 ("LU-11474 lnet: unlink md if fail to send recovery")
Signed-off-by: Amir Shehata <ashehata at whamcloud.com>
Reviewed-on: https://review.whamcloud.com/33306
Reviewed-by: Sonia Sharma <sharmaso at whamcloud.com>
Reviewed-by: Doug Oucharek <dougso at me.com>
Reviewed-by: Olaf Weber <olaf.weber at hpe.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
include/linux/lnet/lib-types.h | 7 ++++--
net/lnet/lnet/lib-move.c | 48 +++++++++++++++++++++++++++++++++---------
2 files changed, 43 insertions(+), 12 deletions(-)
diff --git a/include/linux/lnet/lib-types.h b/include/linux/lnet/lib-types.h
index f82ebb6..b2159b0 100644
--- a/include/linux/lnet/lib-types.h
+++ b/include/linux/lnet/lib-types.h
@@ -317,7 +317,8 @@ struct lnet_tx_queue {
#define LNET_NI_STATE_ACTIVE (1 << 1)
#define LNET_NI_STATE_FAILED (1 << 2)
#define LNET_NI_STATE_RECOVERY_PENDING (1 << 3)
-#define LNET_NI_STATE_DELETING (1 << 4)
+#define LNET_NI_STATE_RECOVERY_FAILED BIT(4)
+#define LNET_NI_STATE_DELETING BIT(5)
enum lnet_stats_type {
LNET_STATS_TYPE_SEND = 0,
@@ -606,8 +607,10 @@ struct lnet_peer_ni {
#define LNET_PEER_NI_NON_MR_PREF BIT(0)
/* peer is being recovered. */
#define LNET_PEER_NI_RECOVERY_PENDING BIT(1)
+/* recovery ping failed */
+#define LNET_PEER_NI_RECOVERY_FAILED BIT(2)
/* peer is being deleted */
-#define LNET_PEER_NI_DELETING BIT(2)
+#define LNET_PEER_NI_DELETING BIT(3)
struct lnet_peer {
/* chain on pt_peer_list */
diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index 38ee970..b54fbab 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -2615,13 +2615,13 @@ struct lnet_mt_event_info {
/* called with cpt and ni_lock held */
static void
-lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt)
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
{
struct lnet_handle_md recovery_mdh;
LNetInvalidateMDHandle(&recovery_mdh);
- if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING) {
+ if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING || force) {
recovery_mdh = ni->ni_ping_mdh;
LNetInvalidateMDHandle(&ni->ni_ping_mdh);
}
@@ -2675,12 +2675,22 @@ struct lnet_mt_event_info {
if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) ||
healthv == LNET_MAX_HEALTH_VALUE) {
list_del_init(&ni->ni_recovery);
- lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+ lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
lnet_ni_unlock(ni);
lnet_ni_decref_locked(ni, 0);
lnet_net_unlock(0);
continue;
}
+
+ /* if the local NI failed recovery we must unlink the md.
+ * But we want to keep the local_ni on the recovery queue
+ * so we can continue the attempts to recover it.
+ */
+ if (ni->ni_state & LNET_NI_STATE_RECOVERY_FAILED) {
+ lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+ ni->ni_state &= ~LNET_NI_STATE_RECOVERY_FAILED;
+ }
+
lnet_ni_unlock(ni);
lnet_net_unlock(0);
@@ -2829,7 +2839,7 @@ struct lnet_mt_event_info {
struct lnet_ni, ni_recovery);
list_del_init(&ni->ni_recovery);
lnet_ni_lock(ni);
- lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+ lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
lnet_ni_unlock(ni);
lnet_ni_decref_locked(ni, 0);
}
@@ -2838,13 +2848,14 @@ struct lnet_mt_event_info {
}
static void
-lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt)
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+ bool force)
{
struct lnet_handle_md recovery_mdh;
LNetInvalidateMDHandle(&recovery_mdh);
- if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) {
+ if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
recovery_mdh = lpni->lpni_recovery_ping_mdh;
LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
}
@@ -2867,7 +2878,7 @@ struct lnet_mt_event_info {
lpni_recovery) {
list_del_init(&lpni->lpni_recovery);
spin_lock(&lpni->lpni_lock);
- lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX);
+ lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
spin_unlock(&lpni->lpni_lock);
lnet_peer_ni_decref_locked(lpni);
}
@@ -2933,12 +2944,22 @@ struct lnet_mt_event_info {
if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
healthv == LNET_MAX_HEALTH_VALUE) {
list_del_init(&lpni->lpni_recovery);
- lnet_unlink_lpni_recovery_mdh_locked(lpni, 0);
+ lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
spin_unlock(&lpni->lpni_lock);
lnet_peer_ni_decref_locked(lpni);
lnet_net_unlock(0);
continue;
}
+
+ /* If the peer NI has failed recovery we must unlink the
+ * md. But we want to keep the peer ni on the recovery
+ * queue so we can try to continue recovering it
+ */
+ if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+ lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+ lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+ }
+
spin_unlock(&lpni->lpni_lock);
lnet_net_unlock(0);
@@ -3152,11 +3173,14 @@ struct lnet_mt_event_info {
}
lnet_ni_lock(ni);
ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+ if (status)
+ ni->ni_state |= LNET_NI_STATE_RECOVERY_FAILED;
lnet_ni_unlock(ni);
lnet_net_unlock(0);
if (status != 0) {
- CERROR("local NI recovery failed with %d\n", status);
+ CERROR("local NI (%s) recovery failed with %d\n",
+ libcfs_nid2str(nid), status);
return;
}
/* need to increment healthv for the ni here, because in
@@ -3178,12 +3202,15 @@ struct lnet_mt_event_info {
}
spin_lock(&lpni->lpni_lock);
lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+ if (status)
+ lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
spin_unlock(&lpni->lpni_lock);
lnet_peer_ni_decref_locked(lpni);
lnet_net_unlock(cpt);
if (status != 0)
- CERROR("peer NI recovery failed with %d\n", status);
+ CERROR("peer NI (%s) recovery failed with %d\n",
+ libcfs_nid2str(nid), status);
}
}
@@ -3214,6 +3241,7 @@ struct lnet_mt_event_info {
libcfs_nid2str(ev_info->mt_nid),
(event->status) ? "unsuccessfully" :
"successfully", event->status);
+ lnet_handle_recovery_reply(ev_info, event->status);
break;
default:
CERROR("Unexpected event: %d\n", event->type);
--
1.8.3.1
More information about the lustre-devel
mailing list