[lustre-devel] [PATCH 16/24] lnet: Use fatal NI if none other available

James Simmons jsimmons at infradead.org
Mon Sep 5 18:55:29 PDT 2022


From: Serguei Smirnov <ssmirnov at whamcloud.com>

Allow NI in fatal state to be selected for sending if there are no
NIs in non-fatal state.

HPE-bug-id: LUS-11019
WC-bug-id: https://jira.whamcloud.com/browse/LU-14955
Lustre-commit: ff3322fd0c77a8042 ("LU-14955 lnet: Use fatal NI if none other available")
Signed-off-by: Serguei Smirnov <ssmirnov at whamcloud.com>
Signed-off-by: Chris Horn <chris.horn at hpe.com>
Reviewed-on: https://review.whamcloud.com/44746
Reviewed-by: Cyril Bordage <cbordage at whamcloud.com>
Reviewed-by: Frank Sehr <fsehr at whamcloud.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
 net/lnet/lnet/lib-move.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index 6ad0963..3b20a1b7 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -1449,6 +1449,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 	int best_healthv;
 	u32 best_sel_prio;
 	unsigned int best_dev_prio;
+	int best_ni_fatal;
 	unsigned int dev_idx = UINT_MAX;
 	bool gpu = md ? (md->md_flags & LNET_MD_FLAG_GPU) : false;
 
@@ -1470,6 +1471,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 		best_dev_prio = UINT_MAX;
 		best_credits = INT_MIN;
 		best_healthv = 0;
+		best_ni_fatal = true;
 	} else {
 		best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
 		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
@@ -1477,6 +1479,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 		best_credits = atomic_read(&best_ni->ni_tx_credits);
 		best_healthv = atomic_read(&best_ni->ni_healthv);
 		best_sel_prio = best_ni->ni_sel_priority;
+		best_ni_fatal = atomic_read(&best_ni->ni_fatal_error_on);
 	}
 
 	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
@@ -1510,7 +1513,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 		if (!gpu && distance < lnet_numa_range)
 			distance = lnet_numa_range;
 
-		/* * Select on health, selection policy, direct dma prio,
+		/** Select on health, selection policy, direct dma prio,
 		 * shorter distance, available credits, then round-robin.
 		 */
 		if (ni_fatal)
@@ -1518,16 +1521,24 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 
 		if (best_ni)
 			CDEBUG(D_NET,
-			       "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
-			       libcfs_nidstr(&ni->ni_nid), ni_credits, distance,
+			       "compare ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
+			       libcfs_nidstr(&ni->ni_nid),
+			       ni_fatal ? "y" : "n", ni_credits, distance,
 			       ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv,
-			       (best_ni) ? libcfs_nidstr(&best_ni->ni_nid)
-			       : "not selected", best_credits, shortest_distance,
+			       (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) :
+			       "not selected",
+			       best_ni_fatal ? "y" : "n", best_credits,
+			       shortest_distance,
 			       (best_ni) ? best_ni->ni_seq : 0,
 			       best_sel_prio, best_dev_prio, best_healthv);
 		else
 			goto select_ni;
 
+		if (ni_fatal && !best_ni_fatal)
+			continue;
+		else if (!ni_fatal && best_ni_fatal)
+			goto select_ni;
+
 		if (ni_healthv < best_healthv)
 			continue;
 		else if (ni_healthv > best_healthv)
@@ -1563,6 +1574,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
 		best_healthv = ni_healthv;
 		best_ni = ni;
 		best_credits = ni_credits;
+		best_ni_fatal = ni_fatal;
 	}
 
 	CDEBUG(D_NET, "selected best_ni %s\n",
-- 
1.8.3.1



More information about the lustre-devel mailing list