[lustre-devel] [PATCH 16/24] lnet: Use fatal NI if none other available
James Simmons
jsimmons at infradead.org
Mon Sep 5 18:55:29 PDT 2022
From: Serguei Smirnov <ssmirnov at whamcloud.com>
Allow NI in fatal state to be selected for sending if there are no
NIs in non-fatal state.
HPE-bug-id: LUS-11019
WC-bug-id: https://jira.whamcloud.com/browse/LU-14955
Lustre-commit: ff3322fd0c77a8042 ("LU-14955 lnet: Use fatal NI if none other available")
Signed-off-by: Serguei Smirnov <ssmirnov at whamcloud.com>
Signed-off-by: Chris Horn <chris.horn at hpe.com>
Reviewed-on: https://review.whamcloud.com/44746
Reviewed-by: Cyril Bordage <cbordage at whamcloud.com>
Reviewed-by: Frank Sehr <fsehr at whamcloud.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
net/lnet/lnet/lib-move.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/net/lnet/lnet/lib-move.c b/net/lnet/lnet/lib-move.c
index 6ad0963..3b20a1b7 100644
--- a/net/lnet/lnet/lib-move.c
+++ b/net/lnet/lnet/lib-move.c
@@ -1449,6 +1449,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
int best_healthv;
u32 best_sel_prio;
unsigned int best_dev_prio;
+ int best_ni_fatal;
unsigned int dev_idx = UINT_MAX;
bool gpu = md ? (md->md_flags & LNET_MD_FLAG_GPU) : false;
@@ -1470,6 +1471,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
best_dev_prio = UINT_MAX;
best_credits = INT_MIN;
best_healthv = 0;
+ best_ni_fatal = true;
} else {
best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
@@ -1477,6 +1479,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
best_credits = atomic_read(&best_ni->ni_tx_credits);
best_healthv = atomic_read(&best_ni->ni_healthv);
best_sel_prio = best_ni->ni_sel_priority;
+ best_ni_fatal = atomic_read(&best_ni->ni_fatal_error_on);
}
while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
@@ -1510,7 +1513,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
if (!gpu && distance < lnet_numa_range)
distance = lnet_numa_range;
- /* * Select on health, selection policy, direct dma prio,
+ /** Select on health, selection policy, direct dma prio,
* shorter distance, available credits, then round-robin.
*/
if (ni_fatal)
@@ -1518,16 +1521,24 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
if (best_ni)
CDEBUG(D_NET,
- "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
- libcfs_nidstr(&ni->ni_nid), ni_credits, distance,
+ "compare ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
+ libcfs_nidstr(&ni->ni_nid),
+ ni_fatal ? "y" : "n", ni_credits, distance,
ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv,
- (best_ni) ? libcfs_nidstr(&best_ni->ni_nid)
- : "not selected", best_credits, shortest_distance,
+ (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) :
+ "not selected",
+ best_ni_fatal ? "y" : "n", best_credits,
+ shortest_distance,
(best_ni) ? best_ni->ni_seq : 0,
best_sel_prio, best_dev_prio, best_healthv);
else
goto select_ni;
+ if (ni_fatal && !best_ni_fatal)
+ continue;
+ else if (!ni_fatal && best_ni_fatal)
+ goto select_ni;
+
if (ni_healthv < best_healthv)
continue;
else if (ni_healthv > best_healthv)
@@ -1563,6 +1574,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
best_healthv = ni_healthv;
best_ni = ni;
best_credits = ni_credits;
+ best_ni_fatal = ni_fatal;
}
CDEBUG(D_NET, "selected best_ni %s\n",
--
1.8.3.1
More information about the lustre-devel
mailing list