[lustre-devel] [PATCH 25/26] lnet: o2iblnd: clear fatal error on successful failover

James Simmons jsimmons at infradead.org
Mon Aug 2 12:50:50 PDT 2021


From: Serguei Smirnov <ssmirnov at whamcloud.com>

In IB bonding configuration link down event causes fatal error
flag to be set on the bonded interface so it is not selected by
LNet for tx, e.g. when just one of the two cables is pulled.
This change allows for the interface status to be restored on
successful failover.

WC-bug-id: https://jira.whamcloud.com/browse/LU-14806
Lustre-commit: 4668283cd13079dd ("LU-14806 o2iblnd: clear fatal error on successful failover")
Signed-off-by: Serguei Smirnov <ssmirnov at whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44139
Reviewed-by: Cyril Bordage <cbordage at whamcloud.com>
Reviewed-by: Amir Shehata <ashehata at whamcloud.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
 net/lnet/klnds/o2iblnd/o2iblnd.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/net/lnet/klnds/o2iblnd/o2iblnd.c b/net/lnet/klnds/o2iblnd/o2iblnd.c
index 3141953..686581a 100644
--- a/net/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/net/lnet/klnds/o2iblnd/o2iblnd.c
@@ -1487,6 +1487,21 @@ static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
 	}
 }
 
+static int kiblnd_get_link_status(struct net_device *dev)
+{
+	int ret = -1;
+
+	LASSERT(dev);
+
+	if (!netif_running(dev))
+		ret = 0;
+	/* Some devices may not be providing link settings */
+	else if (dev->ethtool_ops->get_link)
+		ret = dev->ethtool_ops->get_link(dev);
+
+	return ret;
+}
+
 static int
 kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
 			struct kib_net *net,
@@ -2347,6 +2362,7 @@ int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 	struct ib_pd *pd;
 	struct kib_net *net;
 	struct sockaddr_in addr;
+	struct net_device *netdev;
 	unsigned long flags;
 	int rc = 0;
 	int i;
@@ -2467,11 +2483,18 @@ int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 	if (hdev)
 		kiblnd_hdev_decref(hdev);
 
-	if (rc)
+	if (rc) {
 		dev->ibd_failed_failover++;
-	else
+	} else {
 		dev->ibd_failed_failover = 0;
 
+		rcu_read_lock();
+		netdev = dev_get_by_name_rcu(ns, dev->ibd_ifname);
+		if (netdev && (kiblnd_get_link_status(netdev) == 1))
+			kiblnd_set_ni_fatal_on(dev->ibd_hdev, 0);
+		rcu_read_unlock();
+	}
+
 	return rc;
 }
 
-- 
1.8.3.1



More information about the lustre-devel mailing list