[lustre-devel] [PATCH 05/25] lustre: lnet: fix race in lnet shutdown path

James Simmons jsimmons at infradead.org
Tue Sep 25 19:47:57 PDT 2018


From: Olaf Weber <olaf.weber at hpe.com>

The locking changes for the lnet_net_lock made for Multi-Rail
introduce a race in the LNet shutdown path. The code keeps two
states in the_lnet.ln_shutdown: 0 means LNet is either up and
running or shut down, while 1 means lnet is shutting down. In
lnet_select_pathway() if we need to restart and drop and relock
the lnet_net_lock we can find that LNet went from running to
stopped, and not be able to tell the difference.

Replace ln_shutdown with a three-state ln_state patterned on
ln_rc_state: states are LNET_STATE_SHUTDOWN, LNET_STATE_RUNNING,
and LNET_STATE_STOPPING. Most checks against ln_shutdown now test
ln_state against LNET_STATE_RUNNING. LNet moves to RUNNING state
in lnet_startup_lndnets().

Signed-off-by: Olaf Weber <olaf.weber at hpe.com>
WC-bug-id: https://jira.whamcloud.com/browse/LU-9119
Reviewed-on: https://review.whamcloud.com/26690
Reviewed-by: Doug Oucharek <dougso at me.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
 drivers/staging/lustre/include/linux/lnet/lib-types.h |  9 +++++++--
 drivers/staging/lustre/lnet/lnet/api-ni.c             | 15 ++++++++++++---
 drivers/staging/lustre/lnet/lnet/lib-move.c           |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-ptl.c            |  4 ++--
 drivers/staging/lustre/lnet/lnet/peer.c               | 10 +++++-----
 drivers/staging/lustre/lnet/lnet/router.c             |  6 +++---
 6 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
index 18e2665..6abac19 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -726,6 +726,11 @@ struct lnet_msg_container {
 #define LNET_RC_STATE_RUNNING		1	/* started up OK */
 #define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
 
+/* LNet states */
+#define LNET_STATE_SHUTDOWN		0	/* not started */
+#define LNET_STATE_RUNNING		1	/* started up OK */
+#define LNET_STATE_STOPPING		2	/* telling thread to stop */
+
 struct lnet {
 	/* CPU partition table of LNet */
 	struct cfs_cpt_table		 *ln_cpt_table;
@@ -805,8 +810,8 @@ struct lnet {
 	int				  ln_niinit_self;
 	/* LNetNIInit/LNetNIFini counter */
 	int				  ln_refcount;
-	/* shutdown in progress */
-	int				  ln_shutdown;
+	/* SHUTDOWN/RUNNING/STOPPING */
+	int				  ln_state;
 
 	int				  ln_routing;	/* am I a router? */
 	lnet_pid_t			  ln_pid;	/* requested pid */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
index 2d430d0..7c907a3 100644
--- a/drivers/staging/lustre/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -1277,11 +1277,11 @@ struct lnet_ni *
 	/* NB called holding the global mutex */
 
 	/* All quiet on the API front */
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 	LASSERT(!the_lnet.ln_refcount);
 
 	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+	the_lnet.ln_state = LNET_STATE_STOPPING;
 
 	while (!list_empty(&the_lnet.ln_nets)) {
 		/*
@@ -1309,7 +1309,7 @@ struct lnet_ni *
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_shutdown = 0;
+	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
 	lnet_net_unlock(LNET_LOCK_EX);
 }
 
@@ -1583,6 +1583,15 @@ struct lnet_ni *
 	int rc;
 	int ni_count = 0;
 
+	/*
+	 * Change to running state before bringing up the LNDs. This
+	 * allows lnet_shutdown_lndnets() to assert that we've passed
+	 * through here.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
 	while (!list_empty(netlist)) {
 		net = list_entry(netlist->next, struct lnet_net, net_list);
 		list_del_init(&net->net_list);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
index a213387..ea7e2c3 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -1242,7 +1242,7 @@
 
 	seq = lnet_get_dlc_seq_locked();
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lnet_net_unlock(cpt);
 		return -ESHUTDOWN;
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
index d403353..6fa5bbf 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-ptl.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -589,7 +589,7 @@ struct list_head *
 	mtable = lnet_mt_of_match(info, msg);
 	lnet_res_lock(mtable->mt_cpt);
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		rc = LNET_MATCHMD_DROP;
 		goto out1;
 	}
@@ -951,7 +951,7 @@ struct list_head *
 				list_move(&msg->msg_list, &zombies);
 		}
 	} else {
-		if (the_lnet.ln_shutdown)
+		if (the_lnet.ln_state != LNET_STATE_RUNNING)
 			CWARN("Active lazy portal %d on exit\n", portal);
 		else
 			CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
index ae3ffca..2fbf93a 100644
--- a/drivers/staging/lustre/lnet/lnet/peer.c
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -428,7 +428,7 @@ void lnet_peer_uninit(void)
 	struct lnet_peer_table *ptable;
 	int i;
 
-	LASSERT(the_lnet.ln_shutdown || net);
+	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net);
 	/*
 	 * If just deleting the peers for a NI, get rid of any routes these
 	 * peers are gateways for.
@@ -458,7 +458,7 @@ void lnet_peer_uninit(void)
 	struct list_head	*peers;
 	struct lnet_peer_ni	*lp;
 
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
 	list_for_each_entry(lp, peers, lpni_hashlist) {
@@ -1000,7 +1000,7 @@ struct lnet_peer_ni *
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
 
-	if (the_lnet.ln_shutdown) /* it's shutting down */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
 		return ERR_PTR(-ESHUTDOWN);
 
 	/*
@@ -1034,7 +1034,7 @@ struct lnet_peer_ni *
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
 
-	if (the_lnet.ln_shutdown) /* it's shutting down */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
 		return ERR_PTR(-ESHUTDOWN);
 
 	/*
@@ -1063,7 +1063,7 @@ struct lnet_peer_ni *
 	 * Shutdown is only set under the ln_api_lock, so a single
 	 * check here is sufficent.
 	 */
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lpni = ERR_PTR(-ESHUTDOWN);
 		goto out_mutex_unlock;
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
index a0483f9..d3aef8b 100644
--- a/drivers/staging/lustre/lnet/lnet/router.c
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -252,7 +252,7 @@ struct lnet_remotenet *
 	struct lnet_remotenet *rnet;
 	struct list_head *rn_list;
 
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	rn_list = lnet_net2rnethash(net);
 	list_for_each_entry(rnet, rn_list, lrn_list) {
@@ -374,7 +374,7 @@ static void lnet_shuffle_seed(void)
 		return rc;
 	}
 	route->lr_gateway = lpni;
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	rnet2 = lnet_find_rnet_locked(net);
 	if (!rnet2) {
@@ -1775,7 +1775,7 @@ int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
 
 	lnet_net_lock(cpt);
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lnet_net_unlock(cpt);
 		return -ESHUTDOWN;
 	}
-- 
1.8.3.1



More information about the lustre-devel mailing list