[lustre-devel] [PATCH 15/34] lnet: extend zombie handling to nets and nis
Doug Oucharek
doucharek at cray.com
Tue Sep 11 20:53:47 PDT 2018
Which refcount line are you referring to? The call to lnet_ni_unlink_locked()?
Reviewed-by: Doug Oucharek <dougso at me.com>k
Doug
On 9/6/18, 5:53 PM, "NeilBrown" <neilb at suse.com> wrote:
A zombie lnet_ni is now attached to the lnet_net rather than the
global the_lnet. The zombie lnet_net are attached to the_lnet.
For some reason, we don't drop the refcount on the lnd before shutting
it down now.
This is part of
8cbb8cd3e771e7f7e0f99cafc19fad32770dc015
LU-7734 lnet: Multi-Rail local NI split
Signed-off-by: NeilBrown <neilb at suse.com>
---
.../staging/lustre/include/linux/lnet/lib-types.h | 9 ++-
drivers/staging/lustre/lnet/lnet/api-ni.c | 65 ++++++++++----------
drivers/staging/lustre/lnet/lnet/config.c | 3 +
3 files changed, 42 insertions(+), 35 deletions(-)
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
index 22957d142cc0..1d372672e2de 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -284,6 +284,9 @@ struct lnet_net {
struct lnet_lnd *net_lnd;
/* list of NIs on this net */
struct list_head net_ni_list;
+
+ /* dying LND instances */
+ struct list_head net_ni_zombie;
};
struct lnet_ni {
@@ -653,11 +656,11 @@ struct lnet {
/* LND instances */
struct list_head ln_nets;
/* NIs bond on specific CPT(s) */
- struct list_head ln_nis_cpt;
- /* dying LND instances */
- struct list_head ln_nis_zombie;
+ struct list_head ln_nis_cpt;
/* the loopback NI */
struct lnet_ni *ln_loni;
+ /* network zombie list */
+ struct list_head ln_net_zombie;
/* remote networks with routes to them */
struct list_head *ln_remote_nets_hash;
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
index c3c568e63342..18d111cb826b 100644
--- a/drivers/staging/lustre/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -539,7 +539,6 @@ lnet_prepare(lnet_pid_t requested_pid)
INIT_LIST_HEAD(&the_lnet.ln_test_peers);
INIT_LIST_HEAD(&the_lnet.ln_nets);
INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
- INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
INIT_LIST_HEAD(&the_lnet.ln_routers);
INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
@@ -618,7 +617,6 @@ lnet_unprepare(void)
LASSERT(list_empty(&the_lnet.ln_test_peers));
LASSERT(list_empty(&the_lnet.ln_nets));
LASSERT(list_empty(&the_lnet.ln_nis_cpt));
- LASSERT(list_empty(&the_lnet.ln_nis_zombie));
lnet_portals_destroy();
@@ -1095,34 +1093,35 @@ lnet_ni_unlink_locked(struct lnet_ni *ni)
/* move it to zombie list and nobody can find it anymore */
LASSERT(!list_empty(&ni->ni_netlist));
- list_move(&ni->ni_netlist, &the_lnet.ln_nis_zombie);
+ list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
lnet_ni_decref_locked(ni, 0);
}
static void
-lnet_clear_zombies_nis_locked(void)
+lnet_clear_zombies_nis_locked(struct lnet_net *net)
{
int i;
int islo;
struct lnet_ni *ni;
+ struct list_head *zombie_list = &net->net_ni_zombie;
/*
- * Now wait for the NI's I just nuked to show up on ln_zombie_nis
- * and shut them down in guaranteed thread context
+ * Now wait for the NIs I just nuked to show up on the zombie
+ * list and shut them down in guaranteed thread context
*/
i = 2;
- while (!list_empty(&the_lnet.ln_nis_zombie)) {
+ while (!list_empty(zombie_list)) {
int *ref;
int j;
- ni = list_entry(the_lnet.ln_nis_zombie.next,
+ ni = list_entry(zombie_list->next,
struct lnet_ni, ni_netlist);
list_del_init(&ni->ni_netlist);
cfs_percpt_for_each(ref, j, ni->ni_refs) {
if (!*ref)
continue;
/* still busy, add it back to zombie list */
- list_add(&ni->ni_netlist, &the_lnet.ln_nis_zombie);
+ list_add(&ni->ni_netlist, zombie_list);
break;
}
@@ -1138,18 +1137,13 @@ lnet_clear_zombies_nis_locked(void)
continue;
}
- ni->ni_net->net_lnd->lnd_refcount--;
lnet_net_unlock(LNET_LOCK_EX);
islo = ni->ni_net->net_lnd->lnd_type == LOLND;
LASSERT(!in_interrupt());
- ni->ni_net->net_lnd->lnd_shutdown(ni);
+ net->net_lnd->lnd_shutdown(ni);
- /*
- * can't deref lnd anymore now; it might have unregistered
- * itself...
- */
if (!islo)
CDEBUG(D_LNI, "Removed LNI %s\n",
libcfs_nid2str(ni->ni_nid));
@@ -1162,9 +1156,11 @@ lnet_clear_zombies_nis_locked(void)
}
static void
-lnet_shutdown_lndnis(void)
+lnet_shutdown_lndnet(struct lnet_net *net);
+
+static void
+lnet_shutdown_lndnets(void)
{
- struct lnet_ni *ni;
int i;
struct lnet_net *net;
@@ -1173,30 +1169,35 @@ lnet_shutdown_lndnis(void)
/* All quiet on the API front */
LASSERT(!the_lnet.ln_shutdown);
LASSERT(!the_lnet.ln_refcount);
- LASSERT(list_empty(&the_lnet.ln_nis_zombie));
lnet_net_lock(LNET_LOCK_EX);
the_lnet.ln_shutdown = 1; /* flag shutdown */
- /* Unlink NIs from the global table */
while (!list_empty(&the_lnet.ln_nets)) {
+ /*
+ * move the nets to the zombie list to avoid them being
+ * picked up for new work. LONET is also included in the
+ * Nets that will be moved to the zombie list
+ */
net = list_entry(the_lnet.ln_nets.next,
struct lnet_net, net_list);
- while (!list_empty(&net->net_ni_list)) {
- ni = list_entry(net->net_ni_list.next,
- struct lnet_ni, ni_netlist);
- lnet_ni_unlink_locked(ni);
- }
+ list_move(&net->net_list, &the_lnet.ln_net_zombie);
}
- /* Drop the cached loopback NI. */
+ /* Drop the cached loopback Net. */
if (the_lnet.ln_loni) {
lnet_ni_decref_locked(the_lnet.ln_loni, 0);
the_lnet.ln_loni = NULL;
}
-
lnet_net_unlock(LNET_LOCK_EX);
+ /* iterate through the net zombie list and delete each net */
+ while (!list_empty(&the_lnet.ln_net_zombie)) {
+ net = list_entry(the_lnet.ln_net_zombie.next,
+ struct lnet_net, net_list);
+ lnet_shutdown_lndnet(net);
+ }
+
/*
* Clear lazy portals and drop delayed messages which hold refs
* on their lnet_msg::msg_rxpeer
@@ -1211,8 +1212,6 @@ lnet_shutdown_lndnis(void)
lnet_peer_tables_cleanup(NULL);
lnet_net_lock(LNET_LOCK_EX);
-
- lnet_clear_zombies_nis_locked();
the_lnet.ln_shutdown = 0;
lnet_net_unlock(LNET_LOCK_EX);
}
@@ -1222,6 +1221,7 @@ static void
lnet_shutdown_lndni(struct lnet_ni *ni)
{
int i;
+ struct lnet_net *net = ni->ni_net;
lnet_net_lock(LNET_LOCK_EX);
lnet_ni_unlink_locked(ni);
@@ -1235,7 +1235,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
lnet_peer_tables_cleanup(ni);
lnet_net_lock(LNET_LOCK_EX);
- lnet_clear_zombies_nis_locked();
+ lnet_clear_zombies_nis_locked(net);
lnet_net_unlock(LNET_LOCK_EX);
}
@@ -1445,7 +1445,7 @@ lnet_startup_lndnets(struct list_head *netlist)
return ni_count;
failed:
- lnet_shutdown_lndnis();
+ lnet_shutdown_lndnets();
return rc;
}
@@ -1492,6 +1492,7 @@ int lnet_lib_init(void)
the_lnet.ln_refcount = 0;
LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh);
INIT_LIST_HEAD(&the_lnet.ln_lnds);
+ INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
@@ -1656,7 +1657,7 @@ LNetNIInit(lnet_pid_t requested_pid)
if (!the_lnet.ln_nis_from_mod_params)
lnet_destroy_routes();
err_shutdown_lndnis:
- lnet_shutdown_lndnis();
+ lnet_shutdown_lndnets();
err_empty_list:
lnet_unprepare();
LASSERT(rc < 0);
@@ -1703,7 +1704,7 @@ LNetNIFini(void)
lnet_acceptor_stop();
lnet_destroy_routes();
- lnet_shutdown_lndnis();
+ lnet_shutdown_lndnets();
lnet_unprepare();
}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
index 380a3fb1caba..2588d67fea1b 100644
--- a/drivers/staging/lustre/lnet/lnet/config.c
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -279,6 +279,8 @@ lnet_net_free(struct lnet_net *net)
struct list_head *tmp, *tmp2;
struct lnet_ni *ni;
+ LASSERT(list_empty(&net->net_ni_zombie));
+
/* delete any nis which have been started. */
list_for_each_safe(tmp, tmp2, &net->net_ni_list) {
ni = list_entry(tmp, struct lnet_ni, ni_netlist);
@@ -312,6 +314,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list)
INIT_LIST_HEAD(&net->net_list);
INIT_LIST_HEAD(&net->net_ni_list);
+ INIT_LIST_HEAD(&net->net_ni_zombie);
net->net_id = net_id;
More information about the lustre-devel
mailing list