[lustre-devel] [PATCH 08/34] LU-7734 lnet: Multi-Rail peer split

NeilBrown neilb at suse.com
Mon Sep 24 18:07:15 PDT 2018


From: Amir Shehata <amir.shehata at intel.com>

[[Note, the preceding few patches are part of this
  in the upstream lustre code - they were split
  for easier merging into linux.
  ]]

Split the peer structure into peer/peer_net/peer_ni, as
described in the Multi-Rail HLD.

Removed deathrow list in peers, instead peers are immediately
deleted. deathrow complicates memory management for peers to
little gain.

Moved to LNET_LOCK_EX for any operations which will modify the
peer tables. And CPT locks for any operatios which read the peer
tables. Therefore there is no need to use lnet_cpt_of_nid() to
calculate the CPT of the peer NID, instead we use lnet_nid_cpt_hash()
to distribute peers across multiple CPTs.

It is no longe true that peers and NIs would exist on
the same CPT. In the new design peers and NIs don't have a 1-1
relationship. You can send to the same peer from several NIs, which
can exist on separate CPTs

Signed-off-by: Amir Shehata <amir.shehata at intel.com>
Change-Id: Ida41d830d38d0ab2bb551476e4a8866d52a25fe2
Reviewed-on: http://review.whamcloud.com/18293
Reviewed-by: Olaf Weber <olaf at sgi.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek at intel.com>
Signed-off-by: NeilBrown <neilb at suse.com>
---
 .../staging/lustre/include/linux/lnet/lib-lnet.h   |    2 
 .../staging/lustre/include/linux/lnet/lib-types.h  |   29 ++
 drivers/staging/lustre/lnet/lnet/api-ni.c          |    1 
 drivers/staging/lustre/lnet/lnet/peer.c            |  260 ++++++++++++--------
 drivers/staging/lustre/lnet/lnet/router_proc.c     |    3 
 5 files changed, 191 insertions(+), 104 deletions(-)

diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
index 656177b64336..bf076298de71 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
@@ -637,8 +637,6 @@ bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
 bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
 
 int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt);
-struct lnet_peer_ni *lnet_find_peer_locked(struct lnet_peer_table *ptable,
-					   lnet_nid_t nid);
 struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt);
 void lnet_peer_tables_cleanup(struct lnet_ni *ni);
 void lnet_peer_tables_destroy(void);
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
index 9a2cf319dba9..9f70c094cc4c 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -384,6 +384,7 @@ struct lnet_rc_data {
 };
 
 struct lnet_peer_ni {
+	struct list_head	lpni_on_peer_net_list;
 	/* chain on peer hash */
 	struct list_head	 lpni_hashlist;
 	/* messages blocking for tx credits */
@@ -394,6 +395,7 @@ struct lnet_peer_ni {
 	struct list_head	 lpni_rtr_list;
 	/* # tx credits available */
 	int			 lpni_txcredits;
+	struct lnet_peer_net	*lpni_peer_net;
 	/* low water mark */
 	int			 lpni_mintxcredits;
 	/* # router credits */
@@ -442,6 +444,31 @@ struct lnet_peer_ni {
 	struct lnet_rc_data	*lpni_rcd;
 };
 
+struct lnet_peer {
+	/* chain on global peer list */
+	struct list_head	lp_on_lnet_peer_list;
+
+	/* list of peer nets */
+	struct list_head	lp_peer_nets;
+
+	/* primary NID of the peer */
+	lnet_nid_t		lp_primary_nid;
+};
+
+struct lnet_peer_net {
+	/* chain on peer block */
+	struct list_head	lpn_on_peer_list;
+
+	/* list of peer_nis on this network */
+	struct list_head	lpn_peer_nis;
+
+	/* pointer to the peer I'm part of */
+	struct lnet_peer	*lpn_peer;
+
+	/* Net ID */
+	__u32			lpn_net_id;
+};
+
 /* peer hash size */
 #define LNET_PEER_HASH_BITS	9
 #define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
@@ -686,6 +713,8 @@ struct lnet {
 	struct lnet_msg_container	**ln_msg_containers;
 	struct lnet_counters		**ln_counters;
 	struct lnet_peer_table		**ln_peer_tables;
+	/* list of configured or discovered peers */
+	struct list_head		ln_peers;
 	/* failure simulation */
 	struct list_head		  ln_test_peers;
 	struct list_head		  ln_drop_rules;
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
index 20fa3fea04b9..821b030f9621 100644
--- a/drivers/staging/lustre/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -542,6 +542,7 @@ lnet_prepare(lnet_pid_t requested_pid)
 	the_lnet.ln_pid = requested_pid;
 
 	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_peers);
 	INIT_LIST_HEAD(&the_lnet.ln_nets);
 	INIT_LIST_HEAD(&the_lnet.ln_routers);
 	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
index 376e3459fa92..97ee1f5cfd2f 100644
--- a/drivers/staging/lustre/lnet/lnet/peer.c
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -54,8 +54,6 @@ lnet_peer_tables_create(void)
 	}
 
 	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-		INIT_LIST_HEAD(&ptable->pt_deathrow);
-
 		hash = kvmalloc_cpt(LNET_PEER_HASH_SIZE * sizeof(*hash),
 				    GFP_KERNEL, i);
 		if (!hash) {
@@ -88,8 +86,6 @@ lnet_peer_tables_destroy(void)
 		if (!hash) /* not initialized */
 			break;
 
-		LASSERT(list_empty(&ptable->pt_deathrow));
-
 		ptable->pt_hash = NULL;
 		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
 			LASSERT(list_empty(&hash[j]));
@@ -123,7 +119,7 @@ lnet_peer_table_cleanup_locked(struct lnet_ni *ni,
 }
 
 static void
-lnet_peer_table_deathrow_wait_locked(struct lnet_peer_table *ptable,
+lnet_peer_table_finalize_wait_locked(struct lnet_peer_table *ptable,
 				     int cpt_locked)
 {
 	int i;
@@ -173,12 +169,8 @@ void
 lnet_peer_tables_cleanup(struct lnet_ni *ni)
 {
 	struct lnet_peer_table *ptable;
-	struct list_head deathrow;
-	struct lnet_peer_ni *lp;
 	int i;
 
-	INIT_LIST_HEAD(&deathrow);
-
 	LASSERT(the_lnet.ln_shutdown || ni);
 	/*
 	 * If just deleting the peers for a NI, get rid of any routes these
@@ -191,8 +183,7 @@ lnet_peer_tables_cleanup(struct lnet_ni *ni)
 	}
 
 	/*
-	 * Start the process of moving the applicable peers to
-	 * deathrow.
+	 * Start the cleanup process
 	 */
 	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
 		lnet_net_lock(LNET_LOCK_EX);
@@ -200,20 +191,12 @@ lnet_peer_tables_cleanup(struct lnet_ni *ni)
 		lnet_net_unlock(LNET_LOCK_EX);
 	}
 
-	/* Cleanup all entries on deathrow. */
+	/* Wait until all peers have been destroyed. */
 	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
 		lnet_net_lock(LNET_LOCK_EX);
-		lnet_peer_table_deathrow_wait_locked(ptable, i);
-		list_splice_init(&ptable->pt_deathrow, &deathrow);
+		lnet_peer_table_finalize_wait_locked(ptable, i);
 		lnet_net_unlock(LNET_LOCK_EX);
 	}
-
-	while (!list_empty(&deathrow)) {
-		lp = list_entry(deathrow.next, struct lnet_peer_ni,
-				lpni_hashlist);
-		list_del(&lp->lpni_hashlist);
-		kfree(lp);
-	}
 }
 
 static struct lnet_peer_ni *
@@ -247,74 +230,143 @@ lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt)
 	return lpni;
 }
 
-void
-lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp)
+static void
+lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *peer_net;
+	struct lnet_peer *peer;
 
-	LASSERT(atomic_read(&lp->lpni_refcount) == 0);
-	LASSERT(!lp->lpni_rtr_refcount);
-	LASSERT(list_empty(&lp->lpni_txq));
-	LASSERT(list_empty(&lp->lpni_hashlist));
-	LASSERT(!lp->lpni_txqnob);
+	/* TODO: could the below situation happen? accessing an already
+	 * destroyed peer?
+	 */
+	if (!lpni->lpni_peer_net ||
+	    !lpni->lpni_peer_net->lpn_peer)
+		return;
 
-	ptable = the_lnet.ln_peer_tables[lp->lpni_cpt];
-	LASSERT(ptable->pt_number > 0);
-	ptable->pt_number--;
+	peer_net = lpni->lpni_peer_net;
+	peer = lpni->lpni_peer_net->lpn_peer;
 
-	lp->lpni_net = NULL;
+	list_del_init(&lpni->lpni_on_peer_net_list);
+	lpni->lpni_peer_net = NULL;
 
-	list_add(&lp->lpni_hashlist, &ptable->pt_deathrow);
-	LASSERT(ptable->pt_zombies > 0);
-	ptable->pt_zombies--;
+	/* if peer_net is empty, then remove it from the peer */
+	if (list_empty(&peer_net->lpn_peer_nis)) {
+		list_del_init(&peer_net->lpn_on_peer_list);
+		peer_net->lpn_peer = NULL;
+		kfree(peer_net);
+
+		/* If the peer is empty then remove it from the
+		 * the_lnet.ln_peers
+		 */
+		if (list_empty(&peer->lp_peer_nets)) {
+			list_del_init(&peer->lp_on_lnet_peer_list);
+			kfree(peer);
+		}
+	}
 }
 
-struct lnet_peer_ni *
-lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+static int
+lnet_build_peer_hierarchy(struct lnet_peer_ni *lpni)
 {
-	struct list_head *peers;
-	struct lnet_peer_ni *lp;
+	struct lnet_peer *peer;
+	struct lnet_peer_net *peer_net;
+	__u32 lpni_net = LNET_NIDNET(lpni->lpni_nid);
 
-	LASSERT(!the_lnet.ln_shutdown);
+	peer = NULL;
+	peer_net = NULL;
 
-	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
-	list_for_each_entry(lp, peers, lpni_hashlist) {
-		if (lp->lpni_nid == nid) {
-			lnet_peer_ni_addref_locked(lp);
-			return lp;
-		}
+	peer = kzalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer)
+		return -ENOMEM;
+
+	peer_net = kzalloc(sizeof(*peer_net), GFP_KERNEL);
+	if (!peer_net) {
+		kfree(peer);
+		return -ENOMEM;
 	}
 
-	return NULL;
+	INIT_LIST_HEAD(&peer->lp_on_lnet_peer_list);
+	INIT_LIST_HEAD(&peer->lp_peer_nets);
+	INIT_LIST_HEAD(&peer_net->lpn_on_peer_list);
+	INIT_LIST_HEAD(&peer_net->lpn_peer_nis);
+
+	/* build the hierarchy */
+	peer_net->lpn_net_id = lpni_net;
+	peer_net->lpn_peer = peer;
+	lpni->lpni_peer_net = peer_net;
+	peer->lp_primary_nid = lpni->lpni_nid;
+	list_add_tail(&peer_net->lpn_on_peer_list, &peer->lp_peer_nets);
+	list_add_tail(&lpni->lpni_on_peer_net_list, &peer_net->lpn_peer_nis);
+	list_add_tail(&peer->lp_on_lnet_peer_list, &the_lnet.ln_peers);
+
+	return 0;
+}
+
+void
+lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
+	LASSERT(lpni->lpni_rtr_refcount == 0);
+	LASSERT(list_empty(&lpni->lpni_txq));
+	LASSERT(list_empty(&lpni->lpni_hashlist));
+	LASSERT(lpni->lpni_txqnob == 0);
+	LASSERT(lpni->lpni_peer_net);
+	LASSERT(lpni->lpni_peer_net->lpn_peer);
+
+	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
+
+	lpni->lpni_net = NULL;
+
+	lnet_try_destroy_peer_hierarchy_locked(lpni);
+
+	kfree(lpni);
+
+	LASSERT(ptable->pt_zombies > 0);
+	ptable->pt_zombies--;
 }
 
 int
-lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt)
+lnet_nid2peerni_locked(struct lnet_peer_ni **lpnip, lnet_nid_t nid, int cpt)
 {
 	struct lnet_peer_table *ptable;
-	struct lnet_peer_ni *lp = NULL;
-	struct lnet_peer_ni *lp2;
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_ni *lpni2;
 	int cpt2;
 	int rc = 0;
 
-	*lpp = NULL;
+	*lpnip = NULL;
 	if (the_lnet.ln_shutdown) /* it's shutting down */
 		return -ESHUTDOWN;
 
-	/* cpt can be LNET_LOCK_EX if it's called from router functions */
-	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid, NULL);
+	/*
+	 * calculate cpt2 with the standard hash function
+	 * This cpt2 becomes the slot where we'll find or create the peer.
+	 */
+	cpt2 = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 
-	ptable = the_lnet.ln_peer_tables[cpt2];
-	lp = lnet_find_peer_locked(ptable, nid);
-	if (lp) {
-		*lpp = lp;
-		return 0;
+	/*
+	 * Any changes to the peer tables happen under exclusive write
+	 * lock. Any reads to the peer tables can be done via a standard
+	 * CPT read lock.
+	 */
+	if (cpt != LNET_LOCK_EX) {
+		lnet_net_unlock(cpt);
+		lnet_net_lock(LNET_LOCK_EX);
 	}
 
-	if (!list_empty(&ptable->pt_deathrow)) {
-		lp = list_entry(ptable->pt_deathrow.next,
-				struct lnet_peer_ni, lpni_hashlist);
-		list_del(&lp->lpni_hashlist);
+	ptable = the_lnet.ln_peer_tables[cpt2];
+	lpni = lnet_get_peer_ni_locked(ptable, nid);
+	if (lpni) {
+		*lpnip = lpni;
+		if (cpt != LNET_LOCK_EX) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			lnet_net_lock(cpt);
+		}
+		return 0;
 	}
 
 	/*
@@ -322,68 +374,72 @@ lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt)
 	 * and destroyed locks and peer-table before I finish the allocation
 	 */
 	ptable->pt_number++;
-	lnet_net_unlock(cpt);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	if (lp)
-		memset(lp, 0, sizeof(*lp));
-	else
-		lp = kzalloc_cpt(sizeof(*lp), GFP_NOFS, cpt2);
-
-	if (!lp) {
+	lpni = kzalloc_cpt(sizeof(*lpni), GFP_KERNEL, cpt2);
+	if (!lpni) {
 		rc = -ENOMEM;
 		lnet_net_lock(cpt);
 		goto out;
 	}
 
-	INIT_LIST_HEAD(&lp->lpni_txq);
-	INIT_LIST_HEAD(&lp->lpni_rtrq);
-	INIT_LIST_HEAD(&lp->lpni_routes);
-
-	lp->lpni_notify = 0;
-	lp->lpni_notifylnd = 0;
-	lp->lpni_notifying = 0;
-	lp->lpni_alive_count = 0;
-	lp->lpni_timestamp = 0;
-	lp->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
-	lp->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
-	lp->lpni_last_query = 0; /* haven't asked NI yet */
-	lp->lpni_ping_timestamp = 0;
-	lp->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-	lp->lpni_nid = nid;
-	lp->lpni_cpt = cpt2;
-	atomic_set(&lp->lpni_refcount, 2);	/* 1 for caller; 1 for hash */
-	lp->lpni_rtr_refcount = 0;
+	INIT_LIST_HEAD(&lpni->lpni_txq);
+	INIT_LIST_HEAD(&lpni->lpni_rtrq);
+	INIT_LIST_HEAD(&lpni->lpni_routes);
 
-	lnet_net_lock(cpt);
+	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
+	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+	lpni->lpni_nid = nid;
+	lpni->lpni_cpt = cpt2;
+	atomic_set(&lpni->lpni_refcount, 2);	/* 1 for caller; 1 for hash */
+
+	rc = lnet_build_peer_hierarchy(lpni);
+	if (rc != 0)
+		goto out;
+
+	lnet_net_lock(LNET_LOCK_EX);
 
 	if (the_lnet.ln_shutdown) {
 		rc = -ESHUTDOWN;
 		goto out;
 	}
 
-	lp2 = lnet_find_peer_locked(ptable, nid);
-	if (lp2) {
-		*lpp = lp2;
+	lpni2 = lnet_get_peer_ni_locked(ptable, nid);
+	if (lpni2) {
+		*lpnip = lpni2;
 		goto out;
 	}
 
-	lp->lpni_net = lnet_get_net_locked(LNET_NIDNET(lp->lpni_nid));
-	lp->lpni_txcredits =
-		lp->lpni_mintxcredits =
-		lp->lpni_net->net_tunables.lct_peer_tx_credits;
-	lp->lpni_rtrcredits =
-		lp->lpni_minrtrcredits = lnet_peer_buffer_credits(lp->lpni_net);
+	lpni->lpni_net = lnet_get_net_locked(LNET_NIDNET(lpni->lpni_nid));
+	lpni->lpni_txcredits =
+		lpni->lpni_mintxcredits =
+		lpni->lpni_net->net_tunables.lct_peer_tx_credits;
+	lpni->lpni_rtrcredits =
+		lpni->lpni_minrtrcredits =
+		lnet_peer_buffer_credits(lpni->lpni_net);
 
-	list_add_tail(&lp->lpni_hashlist,
+	list_add_tail(&lpni->lpni_hashlist,
 		      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
 	ptable->pt_version++;
-	*lpp = lp;
+	*lpnip = lpni;
+
+	if (cpt != LNET_LOCK_EX) {
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_net_lock(cpt);
+	}
 
 	return 0;
 out:
-	if (lp)
-		list_add(&lp->lpni_hashlist, &ptable->pt_deathrow);
+	if (lpni) {
+		lnet_try_destroy_peer_hierarchy_locked(lpni);
+		kfree(lpni);
+	}
 	ptable->pt_number--;
+	if (cpt != LNET_LOCK_EX) {
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_net_lock(cpt);
+	}
 	return rc;
 }
 
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
index 12a4b1708d3c..977a937f261c 100644
--- a/drivers/staging/lustre/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -385,6 +385,9 @@ static int proc_lnet_routers(struct ctl_table *table, int write,
 	return rc;
 }
 
+/* TODO: there should be no direct access to ptable. We should add a set
+ * of APIs that give access to the ptable and its members
+ */
 static int proc_lnet_peers(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {




More information about the lustre-devel mailing list