[lustre-devel] [PATCH 08/34] LU-7734 lnet: Multi-Rail peer split

James Simmons jsimmons at infradead.org
Sat Sep 29 16:01:38 PDT 2018


On Tue, 25 Sep 2018, NeilBrown wrote:

> From: Amir Shehata <amir.shehata at intel.com>
> 
> [[Note, the preceding few patches are part of this
>   in the upstream lustre code - they were split
>   for easier merging into linux.
>   ]]
> 
> Split the peer structure into peer/peer_net/peer_ni, as
> described in the Multi-Rail HLD.
> 
> Removed deathrow list in peers, instead peers are immediately
> deleted. deathrow complicates memory management for peers to
> little gain.
> 
> Moved to LNET_LOCK_EX for any operations which will modify the
> peer tables. And CPT locks for any operatios which read the peer
> tables. Therefore there is no need to use lnet_cpt_of_nid() to
> calculate the CPT of the peer NID, instead we use lnet_nid_cpt_hash()
> to distribute peers across multiple CPTs.
> 
> It is no longe true that peers and NIs would exist on
> the same CPT. In the new design peers and NIs don't have a 1-1
> relationship. You can send to the same peer from several NIs, which
> can exist on separate CPTs
> 
> Signed-off-by: Amir Shehata <amir.shehata at intel.com>
> Change-Id: Ida41d830d38d0ab2bb551476e4a8866d52a25fe2
> Reviewed-on: http://review.whamcloud.com/18293
> Reviewed-by: Olaf Weber <olaf at sgi.com>
> Reviewed-by: Doug Oucharek <doug.s.oucharek at intel.com>
> Signed-off-by: NeilBrown <neilb at suse.com>

The patch is fine but the commit message needs to be properly formatted.
While patches to OpenSFS tree are titled with "LU-7743 lnet:" for the
linux client we changed that to be more inline with what the kernel does.

LU-7734 lnet: Multi-Rail peer split  --->>> lustre: lnet: Multi-Rail peer split

To the outsider reviewer LU-7734 doesn't mean much. Also please replace
Change-Id: to the URL link for the work which in turn explains LU-7734.

WC-bug-id: https://jira.whamcloud.com/browse/LU-7734

Same for the other patches this series. Seem reasonable?

> ---
>  .../staging/lustre/include/linux/lnet/lib-lnet.h   |    2 
>  .../staging/lustre/include/linux/lnet/lib-types.h  |   29 ++
>  drivers/staging/lustre/lnet/lnet/api-ni.c          |    1 
>  drivers/staging/lustre/lnet/lnet/peer.c            |  260 ++++++++++++--------
>  drivers/staging/lustre/lnet/lnet/router_proc.c     |    3 
>  5 files changed, 191 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> index 656177b64336..bf076298de71 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> @@ -637,8 +637,6 @@ bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
>  bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
>  
>  int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt);
> -struct lnet_peer_ni *lnet_find_peer_locked(struct lnet_peer_table *ptable,
> -					   lnet_nid_t nid);
>  struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt);
>  void lnet_peer_tables_cleanup(struct lnet_ni *ni);
>  void lnet_peer_tables_destroy(void);
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> index 9a2cf319dba9..9f70c094cc4c 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> @@ -384,6 +384,7 @@ struct lnet_rc_data {
>  };
>  
>  struct lnet_peer_ni {
> +	struct list_head	lpni_on_peer_net_list;
>  	/* chain on peer hash */
>  	struct list_head	 lpni_hashlist;
>  	/* messages blocking for tx credits */
> @@ -394,6 +395,7 @@ struct lnet_peer_ni {
>  	struct list_head	 lpni_rtr_list;
>  	/* # tx credits available */
>  	int			 lpni_txcredits;
> +	struct lnet_peer_net	*lpni_peer_net;
>  	/* low water mark */
>  	int			 lpni_mintxcredits;
>  	/* # router credits */
> @@ -442,6 +444,31 @@ struct lnet_peer_ni {
>  	struct lnet_rc_data	*lpni_rcd;
>  };
>  
> +struct lnet_peer {
> +	/* chain on global peer list */
> +	struct list_head	lp_on_lnet_peer_list;
> +
> +	/* list of peer nets */
> +	struct list_head	lp_peer_nets;
> +
> +	/* primary NID of the peer */
> +	lnet_nid_t		lp_primary_nid;
> +};
> +
> +struct lnet_peer_net {
> +	/* chain on peer block */
> +	struct list_head	lpn_on_peer_list;
> +
> +	/* list of peer_nis on this network */
> +	struct list_head	lpn_peer_nis;
> +
> +	/* pointer to the peer I'm part of */
> +	struct lnet_peer	*lpn_peer;
> +
> +	/* Net ID */
> +	__u32			lpn_net_id;
> +};
> +
>  /* peer hash size */
>  #define LNET_PEER_HASH_BITS	9
>  #define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
> @@ -686,6 +713,8 @@ struct lnet {
>  	struct lnet_msg_container	**ln_msg_containers;
>  	struct lnet_counters		**ln_counters;
>  	struct lnet_peer_table		**ln_peer_tables;
> +	/* list of configured or discovered peers */
> +	struct list_head		ln_peers;
>  	/* failure simulation */
>  	struct list_head		  ln_test_peers;
>  	struct list_head		  ln_drop_rules;
> diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
> index 20fa3fea04b9..821b030f9621 100644
> --- a/drivers/staging/lustre/lnet/lnet/api-ni.c
> +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
> @@ -542,6 +542,7 @@ lnet_prepare(lnet_pid_t requested_pid)
>  	the_lnet.ln_pid = requested_pid;
>  
>  	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
> +	INIT_LIST_HEAD(&the_lnet.ln_peers);
>  	INIT_LIST_HEAD(&the_lnet.ln_nets);
>  	INIT_LIST_HEAD(&the_lnet.ln_routers);
>  	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
> diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
> index 376e3459fa92..97ee1f5cfd2f 100644
> --- a/drivers/staging/lustre/lnet/lnet/peer.c
> +++ b/drivers/staging/lustre/lnet/lnet/peer.c
> @@ -54,8 +54,6 @@ lnet_peer_tables_create(void)
>  	}
>  
>  	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
> -		INIT_LIST_HEAD(&ptable->pt_deathrow);
> -
>  		hash = kvmalloc_cpt(LNET_PEER_HASH_SIZE * sizeof(*hash),
>  				    GFP_KERNEL, i);
>  		if (!hash) {
> @@ -88,8 +86,6 @@ lnet_peer_tables_destroy(void)
>  		if (!hash) /* not initialized */
>  			break;
>  
> -		LASSERT(list_empty(&ptable->pt_deathrow));
> -
>  		ptable->pt_hash = NULL;
>  		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
>  			LASSERT(list_empty(&hash[j]));
> @@ -123,7 +119,7 @@ lnet_peer_table_cleanup_locked(struct lnet_ni *ni,
>  }
>  
>  static void
> -lnet_peer_table_deathrow_wait_locked(struct lnet_peer_table *ptable,
> +lnet_peer_table_finalize_wait_locked(struct lnet_peer_table *ptable,
>  				     int cpt_locked)
>  {
>  	int i;
> @@ -173,12 +169,8 @@ void
>  lnet_peer_tables_cleanup(struct lnet_ni *ni)
>  {
>  	struct lnet_peer_table *ptable;
> -	struct list_head deathrow;
> -	struct lnet_peer_ni *lp;
>  	int i;
>  
> -	INIT_LIST_HEAD(&deathrow);
> -
>  	LASSERT(the_lnet.ln_shutdown || ni);
>  	/*
>  	 * If just deleting the peers for a NI, get rid of any routes these
> @@ -191,8 +183,7 @@ lnet_peer_tables_cleanup(struct lnet_ni *ni)
>  	}
>  
>  	/*
> -	 * Start the process of moving the applicable peers to
> -	 * deathrow.
> +	 * Start the cleanup process
>  	 */
>  	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
>  		lnet_net_lock(LNET_LOCK_EX);
> @@ -200,20 +191,12 @@ lnet_peer_tables_cleanup(struct lnet_ni *ni)
>  		lnet_net_unlock(LNET_LOCK_EX);
>  	}
>  
> -	/* Cleanup all entries on deathrow. */
> +	/* Wait until all peers have been destroyed. */
>  	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
>  		lnet_net_lock(LNET_LOCK_EX);
> -		lnet_peer_table_deathrow_wait_locked(ptable, i);
> -		list_splice_init(&ptable->pt_deathrow, &deathrow);
> +		lnet_peer_table_finalize_wait_locked(ptable, i);
>  		lnet_net_unlock(LNET_LOCK_EX);
>  	}
> -
> -	while (!list_empty(&deathrow)) {
> -		lp = list_entry(deathrow.next, struct lnet_peer_ni,
> -				lpni_hashlist);
> -		list_del(&lp->lpni_hashlist);
> -		kfree(lp);
> -	}
>  }
>  
>  static struct lnet_peer_ni *
> @@ -247,74 +230,143 @@ lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt)
>  	return lpni;
>  }
>  
> -void
> -lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp)
> +static void
> +lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
>  {
> -	struct lnet_peer_table *ptable;
> +	struct lnet_peer_net *peer_net;
> +	struct lnet_peer *peer;
>  
> -	LASSERT(atomic_read(&lp->lpni_refcount) == 0);
> -	LASSERT(!lp->lpni_rtr_refcount);
> -	LASSERT(list_empty(&lp->lpni_txq));
> -	LASSERT(list_empty(&lp->lpni_hashlist));
> -	LASSERT(!lp->lpni_txqnob);
> +	/* TODO: could the below situation happen? accessing an already
> +	 * destroyed peer?
> +	 */
> +	if (!lpni->lpni_peer_net ||
> +	    !lpni->lpni_peer_net->lpn_peer)
> +		return;
>  
> -	ptable = the_lnet.ln_peer_tables[lp->lpni_cpt];
> -	LASSERT(ptable->pt_number > 0);
> -	ptable->pt_number--;
> +	peer_net = lpni->lpni_peer_net;
> +	peer = lpni->lpni_peer_net->lpn_peer;
>  
> -	lp->lpni_net = NULL;
> +	list_del_init(&lpni->lpni_on_peer_net_list);
> +	lpni->lpni_peer_net = NULL;
>  
> -	list_add(&lp->lpni_hashlist, &ptable->pt_deathrow);
> -	LASSERT(ptable->pt_zombies > 0);
> -	ptable->pt_zombies--;
> +	/* if peer_net is empty, then remove it from the peer */
> +	if (list_empty(&peer_net->lpn_peer_nis)) {
> +		list_del_init(&peer_net->lpn_on_peer_list);
> +		peer_net->lpn_peer = NULL;
> +		kfree(peer_net);
> +
> +		/* If the peer is empty then remove it from the
> +		 * the_lnet.ln_peers
> +		 */
> +		if (list_empty(&peer->lp_peer_nets)) {
> +			list_del_init(&peer->lp_on_lnet_peer_list);
> +			kfree(peer);
> +		}
> +	}
>  }
>  
> -struct lnet_peer_ni *
> -lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
> +static int
> +lnet_build_peer_hierarchy(struct lnet_peer_ni *lpni)
>  {
> -	struct list_head *peers;
> -	struct lnet_peer_ni *lp;
> +	struct lnet_peer *peer;
> +	struct lnet_peer_net *peer_net;
> +	__u32 lpni_net = LNET_NIDNET(lpni->lpni_nid);
>  
> -	LASSERT(!the_lnet.ln_shutdown);
> +	peer = NULL;
> +	peer_net = NULL;
>  
> -	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
> -	list_for_each_entry(lp, peers, lpni_hashlist) {
> -		if (lp->lpni_nid == nid) {
> -			lnet_peer_ni_addref_locked(lp);
> -			return lp;
> -		}
> +	peer = kzalloc(sizeof(*peer), GFP_KERNEL);
> +	if (!peer)
> +		return -ENOMEM;
> +
> +	peer_net = kzalloc(sizeof(*peer_net), GFP_KERNEL);
> +	if (!peer_net) {
> +		kfree(peer);
> +		return -ENOMEM;
>  	}
>  
> -	return NULL;
> +	INIT_LIST_HEAD(&peer->lp_on_lnet_peer_list);
> +	INIT_LIST_HEAD(&peer->lp_peer_nets);
> +	INIT_LIST_HEAD(&peer_net->lpn_on_peer_list);
> +	INIT_LIST_HEAD(&peer_net->lpn_peer_nis);
> +
> +	/* build the hierarchy */
> +	peer_net->lpn_net_id = lpni_net;
> +	peer_net->lpn_peer = peer;
> +	lpni->lpni_peer_net = peer_net;
> +	peer->lp_primary_nid = lpni->lpni_nid;
> +	list_add_tail(&peer_net->lpn_on_peer_list, &peer->lp_peer_nets);
> +	list_add_tail(&lpni->lpni_on_peer_net_list, &peer_net->lpn_peer_nis);
> +	list_add_tail(&peer->lp_on_lnet_peer_list, &the_lnet.ln_peers);
> +
> +	return 0;
> +}
> +
> +void
> +lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
> +{
> +	struct lnet_peer_table *ptable;
> +
> +	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
> +	LASSERT(lpni->lpni_rtr_refcount == 0);
> +	LASSERT(list_empty(&lpni->lpni_txq));
> +	LASSERT(list_empty(&lpni->lpni_hashlist));
> +	LASSERT(lpni->lpni_txqnob == 0);
> +	LASSERT(lpni->lpni_peer_net);
> +	LASSERT(lpni->lpni_peer_net->lpn_peer);
> +
> +	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
> +	LASSERT(ptable->pt_number > 0);
> +	ptable->pt_number--;
> +
> +	lpni->lpni_net = NULL;
> +
> +	lnet_try_destroy_peer_hierarchy_locked(lpni);
> +
> +	kfree(lpni);
> +
> +	LASSERT(ptable->pt_zombies > 0);
> +	ptable->pt_zombies--;
>  }
>  
>  int
> -lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt)
> +lnet_nid2peerni_locked(struct lnet_peer_ni **lpnip, lnet_nid_t nid, int cpt)
>  {
>  	struct lnet_peer_table *ptable;
> -	struct lnet_peer_ni *lp = NULL;
> -	struct lnet_peer_ni *lp2;
> +	struct lnet_peer_ni *lpni = NULL;
> +	struct lnet_peer_ni *lpni2;
>  	int cpt2;
>  	int rc = 0;
>  
> -	*lpp = NULL;
> +	*lpnip = NULL;
>  	if (the_lnet.ln_shutdown) /* it's shutting down */
>  		return -ESHUTDOWN;
>  
> -	/* cpt can be LNET_LOCK_EX if it's called from router functions */
> -	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid, NULL);
> +	/*
> +	 * calculate cpt2 with the standard hash function
> +	 * This cpt2 becomes the slot where we'll find or create the peer.
> +	 */
> +	cpt2 = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
>  
> -	ptable = the_lnet.ln_peer_tables[cpt2];
> -	lp = lnet_find_peer_locked(ptable, nid);
> -	if (lp) {
> -		*lpp = lp;
> -		return 0;
> +	/*
> +	 * Any changes to the peer tables happen under exclusive write
> +	 * lock. Any reads to the peer tables can be done via a standard
> +	 * CPT read lock.
> +	 */
> +	if (cpt != LNET_LOCK_EX) {
> +		lnet_net_unlock(cpt);
> +		lnet_net_lock(LNET_LOCK_EX);
>  	}
>  
> -	if (!list_empty(&ptable->pt_deathrow)) {
> -		lp = list_entry(ptable->pt_deathrow.next,
> -				struct lnet_peer_ni, lpni_hashlist);
> -		list_del(&lp->lpni_hashlist);
> +	ptable = the_lnet.ln_peer_tables[cpt2];
> +	lpni = lnet_get_peer_ni_locked(ptable, nid);
> +	if (lpni) {
> +		*lpnip = lpni;
> +		if (cpt != LNET_LOCK_EX) {
> +			lnet_net_unlock(LNET_LOCK_EX);
> +			lnet_net_lock(cpt);
> +		}
> +		return 0;
>  	}
>  
>  	/*
> @@ -322,68 +374,72 @@ lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt)
>  	 * and destroyed locks and peer-table before I finish the allocation
>  	 */
>  	ptable->pt_number++;
> -	lnet_net_unlock(cpt);
> +	lnet_net_unlock(LNET_LOCK_EX);
>  
> -	if (lp)
> -		memset(lp, 0, sizeof(*lp));
> -	else
> -		lp = kzalloc_cpt(sizeof(*lp), GFP_NOFS, cpt2);
> -
> -	if (!lp) {
> +	lpni = kzalloc_cpt(sizeof(*lpni), GFP_KERNEL, cpt2);
> +	if (!lpni) {
>  		rc = -ENOMEM;
>  		lnet_net_lock(cpt);
>  		goto out;
>  	}
>  
> -	INIT_LIST_HEAD(&lp->lpni_txq);
> -	INIT_LIST_HEAD(&lp->lpni_rtrq);
> -	INIT_LIST_HEAD(&lp->lpni_routes);
> -
> -	lp->lpni_notify = 0;
> -	lp->lpni_notifylnd = 0;
> -	lp->lpni_notifying = 0;
> -	lp->lpni_alive_count = 0;
> -	lp->lpni_timestamp = 0;
> -	lp->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
> -	lp->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
> -	lp->lpni_last_query = 0; /* haven't asked NI yet */
> -	lp->lpni_ping_timestamp = 0;
> -	lp->lpni_ping_feats = LNET_PING_FEAT_INVAL;
> -	lp->lpni_nid = nid;
> -	lp->lpni_cpt = cpt2;
> -	atomic_set(&lp->lpni_refcount, 2);	/* 1 for caller; 1 for hash */
> -	lp->lpni_rtr_refcount = 0;
> +	INIT_LIST_HEAD(&lpni->lpni_txq);
> +	INIT_LIST_HEAD(&lpni->lpni_rtrq);
> +	INIT_LIST_HEAD(&lpni->lpni_routes);
>  
> -	lnet_net_lock(cpt);
> +	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
> +	lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
> +	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
> +	lpni->lpni_nid = nid;
> +	lpni->lpni_cpt = cpt2;
> +	atomic_set(&lpni->lpni_refcount, 2);	/* 1 for caller; 1 for hash */
> +
> +	rc = lnet_build_peer_hierarchy(lpni);
> +	if (rc != 0)
> +		goto out;
> +
> +	lnet_net_lock(LNET_LOCK_EX);
>  
>  	if (the_lnet.ln_shutdown) {
>  		rc = -ESHUTDOWN;
>  		goto out;
>  	}
>  
> -	lp2 = lnet_find_peer_locked(ptable, nid);
> -	if (lp2) {
> -		*lpp = lp2;
> +	lpni2 = lnet_get_peer_ni_locked(ptable, nid);
> +	if (lpni2) {
> +		*lpnip = lpni2;
>  		goto out;
>  	}
>  
> -	lp->lpni_net = lnet_get_net_locked(LNET_NIDNET(lp->lpni_nid));
> -	lp->lpni_txcredits =
> -		lp->lpni_mintxcredits =
> -		lp->lpni_net->net_tunables.lct_peer_tx_credits;
> -	lp->lpni_rtrcredits =
> -		lp->lpni_minrtrcredits = lnet_peer_buffer_credits(lp->lpni_net);
> +	lpni->lpni_net = lnet_get_net_locked(LNET_NIDNET(lpni->lpni_nid));
> +	lpni->lpni_txcredits =
> +		lpni->lpni_mintxcredits =
> +		lpni->lpni_net->net_tunables.lct_peer_tx_credits;
> +	lpni->lpni_rtrcredits =
> +		lpni->lpni_minrtrcredits =
> +		lnet_peer_buffer_credits(lpni->lpni_net);
>  
> -	list_add_tail(&lp->lpni_hashlist,
> +	list_add_tail(&lpni->lpni_hashlist,
>  		      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
>  	ptable->pt_version++;
> -	*lpp = lp;
> +	*lpnip = lpni;
> +
> +	if (cpt != LNET_LOCK_EX) {
> +		lnet_net_unlock(LNET_LOCK_EX);
> +		lnet_net_lock(cpt);
> +	}
>  
>  	return 0;
>  out:
> -	if (lp)
> -		list_add(&lp->lpni_hashlist, &ptable->pt_deathrow);
> +	if (lpni) {
> +		lnet_try_destroy_peer_hierarchy_locked(lpni);
> +		kfree(lpni);
> +	}
>  	ptable->pt_number--;
> +	if (cpt != LNET_LOCK_EX) {
> +		lnet_net_unlock(LNET_LOCK_EX);
> +		lnet_net_lock(cpt);
> +	}
>  	return rc;
>  }
>  
> diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
> index 12a4b1708d3c..977a937f261c 100644
> --- a/drivers/staging/lustre/lnet/lnet/router_proc.c
> +++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
> @@ -385,6 +385,9 @@ static int proc_lnet_routers(struct ctl_table *table, int write,
>  	return rc;
>  }
>  
> +/* TODO: there should be no direct access to ptable. We should add a set
> + * of APIs that give access to the ptable and its members
> + */
>  static int proc_lnet_peers(struct ctl_table *table, int write,
>  			   void __user *buffer, size_t *lenp, loff_t *ppos)
>  {
> 
> 
> 


More information about the lustre-devel mailing list