[lustre-devel] [PATCH 12/34] LU-7734 lnet: NUMA support

James Simmons jsimmons at infradead.org
Sat Sep 29 18:49:59 PDT 2018


> From: Amir Shehata <amir.shehata at intel.com>
> 
> This patch adds NUMA node support. NUMA node information is stored
> in the CPT table. A NUMA node mask is maintained for the entire table
> as well as for each CPT to track the NUMA nodes related to each of
> the CPTs. Following key APIs added:
> 
> cfs_cpt_of_node(): returns the CPT of particular NUMA node
> cfs_cpt_distance(): calculates the distance between two CPTs
> 
> When the LND device is started it finds the NUMA node of the physical
> device and then from there it finds the CPT, which is subsequently
> stored in the NI structure.
> 
> When selecting the NI, the MD CPT is determined and the distance
> between the MD CPT and the device CPT is calculated. The NI
> with the shortest distance is preferred.
> 
> If the device or system is not NUMA aware then the CPT for the
> device will default to CFS_CPT_ANY and the distance calculated
> when CFS_CPT_ANY is used is largest in the system. IE, none
> NUMA aware devices are least preferred.
> 
> A NUMA range value can be set. If the value is large enough
> it amounts to basically turning off NUMA criterion completely.
> 
> Signed-off-by: Amir Shehata <amir.shehata at intel.com>
> Change-Id: I2d7c63f8e8fc8e8a6a249b0d6bfdd08fd090a837
> Reviewed-on: http://review.whamcloud.com/18916
> Tested-by: Jenkins
> Tested-by: Maloo <hpdd-maloo at intel.com>

We can remove the Test-by: as well. 

> Reviewed-by: Olaf Weber <olaf at sgi.com>
> Reviewed-by: Doug Oucharek <doug.s.oucharek at intel.com>
> Signed-off-by: NeilBrown <neilb at suse.com>
> ---
>  .../staging/lustre/include/linux/lnet/lib-lnet.h   |    1 
>  .../staging/lustre/include/linux/lnet/lib-types.h  |    3 
>  .../lustre/include/uapi/linux/lnet/libcfs_ioctl.h  |    6 +
>  .../lustre/include/uapi/linux/lnet/lnet-dlc.h      |    6 +
>  .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c    |    4 +
>  .../staging/lustre/lnet/klnds/socklnd/socklnd.c    |   13 ++
>  drivers/staging/lustre/lnet/lnet/api-ni.c          |   27 +++
>  drivers/staging/lustre/lnet/lnet/lib-move.c        |  160 +++++++++++++++++---
>  8 files changed, 195 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> index a7cff6426ad8..c338e31b2cdd 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> @@ -408,6 +408,7 @@ struct lnet_ni *lnet_net2ni_addref(__u32 net);
>  bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
>  struct lnet_net *lnet_get_net_locked(u32 net_id);
>  
> +extern unsigned int lnet_numa_range;
>  extern int portal_rotor;
>  
>  int lnet_lib_init(void);
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> index 22b141cb6cff..5083b72ca20f 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> @@ -346,6 +346,9 @@ struct lnet_ni {
>  	/* lnd tunables set explicitly */
>  	bool ni_lnd_tunables_set;
>  
> +	/* physical device CPT */
> +	int			dev_cpt;
> +
>  	/* sequence number used to round robin over nis within a net */
>  	u32			ni_seq;
>  
> diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> index fa58aaf6ad9d..a231f6d89e95 100644
> --- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> +++ b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> @@ -142,7 +142,9 @@ struct libcfs_debug_ioctl_data {
>  #define IOC_LIBCFS_ADD_LOCAL_NI		_IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
>  #define IOC_LIBCFS_DEL_LOCAL_NI		_IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
>  #define IOC_LIBCFS_GET_LOCAL_NI		_IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
> -#define IOC_LIBCFS_DBG			_IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
> -#define IOC_LIBCFS_MAX_NR		98
> +#define IOC_LIBCFS_SET_NUMA_RANGE	_IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
> +#define IOC_LIBCFS_GET_NUMA_RANGE	_IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
> +#define IOC_LIBCFS_DBG			_IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
> +#define IOC_LIBCFS_MAX_NR		100
>  
>  #endif /* __LIBCFS_IOCTL_H__ */
> diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h
> index bfd9fc6bc4df..5eaaf0eae470 100644
> --- a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h
> +++ b/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h
> @@ -162,6 +162,7 @@ struct lnet_ioctl_config_ni {
>  	__u32			lic_status;
>  	__u32			lic_tcp_bonding;
>  	__u32			lic_idx;
> +	__s32			lic_dev_cpt;
>  	char			lic_bulk[0];
>  };
>  
> @@ -213,6 +214,11 @@ struct lnet_ioctl_peer_cfg {
>  	char prcfg_bulk[0];
>  };
>  
> +struct lnet_ioctl_numa_range {
> +	struct libcfs_ioctl_hdr nr_hdr;
> +	__u32 nr_range;
> +};
> +
>  struct lnet_ioctl_lnet_stats {
>  	struct libcfs_ioctl_hdr st_hdr;
>  	struct lnet_counters st_cntrs;
> diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
> index 958ac9a99045..2e71abbf8a0c 100644
> --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
> +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
> @@ -2829,6 +2829,7 @@ static int kiblnd_startup(struct lnet_ni *ni)
>  	unsigned long flags;
>  	int rc;
>  	int newdev;
> +	int node_id;
>  
>  	LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
>  
> @@ -2878,6 +2879,9 @@ static int kiblnd_startup(struct lnet_ni *ni)
>  	if (!ibdev)
>  		goto failed;
>  
> +	node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
> +	ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
> +
>  	net->ibn_dev = ibdev;
>  	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
>  
> diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
> index 9df66c6d160f..ba1ec35a017a 100644
> --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
> +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
> @@ -38,6 +38,7 @@
>   * Author: Eric Barton <eric at bartonsoftware.com>
>   */
>  
> +#include <linux/pci.h>
>  #include "socklnd.h"
>  #include <linux/inetdevice.h>
>  
> @@ -2726,6 +2727,8 @@ ksocknal_startup(struct lnet_ni *ni)
>  	struct ksock_net *net;
>  	int rc;
>  	int i;
> +	struct net_device *net_dev;
> +	int node_id;
>  
>  	LASSERT(ni->ni_net->net_lnd == &the_ksocklnd);
>  
> @@ -2773,6 +2776,16 @@ ksocknal_startup(struct lnet_ni *ni)
>  		}
>  	}
>  
> +	net_dev = dev_get_by_name(&init_net,
> +				  net->ksnn_interfaces[0].ksni_name);
> +	if (net_dev) {
> +		node_id = dev_to_node(&net_dev->dev);
> +		ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
> +		dev_put(net_dev);
> +	} else {
> +		ni->dev_cpt = CFS_CPT_ANY;
> +	}
> +
>  	/* call it before add it to ksocknal_data.ksnd_nets */
>  	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
>  	if (rc)
> diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
> index 1ef9a39b517d..67a3301258d4 100644
> --- a/drivers/staging/lustre/lnet/lnet/api-ni.c
> +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
> @@ -64,6 +64,12 @@ module_param(use_tcp_bonding, int, 0444);
>  MODULE_PARM_DESC(use_tcp_bonding,
>  		 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
>  
> +unsigned int lnet_numa_range;
> +EXPORT_SYMBOL(lnet_numa_range);
> +module_param(lnet_numa_range, uint, 0444);
> +MODULE_PARM_DESC(lnet_numa_range,
> +		 "NUMA range to consider during Multi-Rail selection");
> +
>  /*
>   * This sequence number keeps track of how many times DLC was used to
>   * update the configuration. It is incremented on any DLC update and
> @@ -1896,6 +1902,7 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
>  	cfg_ni->lic_nid = ni->ni_nid;
>  	cfg_ni->lic_status = ni->ni_status->ns_status;
>  	cfg_ni->lic_tcp_bonding = use_tcp_bonding;
> +	cfg_ni->lic_dev_cpt = ni->dev_cpt;
>  
>  	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
>  
> @@ -2642,6 +2649,26 @@ LNetCtl(unsigned int cmd, void *arg)
>  		mutex_unlock(&the_lnet.ln_api_mutex);
>  		return rc;
>  
> +	case IOC_LIBCFS_SET_NUMA_RANGE: {
> +		struct lnet_ioctl_numa_range *numa;
> +
> +		numa = arg;
> +		if (numa->nr_hdr.ioc_len != sizeof(*numa))
> +			return -EINVAL;
> +		lnet_numa_range = numa->nr_range;
> +		return 0;
> +	}
> +
> +	case IOC_LIBCFS_GET_NUMA_RANGE: {
> +		struct lnet_ioctl_numa_range *numa;
> +
> +		numa = arg;
> +		if (numa->nr_hdr.ioc_len != sizeof(*numa))
> +			return -EINVAL;
> +		numa->nr_range = lnet_numa_range;
> +		return 0;
> +	}
> +
>  	case IOC_LIBCFS_GET_BUF: {
>  		struct lnet_ioctl_pool_cfg *pool_cfg;
>  		size_t total = sizeof(*config) + sizeof(*pool_cfg);
> diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
> index fbf209610ff9..bf2256da6122 100644
> --- a/drivers/staging/lustre/lnet/lnet/lib-move.c
> +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
> @@ -1109,6 +1109,10 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
>  	int best_credits = 0;
>  	u32 seq, seq2;
>  	int best_lpni_credits = INT_MIN;
> +	int md_cpt = 0;
> +	unsigned int shortest_distance = UINT_MAX;
> +	unsigned int distance = 0;
> +	bool found_ir = false;
>  
>  again:
>  	/*
> @@ -1127,12 +1131,20 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
>  	routing = false;
>  	local_net = NULL;
>  	best_ni = NULL;
> +	shortest_distance = UINT_MAX;
> +	found_ir = false;
>  
>  	if (the_lnet.ln_shutdown) {
>  		lnet_net_unlock(cpt);
>  		return -ESHUTDOWN;
>  	}
>  
> +	if (msg->msg_md)
> +		/* get the cpt of the MD, used during NUMA based selection */
> +		md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
> +	else
> +		md_cpt = CFS_CPT_ANY;
> +
>  	/*
>  	 * initialize the variables which could be reused if we go to
>  	 * again
> @@ -1258,34 +1270,113 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
>  			continue;
>  
>  		/*
> -		 * Second jab at determining best_ni
> -		 * if we get here then the peer we're trying to send
> -		 * to is on a directly connected network, and we'll
> -		 * need to pick the local_ni on that network to send
> -		 * from
> +		 * Iterate through the NIs in this local Net and select
> +		 * the NI to send from. The selection is determined by
> +		 * these 3 criterion in the following priority:
> +		 *	1. NUMA
> +		 *	2. NI available credits
> +		 *	3. Round Robin
>  		 */
>  		while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
>  			if (!lnet_is_ni_healthy_locked(ni))
>  				continue;
> -			/* TODO: compare NUMA distance */
> -			if (ni->ni_tx_queues[cpt]->tq_credits <=
> -			    best_credits) {
> +
> +			/*
> +			 * calculate the distance from the cpt on which
> +			 * the message memory is allocated to the CPT of
> +			 * the NI's physical device
> +			 */
> +			distance = cfs_cpt_distance(lnet_cpt_table(),
> +						    md_cpt,
> +						    ni->dev_cpt);
> +
> +			/*
> +			 * If we already have a closer NI within the NUMA
> +			 * range provided, then there is no need to
> +			 * consider the current NI. Move on to the next
> +			 * one.
> +			 */
> +			if (distance > shortest_distance &&
> +			    distance > lnet_numa_range)
> +				continue;
> +
> +			if (distance < shortest_distance &&
> +			    distance > lnet_numa_range) {
>  				/*
> -				 * all we want is to read tq_credits
> -				 * value as an approximation of how
> -				 * busy the NI is. No need to grab a lock
> +				 * The current NI is the closest one that we
> +				 * have found, even though it's not in the
> +				 * NUMA range specified. This occurs if
> +				 * the NUMA range is less than the least
> +				 * of the distances in the system.
> +				 * In effect NUMA range consideration is
> +				 * turned off.
>  				 */
> -				continue;
> -			} else if (best_ni) {
> -				if ((best_ni)->ni_seq - ni->ni_seq <= 0)
> +				shortest_distance = distance;
> +			} else if ((distance <= shortest_distance &&
> +				    distance < lnet_numa_range) ||
> +				   distance == shortest_distance) {
> +				/*
> +				 * This NI is either within range or it's
> +				 * equidistant. In both of these cases we
> +				 * would want to select the NI based on
> +				 * its available credits first, and then
> +				 * via Round Robin.
> +				 */
> +				if (distance <= shortest_distance &&
> +				    distance < lnet_numa_range) {
> +					/*
> +					 * If this is the first NI that's
> +					 * within range, then set the
> +					 * shortest distance to the range
> +					 * specified by the user. In
> +					 * effect we're saying that all
> +					 * NIs that fall within this NUMA
> +					 * range shall be dealt with as
> +					 * having equal NUMA weight. Which
> +					 * will mean that we should select
> +					 * through that set by their
> +					 * available credits first
> +					 * followed by Round Robin.
> +					 *
> +					 * And since this is the first NI
> +					 * in the range, let's just set it
> +					 * as our best_ni for now. The
> +					 * following NIs found in the
> +					 * range will be dealt with as
> +					 * mentioned previously.
> +					 */
> +					shortest_distance = lnet_numa_range;
> +					if (!found_ir) {
> +						found_ir = true;
> +						goto set_ni;
> +					}
> +				}
> +				/*
> +				 * This NI is NUMA equidistant let's
> +				 * select using credits followed by Round
> +				 * Robin.
> +				 */
> +				if (ni->ni_tx_queues[cpt]->tq_credits <
> +				    best_credits) {
>  					continue;
> -				(best_ni)->ni_seq = ni->ni_seq + 1;
> +				} else if (ni->ni_tx_queues[cpt]->tq_credits ==
> +					   best_credits) {
> +					if (best_ni &&
> +					    best_ni->ni_seq <= ni->ni_seq)
> +						continue;
> +				}
>  			}
> -
> +set_ni:
>  			best_ni = ni;
>  			best_credits = ni->ni_tx_queues[cpt]->tq_credits;
>  		}
>  	}
> +	/*
> +	 * Now that we selected the NI to use increment its sequence
> +	 * number so the Round Robin algorithm will detect that it has
> +	 * been used and pick the next NI.
> +	 */
> +	best_ni->ni_seq++;
>  
>  	if (!best_ni) {
>  		lnet_net_unlock(cpt);
> @@ -1372,29 +1463,52 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
>  	best_lpni = NULL;
>  	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
>  		/*
> -		 * if this peer ni is not healty just skip it, no point in
> +		 * if this peer ni is not healthy just skip it, no point in
>  		 * examining it further
>  		 */
>  		if (!lnet_is_peer_ni_healthy_locked(lpni))
>  			continue;
>  		ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
>  
> +		/* if this is a preferred peer use it */
>  		if (!preferred && ni_is_pref) {
>  			preferred = true;
>  		} else if (preferred && !ni_is_pref) {
> +			/*
> +			 * this is not the preferred peer so let's ignore
> +			 * it.
> +			 */
>  			continue;
> -		} else if (lpni->lpni_txcredits <= best_lpni_credits) {
> +		} else if (lpni->lpni_txcredits < best_lpni_credits) {
> +			/*
> +			 * We already have a peer that has more credits
> +			 * available than this one. No need to consider
> +			 * this peer further.
> +			 */
>  			continue;
> -		} else if (best_lpni) {
> -			if (best_lpni->lpni_seq - lpni->lpni_seq <= 0)
> -				continue;
> -			best_lpni->lpni_seq = lpni->lpni_seq + 1;
> +		} else if (lpni->lpni_txcredits == best_lpni_credits) {
> +			/*
> +			 * The best peer found so far and the current peer
> +			 * have the same number of available credits let's
> +			 * make sure to select between them using Round
> +			 * Robin
> +			 */
> +			if (best_lpni) {
> +				if (best_lpni->lpni_seq <= lpni->lpni_seq)
> +					continue;
> +			}
>  		}
>  
>  		best_lpni = lpni;
>  		best_lpni_credits = lpni->lpni_txcredits;
>  	}
>  
> +	/*
> +	 * Increment sequence number of the peer selected so that we can
> +	 * pick the next one in Round Robin.
> +	 */
> +	best_lpni->lpni_seq++;
> +
>  	/* if we still can't find a peer ni then we can't reach it */
>  	if (!best_lpni) {
>  		u32 net_id = peer_net ? peer_net->lpn_net_id :
> @@ -1403,7 +1517,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
>  		lnet_net_unlock(cpt);
>  		LCONSOLE_WARN("no peer_ni found on peer net %s\n",
>  			      libcfs_net2str(net_id));
> -		goto again;
> +		return -EHOSTUNREACH;
>  	}
>  
>  send:
> 
> 
> 


More information about the lustre-devel mailing list