[lustre-devel] [PATCH 22/24] lustre: lnet: add enhanced statistics

James Simmons jsimmons at infradead.org
Sun Oct 14 16:50:44 PDT 2018


> From: Amir Shehata <amir.shehata at intel.com>
> 
> Added statistics to track the different types of
> LNet messages which are sent/received/dropped

Reviewed-by: James Simmons <jsimmons at infradead.org>
 
> WC-bug-id: https://jira.whamcloud.com/browse/LU-9480
> Signed-off-by: Amir Shehata <amir.shehata at intel.com>
> Signed-off-by: Olaf Weber <olaf at sgi.com>
> Reviewed-on: https://review.whamcloud.com/25795
> Signed-off-by: NeilBrown <neilb at suse.com>
> ---
>  .../staging/lustre/include/linux/lnet/lib-lnet.h   |   12 ++
>  .../staging/lustre/include/linux/lnet/lib-types.h  |   20 +++
>  .../lustre/include/uapi/linux/lnet/libcfs_ioctl.h  |    3 -
>  drivers/staging/lustre/lnet/lnet/api-ni.c          |   45 +++++++-
>  drivers/staging/lustre/lnet/lnet/lib-move.c        |  116 +++++++++++++++++++-
>  drivers/staging/lustre/lnet/lnet/lib-msg.c         |   16 ++-
>  drivers/staging/lustre/lnet/lnet/net_fault.c       |    3 -
>  drivers/staging/lustre/lnet/lnet/peer.c            |   26 +++-
>  8 files changed, 217 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> index adb4d0551ef5..91980f60a50d 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> @@ -575,7 +575,7 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
>  void lnet_finalize(struct lnet_msg *msg, int rc);
>  
>  void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
> -		       unsigned int nob);
> +		       unsigned int nob, __u32 msg_type);
>  void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
>  void lnet_recv_delayed_msg_list(struct list_head *head);
>  
> @@ -825,4 +825,14 @@ lnet_peer_needs_push(struct lnet_peer *lp)
>  	return false;
>  }
>  
> +void lnet_incr_stats(struct lnet_element_stats *stats,
> +		     enum lnet_msg_type msg_type,
> +		     enum lnet_stats_type stats_type);
> +
> +__u32 lnet_sum_stats(struct lnet_element_stats *stats,
> +		     enum lnet_stats_type stats_type);
> +
> +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
> +			      struct lnet_element_stats *stats);
> +
>  #endif
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> index 8543a67420d7..19f7b11a1e44 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> @@ -279,10 +279,24 @@ enum lnet_ni_state {
>  	LNET_NI_STATE_DELETING
>  };
>  
> +enum lnet_stats_type {
> +	LNET_STATS_TYPE_SEND = 0,
> +	LNET_STATS_TYPE_RECV,
> +	LNET_STATS_TYPE_DROP
> +};
> +
> +struct lnet_comm_count {
> +	atomic_t co_get_count;
> +	atomic_t co_put_count;
> +	atomic_t co_reply_count;
> +	atomic_t co_ack_count;
> +	atomic_t co_hello_count;
> +};
> +
>  struct lnet_element_stats {
> -	atomic_t	send_count;
> -	atomic_t	recv_count;
> -	atomic_t	drop_count;
> +	struct lnet_comm_count el_send_stats;
> +	struct lnet_comm_count el_recv_stats;
> +	struct lnet_comm_count el_drop_stats;
>  };
>  
>  struct lnet_net {
> diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> index 60bc9713923e..4590f65c333f 100644
> --- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> +++ b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> @@ -145,6 +145,7 @@ struct libcfs_debug_ioctl_data {
>  #define IOC_LIBCFS_SET_NUMA_RANGE	_IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
>  #define IOC_LIBCFS_GET_NUMA_RANGE	_IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
>  #define IOC_LIBCFS_GET_PEER_LIST	_IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
> -#define IOC_LIBCFS_MAX_NR		100
> +#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
> +#define IOC_LIBCFS_MAX_NR		101
>  
>  #endif /* __LIBCFS_IOCTL_H__ */
> diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
> index 0511c6acb9b1..0852118bf803 100644
> --- a/drivers/staging/lustre/lnet/lnet/api-ni.c
> +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
> @@ -2263,8 +2263,12 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
>  	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
>  
>  	if (stats) {
> -		stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
> -		stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
> +		stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
> +						       LNET_STATS_TYPE_SEND);
> +		stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
> +						       LNET_STATS_TYPE_RECV);
> +		stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
> +						       LNET_STATS_TYPE_DROP);
>  	}
>  
>  	/*
> @@ -2491,6 +2495,29 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
>  	return rc;
>  }
>  
> +int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
> +{
> +	struct lnet_ni *ni;
> +	int cpt;
> +	int rc = -ENOENT;
> +
> +	if (!msg_stats)
> +		return -EINVAL;
> +
> +	cpt = lnet_net_lock_current();
> +
> +	ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
> +
> +	if (ni) {
> +		lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
> +		rc = 0;
> +	}
> +
> +	lnet_net_unlock(cpt);
> +
> +	return rc;
> +}
> +
>  static int lnet_add_net_common(struct lnet_net *net,
>  			       struct lnet_ioctl_config_lnd_tunables *tun)
>  {
> @@ -2956,6 +2983,7 @@ LNetCtl(unsigned int cmd, void *arg)
>  		__u32 tun_size;
>  
>  		cfg_ni = arg;
> +
>  		/* get the tunables if they are available */
>  		if (cfg_ni->lic_cfg_hdr.ioc_len <
>  		    sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
> @@ -2975,6 +3003,19 @@ LNetCtl(unsigned int cmd, void *arg)
>  		return rc;
>  	}
>  
> +	case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
> +		struct lnet_ioctl_element_msg_stats *msg_stats = arg;
> +
> +		if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
> +			return -EINVAL;
> +
> +		mutex_lock(&the_lnet.ln_api_mutex);
> +		rc = lnet_get_ni_stats(msg_stats);
> +		mutex_unlock(&the_lnet.ln_api_mutex);
> +
> +		return rc;
> +	}
> +
>  	case IOC_LIBCFS_GET_NET: {
>  		size_t total = sizeof(*config) +
>  			       sizeof(struct lnet_ioctl_net_config);
> diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
> index 2ff329bf91ba..5694d85c713c 100644
> --- a/drivers/staging/lustre/lnet/lnet/lib-move.c
> +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
> @@ -45,6 +45,104 @@ static int local_nid_dist_zero = 1;
>  module_param(local_nid_dist_zero, int, 0444);
>  MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
>  
> +static inline struct lnet_comm_count *
> +get_stats_counts(struct lnet_element_stats *stats,
> +		 enum lnet_stats_type stats_type)
> +{
> +	switch (stats_type) {
> +	case LNET_STATS_TYPE_SEND:
> +		return &stats->el_send_stats;
> +	case LNET_STATS_TYPE_RECV:
> +		return &stats->el_recv_stats;
> +	case LNET_STATS_TYPE_DROP:
> +		return &stats->el_drop_stats;
> +	default:
> +		CERROR("Unknown stats type\n");
> +	}
> +
> +	return NULL;
> +}
> +
> +void lnet_incr_stats(struct lnet_element_stats *stats,
> +		     enum lnet_msg_type msg_type,
> +		     enum lnet_stats_type stats_type)
> +{
> +	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
> +
> +	if (!counts)
> +		return;
> +
> +	switch (msg_type) {
> +	case LNET_MSG_ACK:
> +		atomic_inc(&counts->co_ack_count);
> +		break;
> +	case LNET_MSG_PUT:
> +		atomic_inc(&counts->co_put_count);
> +		break;
> +	case LNET_MSG_GET:
> +		atomic_inc(&counts->co_get_count);
> +		break;
> +	case LNET_MSG_REPLY:
> +		atomic_inc(&counts->co_reply_count);
> +		break;
> +	case LNET_MSG_HELLO:
> +		atomic_inc(&counts->co_hello_count);
> +		break;
> +	default:
> +		CERROR("There is a BUG in the code. Unknown message type\n");
> +		break;
> +	}
> +}
> +
> +__u32 lnet_sum_stats(struct lnet_element_stats *stats,
> +		     enum lnet_stats_type stats_type)
> +{
> +	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
> +
> +	if (!counts)
> +		return 0;
> +
> +	return (atomic_read(&counts->co_ack_count) +
> +		atomic_read(&counts->co_put_count) +
> +		atomic_read(&counts->co_get_count) +
> +		atomic_read(&counts->co_reply_count) +
> +		atomic_read(&counts->co_hello_count));
> +}
> +
> +static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
> +				struct lnet_comm_count *counts)
> +{
> +	msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
> +	msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
> +	msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
> +	msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
> +	msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
> +}
> +
> +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
> +			      struct lnet_element_stats *stats)
> +{
> +	struct lnet_comm_count *counts;
> +
> +	LASSERT(msg_stats);
> +	LASSERT(stats);
> +
> +	counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
> +	if (!counts)
> +		return;
> +	assign_stats(&msg_stats->im_send_stats, counts);
> +
> +	counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
> +	if (!counts)
> +		return;
> +	assign_stats(&msg_stats->im_recv_stats, counts);
> +
> +	counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
> +	if (!counts)
> +		return;
> +	assign_stats(&msg_stats->im_drop_stats, counts);
> +}
> +
>  int
>  lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
>  {
> @@ -632,9 +730,13 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
>  		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
>  		lnet_net_unlock(cpt);
>  		if (msg->msg_txpeer)
> -			atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
> +			lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
> +					msg->msg_type,
> +					LNET_STATS_TYPE_DROP);
>  		if (msg->msg_txni)
> -			atomic_inc(&msg->msg_txni->ni_stats.drop_count);
> +			lnet_incr_stats(&msg->msg_txni->ni_stats,
> +					msg->msg_type,
> +					LNET_STATS_TYPE_DROP);
>  
>  		CNETERR("Dropping message for %s: peer not alive\n",
>  			libcfs_id2str(msg->msg_target));
> @@ -1859,9 +1961,11 @@ lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
>  }
>  
>  void
> -lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
> +lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
> +		  __u32 msg_type)
>  {
>  	lnet_net_lock(cpt);
> +	lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
>  	the_lnet.ln_counters[cpt]->drop_count++;
>  	the_lnet.ln_counters[cpt]->drop_length += nob;
>  	lnet_net_unlock(cpt);
> @@ -2510,7 +2614,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
>  	lnet_finalize(msg, rc);
>  
>   drop:
> -	lnet_drop_message(ni, cpt, private, payload_length);
> +	lnet_drop_message(ni, cpt, private, payload_length, type);
>  	return 0;
>  }
>  EXPORT_SYMBOL(lnet_parse);
> @@ -2546,7 +2650,8 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
>  		 * until that's done
>  		 */
>  		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
> -				  msg->msg_private, msg->msg_len);
> +				  msg->msg_private, msg->msg_len,
> +				  msg->msg_type);
>  		/*
>  		 * NB: message will not generate event because w/o attached MD,
>  		 * but we still should give error code so lnet_msg_decommit()
> @@ -2786,6 +2891,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
>  	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
>  
>  	lnet_net_lock(cpt);
> +	lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
>  	the_lnet.ln_counters[cpt]->drop_count++;
>  	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
>  	lnet_net_unlock(cpt);
> diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
> index db13d01d366f..7f58cfe25bc2 100644
> --- a/drivers/staging/lustre/lnet/lnet/lib-msg.c
> +++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
> @@ -219,9 +219,13 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
>  
>  incr_stats:
>  	if (msg->msg_txpeer)
> -		atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
> +		lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
> +				msg->msg_type,
> +				LNET_STATS_TYPE_SEND);
>  	if (msg->msg_txni)
> -		atomic_inc(&msg->msg_txni->ni_stats.send_count);
> +		lnet_incr_stats(&msg->msg_txni->ni_stats,
> +				msg->msg_type,
> +				LNET_STATS_TYPE_SEND);
>   out:
>  	lnet_return_tx_credits_locked(msg);
>  	msg->msg_tx_committed = 0;
> @@ -280,9 +284,13 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
>  
>  incr_stats:
>  	if (msg->msg_rxpeer)
> -		atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
> +		lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
> +				msg->msg_type,
> +				LNET_STATS_TYPE_RECV);
>  	if (msg->msg_rxni)
> -		atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
> +		lnet_incr_stats(&msg->msg_rxni->ni_stats,
> +				msg->msg_type,
> +				LNET_STATS_TYPE_RECV);
>  	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
>  		counters->recv_length += msg->msg_wanted;
>  
> diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c
> index 3841bac1aa0a..e2c746855da9 100644
> --- a/drivers/staging/lustre/lnet/lnet/net_fault.c
> +++ b/drivers/staging/lustre/lnet/lnet/net_fault.c
> @@ -632,7 +632,8 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
>  			}
>  		}
>  
> -		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
> +		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
> +				  msg->msg_type);
>  		lnet_finalize(msg, rc);
>  	}
>  }
> diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
> index 95f72ae39a89..03c1c34517e4 100644
> --- a/drivers/staging/lustre/lnet/lnet/peer.c
> +++ b/drivers/staging/lustre/lnet/lnet/peer.c
> @@ -3301,6 +3301,7 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
>  		       void __user *bulk)
>  {
>  	struct lnet_ioctl_element_stats *lpni_stats;
> +	struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
>  	struct lnet_peer_ni_credit_info *lpni_info;
>  	struct lnet_peer_ni *lpni;
>  	struct lnet_peer *lp;
> @@ -3315,7 +3316,8 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
>  		goto out;
>  	}
>  
> -	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats);
> +	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
> +		+ sizeof(*lpni_msg_stats);
>  	size *= lp->lp_nnis;
>  	if (size > *sizep) {
>  		*sizep = size;
> @@ -3337,13 +3339,17 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
>  	lpni_stats = kzalloc(sizeof(*lpni_stats), GFP_KERNEL);
>  	if (!lpni_stats)
>  		goto out_free_info;
> +	lpni_msg_stats = kzalloc(sizeof(*lpni_msg_stats), GFP_KERNEL);
> +	if (!lpni_msg_stats)
> +		goto out_free_stats;
> +
>  
>  	lpni = NULL;
>  	rc = -EFAULT;
>  	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
>  		nid = lpni->lpni_nid;
>  		if (copy_to_user(bulk, &nid, sizeof(nid)))
> -			goto out_free_stats;
> +			goto out_free_msg_stats;
>  		bulk += sizeof(nid);
>  
>  		memset(lpni_info, 0, sizeof(*lpni_info));
> @@ -3362,22 +3368,28 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
>  		lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
>  		lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
>  		if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
> -			goto out_free_stats;
> +			goto out_free_msg_stats;
>  		bulk += sizeof(*lpni_info);
>  
>  		memset(lpni_stats, 0, sizeof(*lpni_stats));
>  		lpni_stats->iel_send_count =
> -			atomic_read(&lpni->lpni_stats.send_count);
> +			lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_SEND);
>  		lpni_stats->iel_recv_count =
> -			atomic_read(&lpni->lpni_stats.recv_count);
> +			lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_RECV);
>  		lpni_stats->iel_drop_count =
> -			atomic_read(&lpni->lpni_stats.drop_count);
> +			lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_DROP);
>  		if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
> -			goto out_free_stats;
> +			goto out_free_msg_stats;
>  		bulk += sizeof(*lpni_stats);
> +		lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
> +		if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
> +			goto out_free_msg_stats;
> +		bulk += sizeof(*lpni_msg_stats);
>  	}
>  	rc = 0;
>  
> +out_free_msg_stats:
> +	kfree(lpni_msg_stats);
>  out_free_stats:
>  	kfree(lpni_stats);
>  out_free_info:
> 
> 
> 


More information about the lustre-devel mailing list