[lustre-devel] [PATCH 22/24] lustre: lnet: add enhanced statistics
James Simmons
jsimmons at infradead.org
Sun Oct 14 16:50:44 PDT 2018
> From: Amir Shehata <amir.shehata at intel.com>
>
> Added statistics to track the different types of
> LNet messages which are sent/received/dropped
Reviewed-by: James Simmons <jsimmons at infradead.org>
> WC-bug-id: https://jira.whamcloud.com/browse/LU-9480
> Signed-off-by: Amir Shehata <amir.shehata at intel.com>
> Signed-off-by: Olaf Weber <olaf at sgi.com>
> Reviewed-on: https://review.whamcloud.com/25795
> Signed-off-by: NeilBrown <neilb at suse.com>
> ---
> .../staging/lustre/include/linux/lnet/lib-lnet.h | 12 ++
> .../staging/lustre/include/linux/lnet/lib-types.h | 20 +++
> .../lustre/include/uapi/linux/lnet/libcfs_ioctl.h | 3 -
> drivers/staging/lustre/lnet/lnet/api-ni.c | 45 +++++++-
> drivers/staging/lustre/lnet/lnet/lib-move.c | 116 +++++++++++++++++++-
> drivers/staging/lustre/lnet/lnet/lib-msg.c | 16 ++-
> drivers/staging/lustre/lnet/lnet/net_fault.c | 3 -
> drivers/staging/lustre/lnet/lnet/peer.c | 26 +++-
> 8 files changed, 217 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> index adb4d0551ef5..91980f60a50d 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
> @@ -575,7 +575,7 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
> void lnet_finalize(struct lnet_msg *msg, int rc);
>
> void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
> - unsigned int nob);
> + unsigned int nob, __u32 msg_type);
> void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
> void lnet_recv_delayed_msg_list(struct list_head *head);
>
> @@ -825,4 +825,14 @@ lnet_peer_needs_push(struct lnet_peer *lp)
> return false;
> }
>
> +void lnet_incr_stats(struct lnet_element_stats *stats,
> + enum lnet_msg_type msg_type,
> + enum lnet_stats_type stats_type);
> +
> +__u32 lnet_sum_stats(struct lnet_element_stats *stats,
> + enum lnet_stats_type stats_type);
> +
> +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
> + struct lnet_element_stats *stats);
> +
> #endif
> diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> index 8543a67420d7..19f7b11a1e44 100644
> --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
> +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
> @@ -279,10 +279,24 @@ enum lnet_ni_state {
> LNET_NI_STATE_DELETING
> };
>
> +enum lnet_stats_type {
> + LNET_STATS_TYPE_SEND = 0,
> + LNET_STATS_TYPE_RECV,
> + LNET_STATS_TYPE_DROP
> +};
> +
> +struct lnet_comm_count {
> + atomic_t co_get_count;
> + atomic_t co_put_count;
> + atomic_t co_reply_count;
> + atomic_t co_ack_count;
> + atomic_t co_hello_count;
> +};
> +
> struct lnet_element_stats {
> - atomic_t send_count;
> - atomic_t recv_count;
> - atomic_t drop_count;
> + struct lnet_comm_count el_send_stats;
> + struct lnet_comm_count el_recv_stats;
> + struct lnet_comm_count el_drop_stats;
> };
>
> struct lnet_net {
> diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> index 60bc9713923e..4590f65c333f 100644
> --- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> +++ b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
> @@ -145,6 +145,7 @@ struct libcfs_debug_ioctl_data {
> #define IOC_LIBCFS_SET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
> #define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
> #define IOC_LIBCFS_GET_PEER_LIST _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
> -#define IOC_LIBCFS_MAX_NR 100
> +#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
> +#define IOC_LIBCFS_MAX_NR 101
>
> #endif /* __LIBCFS_IOCTL_H__ */
> diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
> index 0511c6acb9b1..0852118bf803 100644
> --- a/drivers/staging/lustre/lnet/lnet/api-ni.c
> +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
> @@ -2263,8 +2263,12 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
> memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
>
> if (stats) {
> - stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
> - stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
> + stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
> + LNET_STATS_TYPE_SEND);
> + stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
> + LNET_STATS_TYPE_RECV);
> + stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
> + LNET_STATS_TYPE_DROP);
> }
>
> /*
> @@ -2491,6 +2495,29 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
> return rc;
> }
>
> +int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
> +{
> + struct lnet_ni *ni;
> + int cpt;
> + int rc = -ENOENT;
> +
> + if (!msg_stats)
> + return -EINVAL;
> +
> + cpt = lnet_net_lock_current();
> +
> + ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
> +
> + if (ni) {
> + lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
> + rc = 0;
> + }
> +
> + lnet_net_unlock(cpt);
> +
> + return rc;
> +}
> +
> static int lnet_add_net_common(struct lnet_net *net,
> struct lnet_ioctl_config_lnd_tunables *tun)
> {
> @@ -2956,6 +2983,7 @@ LNetCtl(unsigned int cmd, void *arg)
> __u32 tun_size;
>
> cfg_ni = arg;
> +
> /* get the tunables if they are available */
> if (cfg_ni->lic_cfg_hdr.ioc_len <
> sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
> @@ -2975,6 +3003,19 @@ LNetCtl(unsigned int cmd, void *arg)
> return rc;
> }
>
> + case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
> + struct lnet_ioctl_element_msg_stats *msg_stats = arg;
> +
> + if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
> + return -EINVAL;
> +
> + mutex_lock(&the_lnet.ln_api_mutex);
> + rc = lnet_get_ni_stats(msg_stats);
> + mutex_unlock(&the_lnet.ln_api_mutex);
> +
> + return rc;
> + }
> +
> case IOC_LIBCFS_GET_NET: {
> size_t total = sizeof(*config) +
> sizeof(struct lnet_ioctl_net_config);
> diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
> index 2ff329bf91ba..5694d85c713c 100644
> --- a/drivers/staging/lustre/lnet/lnet/lib-move.c
> +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
> @@ -45,6 +45,104 @@ static int local_nid_dist_zero = 1;
> module_param(local_nid_dist_zero, int, 0444);
> MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
>
> +static inline struct lnet_comm_count *
> +get_stats_counts(struct lnet_element_stats *stats,
> + enum lnet_stats_type stats_type)
> +{
> + switch (stats_type) {
> + case LNET_STATS_TYPE_SEND:
> + return &stats->el_send_stats;
> + case LNET_STATS_TYPE_RECV:
> + return &stats->el_recv_stats;
> + case LNET_STATS_TYPE_DROP:
> + return &stats->el_drop_stats;
> + default:
> + CERROR("Unknown stats type\n");
> + }
> +
> + return NULL;
> +}
> +
> +void lnet_incr_stats(struct lnet_element_stats *stats,
> + enum lnet_msg_type msg_type,
> + enum lnet_stats_type stats_type)
> +{
> + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
> +
> + if (!counts)
> + return;
> +
> + switch (msg_type) {
> + case LNET_MSG_ACK:
> + atomic_inc(&counts->co_ack_count);
> + break;
> + case LNET_MSG_PUT:
> + atomic_inc(&counts->co_put_count);
> + break;
> + case LNET_MSG_GET:
> + atomic_inc(&counts->co_get_count);
> + break;
> + case LNET_MSG_REPLY:
> + atomic_inc(&counts->co_reply_count);
> + break;
> + case LNET_MSG_HELLO:
> + atomic_inc(&counts->co_hello_count);
> + break;
> + default:
> + CERROR("There is a BUG in the code. Unknown message type\n");
> + break;
> + }
> +}
> +
> +__u32 lnet_sum_stats(struct lnet_element_stats *stats,
> + enum lnet_stats_type stats_type)
> +{
> + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
> +
> + if (!counts)
> + return 0;
> +
> + return (atomic_read(&counts->co_ack_count) +
> + atomic_read(&counts->co_put_count) +
> + atomic_read(&counts->co_get_count) +
> + atomic_read(&counts->co_reply_count) +
> + atomic_read(&counts->co_hello_count));
> +}
> +
> +static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
> + struct lnet_comm_count *counts)
> +{
> + msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
> + msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
> + msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
> + msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
> + msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
> +}
> +
> +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
> + struct lnet_element_stats *stats)
> +{
> + struct lnet_comm_count *counts;
> +
> + LASSERT(msg_stats);
> + LASSERT(stats);
> +
> + counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
> + if (!counts)
> + return;
> + assign_stats(&msg_stats->im_send_stats, counts);
> +
> + counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
> + if (!counts)
> + return;
> + assign_stats(&msg_stats->im_recv_stats, counts);
> +
> + counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
> + if (!counts)
> + return;
> + assign_stats(&msg_stats->im_drop_stats, counts);
> +}
> +
> int
> lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
> {
> @@ -632,9 +730,13 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
> the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
> lnet_net_unlock(cpt);
> if (msg->msg_txpeer)
> - atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
> + lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_DROP);
> if (msg->msg_txni)
> - atomic_inc(&msg->msg_txni->ni_stats.drop_count);
> + lnet_incr_stats(&msg->msg_txni->ni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_DROP);
>
> CNETERR("Dropping message for %s: peer not alive\n",
> libcfs_id2str(msg->msg_target));
> @@ -1859,9 +1961,11 @@ lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
> }
>
> void
> -lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
> +lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
> + __u32 msg_type)
> {
> lnet_net_lock(cpt);
> + lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
> the_lnet.ln_counters[cpt]->drop_count++;
> the_lnet.ln_counters[cpt]->drop_length += nob;
> lnet_net_unlock(cpt);
> @@ -2510,7 +2614,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
> lnet_finalize(msg, rc);
>
> drop:
> - lnet_drop_message(ni, cpt, private, payload_length);
> + lnet_drop_message(ni, cpt, private, payload_length, type);
> return 0;
> }
> EXPORT_SYMBOL(lnet_parse);
> @@ -2546,7 +2650,8 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
> * until that's done
> */
> lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
> - msg->msg_private, msg->msg_len);
> + msg->msg_private, msg->msg_len,
> + msg->msg_type);
> /*
> * NB: message will not generate event because w/o attached MD,
> * but we still should give error code so lnet_msg_decommit()
> @@ -2786,6 +2891,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
> cpt = lnet_cpt_of_nid(peer_id.nid, ni);
>
> lnet_net_lock(cpt);
> + lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
> the_lnet.ln_counters[cpt]->drop_count++;
> the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
> lnet_net_unlock(cpt);
> diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
> index db13d01d366f..7f58cfe25bc2 100644
> --- a/drivers/staging/lustre/lnet/lnet/lib-msg.c
> +++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
> @@ -219,9 +219,13 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
>
> incr_stats:
> if (msg->msg_txpeer)
> - atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
> + lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_SEND);
> if (msg->msg_txni)
> - atomic_inc(&msg->msg_txni->ni_stats.send_count);
> + lnet_incr_stats(&msg->msg_txni->ni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_SEND);
> out:
> lnet_return_tx_credits_locked(msg);
> msg->msg_tx_committed = 0;
> @@ -280,9 +284,13 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
>
> incr_stats:
> if (msg->msg_rxpeer)
> - atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
> + lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_RECV);
> if (msg->msg_rxni)
> - atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
> + lnet_incr_stats(&msg->msg_rxni->ni_stats,
> + msg->msg_type,
> + LNET_STATS_TYPE_RECV);
> if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
> counters->recv_length += msg->msg_wanted;
>
> diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c
> index 3841bac1aa0a..e2c746855da9 100644
> --- a/drivers/staging/lustre/lnet/lnet/net_fault.c
> +++ b/drivers/staging/lustre/lnet/lnet/net_fault.c
> @@ -632,7 +632,8 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
> }
> }
>
> - lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
> + lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
> + msg->msg_type);
> lnet_finalize(msg, rc);
> }
> }
> diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
> index 95f72ae39a89..03c1c34517e4 100644
> --- a/drivers/staging/lustre/lnet/lnet/peer.c
> +++ b/drivers/staging/lustre/lnet/lnet/peer.c
> @@ -3301,6 +3301,7 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
> void __user *bulk)
> {
> struct lnet_ioctl_element_stats *lpni_stats;
> + struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
> struct lnet_peer_ni_credit_info *lpni_info;
> struct lnet_peer_ni *lpni;
> struct lnet_peer *lp;
> @@ -3315,7 +3316,8 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
> goto out;
> }
>
> - size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats);
> + size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
> + + sizeof(*lpni_msg_stats);
> size *= lp->lp_nnis;
> if (size > *sizep) {
> *sizep = size;
> @@ -3337,13 +3339,17 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
> lpni_stats = kzalloc(sizeof(*lpni_stats), GFP_KERNEL);
> if (!lpni_stats)
> goto out_free_info;
> + lpni_msg_stats = kzalloc(sizeof(*lpni_msg_stats), GFP_KERNEL);
> + if (!lpni_msg_stats)
> + goto out_free_stats;
> +
>
> lpni = NULL;
> rc = -EFAULT;
> while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
> nid = lpni->lpni_nid;
> if (copy_to_user(bulk, &nid, sizeof(nid)))
> - goto out_free_stats;
> + goto out_free_msg_stats;
> bulk += sizeof(nid);
>
> memset(lpni_info, 0, sizeof(*lpni_info));
> @@ -3362,22 +3368,28 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
> lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
> lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
> if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
> - goto out_free_stats;
> + goto out_free_msg_stats;
> bulk += sizeof(*lpni_info);
>
> memset(lpni_stats, 0, sizeof(*lpni_stats));
> lpni_stats->iel_send_count =
> - atomic_read(&lpni->lpni_stats.send_count);
> + lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_SEND);
> lpni_stats->iel_recv_count =
> - atomic_read(&lpni->lpni_stats.recv_count);
> + lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_RECV);
> lpni_stats->iel_drop_count =
> - atomic_read(&lpni->lpni_stats.drop_count);
> + lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_DROP);
> if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
> - goto out_free_stats;
> + goto out_free_msg_stats;
> bulk += sizeof(*lpni_stats);
> + lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
> + if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
> + goto out_free_msg_stats;
> + bulk += sizeof(*lpni_msg_stats);
> }
> rc = 0;
>
> +out_free_msg_stats:
> + kfree(lpni_msg_stats);
> out_free_stats:
> kfree(lpni_stats);
> out_free_info:
>
>
>
More information about the lustre-devel
mailing list