[lustre-devel] [PATCH 22/24] lustre: lnet: add enhanced statistics
NeilBrown
neilb at suse.com
Sun Oct 7 16:19:38 PDT 2018
From: Amir Shehata <amir.shehata at intel.com>
Added statistics to track the different types of
LNet messages which are sent/received/dropped
WC-bug-id: https://jira.whamcloud.com/browse/LU-9480
Signed-off-by: Amir Shehata <amir.shehata at intel.com>
Signed-off-by: Olaf Weber <olaf at sgi.com>
Reviewed-on: https://review.whamcloud.com/25795
Signed-off-by: NeilBrown <neilb at suse.com>
---
.../staging/lustre/include/linux/lnet/lib-lnet.h | 12 ++
.../staging/lustre/include/linux/lnet/lib-types.h | 20 +++
.../lustre/include/uapi/linux/lnet/libcfs_ioctl.h | 3 -
drivers/staging/lustre/lnet/lnet/api-ni.c | 45 +++++++-
drivers/staging/lustre/lnet/lnet/lib-move.c | 116 +++++++++++++++++++-
drivers/staging/lustre/lnet/lnet/lib-msg.c | 16 ++-
drivers/staging/lustre/lnet/lnet/net_fault.c | 3 -
drivers/staging/lustre/lnet/lnet/peer.c | 26 +++-
8 files changed, 217 insertions(+), 24 deletions(-)
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
index adb4d0551ef5..91980f60a50d 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
@@ -575,7 +575,7 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
void lnet_finalize(struct lnet_msg *msg, int rc);
void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
- unsigned int nob);
+ unsigned int nob, __u32 msg_type);
void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
void lnet_recv_delayed_msg_list(struct list_head *head);
@@ -825,4 +825,14 @@ lnet_peer_needs_push(struct lnet_peer *lp)
return false;
}
+void lnet_incr_stats(struct lnet_element_stats *stats,
+ enum lnet_msg_type msg_type,
+ enum lnet_stats_type stats_type);
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+ enum lnet_stats_type stats_type);
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+ struct lnet_element_stats *stats);
+
#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
index 8543a67420d7..19f7b11a1e44 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -279,10 +279,24 @@ enum lnet_ni_state {
LNET_NI_STATE_DELETING
};
+enum lnet_stats_type {
+ LNET_STATS_TYPE_SEND = 0,
+ LNET_STATS_TYPE_RECV,
+ LNET_STATS_TYPE_DROP
+};
+
+struct lnet_comm_count {
+ atomic_t co_get_count;
+ atomic_t co_put_count;
+ atomic_t co_reply_count;
+ atomic_t co_ack_count;
+ atomic_t co_hello_count;
+};
+
struct lnet_element_stats {
- atomic_t send_count;
- atomic_t recv_count;
- atomic_t drop_count;
+ struct lnet_comm_count el_send_stats;
+ struct lnet_comm_count el_recv_stats;
+ struct lnet_comm_count el_drop_stats;
};
struct lnet_net {
diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
index 60bc9713923e..4590f65c333f 100644
--- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
+++ b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h
@@ -145,6 +145,7 @@ struct libcfs_debug_ioctl_data {
#define IOC_LIBCFS_SET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
#define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
#define IOC_LIBCFS_GET_PEER_LIST _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR 100
+#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR 101
#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
index 0511c6acb9b1..0852118bf803 100644
--- a/drivers/staging/lustre/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -2263,8 +2263,12 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
if (stats) {
- stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
- stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
+ stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
+ LNET_STATS_TYPE_SEND);
+ stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
+ LNET_STATS_TYPE_RECV);
+ stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
+ LNET_STATS_TYPE_DROP);
}
/*
@@ -2491,6 +2495,29 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
return rc;
}
+int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
+{
+ struct lnet_ni *ni;
+ int cpt;
+ int rc = -ENOENT;
+
+ if (!msg_stats)
+ return -EINVAL;
+
+ cpt = lnet_net_lock_current();
+
+ ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
+
+ if (ni) {
+ lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
+ rc = 0;
+ }
+
+ lnet_net_unlock(cpt);
+
+ return rc;
+}
+
static int lnet_add_net_common(struct lnet_net *net,
struct lnet_ioctl_config_lnd_tunables *tun)
{
@@ -2956,6 +2983,7 @@ LNetCtl(unsigned int cmd, void *arg)
__u32 tun_size;
cfg_ni = arg;
+
/* get the tunables if they are available */
if (cfg_ni->lic_cfg_hdr.ioc_len <
sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
@@ -2975,6 +3003,19 @@ LNetCtl(unsigned int cmd, void *arg)
return rc;
}
+ case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
+ struct lnet_ioctl_element_msg_stats *msg_stats = arg;
+
+ if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
+ return -EINVAL;
+
+ mutex_lock(&the_lnet.ln_api_mutex);
+ rc = lnet_get_ni_stats(msg_stats);
+ mutex_unlock(&the_lnet.ln_api_mutex);
+
+ return rc;
+ }
+
case IOC_LIBCFS_GET_NET: {
size_t total = sizeof(*config) +
sizeof(struct lnet_ioctl_net_config);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
index 2ff329bf91ba..5694d85c713c 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -45,6 +45,104 @@ static int local_nid_dist_zero = 1;
module_param(local_nid_dist_zero, int, 0444);
MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
+static inline struct lnet_comm_count *
+get_stats_counts(struct lnet_element_stats *stats,
+ enum lnet_stats_type stats_type)
+{
+ switch (stats_type) {
+ case LNET_STATS_TYPE_SEND:
+ return &stats->el_send_stats;
+ case LNET_STATS_TYPE_RECV:
+ return &stats->el_recv_stats;
+ case LNET_STATS_TYPE_DROP:
+ return &stats->el_drop_stats;
+ default:
+ CERROR("Unknown stats type\n");
+ }
+
+ return NULL;
+}
+
+void lnet_incr_stats(struct lnet_element_stats *stats,
+ enum lnet_msg_type msg_type,
+ enum lnet_stats_type stats_type)
+{
+ struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+
+ if (!counts)
+ return;
+
+ switch (msg_type) {
+ case LNET_MSG_ACK:
+ atomic_inc(&counts->co_ack_count);
+ break;
+ case LNET_MSG_PUT:
+ atomic_inc(&counts->co_put_count);
+ break;
+ case LNET_MSG_GET:
+ atomic_inc(&counts->co_get_count);
+ break;
+ case LNET_MSG_REPLY:
+ atomic_inc(&counts->co_reply_count);
+ break;
+ case LNET_MSG_HELLO:
+ atomic_inc(&counts->co_hello_count);
+ break;
+ default:
+ CERROR("There is a BUG in the code. Unknown message type\n");
+ break;
+ }
+}
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+ enum lnet_stats_type stats_type)
+{
+ struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+
+ if (!counts)
+ return 0;
+
+ return (atomic_read(&counts->co_ack_count) +
+ atomic_read(&counts->co_put_count) +
+ atomic_read(&counts->co_get_count) +
+ atomic_read(&counts->co_reply_count) +
+ atomic_read(&counts->co_hello_count));
+}
+
+static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
+ struct lnet_comm_count *counts)
+{
+ msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
+ msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
+ msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
+ msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
+ msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
+}
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+ struct lnet_element_stats *stats)
+{
+ struct lnet_comm_count *counts;
+
+ LASSERT(msg_stats);
+ LASSERT(stats);
+
+ counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
+ if (!counts)
+ return;
+ assign_stats(&msg_stats->im_send_stats, counts);
+
+ counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
+ if (!counts)
+ return;
+ assign_stats(&msg_stats->im_recv_stats, counts);
+
+ counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
+ if (!counts)
+ return;
+ assign_stats(&msg_stats->im_drop_stats, counts);
+}
+
int
lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
{
@@ -632,9 +730,13 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
lnet_net_unlock(cpt);
if (msg->msg_txpeer)
- atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+ lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_DROP);
if (msg->msg_txni)
- atomic_inc(&msg->msg_txni->ni_stats.drop_count);
+ lnet_incr_stats(&msg->msg_txni->ni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_DROP);
CNETERR("Dropping message for %s: peer not alive\n",
libcfs_id2str(msg->msg_target));
@@ -1859,9 +1961,11 @@ lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
}
void
-lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
+ __u32 msg_type)
{
lnet_net_lock(cpt);
+ lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
the_lnet.ln_counters[cpt]->drop_count++;
the_lnet.ln_counters[cpt]->drop_length += nob;
lnet_net_unlock(cpt);
@@ -2510,7 +2614,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
lnet_finalize(msg, rc);
drop:
- lnet_drop_message(ni, cpt, private, payload_length);
+ lnet_drop_message(ni, cpt, private, payload_length, type);
return 0;
}
EXPORT_SYMBOL(lnet_parse);
@@ -2546,7 +2650,8 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
* until that's done
*/
lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
- msg->msg_private, msg->msg_len);
+ msg->msg_private, msg->msg_len,
+ msg->msg_type);
/*
* NB: message will not generate event because w/o attached MD,
* but we still should give error code so lnet_msg_decommit()
@@ -2786,6 +2891,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
cpt = lnet_cpt_of_nid(peer_id.nid, ni);
lnet_net_lock(cpt);
+ lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
the_lnet.ln_counters[cpt]->drop_count++;
the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
lnet_net_unlock(cpt);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
index db13d01d366f..7f58cfe25bc2 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-msg.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -219,9 +219,13 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
incr_stats:
if (msg->msg_txpeer)
- atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
+ lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_SEND);
if (msg->msg_txni)
- atomic_inc(&msg->msg_txni->ni_stats.send_count);
+ lnet_incr_stats(&msg->msg_txni->ni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_SEND);
out:
lnet_return_tx_credits_locked(msg);
msg->msg_tx_committed = 0;
@@ -280,9 +284,13 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
incr_stats:
if (msg->msg_rxpeer)
- atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
+ lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_RECV);
if (msg->msg_rxni)
- atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
+ lnet_incr_stats(&msg->msg_rxni->ni_stats,
+ msg->msg_type,
+ LNET_STATS_TYPE_RECV);
if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
counters->recv_length += msg->msg_wanted;
diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c
index 3841bac1aa0a..e2c746855da9 100644
--- a/drivers/staging/lustre/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustre/lnet/lnet/net_fault.c
@@ -632,7 +632,8 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
}
}
- lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
+ lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
+ msg->msg_type);
lnet_finalize(msg, rc);
}
}
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
index 95f72ae39a89..03c1c34517e4 100644
--- a/drivers/staging/lustre/lnet/lnet/peer.c
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -3301,6 +3301,7 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
void __user *bulk)
{
struct lnet_ioctl_element_stats *lpni_stats;
+ struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
struct lnet_peer_ni_credit_info *lpni_info;
struct lnet_peer_ni *lpni;
struct lnet_peer *lp;
@@ -3315,7 +3316,8 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
goto out;
}
- size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats);
+ size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
+ + sizeof(*lpni_msg_stats);
size *= lp->lp_nnis;
if (size > *sizep) {
*sizep = size;
@@ -3337,13 +3339,17 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
lpni_stats = kzalloc(sizeof(*lpni_stats), GFP_KERNEL);
if (!lpni_stats)
goto out_free_info;
+ lpni_msg_stats = kzalloc(sizeof(*lpni_msg_stats), GFP_KERNEL);
+ if (!lpni_msg_stats)
+ goto out_free_stats;
+
lpni = NULL;
rc = -EFAULT;
while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
nid = lpni->lpni_nid;
if (copy_to_user(bulk, &nid, sizeof(nid)))
- goto out_free_stats;
+ goto out_free_msg_stats;
bulk += sizeof(nid);
memset(lpni_info, 0, sizeof(*lpni_info));
@@ -3362,22 +3368,28 @@ int lnet_get_peer_info(lnet_nid_t *primary_nid, lnet_nid_t *nidp,
lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
- goto out_free_stats;
+ goto out_free_msg_stats;
bulk += sizeof(*lpni_info);
memset(lpni_stats, 0, sizeof(*lpni_stats));
lpni_stats->iel_send_count =
- atomic_read(&lpni->lpni_stats.send_count);
+ lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_SEND);
lpni_stats->iel_recv_count =
- atomic_read(&lpni->lpni_stats.recv_count);
+ lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_RECV);
lpni_stats->iel_drop_count =
- atomic_read(&lpni->lpni_stats.drop_count);
+ lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_DROP);
if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
- goto out_free_stats;
+ goto out_free_msg_stats;
bulk += sizeof(*lpni_stats);
+ lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
+ if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
+ goto out_free_msg_stats;
+ bulk += sizeof(*lpni_msg_stats);
}
rc = 0;
+out_free_msg_stats:
+ kfree(lpni_msg_stats);
out_free_stats:
kfree(lpni_stats);
out_free_info:
More information about the lustre-devel
mailing list