[lustre-devel] [PATCH 090/622] lnet: add health statistics
James Simmons
jsimmons at infradead.org
Thu Feb 27 13:09:18 PST 2020
From: Amir Shehata <ashehata at whamcloud.com>
Add a health statistics block for each local and peer NI.
These statistics will be incremented when processing errors reported
by lnet_finalize()
WC-bug-id: https://jira.whamcloud.com/browse/LU-9120
Lustre-commit: 67908ab34371 ("LU-9120 lnet: add health statistics")
Signed-off-by: Amir Shehata <ashehata at whamcloud.com>
Reviewed-on: https://review.whamcloud.com/32775
Reviewed-by: Olaf Weber <olaf.weber at hpe.com>
Reviewed-by: Sonia Sharma <sharmaso at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
include/linux/lnet/lib-types.h | 18 +++++++++++++++
net/lnet/lnet/lib-msg.c | 52 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/include/linux/lnet/lib-types.h b/include/linux/lnet/lib-types.h
index 2b3e76a..e5d4128 100644
--- a/include/linux/lnet/lib-types.h
+++ b/include/linux/lnet/lib-types.h
@@ -338,6 +338,22 @@ struct lnet_element_stats {
struct lnet_comm_count el_drop_stats;
};
+struct lnet_health_local_stats {
+ atomic_t hlt_local_interrupt;
+ atomic_t hlt_local_dropped;
+ atomic_t hlt_local_aborted;
+ atomic_t hlt_local_no_route;
+ atomic_t hlt_local_timeout;
+ atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+ atomic_t hlt_remote_dropped;
+ atomic_t hlt_remote_timeout;
+ atomic_t hlt_remote_error;
+ atomic_t hlt_network_timeout;
+};
+
struct lnet_net {
/* chain on the ln_nets */
struct list_head net_list;
@@ -426,6 +442,7 @@ struct lnet_ni {
/* NI statistics */
struct lnet_element_stats ni_stats;
+ struct lnet_health_local_stats ni_hstats;
/* physical device CPT */
int ni_dev_cpt;
@@ -511,6 +528,7 @@ struct lnet_peer_ni {
struct list_head lpni_rtr_list;
/* statistics kept on each peer NI */
struct lnet_element_stats lpni_stats;
+ struct lnet_health_remote_stats lpni_hstats;
/* spin lock protecting credits and lpni_txq / lpni_rtrq */
spinlock_t lpni_lock;
/* # tx credits available */
diff --git a/net/lnet/lnet/lib-msg.c b/net/lnet/lnet/lib-msg.c
index 32d49e9..dc51a17 100644
--- a/net/lnet/lnet/lib-msg.c
+++ b/net/lnet/lnet/lib-msg.c
@@ -541,6 +541,54 @@
lnet_net_unlock(0);
}
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+ struct lnet_ni *ni = msg->msg_txni;
+ struct lnet_peer_ni *lpni = msg->msg_txpeer;
+
+ switch (hstatus) {
+ case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+ atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+ break;
+ case LNET_MSG_STATUS_LOCAL_DROPPED:
+ atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+ break;
+ case LNET_MSG_STATUS_LOCAL_ABORTED:
+ atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+ break;
+ case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+ atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+ break;
+ case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+ atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+ break;
+ case LNET_MSG_STATUS_LOCAL_ERROR:
+ atomic_inc(&ni->ni_hstats.hlt_local_error);
+ break;
+ case LNET_MSG_STATUS_REMOTE_DROPPED:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+ break;
+ case LNET_MSG_STATUS_REMOTE_ERROR:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+ break;
+ case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+ break;
+ case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+ break;
+ case LNET_MSG_STATUS_OK:
+ break;
+ default:
+ LBUG();
+ }
+}
+
/* Do a health check on the message:
* return -1 if we're not going to handle the error or
* if we've reached the maximum number of retries.
@@ -553,8 +601,6 @@
enum lnet_msg_hstatus hstatus = msg->msg_health_status;
bool lo = false;
- /* TODO: lnet_incr_hstats(hstatus); */
-
LASSERT(msg->msg_txni);
/* if we're sending to the LOLND then the msg_txpeer will not be
@@ -565,6 +611,8 @@
else
lo = true;
+ lnet_incr_hstats(msg, hstatus);
+
if (hstatus != LNET_MSG_STATUS_OK &&
ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
return -1;
--
1.8.3.1
More information about the lustre-devel
mailing list