[lustre-devel] [PATCH 137/151] lnet: reduce discovery timeout
James Simmons
jsimmons at infradead.org
Mon Sep 30 11:56:36 PDT 2019
From: Amir Shehata <ashehata at whamcloud.com>
Discovery protocol sends a ping (GET) to the peer and expects a
REPLY back with the interface information. Discovery uses the
DEFAULT_PEER_TIMEOUT which 180s. This could lead to extended delay
during mounting if the OSTs are down or if the ping fails for
any reason.
This patch adds a module parameter lnet_transaction_timeout which
defaults to 5 seconds. lnet_transaction_timeout is used for the
discovery timeout.
WC-bug-id: https://jira.whamcloud.com/browse/LU-10800
Lustre-commit: 1cf929df259a ("LU-10800 lnet: reduce discovery timeout")
Signed-off-by: Amir Shehata <ashehata at whamcloud.com>
Reviewed-on: https://review.whamcloud.com/31663
Reviewed-by: Andreas Dilger <adilger at whamcloud.com>
Reviewed-by: Sonia Sharma <sharmaso at whamcloud.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin at intel.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
include/linux/lnet/lib-lnet.h | 1 +
net/lnet/lnet/api-ni.c | 44 +++++++++++++++++++++++++++++++++++++++++++
net/lnet/lnet/peer.c | 16 ++++++++--------
3 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/include/linux/lnet/lib-lnet.h b/include/linux/lnet/lib-lnet.h
index 3d7867f..22c6152 100644
--- a/include/linux/lnet/lib-lnet.h
+++ b/include/linux/lnet/lib-lnet.h
@@ -477,6 +477,7 @@ struct lnet_ni *
bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
struct lnet_net *lnet_get_net_locked(u32 net_id);
+extern unsigned int lnet_transaction_timeout;
extern unsigned int lnet_numa_range;
extern unsigned int lnet_peer_discovery_disabled;
extern int portal_rotor;
diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c
index fc4fe5d..8be3354 100644
--- a/net/lnet/lnet/api-ni.c
+++ b/net/lnet/lnet/api-ni.c
@@ -92,6 +92,13 @@ struct lnet the_lnet = {
MODULE_PARM_DESC(lnet_peer_discovery_disabled,
"Set to 1 to disable peer discovery on this node.");
+unsigned int lnet_transaction_timeout = 5;
+static int transaction_to_set(const char *val, const struct kernel_param *kp);
+module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
+ &lnet_transaction_timeout, 0444);
+MODULE_PARM_DESC(lnet_transaction_timeout,
+ "Time in seconds to wait for a REPLY or an ACK");
+
/*
* This sequence number keeps track of how many times DLC was used to
* update the local NIs. It is incremented when a NI is added or
@@ -158,6 +165,43 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
}
static int
+transaction_to_set(const char *val, const struct kernel_param *kp)
+{
+ unsigned int *transaction_to = (unsigned int *)kp->arg;
+ unsigned long value;
+ int rc;
+
+ rc = kstrtoul(val, 0, &value);
+ if (rc) {
+ CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n");
+ return rc;
+ }
+
+ /* The purpose of locking the api_mutex here is to ensure that
+ * the correct value ends up stored properly.
+ */
+ mutex_lock(&the_lnet.ln_api_mutex);
+
+ if (value == 0) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ CERROR("Invalid value for lnet_transaction_timeout (%lu).\n",
+ value);
+ return -EINVAL;
+ }
+
+ if (value == *transaction_to) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ return 0;
+ }
+
+ *transaction_to = value;
+
+ mutex_unlock(&the_lnet.ln_api_mutex);
+
+ return 0;
+}
+
+static int
intf_max_set(const char *val, const struct kernel_param *kp)
{
int value, rc;
diff --git a/net/lnet/lnet/peer.c b/net/lnet/lnet/peer.c
index e2f8c28..1534ab2 100644
--- a/net/lnet/lnet/peer.c
+++ b/net/lnet/lnet/peer.c
@@ -2942,7 +2942,7 @@ static int lnet_peer_rediscover(struct lnet_peer *lp)
* obsessively re-check the clock. The oldest discovery request will
* be at the head of the queue.
*/
-static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
+static struct lnet_peer *lnet_peer_get_dc_timed_out(time64_t now)
{
struct lnet_peer *lp;
@@ -2950,7 +2950,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
return NULL;
lp = list_first_entry(&the_lnet.ln_dc_working,
struct lnet_peer, lp_dc_list);
- if (now < lp->lp_last_queued + DEFAULT_PEER_TIMEOUT)
+ if (now < lp->lp_last_queued + lnet_transaction_timeout)
return NULL;
return lp;
}
@@ -2961,7 +2961,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
* lnet_discovery_event_handler() will proceed from here and complete
* the cleanup.
*/
-static void lnet_peer_discovery_timeout(struct lnet_peer *lp)
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
{
struct lnet_handle_md ping_mdh;
struct lnet_handle_md push_mdh;
@@ -3010,7 +3010,7 @@ static int lnet_peer_discovery_wait_for_work(void)
break;
if (!list_empty(&the_lnet.ln_msg_resend))
break;
- if (lnet_peer_dc_timed_out(ktime_get_real_seconds()))
+ if (lnet_peer_get_dc_timed_out(ktime_get_real_seconds()))
break;
lnet_net_unlock(cpt);
@@ -3177,14 +3177,14 @@ static int lnet_peer_discovery(void *arg)
* taking too long. Move all that are found to the
* ln_dc_expired queue and time out any pending
* Ping or Push. We have to drop the lnet_net_lock
- * in the loop because lnet_peer_discovery_timeout()
+ * in the loop because lnet_peer_cancel_discovery()
* calls LNetMDUnlink().
*/
now = ktime_get_real_seconds();
- while ((lp = lnet_peer_dc_timed_out(now)) != NULL) {
+ while ((lp = lnet_peer_get_dc_timed_out(now)) != NULL) {
list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
lnet_net_unlock(LNET_LOCK_EX);
- lnet_peer_discovery_timeout(lp);
+ lnet_peer_cancel_discovery(lp);
lnet_net_lock(LNET_LOCK_EX);
}
@@ -3208,7 +3208,7 @@ static int lnet_peer_discovery(void *arg)
struct lnet_peer, lp_dc_list);
list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
lnet_net_unlock(LNET_LOCK_EX);
- lnet_peer_discovery_timeout(lp);
+ lnet_peer_cancel_discovery(lp);
lnet_net_lock(LNET_LOCK_EX);
}
lnet_net_unlock(LNET_LOCK_EX);
--
1.8.3.1
More information about the lustre-devel
mailing list