[lustre-devel] [PATCH 16/25] lustre: o2iblnd: kill timedout txs from ibp_tx_queue

James Simmons jsimmons at infradead.org
Tue Sep 25 19:48:08 PDT 2018


From: Sergey Cheremencev <c17829 at cray.com>

Sometimes connection can't be established for a long time
due to rejections and produces cycle of reconnections.
Peer is not removed in each iteration unlike connection.
Thus until connection becomes established txs live in
peer->ibp_tx_queue. This patch adds tx_deadline checking
for txs from peer tx_queue.

Signed-off-by: Sergey Cheremencev <c17829 at cray.com>
WC-bug-id: https://jira.whamcloud.com/browse/LU-9094
Seagate-bug-id: MRP-4056
Reviewed-on: https://review.whamcloud.com/25376
Reviewed-by: Doug Oucharek <dougso at me.com>
Reviewed-by: Amir Shehata <ashehata at whamcloud.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index dc71554..3218999 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3159,8 +3159,10 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 {
 	LIST_HEAD(closes);
 	LIST_HEAD(checksends);
+	LIST_HEAD(timedout_txs);
 	struct list_head *peers = &kiblnd_data.kib_peers[idx];
 	struct kib_peer_ni *peer_ni;
+	struct kib_tx *tx_tmp, *tx;
 	struct kib_conn *conn;
 	unsigned long flags;
 
@@ -3169,9 +3171,19 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 	 * RDMAs to time out, so we just use a shared lock while we
 	 * take a look...
 	 */
-	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
 	list_for_each_entry(peer_ni, peers, ibp_list) {
+		/* Check tx_deadline */
+		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
+			if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+				CWARN("Timed out tx for %s: %lld seconds\n",
+				      libcfs_nid2str(peer_ni->ibp_nid),
+				      ktime_ms_delta(ktime_get(),
+						     tx->tx_deadline) / MSEC_PER_SEC);
+				list_move(&tx->tx_list, &timedout_txs);
+			}
+		}
 
 		list_for_each_entry(conn, &peer_ni->ibp_conns, ibc_list) {
 			int timedout;
@@ -3207,7 +3219,10 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 		}
 	}
 
-	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!list_empty(&timedout_txs))
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
 
 	/*
 	 * Handle timeout by closing the whole
-- 
1.8.3.1



More information about the lustre-devel mailing list