[lustre-devel] [PATCH 24/25] lustre: socklnd: propagate errors on send failure
James Simmons
jsimmons at infradead.org
Tue Sep 25 19:48:16 PDT 2018
From: Olaf Weber <olaf.weber at hpe.com>
When an attempt to send a message fails, for example because no
connection could be established with the remote address, socklnd
drops the message. For a PUT or REPLY message with non-zero
payload, ksocknal_tx_done() calls lnet_finalize() with -EIO
as the error code. But for an ACK or GET message there is no
payload, and lnet_finalize() is called with 0 (no error) as the
error code. This leaves upper layers to rely on other means to
determine that sending the message did actually fail, and that
(for example) no REPLY will ever answer a failed GET.
Add an error code parameter to ksocknal_tx_done().
In ksocknal_txlist_done() change the 0/1 'error' indicator to be
an actual error code that is passed on the ksocknal_tx_done().
Update the callers of ksocknal_txlist_done() to pass in the error
code if they have encountered an error.
Signed-off-by: Olaf Weber <olaf.weber at hpe.com>
WC-bug-id: https://jira.whamcloud.com/browse/LU-9119
Reviewed-on: https://review.whamcloud.com/26691
Reviewed-by: Doug Oucharek <dougso at me.com>
Reviewed-by: Oleg Drokin <green at whamcloud.com>
Signed-off-by: James Simmons <jsimmons at infradead.org>
---
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c | 11 +++++++++--
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h | 4 ++--
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 15 +++++++--------
3 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
index 1a49f5e..b2f0148 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -607,7 +607,7 @@ struct ksock_peer *
write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_txlist_done(ni, &zombies, 1);
+ ksocknal_txlist_done(ni, &zombies, -ENETDOWN);
return rc;
}
@@ -1023,6 +1023,7 @@ struct ksock_peer *
int cpt;
struct ksock_tx *tx;
struct ksock_tx *txtmp;
+ int rc2;
int rc;
int active;
char *warn = NULL;
@@ -1406,7 +1407,13 @@ struct ksock_peer *
write_unlock_bh(global_lock);
}
- ksocknal_txlist_done(ni, &zombies, 1);
+ /*
+ * If we get here without an error code, just use -EALREADY.
+ * Depending on how we got here, the error may be positive
+ * or negative. Normalize the value for ksocknal_txlist_done().
+ */
+ rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc));
+ ksocknal_txlist_done(ni, &zombies, rc2);
ksocknal_peer_decref(peer_ni);
failed_1:
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
index 95ca2aa..82e3523 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -582,14 +582,14 @@ struct ksock_proto {
}
void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
-void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx);
+void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error);
static inline void
ksocknal_tx_decref(struct ksock_tx *tx)
{
LASSERT(atomic_read(&tx->tx_refcount) > 0);
if (atomic_dec_and_test(&tx->tx_refcount))
- ksocknal_tx_done(NULL, tx);
+ ksocknal_tx_done(NULL, tx, 0);
}
static inline void
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
index 73321a7..dc9a129 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -328,19 +328,18 @@ struct ksock_tx *
}
void
-ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx)
+ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
{
struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
- int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO;
LASSERT(ni || tx->tx_conn);
+ if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
+ rc = -EIO;
+
if (tx->tx_conn)
ksocknal_conn_decref(tx->tx_conn);
- if (!ni && tx->tx_conn)
- ni = tx->tx_conn->ksnc_peer->ksnp_ni;
-
ksocknal_free_tx(tx);
if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */
lnet_finalize(lnetmsg, rc);
@@ -367,7 +366,7 @@ struct ksock_tx *
list_del(&tx->tx_list);
LASSERT(atomic_read(&tx->tx_refcount) == 1);
- ksocknal_tx_done(ni, tx);
+ ksocknal_tx_done(ni, tx, error);
}
}
@@ -1923,7 +1922,7 @@ void ksocknal_write_callback(struct ksock_conn *conn)
write_unlock_bh(&ksocknal_data.ksnd_global_lock);
ksocknal_peer_failed(peer_ni);
- ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, 1);
+ ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc);
return 0;
}
@@ -2268,7 +2267,7 @@ void ksocknal_write_callback(struct ksock_conn *conn)
write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, 1);
+ ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT);
}
static int
--
1.8.3.1
More information about the lustre-devel
mailing list