Blame SOURCES/0044-multipathd-Don-t-keep-starting-TUR-threads-if-they-a.patch

aebebb
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
aebebb
From: Benjamin Marzinski <bmarzins@redhat.com>
aebebb
Date: Tue, 29 Mar 2022 22:22:10 -0500
aebebb
Subject: [PATCH] multipathd: Don't keep starting TUR threads, if they always
aebebb
 hang.
aebebb
aebebb
If tur thead hangs, multipathd was simply creating a new thread, and
aebebb
assuming that the old thread would get cleaned up eventually. I have
aebebb
seen a case recently where there were 26000 multipathd threads on a
aebebb
system, all stuck trying to send TUR commands to path devices. The root
aebebb
cause of the issue was a scsi kernel issue, but it shows that the way
aebebb
multipathd currently deals with stuck threads could use some refinement.
aebebb
aebebb
Now, when one tur thread hangs, multipathd will act as it did before.
aebebb
If a second one in a row hangs, multipathd will instead wait for it to
aebebb
complete before starting another thread. Once the thread completes, the
aebebb
count is reset.
aebebb
aebebb
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
aebebb
Reviewed-by: Martin Wilck 
aebebb
---
aebebb
 libmultipath/checkers/tur.c | 23 +++++++++++++++++++++--
aebebb
 1 file changed, 21 insertions(+), 2 deletions(-)
aebebb
aebebb
diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
aebebb
index a4b4a213..d82f7dbc 100644
aebebb
--- a/libmultipath/checkers/tur.c
aebebb
+++ b/libmultipath/checkers/tur.c
aebebb
@@ -27,6 +27,7 @@
aebebb
 
aebebb
 #define TUR_CMD_LEN 6
aebebb
 #define HEAVY_CHECK_COUNT       10
aebebb
+#define MAX_NR_TIMEOUTS 1
aebebb
 
aebebb
 enum {
aebebb
 	MSG_TUR_RUNNING = CHECKER_FIRST_MSGID,
aebebb
@@ -55,6 +56,7 @@ struct tur_checker_context {
aebebb
 	int holders; /* uatomic access only */
aebebb
 	int msgid;
aebebb
 	struct checker_context ctx;
aebebb
+	unsigned int nr_timeouts;
aebebb
 };
aebebb
 
aebebb
 int libcheck_init (struct checker * c)
aebebb
@@ -359,8 +361,23 @@ int libcheck_check(struct checker * c)
aebebb
 		}
aebebb
 	} else {
aebebb
 		if (uatomic_read(&ct->holders) > 1) {
aebebb
+			/* The thread has been cancelled but hasn't quit. */
aebebb
+			if (ct->nr_timeouts == MAX_NR_TIMEOUTS) {
aebebb
+				condlog(2, "%d:%d : waiting for stalled tur thread to finish",
aebebb
+					major(ct->devt), minor(ct->devt));
aebebb
+				ct->nr_timeouts++;
aebebb
+			}
aebebb
 			/*
aebebb
-			 * The thread has been cancelled but hasn't quit.
aebebb
+			 * Don't start new threads until the last once has
aebebb
+			 * finished.
aebebb
+			 */
aebebb
+			if (ct->nr_timeouts > MAX_NR_TIMEOUTS) {
aebebb
+				c->msgid = MSG_TUR_TIMEOUT;
aebebb
+				return PATH_TIMEOUT;
aebebb
+			}
aebebb
+			ct->nr_timeouts++;
aebebb
+			/*
aebebb
+			 * Start a new thread while the old one is stalled.
aebebb
 			 * We have to prevent it from interfering with the new
aebebb
 			 * thread. We create a new context and leave the old
aebebb
 			 * one with the stale thread, hoping it will clean up
aebebb
@@ -376,13 +393,15 @@ int libcheck_check(struct checker * c)
aebebb
 			 */
aebebb
 			if (libcheck_init(c) != 0)
aebebb
 				return PATH_UNCHECKED;
aebebb
+			((struct tur_checker_context *)c->context)->nr_timeouts = ct->nr_timeouts;
aebebb
 
aebebb
 			if (!uatomic_sub_return(&ct->holders, 1))
aebebb
 				/* It did terminate, eventually */
aebebb
 				cleanup_context(ct);
aebebb
 
aebebb
 			ct = c->context;
aebebb
-		}
aebebb
+		} else
aebebb
+			ct->nr_timeouts = 0;
aebebb
 		/* Start new TUR checker */
aebebb
 		pthread_mutex_lock(&ct->lock);
aebebb
 		tur_status = ct->state = PATH_PENDING;