|
 |
850067 |
From b9f5c290b7dedd0a677cdfc25db7dd111245a745 Mon Sep 17 00:00:00 2001
|
|
 |
850067 |
From: Christine Caulfield <ccaulfie@redhat.com>
|
|
 |
850067 |
Date: Thu, 18 Jun 2015 09:57:59 +0100
|
|
 |
850067 |
Subject: [PATCH] votequorum: Fix auto_tie_breaker behaviour in odd-sized clusters
|
|
 |
850067 |
|
|
 |
850067 |
auto_tie_breaker can behave incorrectly in the case of a cluster
|
|
 |
850067 |
with an odd number of nodes. It's possible for a partition to
|
|
 |
850067 |
have quorum while the other side has the ATB node, and both will
|
|
 |
850067 |
continue working. (Of course in a properly configured cluster one side
|
|
 |
850067 |
will be fenced but that becomes an indeterminate race .. just what ATB
|
|
 |
850067 |
is supposed to avoid).
|
|
 |
850067 |
|
|
 |
850067 |
This patch prevents ATB from running in a partition if the 'other'
|
|
 |
850067 |
partition might have quorum, and also mandates the use of wait_for_all
|
|
 |
850067 |
in clusters with an odd number of nodes so that a quorate partition
|
|
 |
850067 |
cannot start services or fence an existing partition with the tie
|
|
 |
850067 |
breaker node.
|
|
 |
850067 |
|
|
 |
850067 |
Signed-Off-By: Christine Caulfield <ccaulfie@redhat.com>
|
|
 |
850067 |
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
|
|
 |
850067 |
---
|
|
 |
850067 |
exec/votequorum.c | 31 +++++++++++++++++++++++++++++++
|
|
 |
850067 |
1 files changed, 31 insertions(+), 0 deletions(-)
|
|
 |
850067 |
|
|
 |
850067 |
diff --git a/exec/votequorum.c b/exec/votequorum.c
|
|
 |
850067 |
index f6faa25..62c8cf3 100644
|
|
 |
850067 |
--- a/exec/votequorum.c
|
|
 |
850067 |
+++ b/exec/votequorum.c
|
|
 |
850067 |
@@ -1011,7 +1011,10 @@ static void are_we_quorate(unsigned int total_votes)
|
|
 |
850067 |
}
|
|
 |
850067 |
|
|
 |
850067 |
if ((auto_tie_breaker != ATB_NONE) &&
|
|
 |
850067 |
+ /* Must be a half (or half-1) split */
|
|
 |
850067 |
(total_votes == (us->expected_votes / 2)) &&
|
|
 |
850067 |
+ /* If the 'other' partition in a split might have quorum then we can't run ATB */
|
|
 |
850067 |
+ (previous_quorum_members_entries - quorum_members_entries < quorum) &&
|
|
 |
850067 |
(check_auto_tie_breaker() == 1)) {
|
|
 |
850067 |
quorate = 1;
|
|
 |
850067 |
}
|
|
 |
850067 |
@@ -1331,6 +1334,34 @@ static char *votequorum_readconfig(int runtime)
|
|
 |
850067 |
log_printf(LOGSYS_LEVEL_CRIT, "two_node has been disabled, please fix your corosync.conf");
|
|
 |
850067 |
two_node = 0;
|
|
 |
850067 |
}
|
|
 |
850067 |
+
|
|
 |
850067 |
+ /* If ATB is set and the cluster has an odd number of nodes then wait_for_all needs
|
|
 |
850067 |
+ * to be set so that an isolated half+1 without the tie breaker node
|
|
 |
850067 |
+ * does not have quorum on reboot.
|
|
 |
850067 |
+ */
|
|
 |
850067 |
+ if ((auto_tie_breaker != ATB_NONE) && (node_expected_votes % 2) &&
|
|
 |
850067 |
+ (!wait_for_all)) {
|
|
 |
850067 |
+ if (last_man_standing) {
|
|
 |
850067 |
+ /* if LMS is set too, it's a fatal configuration error. We can't dictate to the user what
|
|
 |
850067 |
+ * they might want so we'll just quit.
|
|
 |
850067 |
+ */
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set, the cluster has an odd number of nodes\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "and last_man_standing is also set. With this situation a better\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "solution would be to disable LMS, leave ATB enabled, and also\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "enable wait_for_all (mandatory for ATB in odd-numbered clusters).\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "Due to this ambiguity, corosync will fail to start. Please fix your corosync.conf\n");
|
|
 |
850067 |
+ error = (char *)"configuration error: auto_tie_breaker & last_man_standing not available in odd sized cluster";
|
|
 |
850067 |
+ goto out;
|
|
 |
850067 |
+ }
|
|
 |
850067 |
+ else {
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set and the cluster has an odd number of nodes.\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "wait_for_all needs to be set for this configuration but it is missing\n");
|
|
 |
850067 |
+ log_printf(LOGSYS_LEVEL_CRIT, "Therefore auto_tie_breaker has been disabled. Please fix your corosync.conf\n");
|
|
 |
850067 |
+ auto_tie_breaker = ATB_NONE;
|
|
 |
850067 |
+ icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
|
|
 |
850067 |
+ }
|
|
 |
850067 |
+ }
|
|
 |
850067 |
+
|
|
 |
850067 |
/*
|
|
 |
850067 |
* quorum device is not compatible with last_man_standing and auto_tie_breaker
|
|
 |
850067 |
* neither lms or atb can be set at runtime, so there is no need to check for
|
|
 |
850067 |
--
|
|
 |
850067 |
1.7.1
|
|
 |
850067 |
|