From b9f5c290b7dedd0a677cdfc25db7dd111245a745 Mon Sep 17 00:00:00 2001
From: Christine Caulfield <ccaulfie@redhat.com>
Date: Thu, 18 Jun 2015 09:57:59 +0100
Subject: [PATCH] votequorum: Fix auto_tie_breaker behaviour in odd-sized clusters
auto_tie_breaker can behave incorrectly in the case of a cluster
with an odd number of nodes. It's possible for a partition to
have quorum while the other side has the ATB node, and both will
continue working. (Of course in a properly configured cluster one side
will be fenced but that becomes an indeterminate race .. just what ATB
is supposed to avoid).
This patch prevents ATB from running in a partition if the 'other'
partition might have quorum, and also mandates the use of wait_for_all
in clusters with an odd number of nodes so that a quorate partition
cannot start services or fence an existing partition with the tie
breaker node.
Signed-Off-By: Christine Caulfield <ccaulfie@redhat.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
---
exec/votequorum.c | 31 +++++++++++++++++++++++++++++++
1 files changed, 31 insertions(+), 0 deletions(-)
diff --git a/exec/votequorum.c b/exec/votequorum.c
index f6faa25..62c8cf3 100644
--- a/exec/votequorum.c
+++ b/exec/votequorum.c
@@ -1011,7 +1011,10 @@ static void are_we_quorate(unsigned int total_votes)
}
if ((auto_tie_breaker != ATB_NONE) &&
+ /* Must be a half (or half-1) split */
(total_votes == (us->expected_votes / 2)) &&
+ /* If the 'other' partition in a split might have quorum then we can't run ATB */
+ (previous_quorum_members_entries - quorum_members_entries < quorum) &&
(check_auto_tie_breaker() == 1)) {
quorate = 1;
}
@@ -1331,6 +1334,34 @@ static char *votequorum_readconfig(int runtime)
log_printf(LOGSYS_LEVEL_CRIT, "two_node has been disabled, please fix your corosync.conf");
two_node = 0;
}
+
+ /* If ATB is set and the cluster has an odd number of nodes then wait_for_all needs
+ * to be set so that an isolated half+1 without the tie breaker node
+ * does not have quorum on reboot.
+ */
+ if ((auto_tie_breaker != ATB_NONE) && (node_expected_votes % 2) &&
+ (!wait_for_all)) {
+ if (last_man_standing) {
+ /* if LMS is set too, it's a fatal configuration error. We can't dictate to the user what
+ * they might want so we'll just quit.
+ */
+ log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set, the cluster has an odd number of nodes\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "and last_man_standing is also set. With this situation a better\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "solution would be to disable LMS, leave ATB enabled, and also\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "enable wait_for_all (mandatory for ATB in odd-numbered clusters).\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "Due to this ambiguity, corosync will fail to start. Please fix your corosync.conf\n");
+ error = (char *)"configuration error: auto_tie_breaker & last_man_standing not available in odd sized cluster";
+ goto out;
+ }
+ else {
+ log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set and the cluster has an odd number of nodes.\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "wait_for_all needs to be set for this configuration but it is missing\n");
+ log_printf(LOGSYS_LEVEL_CRIT, "Therefore auto_tie_breaker has been disabled. Please fix your corosync.conf\n");
+ auto_tie_breaker = ATB_NONE;
+ icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
+ }
+ }
+
/*
* quorum device is not compatible with last_man_standing and auto_tie_breaker
* neither lms or atb can be set at runtime, so there is no need to check for
--
1.7.1