From ab8942f6260fde93824ed2a18e09e572b59ceb25 Mon Sep 17 00:00:00 2001
From: Christine Caulfield <ccaulfie@redhat.com>
Date: Fri, 12 Jun 2015 16:16:45 +0100
Subject: [PATCH] totemsrp: Improve logging of left/down nodes
This patch from Hideo Yamauchi improves the logging of
whether nodes leave the cluster cleanly or uncleanly,
making it easier to determine if a node ws shut down
by the operator. There is also the possibility that a
LEAVE message could get missed (due to the node being
in flush state) so this can also make that clearer.
The modifications are as follows.
Change 1) I added the list which maintained LEAVE node to totemsrp.
Change 2) I added registration, a search, the handling of to clear LEAVE
node.
Change 3) I added the output to log.
Change 4) I changed an output level of the log.
Signed-off-by: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
---
exec/totemsrp.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 104 insertions(+), 1 deletions(-)
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
index 6357f5a..3aa61cc 100644
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -316,6 +316,8 @@ struct totemsrp_instance {
struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX];
+ unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX];
+
int my_proc_list_entries;
int my_failed_list_entries;
@@ -329,6 +331,8 @@ struct totemsrp_instance {
int my_deliver_memb_entries;
int my_left_memb_entries;
+
+ int my_leave_memb_entries;
struct memb_ring_id my_ring_id;
@@ -513,6 +517,8 @@ struct totemsrp_instance {
uint32_t threaded_mode_enabled;
uint32_t waiting_trans_ack;
+
+ int flushing;
void * token_recv_event_handle;
void * token_sent_event_handle;
@@ -1476,6 +1482,52 @@ static void memb_set_print (
}
}
#endif
+static void my_leave_memb_clear(
+ struct totemsrp_instance *instance)
+{
+ memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list));
+ instance->my_leave_memb_entries = 0;
+}
+
+static unsigned int my_leave_memb_match(
+ struct totemsrp_instance *instance,
+ unsigned int nodeid)
+{
+ int i;
+ unsigned int ret = 0;
+
+ for (i = 0; i < instance->my_leave_memb_entries; i++){
+ if (instance->my_leave_memb_list[i] == nodeid){
+ ret = nodeid;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void my_leave_memb_set(
+ struct totemsrp_instance *instance,
+ unsigned int nodeid)
+{
+ int i, found = 0;
+ for (i = 0; i < instance->my_leave_memb_entries; i++){
+ if (instance->my_leave_memb_list[i] == nodeid){
+ found = 1;
+ break;
+ }
+ }
+ if (found == 1) {
+ return;
+ }
+ if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) {
+ instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid;
+ instance->my_leave_memb_entries++;
+ } else {
+ log_printf (instance->totemsrp_log_level_warning,
+ "Cannot set LEAVE nodeid=%d", nodeid);
+ }
+}
+
static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance)
{
@@ -1837,6 +1889,7 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
unsigned int res;
char left_node_msg[1024];
char joined_node_msg[1024];
+ char failed_node_msg[1024];
instance->originated_orf_token = 0;
@@ -2008,15 +2061,30 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
if (instance->my_left_memb_entries) {
int sptr = 0;
+ int sptr2 = 0;
sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:");
for (i=0; i< instance->my_left_memb_entries; i++) {
sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " %u", left_list[i]);
}
+ for (i=0; i< instance->my_left_memb_entries; i++) {
+ if (my_leave_memb_match(instance, left_list[i]) == 0) {
+ if (sptr2 == 0) {
+ sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:");
+ }
+ sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " %u", left_list[i]);
+ }
+ }
+ if (sptr2 == 0) {
+ failed_node_msg[0] = '\0';
+ }
}
else {
left_node_msg[0] = '\0';
+ failed_node_msg[0] = '\0';
}
+ my_leave_memb_clear(instance);
+
log_printf (instance->totemsrp_log_level_debug,
"entering OPERATIONAL state.");
log_printf (instance->totemsrp_log_level_notice,
@@ -2025,6 +2093,13 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
instance->my_ring_id.seq,
joined_node_msg,
left_node_msg);
+
+ if (strlen(failed_node_msg)) {
+ log_printf (instance->totemsrp_log_level_notice,
+ "Failed to receive the leave message.%s",
+ failed_node_msg);
+ }
+
instance->memb_state = MEMB_STATE_OPERATIONAL;
instance->stats.operational_entered++;
@@ -3597,8 +3672,9 @@ static int message_handler_orf_token (
return (0);
}
#endif
-
+ instance->flushing = 1;
totemrrp_recv_flush (instance->totemrrp_context);
+ instance->flushing = 0;
/*
* Determine if we should hold (in reality drop) the token
@@ -4130,6 +4206,32 @@ static void memb_join_process (
memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries);
-*/
+ if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) {
+ if (instance->flushing) {
+ if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
+ log_printf (instance->totemsrp_log_level_warning,
+ "Discarding LEAVE message during flush, nodeid=%u",
+ memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
+ if (memb_join->failed_list_entries > 0) {
+ my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
+ }
+ } else {
+ log_printf (instance->totemsrp_log_level_warning,
+ "Discarding JOIN message during flush, nodeid=%d", memb_join->header.nodeid);
+ }
+ return;
+ } else {
+ if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
+ log_printf (instance->totemsrp_log_level_debug,
+ "Recieve LEAVE message from %u", memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
+ if (memb_join->failed_list_entries > 0) {
+ my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
+ }
+ }
+ }
+
+ }
+
if (memb_set_equal (proc_list,
memb_join->proc_list_entries,
instance->my_proc_list,
@@ -4573,6 +4675,7 @@ void main_deliver_fn (
return;
}
+
switch (message_header->type) {
case MESSAGE_TYPE_ORF_TOKEN:
instance->stats.orf_token_rx++;
--
1.7.1