Blame SOURCES/0053-Ticket-49463-After-cleanALLruv-there-is-a-flow-of-ke.patch

058656
From 0ac68e15a9a4048d3c1ad4519000996cd65fdefb Mon Sep 17 00:00:00 2001
058656
From: Thierry Bordaz <tbordaz@redhat.com>
058656
Date: Fri, 1 Dec 2017 16:23:11 +0100
058656
Subject: [PATCH] Ticket 49463 - After cleanALLruv, there is a flow of keep
058656
 alive DEL
058656
058656
Bug Description:
058656
	When cleanAllRuv is launched, it spawn cleanAllRuv on all replicas.
058656
	Each replica will clean its changelog and database RUV AND in addition
058656
	will DEL the keep alive entry of the target ReplicaID.
058656
	So for the same entry (keep alive) there will be as many DEL as there are replicas
058656
058656
	This flow of DEL is useless as only one DEL is enough.
058656
	In addition because of https://pagure.io/389-ds-base/issue/49466, replication may
058656
	loop on each of those DELs.
058656
058656
Fix Description:
058656
	The fix is only to prevent the flow of DEL.
058656
	It adds a flag ('original_task') in the task payload.
058656
	The server receiving the task (replica_execute_cleanall_ruv_task) flags the
058656
	task as 'original_task'.
058656
	In the opposite, the propagated cleanAllRuv (multimaster_extop_cleanruv) does
058656
	not flag the task as 'original_task'
058656
	Only original task does the DEL of the keep alive entry.
058656
	Note the propageted payload (extop) is not changed. In a mixed version
058656
	environment "old" servers will DEL the keep alive and flow can still happen
058656
058656
https://pagure.io/389-ds-base/issue/49466
058656
058656
Reviewed by: Ludwig Krispenz
058656
058656
Platforms tested: F23
058656
058656
Flag Day: no
058656
058656
Doc impact: no
058656
---
058656
 ldap/servers/plugins/replication/repl5.h           | 49 ++++++++++++----------
058656
 ldap/servers/plugins/replication/repl5_replica.c   | 21 ++++++++++
058656
 .../plugins/replication/repl5_replica_config.c     | 32 +++++++++++---
058656
 ldap/servers/plugins/replication/repl_extop.c      |  2 +
058656
 4 files changed, 76 insertions(+), 28 deletions(-)
058656
058656
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
058656
index 4e206a0fc..e08fec752 100644
058656
--- a/ldap/servers/plugins/replication/repl5.h
058656
+++ b/ldap/servers/plugins/replication/repl5.h
058656
@@ -783,12 +783,37 @@ void multimaster_mtnode_construct_replicas(void);
058656
 
058656
 void multimaster_be_state_change(void *handle, char *be_name, int old_be_state, int new_be_state);
058656
 
058656
+#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
058656
+
058656
+typedef struct _cleanruv_data
058656
+{
058656
+    Object *repl_obj;
058656
+    Replica *replica;
058656
+    ReplicaId rid;
058656
+    Slapi_Task *task;
058656
+    struct berval *payload;
058656
+    CSN *maxcsn;
058656
+    char *repl_root;
058656
+    Slapi_DN *sdn;
058656
+    char *certify;
058656
+    char *force;
058656
+    PRBool original_task;
058656
+} cleanruv_data;
058656
+
058656
+typedef struct _cleanruv_purge_data
058656
+{
058656
+    int cleaned_rid;
058656
+    const Slapi_DN *suffix_sdn;
058656
+    char *replName;
058656
+    char *replGen;
058656
+} cleanruv_purge_data;
058656
+
058656
 /* In repl5_replica_config.c */
058656
 int replica_config_init(void);
058656
 void replica_config_destroy(void);
058656
 int get_replica_type(Replica *r);
058656
 int replica_execute_cleanruv_task_ext(Object *r, ReplicaId rid);
058656
-void add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing);
058656
+void add_cleaned_rid(cleanruv_data *data, char *maxcsn);
058656
 int is_cleaned_rid(ReplicaId rid);
058656
 int replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter, int *returncode, char *returntext, void *arg);
058656
 void replica_cleanallruv_thread_ext(void *arg);
058656
@@ -808,29 +833,7 @@ void set_cleaned_rid(ReplicaId rid);
058656
 void cleanruv_log(Slapi_Task *task, int rid, char *task_type, int sev_level, char *fmt, ...);
058656
 char *replica_cleanallruv_get_local_maxcsn(ReplicaId rid, char *base_dn);
058656
 
058656
-#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
058656
 
058656
-typedef struct _cleanruv_data
058656
-{
058656
-    Object *repl_obj;
058656
-    Replica *replica;
058656
-    ReplicaId rid;
058656
-    Slapi_Task *task;
058656
-    struct berval *payload;
058656
-    CSN *maxcsn;
058656
-    char *repl_root;
058656
-    Slapi_DN *sdn;
058656
-    char *certify;
058656
-    char *force;
058656
-} cleanruv_data;
058656
-
058656
-typedef struct _cleanruv_purge_data
058656
-{
058656
-    int cleaned_rid;
058656
-    const Slapi_DN *suffix_sdn;
058656
-    char *replName;
058656
-    char *replGen;
058656
-} cleanruv_purge_data;
058656
 
058656
 /* replutil.c */
058656
 LDAPControl *create_managedsait_control(void);
058656
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
058656
index 77f4f18e4..e75807a62 100644
058656
--- a/ldap/servers/plugins/replication/repl5_replica.c
058656
+++ b/ldap/servers/plugins/replication/repl5_replica.c
058656
@@ -2120,6 +2120,7 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
058656
         char csnstr[CSN_STRSIZE];
058656
         char *token = NULL;
058656
         char *forcing;
058656
+        PRBool original_task;
058656
         char *csnpart;
058656
         char *ridstr;
058656
         char *iter = NULL;
058656
@@ -2151,8 +2152,15 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
058656
             csn_init_by_string(maxcsn, csnpart);
058656
             csn_as_string(maxcsn, PR_FALSE, csnstr);
058656
             forcing = ldap_utf8strtok_r(iter, ":", &iter);
058656
+            original_task = PR_TRUE;
058656
             if (forcing == NULL) {
058656
                 forcing = "no";
058656
+            } else if (!strcasecmp(forcing, "yes") || !strcasecmp(forcing, "no")) {
058656
+                /* forcing was correctly set, lets try to read the original task flag */
058656
+                token = ldap_utf8strtok_r(iter, ":", &iter);
058656
+                if (token && !atoi(token)) {
058656
+                    original_task = PR_FALSE;
058656
+                }
058656
             }
058656
 
058656
             slapi_log_err(SLAPI_LOG_NOTICE, repl_plugin_name, "CleanAllRUV Task - cleanAllRUV task found, "
058656
@@ -2190,6 +2198,13 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
058656
                 data->force = slapi_ch_strdup(forcing);
058656
                 data->repl_root = NULL;
058656
 
058656
+                /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
058656
+                 * We retrieved from type_replicaCleanRUV if the cleanAllRuv request
058656
+                 * was received from a direct task ADD or if was received via
058656
+                 * the cleanAllRuv extop.
058656
+                 */
058656
+                data->original_task = original_task;
058656
+
058656
                 thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
058656
                                          (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
058656
                                          PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
058656
@@ -2284,6 +2299,12 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
058656
                     data->sdn = slapi_sdn_dup(r->repl_root);
058656
                     data->certify = slapi_ch_strdup(certify);
058656
 
058656
+                    /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
058656
+                     * Let's assum this replica was the original receiver of the task.
058656
+                     * This flag has no impact on Abort cleanAllRuv
058656
+                     */
058656
+                    data->original_task = PR_TRUE;
058656
+
058656
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
058656
                                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
058656
                                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
058656
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
058656
index 005528a41..95b933bb8 100644
058656
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
058656
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
058656
@@ -1573,6 +1573,11 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
058656
     data->repl_root = slapi_ch_strdup(basedn);
058656
     data->force = slapi_ch_strdup(force_cleaning);
058656
 
058656
+    /* It is either a consequence of a direct ADD cleanAllRuv task
058656
+     * or modify of the replica to add nsds5task: cleanAllRuv
058656
+     */
058656
+    data->original_task = PR_TRUE;
058656
+
058656
     thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread,
058656
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
058656
                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
058656
@@ -1702,7 +1707,7 @@ replica_cleanallruv_thread(void *arg)
058656
     /*
058656
      *  Add the cleanallruv task to the repl config - so we can handle restarts
058656
      */
058656
-    add_cleaned_rid(data->rid, data->replica, csnstr, data->force); /* marks config that we started cleaning a rid */
058656
+    add_cleaned_rid(data, csnstr); /* marks config that we started cleaning a rid */
058656
     cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Cleaning rid (%d)...", data->rid);
058656
     /*
058656
      *  First, wait for the maxcsn to be covered
058656
@@ -1878,7 +1883,13 @@ done:
058656
          */
058656
         delete_cleaned_rid_config(data);
058656
         check_replicas_are_done_cleaning(data);
058656
-        remove_keep_alive_entry(data->task, data->rid, data->repl_root);
058656
+        if (data->original_task) {
058656
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Original task deletes Keep alive entry (%d).", data->rid);
058656
+            remove_keep_alive_entry(data->task, data->rid, data->repl_root);
058656
+        } else {
058656
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Propagated task does not delete Keep alive entry (%d).", data->rid);
058656
+        }
058656
+
058656
         clean_agmts(data);
058656
         remove_cleaned_rid(data->rid);
058656
         cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d).", data->rid);
058656
@@ -2029,7 +2040,7 @@ check_replicas_are_done_cleaning(cleanruv_data *data)
058656
                  "Waiting for all the replicas to finish cleaning...");
058656
 
058656
     csn_as_string(data->maxcsn, PR_FALSE, csnstr);
058656
-    filter = PR_smprintf("(%s=%d:%s:%s)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force);
058656
+    filter = PR_smprintf("(%s=%d:%s:%s:%d)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force, data->original_task ? 1 : 0);
058656
     while (not_all_cleaned && !is_task_aborted(data->rid) && !slapi_is_shutting_down()) {
058656
         agmt_obj = agmtlist_get_first_agreement_for_replica(data->replica);
058656
         if (agmt_obj == NULL) {
058656
@@ -2502,7 +2513,7 @@ set_cleaned_rid(ReplicaId rid)
058656
  *  Add the rid and maxcsn to the repl config (so we can resume after a server restart)
058656
  */
058656
 void
058656
-add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
058656
+add_cleaned_rid(cleanruv_data *cleanruv_data, char *maxcsn)
058656
 {
058656
     Slapi_PBlock *pb;
058656
     struct berval *vals[2];
058656
@@ -2512,6 +2523,16 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
058656
     char data[CSN_STRSIZE + 10];
058656
     char *dn;
058656
     int rc;
058656
+    ReplicaId rid;
058656
+    Replica *r;
058656
+    char *forcing;
058656
+
058656
+    if (data == NULL) {
058656
+        return;
058656
+    }
058656
+    rid = cleanruv_data->rid;
058656
+    r = cleanruv_data->replica;
058656
+    forcing = cleanruv_data->force;
058656
 
058656
     if (r == NULL || maxcsn == NULL) {
058656
         return;
058656
@@ -2519,7 +2540,7 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
058656
     /*
058656
      *  Write the rid & maxcsn to the config entry
058656
      */
058656
-    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s", rid, maxcsn, forcing);
058656
+    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s:%d", rid, maxcsn, forcing, cleanruv_data->original_task ? 1 : 0);
058656
     dn = replica_get_dn(r);
058656
     pb = slapi_pblock_new();
058656
     mod.mod_op = LDAP_MOD_ADD | LDAP_MOD_BVALUES;
058656
@@ -2961,6 +2982,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
058656
     data->repl_root = slapi_ch_strdup(base_dn);
058656
     data->sdn = NULL;
058656
     data->certify = slapi_ch_strdup(certify_all);
058656
+    data->original_task = PR_TRUE;
058656
 
058656
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
058656
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
058656
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
058656
index c49c6bd8d..68e2544b4 100644
058656
--- a/ldap/servers/plugins/replication/repl_extop.c
058656
+++ b/ldap/servers/plugins/replication/repl_extop.c
058656
@@ -1412,6 +1412,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
058656
     data->rid = rid;
058656
     data->repl_root = slapi_ch_strdup(repl_root);
058656
     data->certify = slapi_ch_strdup(certify_all);
058656
+    data->original_task = PR_FALSE;
058656
     /*
058656
      *  Set the aborted rid and stop the cleaning
058656
      */
058656
@@ -1555,6 +1556,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
058656
         data->payload = slapi_ch_bvdup(extop_payload);
058656
         data->force = slapi_ch_strdup(force);
058656
         data->repl_root = slapi_ch_strdup(repl_root);
058656
+        data->original_task = PR_FALSE;
058656
 
058656
         thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
058656
                                  (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
058656
-- 
058656
2.13.6
058656