andykimpe / rpms / 389-ds-base

Forked from rpms/389-ds-base 7 months ago
Clone

Blame SOURCES/0042-Issue-4492-Changelog-cache-can-upload-updates-from-a.patch

36233f
From 16ec195b12688bcbe0d113396eee782175102565 Mon Sep 17 00:00:00 2001
36233f
From: Thierry Bordaz <tbordaz@redhat.com>
36233f
Date: Mon, 14 Dec 2020 10:41:58 +0100
36233f
Subject: [PATCH] Issue 4492 - Changelog cache can upload updates from a wrong
36233f
 starting point (CSN)
36233f
36233f
Bug description:
36233f
          When a replication session starts, a starting point is computed
36233f
          according to supplier/consumer RUVs.
36233f
	  from the starting point the updates are bulk loaded from the CL.
36233f
          When a bulk set have been fully evaluated the server needs to bulk load another set.
36233f
	  It iterates until there is no more updates to send.
36233f
          The bug is that during bulk load, it recomputes the CL cursor position
36233f
          and this computation can be wrong. For example if a new update on
36233f
          a rarely updated replica (or not known replica) the new position will
36233f
          be set before the inital starting point
36233f
36233f
Fix description:
36233f
          Fixing the invalid computation is a bit risky (complex code resulting from
36233f
          years of corner cases handling) and a fix could fail to address others flavor
36233f
          with the same symptom
36233f
          The fix is only (sorry for that) safety checking fix that would end a replication session
36233f
          if the computed cursor position goes before the initial starting point.
36233f
	  In case of large jump behind (24h) the starting point, a warning is logged.
36233f
36233f
relates: https://github.com/389ds/389-ds-base/issues/4492
36233f
36233f
Reviewed by: Mark Reynolds, William Brown
36233f
36233f
Platforms tested: F31
36233f
---
36233f
 ldap/servers/plugins/replication/cl5_api.c    |  6 +-
36233f
 .../servers/plugins/replication/cl5_clcache.c | 60 ++++++++++++++++++-
36233f
 .../servers/plugins/replication/cl5_clcache.h |  4 +-
36233f
 3 files changed, 63 insertions(+), 7 deletions(-)
36233f
36233f
diff --git a/ldap/servers/plugins/replication/cl5_api.c b/ldap/servers/plugins/replication/cl5_api.c
36233f
index d7e47495a..403a6a666 100644
36233f
--- a/ldap/servers/plugins/replication/cl5_api.c
36233f
+++ b/ldap/servers/plugins/replication/cl5_api.c
36233f
@@ -143,6 +143,7 @@ struct cl5replayiterator
36233f
     ReplicaId consumerRID;  /* consumer's RID */
36233f
     const RUV *consumerRuv; /* consumer's update vector                    */
36233f
     Object *supplierRuvObj; /* supplier's update vector object          */
36233f
+    char starting_csn[CSN_STRSIZE];
36233f
 };
36233f
 
36233f
 typedef struct cl5iterator
36233f
@@ -1367,7 +1368,7 @@ cl5GetNextOperationToReplay(CL5ReplayIterator *iterator, CL5Entry *entry)
36233f
         return CL5_BAD_DATA;
36233f
     }
36233f
 
36233f
-    rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn;;
36233f
+    rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn, iterator->starting_csn);
36233f
 
36233f
     if (rc == DB_NOTFOUND) {
36233f
         /*
36233f
@@ -4999,7 +5000,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Repli
36233f
     if (rc != 0)
36233f
         goto done;
36233f
 
36233f
-    rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing);
36233f
+    rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing, NULL);
36233f
 
36233f
     if (rc == 0) {
36233f
         haveChanges = PR_TRUE;
36233f
@@ -5063,6 +5064,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Repli
36233f
         (*iterator)->consumerRID = consumerRID;
36233f
         (*iterator)->consumerRuv = consumerRuv;
36233f
         (*iterator)->supplierRuvObj = supplierRuvObj;
36233f
+        csn_as_string(startCSN, PR_FALSE, (*iterator)->starting_csn);
36233f
     } else if (rc == CL5_SUCCESS) {
36233f
         /* we have no changes to send */
36233f
         rc = CL5_NOTFOUND;
36233f
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
36233f
index 6b591fb8d..fcbca047a 100644
36233f
--- a/ldap/servers/plugins/replication/cl5_clcache.c
36233f
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
36233f
@@ -15,6 +15,8 @@
36233f
 #include "db.h"    /* Berkeley DB */
36233f
 #include "cl5.h"   /* changelog5Config */
36233f
 #include "cl5_clcache.h"
36233f
+#include "slap.h"
36233f
+#include "proto-slap.h"
36233f
 
36233f
 /* newer bdb uses DB_BUFFER_SMALL instead of ENOMEM as the
36233f
    error return if the given buffer in which to load a
36233f
@@ -323,14 +325,21 @@ clcache_return_buffer(CLC_Buffer **buf)
36233f
  * anchorcsn - passed in for the first load of a replication session;
36233f
  * flag         - DB_SET to load in the key CSN record.
36233f
  *                DB_NEXT to load in the records greater than key CSN.
36233f
+ * initial_starting_csn
36233f
+ *              This is the starting_csn computed at the beginning of
36233f
+ *              the replication session. It never change during a session
36233f
+ *              (aka iterator creation).
36233f
+ *              This is used for safety checking that the next CSN use
36233f
+ *              for bulk load is not before the initial csn
36233f
  * return    - DB error code instead of cl5 one because of the
36233f
  *               historic reason.
36233f
  */
36233f
 int
36233f
-clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
36233f
+clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn)
36233f
 {
36233f
     int rc = 0;
36233f
     int flag = DB_NEXT;
36233f
+    CSN limit_csn = {0};
36233f
 
36233f
     if (anchorCSN)
36233f
         *anchorCSN = NULL;
36233f
@@ -343,6 +352,30 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
36233f
         rc = clcache_adjust_anchorcsn(buf, &flag;;
36233f
     }
36233f
 
36233f
+    /* safety checking, we do not want to (re)start replication before
36233f
+     * the inital computed starting point
36233f
+     */
36233f
+    if (initial_starting_csn) {
36233f
+        csn_init_by_string(&limit_csn, initial_starting_csn);
36233f
+        if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
36233f
+            char curr[CSN_STRSIZE];
36233f
+            int loglevel = SLAPI_LOG_REPL;
36233f
+
36233f
+            if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
36233f
+                /* This is a big jump (more than a day) behind the
36233f
+                 * initial starting csn. Log a warning before ending
36233f
+                 * the session
36233f
+                 */
36233f
+                loglevel = SLAPI_LOG_WARNING;
36233f
+            }
36233f
+            csn_as_string(buf->buf_current_csn, 0, curr);
36233f
+            slapi_log_err(loglevel, buf->buf_agmt_name,
36233f
+                      "clcache_load_buffer - bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
36233f
+            /* it just end the session with UPDATE_NO_MORE_UPDATES */
36233f
+            rc = CLC_STATE_DONE;
36233f
+        }
36233f
+    }
36233f
+
36233f
     if (rc == 0) {
36233f
 
36233f
         buf->buf_state = CLC_STATE_READY;
36233f
@@ -365,6 +398,27 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
36233f
             }
36233f
             /* the use of alternative start csns can be limited, record its usage */
36233f
             (*continue_on_miss)--;
36233f
+
36233f
+            if (initial_starting_csn) {
36233f
+                if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
36233f
+                    char curr[CSN_STRSIZE];
36233f
+                    int loglevel = SLAPI_LOG_REPL;
36233f
+
36233f
+                    if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
36233f
+                        /* This is a big jump (more than a day) behind the
36233f
+                         * initial starting csn. Log a warning before ending
36233f
+                         * the session
36233f
+                         */
36233f
+                        loglevel = SLAPI_LOG_WARNING;
36233f
+                    }
36233f
+                    csn_as_string(buf->buf_current_csn, 0, curr);
36233f
+                    slapi_log_err(loglevel, buf->buf_agmt_name,
36233f
+                            "clcache_load_buffer - (DB_SET_RANGE) bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
36233f
+                    rc = DB_NOTFOUND;
36233f
+
36233f
+                    return rc;
36233f
+                }
36233f
+            }
36233f
         }
36233f
         /* Reset some flag variables */
36233f
         if (rc == 0) {
36233f
@@ -492,7 +546,7 @@ retry:
36233f
  * *data: output - data of the next change, or NULL if no more change
36233f
  */
36233f
 int
36233f
-clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn)
36233f
+clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn)
36233f
 {
36233f
     int skip = 1;
36233f
     int rc = 0;
36233f
@@ -510,7 +564,7 @@ clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data
36233f
          * We're done with the current buffer. Now load the next chunk.
36233f
          */
36233f
         if (NULL == *key && CLC_STATE_READY == buf->buf_state) {
36233f
-            rc = clcache_load_buffer(buf, NULL, NULL);
36233f
+            rc = clcache_load_buffer(buf, NULL, NULL, initial_starting_csn);
36233f
             if (0 == rc && buf->buf_record_ptr) {
36233f
                 DB_MULTIPLE_KEY_NEXT(buf->buf_record_ptr, &buf->buf_data,
36233f
                                      *key, *keylen, *data, *datalen);
36233f
diff --git a/ldap/servers/plugins/replication/cl5_clcache.h b/ldap/servers/plugins/replication/cl5_clcache.h
36233f
index 73eb41590..16d53d563 100644
36233f
--- a/ldap/servers/plugins/replication/cl5_clcache.h
36233f
+++ b/ldap/servers/plugins/replication/cl5_clcache.h
36233f
@@ -23,9 +23,9 @@ typedef struct clc_buffer CLC_Buffer;
36233f
 int clcache_init(DB_ENV **dbenv);
36233f
 void clcache_set_config(void);
36233f
 int clcache_get_buffer(CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV *consumer_ruv, const RUV *local_ruv);
36233f
-int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss);
36233f
+int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn);
36233f
 void clcache_return_buffer(CLC_Buffer **buf);
36233f
-int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn);
36233f
+int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn);
36233f
 void clcache_destroy(void);
36233f
 
36233f
 #endif
36233f
-- 
36233f
2.31.1
36233f