andykimpe / rpms / 389-ds-base

Forked from rpms/389-ds-base 5 months ago
Clone

Blame SOURCES/0017-Issue-4492-Changelog-cache-can-upload-updates-from-a.patch

ed9856
From f22fe3da910889ab2530d84b647b5b36b6e7e95f Mon Sep 17 00:00:00 2001
ed9856
From: tbordaz <tbordaz@redhat.com>
ed9856
Date: Mon, 14 Dec 2020 10:02:24 +0100
ed9856
Subject: [PATCH] Issue 4492 - Changelog cache can upload updates from a wrong
ed9856
 starting point (CSN) (#4493)
ed9856
ed9856
Bug description:
ed9856
          When a replication session starts, a starting point is computed
ed9856
          according to supplier/consumer RUVs.
ed9856
	  from the starting point the updates are bulk loaded from the CL.
ed9856
          When a bulk set have been fully evaluated the server needs to bulk load another set.
ed9856
	  It iterates until there is no more updates to send.
ed9856
          The bug is that during bulk load, it recomputes the CL cursor position
ed9856
          and this computation can be wrong. For example if a new update on
ed9856
          a rarely updated replica (or not known replica) the new position will
ed9856
          be set before the inital starting point
ed9856
ed9856
Fix description:
ed9856
          Fixing the invalid computation is a bit risky (complex code resulting from
ed9856
          years of corner cases handling) and a fix could fail to address others flavor
ed9856
          with the same symptom
ed9856
          The fix is only (sorry for that) safety checking fix that would end a replication session
ed9856
          if the computed cursor position goes before the initial starting point.
ed9856
	  In case of large jump behind (24h) the starting point, a warning is logged.
ed9856
ed9856
relates: https://github.com/389ds/389-ds-base/issues/4492
ed9856
ed9856
Reviewed by: Mark Reynolds, William Brown
ed9856
ed9856
Platforms tested: F31
ed9856
---
ed9856
 ldap/servers/plugins/replication/cl5_api.c    |  6 +-
ed9856
 .../servers/plugins/replication/cl5_clcache.c | 60 ++++++++++++++++++-
ed9856
 .../servers/plugins/replication/cl5_clcache.h |  4 +-
ed9856
 3 files changed, 63 insertions(+), 7 deletions(-)
ed9856
ed9856
diff --git a/ldap/servers/plugins/replication/cl5_api.c b/ldap/servers/plugins/replication/cl5_api.c
ed9856
index 65801bc01..1d6e20b07 100644
ed9856
--- a/ldap/servers/plugins/replication/cl5_api.c
ed9856
+++ b/ldap/servers/plugins/replication/cl5_api.c
ed9856
@@ -143,6 +143,7 @@ struct cl5replayiterator
ed9856
     ReplicaId consumerRID;  /* consumer's RID */
ed9856
     const RUV *consumerRuv; /* consumer's update vector                    */
ed9856
     Object *supplierRuvObj; /* supplier's update vector object          */
ed9856
+    char starting_csn[CSN_STRSIZE];
ed9856
 };
ed9856
 
ed9856
 typedef struct cl5iterator
ed9856
@@ -1542,7 +1543,7 @@ cl5GetNextOperationToReplay(CL5ReplayIterator *iterator, CL5Entry *entry)
ed9856
         return CL5_BAD_DATA;
ed9856
     }
ed9856
 
ed9856
-    rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn;;
ed9856
+    rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn, iterator->starting_csn);
ed9856
 
ed9856
     if (rc == DB_NOTFOUND) {
ed9856
         /*
ed9856
@@ -5256,7 +5257,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Objec
ed9856
     if (rc != 0)
ed9856
         goto done;
ed9856
 
ed9856
-    rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing);
ed9856
+    rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing, NULL /* retrieving startCSN, no limit enforced on this call */);
ed9856
 
ed9856
     if (rc == 0) {
ed9856
         haveChanges = PR_TRUE;
ed9856
@@ -5320,6 +5321,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Objec
ed9856
         (*iterator)->consumerRID = consumerRID;
ed9856
         (*iterator)->consumerRuv = consumerRuv;
ed9856
         (*iterator)->supplierRuvObj = supplierRuvObj;
ed9856
+        csn_as_string(startCSN, PR_FALSE, (*iterator)->starting_csn);
ed9856
     } else if (rc == CL5_SUCCESS) {
ed9856
         /* we have no changes to send */
ed9856
         rc = CL5_NOTFOUND;
ed9856
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
ed9856
index a8477a83a..43b7c77d8 100644
ed9856
--- a/ldap/servers/plugins/replication/cl5_clcache.c
ed9856
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
ed9856
@@ -15,6 +15,8 @@
ed9856
 #include "db.h"    /* Berkeley DB */
ed9856
 #include "cl5.h"   /* changelog5Config */
ed9856
 #include "cl5_clcache.h"
ed9856
+#include "slap.h"
ed9856
+#include "proto-slap.h"
ed9856
 
ed9856
 /* newer bdb uses DB_BUFFER_SMALL instead of ENOMEM as the
ed9856
    error return if the given buffer in which to load a
ed9856
@@ -323,14 +325,21 @@ clcache_return_buffer(CLC_Buffer **buf)
ed9856
  * anchorcsn - passed in for the first load of a replication session;
ed9856
  * flag         - DB_SET to load in the key CSN record.
ed9856
  *                DB_NEXT to load in the records greater than key CSN.
ed9856
+ * initial_starting_csn
ed9856
+ *              This is the starting_csn computed at the beginning of
ed9856
+ *              the replication session. It never change during a session
ed9856
+ *              (aka iterator creation).
ed9856
+ *              This is used for safety checking that the next CSN use
ed9856
+ *              for bulk load is not before the initial csn
ed9856
  * return    - DB error code instead of cl5 one because of the
ed9856
  *               historic reason.
ed9856
  */
ed9856
 int
ed9856
-clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
ed9856
+clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn)
ed9856
 {
ed9856
     int rc = 0;
ed9856
     int flag = DB_NEXT;
ed9856
+    CSN limit_csn = {0};
ed9856
 
ed9856
     if (anchorCSN)
ed9856
         *anchorCSN = NULL;
ed9856
@@ -343,6 +352,30 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
ed9856
         rc = clcache_adjust_anchorcsn(buf, &flag;;
ed9856
     }
ed9856
 
ed9856
+    /* safety checking, we do not want to (re)start replication before
ed9856
+     * the inital computed starting point
ed9856
+     */
ed9856
+    if (initial_starting_csn) {
ed9856
+        csn_init_by_string(&limit_csn, initial_starting_csn);
ed9856
+        if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
ed9856
+            char curr[CSN_STRSIZE];
ed9856
+            int loglevel = SLAPI_LOG_REPL;
ed9856
+
ed9856
+            if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
ed9856
+                /* This is a big jump (more than a day) behind the
ed9856
+                 * initial starting csn. Log a warning before ending
ed9856
+                 * the session
ed9856
+                 */
ed9856
+                loglevel = SLAPI_LOG_WARNING;
ed9856
+            }
ed9856
+            csn_as_string(buf->buf_current_csn, 0, curr);
ed9856
+            slapi_log_err(loglevel, buf->buf_agmt_name,
ed9856
+                      "clcache_load_buffer - bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
ed9856
+            /* it just end the session with UPDATE_NO_MORE_UPDATES */
ed9856
+            rc = CLC_STATE_DONE;
ed9856
+        }
ed9856
+    }
ed9856
+
ed9856
     if (rc == 0) {
ed9856
 
ed9856
         buf->buf_state = CLC_STATE_READY;
ed9856
@@ -365,6 +398,27 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
ed9856
             }
ed9856
             /* the use of alternative start csns can be limited, record its usage */
ed9856
             (*continue_on_miss)--;
ed9856
+
ed9856
+            if (initial_starting_csn) {
ed9856
+                if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
ed9856
+                    char curr[CSN_STRSIZE];
ed9856
+                    int loglevel = SLAPI_LOG_REPL;
ed9856
+
ed9856
+                    if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
ed9856
+                        /* This is a big jump (more than a day) behind the
ed9856
+                         * initial starting csn. Log a warning before ending
ed9856
+                         * the session
ed9856
+                         */
ed9856
+                        loglevel = SLAPI_LOG_WARNING;
ed9856
+                    }
ed9856
+                    csn_as_string(buf->buf_current_csn, 0, curr);
ed9856
+                    slapi_log_err(loglevel, buf->buf_agmt_name,
ed9856
+                            "clcache_load_buffer - (DB_SET_RANGE) bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
ed9856
+                    rc = DB_NOTFOUND;
ed9856
+
ed9856
+                    return rc;
ed9856
+                }
ed9856
+            }
ed9856
         }
ed9856
         /* Reset some flag variables */
ed9856
         if (rc == 0) {
ed9856
@@ -492,7 +546,7 @@ retry:
ed9856
  * *data: output - data of the next change, or NULL if no more change
ed9856
  */
ed9856
 int
ed9856
-clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn)
ed9856
+clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn)
ed9856
 {
ed9856
     int skip = 1;
ed9856
     int rc = 0;
ed9856
@@ -510,7 +564,7 @@ clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data
ed9856
          * We're done with the current buffer. Now load the next chunk.
ed9856
          */
ed9856
         if (NULL == *key && CLC_STATE_READY == buf->buf_state) {
ed9856
-            rc = clcache_load_buffer(buf, NULL, NULL);
ed9856
+            rc = clcache_load_buffer(buf, NULL, NULL, initial_starting_csn);
ed9856
             if (0 == rc && buf->buf_record_ptr) {
ed9856
                 DB_MULTIPLE_KEY_NEXT(buf->buf_record_ptr, &buf->buf_data,
ed9856
                                      *key, *keylen, *data, *datalen);
ed9856
diff --git a/ldap/servers/plugins/replication/cl5_clcache.h b/ldap/servers/plugins/replication/cl5_clcache.h
ed9856
index 73eb41590..16d53d563 100644
ed9856
--- a/ldap/servers/plugins/replication/cl5_clcache.h
ed9856
+++ b/ldap/servers/plugins/replication/cl5_clcache.h
ed9856
@@ -23,9 +23,9 @@ typedef struct clc_buffer CLC_Buffer;
ed9856
 int clcache_init(DB_ENV **dbenv);
ed9856
 void clcache_set_config(void);
ed9856
 int clcache_get_buffer(CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV *consumer_ruv, const RUV *local_ruv);
ed9856
-int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss);
ed9856
+int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn);
ed9856
 void clcache_return_buffer(CLC_Buffer **buf);
ed9856
-int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn);
ed9856
+int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn);
ed9856
 void clcache_destroy(void);
ed9856
 
ed9856
 #endif
ed9856
-- 
ed9856
2.26.2
ed9856