|
|
ed9856 |
From f22fe3da910889ab2530d84b647b5b36b6e7e95f Mon Sep 17 00:00:00 2001
|
|
|
ed9856 |
From: tbordaz <tbordaz@redhat.com>
|
|
|
ed9856 |
Date: Mon, 14 Dec 2020 10:02:24 +0100
|
|
|
ed9856 |
Subject: [PATCH] Issue 4492 - Changelog cache can upload updates from a wrong
|
|
|
ed9856 |
starting point (CSN) (#4493)
|
|
|
ed9856 |
|
|
|
ed9856 |
Bug description:
|
|
|
ed9856 |
When a replication session starts, a starting point is computed
|
|
|
ed9856 |
according to supplier/consumer RUVs.
|
|
|
ed9856 |
from the starting point the updates are bulk loaded from the CL.
|
|
|
ed9856 |
When a bulk set have been fully evaluated the server needs to bulk load another set.
|
|
|
ed9856 |
It iterates until there is no more updates to send.
|
|
|
ed9856 |
The bug is that during bulk load, it recomputes the CL cursor position
|
|
|
ed9856 |
and this computation can be wrong. For example if a new update on
|
|
|
ed9856 |
a rarely updated replica (or not known replica) the new position will
|
|
|
ed9856 |
be set before the inital starting point
|
|
|
ed9856 |
|
|
|
ed9856 |
Fix description:
|
|
|
ed9856 |
Fixing the invalid computation is a bit risky (complex code resulting from
|
|
|
ed9856 |
years of corner cases handling) and a fix could fail to address others flavor
|
|
|
ed9856 |
with the same symptom
|
|
|
ed9856 |
The fix is only (sorry for that) safety checking fix that would end a replication session
|
|
|
ed9856 |
if the computed cursor position goes before the initial starting point.
|
|
|
ed9856 |
In case of large jump behind (24h) the starting point, a warning is logged.
|
|
|
ed9856 |
|
|
|
ed9856 |
relates: https://github.com/389ds/389-ds-base/issues/4492
|
|
|
ed9856 |
|
|
|
ed9856 |
Reviewed by: Mark Reynolds, William Brown
|
|
|
ed9856 |
|
|
|
ed9856 |
Platforms tested: F31
|
|
|
ed9856 |
---
|
|
|
ed9856 |
ldap/servers/plugins/replication/cl5_api.c | 6 +-
|
|
|
ed9856 |
.../servers/plugins/replication/cl5_clcache.c | 60 ++++++++++++++++++-
|
|
|
ed9856 |
.../servers/plugins/replication/cl5_clcache.h | 4 +-
|
|
|
ed9856 |
3 files changed, 63 insertions(+), 7 deletions(-)
|
|
|
ed9856 |
|
|
|
ed9856 |
diff --git a/ldap/servers/plugins/replication/cl5_api.c b/ldap/servers/plugins/replication/cl5_api.c
|
|
|
ed9856 |
index 65801bc01..1d6e20b07 100644
|
|
|
ed9856 |
--- a/ldap/servers/plugins/replication/cl5_api.c
|
|
|
ed9856 |
+++ b/ldap/servers/plugins/replication/cl5_api.c
|
|
|
ed9856 |
@@ -143,6 +143,7 @@ struct cl5replayiterator
|
|
|
ed9856 |
ReplicaId consumerRID; /* consumer's RID */
|
|
|
ed9856 |
const RUV *consumerRuv; /* consumer's update vector */
|
|
|
ed9856 |
Object *supplierRuvObj; /* supplier's update vector object */
|
|
|
ed9856 |
+ char starting_csn[CSN_STRSIZE];
|
|
|
ed9856 |
};
|
|
|
ed9856 |
|
|
|
ed9856 |
typedef struct cl5iterator
|
|
|
ed9856 |
@@ -1542,7 +1543,7 @@ cl5GetNextOperationToReplay(CL5ReplayIterator *iterator, CL5Entry *entry)
|
|
|
ed9856 |
return CL5_BAD_DATA;
|
|
|
ed9856 |
}
|
|
|
ed9856 |
|
|
|
ed9856 |
- rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn;;
|
|
|
ed9856 |
+ rc = clcache_get_next_change(iterator->clcache, (void **)&key, &keylen, (void **)&data, &datalen, &csn, iterator->starting_csn);
|
|
|
ed9856 |
|
|
|
ed9856 |
if (rc == DB_NOTFOUND) {
|
|
|
ed9856 |
/*
|
|
|
ed9856 |
@@ -5256,7 +5257,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Objec
|
|
|
ed9856 |
if (rc != 0)
|
|
|
ed9856 |
goto done;
|
|
|
ed9856 |
|
|
|
ed9856 |
- rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing);
|
|
|
ed9856 |
+ rc = clcache_load_buffer(clcache, &startCSN, continue_on_missing, NULL /* retrieving startCSN, no limit enforced on this call */);
|
|
|
ed9856 |
|
|
|
ed9856 |
if (rc == 0) {
|
|
|
ed9856 |
haveChanges = PR_TRUE;
|
|
|
ed9856 |
@@ -5320,6 +5321,7 @@ _cl5PositionCursorForReplay(ReplicaId consumerRID, const RUV *consumerRuv, Objec
|
|
|
ed9856 |
(*iterator)->consumerRID = consumerRID;
|
|
|
ed9856 |
(*iterator)->consumerRuv = consumerRuv;
|
|
|
ed9856 |
(*iterator)->supplierRuvObj = supplierRuvObj;
|
|
|
ed9856 |
+ csn_as_string(startCSN, PR_FALSE, (*iterator)->starting_csn);
|
|
|
ed9856 |
} else if (rc == CL5_SUCCESS) {
|
|
|
ed9856 |
/* we have no changes to send */
|
|
|
ed9856 |
rc = CL5_NOTFOUND;
|
|
|
ed9856 |
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
ed9856 |
index a8477a83a..43b7c77d8 100644
|
|
|
ed9856 |
--- a/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
ed9856 |
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
ed9856 |
@@ -15,6 +15,8 @@
|
|
|
ed9856 |
#include "db.h" /* Berkeley DB */
|
|
|
ed9856 |
#include "cl5.h" /* changelog5Config */
|
|
|
ed9856 |
#include "cl5_clcache.h"
|
|
|
ed9856 |
+#include "slap.h"
|
|
|
ed9856 |
+#include "proto-slap.h"
|
|
|
ed9856 |
|
|
|
ed9856 |
/* newer bdb uses DB_BUFFER_SMALL instead of ENOMEM as the
|
|
|
ed9856 |
error return if the given buffer in which to load a
|
|
|
ed9856 |
@@ -323,14 +325,21 @@ clcache_return_buffer(CLC_Buffer **buf)
|
|
|
ed9856 |
* anchorcsn - passed in for the first load of a replication session;
|
|
|
ed9856 |
* flag - DB_SET to load in the key CSN record.
|
|
|
ed9856 |
* DB_NEXT to load in the records greater than key CSN.
|
|
|
ed9856 |
+ * initial_starting_csn
|
|
|
ed9856 |
+ * This is the starting_csn computed at the beginning of
|
|
|
ed9856 |
+ * the replication session. It never change during a session
|
|
|
ed9856 |
+ * (aka iterator creation).
|
|
|
ed9856 |
+ * This is used for safety checking that the next CSN use
|
|
|
ed9856 |
+ * for bulk load is not before the initial csn
|
|
|
ed9856 |
* return - DB error code instead of cl5 one because of the
|
|
|
ed9856 |
* historic reason.
|
|
|
ed9856 |
*/
|
|
|
ed9856 |
int
|
|
|
ed9856 |
-clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
|
|
|
ed9856 |
+clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn)
|
|
|
ed9856 |
{
|
|
|
ed9856 |
int rc = 0;
|
|
|
ed9856 |
int flag = DB_NEXT;
|
|
|
ed9856 |
+ CSN limit_csn = {0};
|
|
|
ed9856 |
|
|
|
ed9856 |
if (anchorCSN)
|
|
|
ed9856 |
*anchorCSN = NULL;
|
|
|
ed9856 |
@@ -343,6 +352,30 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
|
|
|
ed9856 |
rc = clcache_adjust_anchorcsn(buf, &flag;;
|
|
|
ed9856 |
}
|
|
|
ed9856 |
|
|
|
ed9856 |
+ /* safety checking, we do not want to (re)start replication before
|
|
|
ed9856 |
+ * the inital computed starting point
|
|
|
ed9856 |
+ */
|
|
|
ed9856 |
+ if (initial_starting_csn) {
|
|
|
ed9856 |
+ csn_init_by_string(&limit_csn, initial_starting_csn);
|
|
|
ed9856 |
+ if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
|
|
|
ed9856 |
+ char curr[CSN_STRSIZE];
|
|
|
ed9856 |
+ int loglevel = SLAPI_LOG_REPL;
|
|
|
ed9856 |
+
|
|
|
ed9856 |
+ if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
|
|
|
ed9856 |
+ /* This is a big jump (more than a day) behind the
|
|
|
ed9856 |
+ * initial starting csn. Log a warning before ending
|
|
|
ed9856 |
+ * the session
|
|
|
ed9856 |
+ */
|
|
|
ed9856 |
+ loglevel = SLAPI_LOG_WARNING;
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
+ csn_as_string(buf->buf_current_csn, 0, curr);
|
|
|
ed9856 |
+ slapi_log_err(loglevel, buf->buf_agmt_name,
|
|
|
ed9856 |
+ "clcache_load_buffer - bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
|
|
|
ed9856 |
+ /* it just end the session with UPDATE_NO_MORE_UPDATES */
|
|
|
ed9856 |
+ rc = CLC_STATE_DONE;
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
+
|
|
|
ed9856 |
if (rc == 0) {
|
|
|
ed9856 |
|
|
|
ed9856 |
buf->buf_state = CLC_STATE_READY;
|
|
|
ed9856 |
@@ -365,6 +398,27 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss)
|
|
|
ed9856 |
}
|
|
|
ed9856 |
/* the use of alternative start csns can be limited, record its usage */
|
|
|
ed9856 |
(*continue_on_miss)--;
|
|
|
ed9856 |
+
|
|
|
ed9856 |
+ if (initial_starting_csn) {
|
|
|
ed9856 |
+ if (csn_compare(&limit_csn, buf->buf_current_csn) > 0) {
|
|
|
ed9856 |
+ char curr[CSN_STRSIZE];
|
|
|
ed9856 |
+ int loglevel = SLAPI_LOG_REPL;
|
|
|
ed9856 |
+
|
|
|
ed9856 |
+ if (csn_time_difference(&limit_csn, buf->buf_current_csn) > (24 * 60 * 60)) {
|
|
|
ed9856 |
+ /* This is a big jump (more than a day) behind the
|
|
|
ed9856 |
+ * initial starting csn. Log a warning before ending
|
|
|
ed9856 |
+ * the session
|
|
|
ed9856 |
+ */
|
|
|
ed9856 |
+ loglevel = SLAPI_LOG_WARNING;
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
+ csn_as_string(buf->buf_current_csn, 0, curr);
|
|
|
ed9856 |
+ slapi_log_err(loglevel, buf->buf_agmt_name,
|
|
|
ed9856 |
+ "clcache_load_buffer - (DB_SET_RANGE) bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn);
|
|
|
ed9856 |
+ rc = DB_NOTFOUND;
|
|
|
ed9856 |
+
|
|
|
ed9856 |
+ return rc;
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
+ }
|
|
|
ed9856 |
}
|
|
|
ed9856 |
/* Reset some flag variables */
|
|
|
ed9856 |
if (rc == 0) {
|
|
|
ed9856 |
@@ -492,7 +546,7 @@ retry:
|
|
|
ed9856 |
* *data: output - data of the next change, or NULL if no more change
|
|
|
ed9856 |
*/
|
|
|
ed9856 |
int
|
|
|
ed9856 |
-clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn)
|
|
|
ed9856 |
+clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn)
|
|
|
ed9856 |
{
|
|
|
ed9856 |
int skip = 1;
|
|
|
ed9856 |
int rc = 0;
|
|
|
ed9856 |
@@ -510,7 +564,7 @@ clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data
|
|
|
ed9856 |
* We're done with the current buffer. Now load the next chunk.
|
|
|
ed9856 |
*/
|
|
|
ed9856 |
if (NULL == *key && CLC_STATE_READY == buf->buf_state) {
|
|
|
ed9856 |
- rc = clcache_load_buffer(buf, NULL, NULL);
|
|
|
ed9856 |
+ rc = clcache_load_buffer(buf, NULL, NULL, initial_starting_csn);
|
|
|
ed9856 |
if (0 == rc && buf->buf_record_ptr) {
|
|
|
ed9856 |
DB_MULTIPLE_KEY_NEXT(buf->buf_record_ptr, &buf->buf_data,
|
|
|
ed9856 |
*key, *keylen, *data, *datalen);
|
|
|
ed9856 |
diff --git a/ldap/servers/plugins/replication/cl5_clcache.h b/ldap/servers/plugins/replication/cl5_clcache.h
|
|
|
ed9856 |
index 73eb41590..16d53d563 100644
|
|
|
ed9856 |
--- a/ldap/servers/plugins/replication/cl5_clcache.h
|
|
|
ed9856 |
+++ b/ldap/servers/plugins/replication/cl5_clcache.h
|
|
|
ed9856 |
@@ -23,9 +23,9 @@ typedef struct clc_buffer CLC_Buffer;
|
|
|
ed9856 |
int clcache_init(DB_ENV **dbenv);
|
|
|
ed9856 |
void clcache_set_config(void);
|
|
|
ed9856 |
int clcache_get_buffer(CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV *consumer_ruv, const RUV *local_ruv);
|
|
|
ed9856 |
-int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss);
|
|
|
ed9856 |
+int clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, char *initial_starting_csn);
|
|
|
ed9856 |
void clcache_return_buffer(CLC_Buffer **buf);
|
|
|
ed9856 |
-int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn);
|
|
|
ed9856 |
+int clcache_get_next_change(CLC_Buffer *buf, void **key, size_t *keylen, void **data, size_t *datalen, CSN **csn, char *initial_starting_csn);
|
|
|
ed9856 |
void clcache_destroy(void);
|
|
|
ed9856 |
|
|
|
ed9856 |
#endif
|
|
|
ed9856 |
--
|
|
|
ed9856 |
2.26.2
|
|
|
ed9856 |
|