From e202c62c3b4c92163d2de9f3da9a9f3efc81e4b8 Mon Sep 17 00:00:00 2001
From: progier389 <72748589+progier389@users.noreply.github.com>
Date: Thu, 12 Nov 2020 18:50:04 +0100
Subject: [PATCH 3/3] do not add referrals for masters with different data
generation #2054 (#4427)
Bug description:
The problem is that some operation mandatory in the usual cases are
also performed when replication cannot take place because the
database set are differents (i.e: RUV generation ids are different)
One of the issue is that the csn generator state is updated when
starting a replication session (it is a problem when trying to
reset the time skew, as freshly reinstalled replicas get infected
by the old ones)
A second issue is that the RUV got updated when ending a replication session
(which may add replica that does not share the same data set,
then update operations on consumer retun referrals towards wrong masters
Fix description:
The fix checks the RUVs generation id before updating the csn generator
and before updating the RUV.
Reviewed by: mreynolds
firstyear
vashirov
Platforms tested: F32
---
.../suites/replication/regression_test.py | 290 ++++++++++++++++++
ldap/servers/plugins/replication/repl5.h | 1 +
.../plugins/replication/repl5_inc_protocol.c | 20 +-
.../plugins/replication/repl5_replica.c | 39 ++-
src/lib389/lib389/dseldif.py | 37 +++
5 files changed, 368 insertions(+), 19 deletions(-)
diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py
index 14b9d6a44..a72af6b30 100644
--- a/dirsrvtests/tests/suites/replication/regression_test.py
+++ b/dirsrvtests/tests/suites/replication/regression_test.py
@@ -13,6 +13,7 @@ from lib389.idm.user import TEST_USER_PROPERTIES, UserAccounts
from lib389.pwpolicy import PwPolicyManager
from lib389.utils import *
from lib389.topologies import topology_m2 as topo_m2, TopologyMain, topology_m3 as topo_m3, create_topology, _remove_ssca_db, topology_i2 as topo_i2
+from lib389.topologies import topology_m2c2 as topo_m2c2
from lib389._constants import *
from lib389.idm.organizationalunit import OrganizationalUnits
from lib389.idm.user import UserAccount
@@ -22,6 +23,7 @@ from lib389.idm.directorymanager import DirectoryManager
from lib389.replica import Replicas, ReplicationManager, Changelog5, BootstrapReplicationManager
from lib389.agreement import Agreements
from lib389 import pid_from_file
+from lib389.dseldif import *
pytestmark = pytest.mark.tier1
@@ -1027,6 +1029,294 @@ def test_online_init_should_create_keepalive_entries(topo_m2):
verify_keepalive_entries(topo_m2, True);
+def get_agreement(agmts, consumer):
+ # Get agreement towards consumer among the agremment list
+ for agmt in agmts.list():
+ if (agmt.get_attr_val_utf8('nsDS5ReplicaPort') == str(consumer.port) and
+ agmt.get_attr_val_utf8('nsDS5ReplicaHost') == consumer.host):
+ return agmt
+ return None;
+
+
+def test_ruv_url_not_added_if_different_uuid(topo_m2c2):
+ """Check that RUV url is not updated if RUV generation uuid are different
+
+ :id: 7cc30a4e-0ffd-4758-8f00-e500279af344
+ :setup: Two masters + two consumers replication setup
+ :steps:
+ 1. Generate ldif without replication data
+ 2. Init both masters from that ldif
+ (to clear the ruvs and generates different generation uuid)
+ 3. Perform on line init from master1 to consumer1
+ and from master2 to consumer2
+ 4. Perform update on both masters
+ 5. Check that c1 RUV does not contains URL towards m2
+ 6. Check that c2 RUV does contains URL towards m2
+ 7. Perform on line init from master1 to master2
+ 8. Perform update on master2
+ 9. Check that c1 RUV does contains URL towards m2
+ :expectedresults:
+ 1. No error while generating ldif
+ 2. No error while importing the ldif file
+ 3. No error and Initialization done.
+ 4. No error
+ 5. master2 replicaid should not be in the consumer1 RUV
+ 6. master2 replicaid should be in the consumer2 RUV
+ 7. No error and Initialization done.
+ 8. No error
+ 9. master2 replicaid should be in the consumer1 RUV
+
+ """
+
+ # Variables initialization
+ repl = ReplicationManager(DEFAULT_SUFFIX)
+
+ m1 = topo_m2c2.ms["master1"]
+ m2 = topo_m2c2.ms["master2"]
+ c1 = topo_m2c2.cs["consumer1"]
+ c2 = topo_m2c2.cs["consumer2"]
+
+ replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX)
+ replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX)
+ replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX)
+ replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX)
+
+ replicid_m2 = replica_m2.get_rid()
+
+ agmts_m1 = Agreements(m1, replica_m1.dn)
+ agmts_m2 = Agreements(m2, replica_m2.dn)
+
+ m1_m2 = get_agreement(agmts_m1, m2)
+ m1_c1 = get_agreement(agmts_m1, c1)
+ m1_c2 = get_agreement(agmts_m1, c2)
+ m2_m1 = get_agreement(agmts_m2, m1)
+ m2_c1 = get_agreement(agmts_m2, c1)
+ m2_c2 = get_agreement(agmts_m2, c2)
+
+ # Step 1: Generate ldif without replication data
+ m1.stop()
+ m2.stop()
+ ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
+ m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
+ excludeSuffixes=None, repl_data=False,
+ outputfile=ldif_file, encrypt=False)
+ # Remove replication metadata that are still in the ldif
+ # _remove_replication_data(ldif_file)
+
+ # Step 2: Init both masters from that ldif
+ m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+ m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+ m1.start()
+ m2.start()
+
+ # Step 3: Perform on line init from master1 to consumer1
+ # and from master2 to consumer2
+ m1_c1.begin_reinit()
+ m2_c2.begin_reinit()
+ (done, error) = m1_c1.wait_reinit()
+ assert done is True
+ assert error is False
+ (done, error) = m2_c2.wait_reinit()
+ assert done is True
+ assert error is False
+
+ # Step 4: Perform update on both masters
+ repl.test_replication(m1, c1)
+ repl.test_replication(m2, c2)
+
+ # Step 5: Check that c1 RUV does not contains URL towards m2
+ ruv = replica_c1.get_ruv()
+ log.debug(f"c1 RUV: {ruv}")
+ url=ruv._rid_url.get(replica_m2.get_rid())
+ if (url == None):
+ log.debug(f"No URL for RID {replica_m2.get_rid()} in RUV");
+ else:
+ log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+ log.error(f"URL for RID {replica_m2.get_rid()} found in RUV")
+ #Note: this assertion fails if issue 2054 is not fixed.
+ assert False
+
+ # Step 6: Check that c2 RUV does contains URL towards m2
+ ruv = replica_c2.get_ruv()
+ log.debug(f"c1 RUV: {ruv} {ruv._rids} ")
+ url=ruv._rid_url.get(replica_m2.get_rid())
+ if (url == None):
+ log.error(f"No URL for RID {replica_m2.get_rid()} in RUV");
+ assert False
+ else:
+ log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+
+
+ # Step 7: Perform on line init from master1 to master2
+ m1_m2.begin_reinit()
+ (done, error) = m1_m2.wait_reinit()
+ assert done is True
+ assert error is False
+
+ # Step 8: Perform update on master2
+ repl.test_replication(m2, c1)
+
+ # Step 9: Check that c1 RUV does contains URL towards m2
+ ruv = replica_c1.get_ruv()
+ log.debug(f"c1 RUV: {ruv} {ruv._rids} ")
+ url=ruv._rid_url.get(replica_m2.get_rid())
+ if (url == None):
+ log.error(f"No URL for RID {replica_m2.get_rid()} in RUV");
+ assert False
+ else:
+ log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+
+
+def test_csngen_state_not_updated_if_different_uuid(topo_m2c2):
+ """Check that csngen remote offset is not updated if RUV generation uuid are different
+
+ :id: 77694b8e-22ae-11eb-89b2-482ae39447e5
+ :setup: Two masters + two consumers replication setup
+ :steps:
+ 1. Disable m1<->m2 agreement to avoid propagate timeSkew
+ 2. Generate ldif without replication data
+ 3. Increase time skew on master2
+ 4. Init both masters from that ldif
+ (to clear the ruvs and generates different generation uuid)
+ 5. Perform on line init from master1 to consumer1 and master2 to consumer2
+ 6. Perform update on both masters
+ 7: Check that c1 has no time skew
+ 8: Check that c2 has time skew
+ 9. Init master2 from master1
+ 10. Perform update on master2
+ 11. Check that c1 has time skew
+ :expectedresults:
+ 1. No error
+ 2. No error while generating ldif
+ 3. No error
+ 4. No error while importing the ldif file
+ 5. No error and Initialization done.
+ 6. No error
+ 7. c1 time skew should be lesser than threshold
+ 8. c2 time skew should be higher than threshold
+ 9. No error and Initialization done.
+ 10. No error
+ 11. c1 time skew should be higher than threshold
+
+ """
+
+ # Variables initialization
+ repl = ReplicationManager(DEFAULT_SUFFIX)
+
+ m1 = topo_m2c2.ms["master1"]
+ m2 = topo_m2c2.ms["master2"]
+ c1 = topo_m2c2.cs["consumer1"]
+ c2 = topo_m2c2.cs["consumer2"]
+
+ replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX)
+ replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX)
+ replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX)
+ replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX)
+
+ replicid_m2 = replica_m2.get_rid()
+
+ agmts_m1 = Agreements(m1, replica_m1.dn)
+ agmts_m2 = Agreements(m2, replica_m2.dn)
+
+ m1_m2 = get_agreement(agmts_m1, m2)
+ m1_c1 = get_agreement(agmts_m1, c1)
+ m1_c2 = get_agreement(agmts_m1, c2)
+ m2_m1 = get_agreement(agmts_m2, m1)
+ m2_c1 = get_agreement(agmts_m2, c1)
+ m2_c2 = get_agreement(agmts_m2, c2)
+
+ # Step 1: Disable m1<->m2 agreement to avoid propagate timeSkew
+ m1_m2.pause()
+ m2_m1.pause()
+
+ # Step 2: Generate ldif without replication data
+ m1.stop()
+ m2.stop()
+ ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
+ m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
+ excludeSuffixes=None, repl_data=False,
+ outputfile=ldif_file, encrypt=False)
+ # Remove replication metadata that are still in the ldif
+ # _remove_replication_data(ldif_file)
+
+ # Step 3: Increase time skew on master2
+ timeSkew=6*3600
+ # We can modify master2 time skew
+ # But the time skew on the consumer may be smaller
+ # depending on when the cnsgen generation time is updated
+ # and when first csn get replicated.
+ # Since we use timeSkew has threshold value to detect
+ # whether there are time skew or not,
+ # lets add a significative margin (longer than the test duration)
+ # to avoid any risk of erroneous failure
+ timeSkewMargin = 300
+ DSEldif(m2)._increaseTimeSkew(DEFAULT_SUFFIX, timeSkew+timeSkewMargin)
+
+ # Step 4: Init both masters from that ldif
+ m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+ m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+ m1.start()
+ m2.start()
+
+ # Step 5: Perform on line init from master1 to consumer1
+ # and from master2 to consumer2
+ m1_c1.begin_reinit()
+ m2_c2.begin_reinit()
+ (done, error) = m1_c1.wait_reinit()
+ assert done is True
+ assert error is False
+ (done, error) = m2_c2.wait_reinit()
+ assert done is True
+ assert error is False
+
+ # Step 6: Perform update on both masters
+ repl.test_replication(m1, c1)
+ repl.test_replication(m2, c2)
+
+ # Step 7: Check that c1 has no time skew
+ # Stop server to insure that dse.ldif is uptodate
+ c1.stop()
+ c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0]
+ c1_timeSkew = int(c1_nsState['time_skew'])
+ log.debug(f"c1 time skew: {c1_timeSkew}")
+ if (c1_timeSkew >= timeSkew):
+ log.error(f"c1 csngen state has unexpectedly been synchronized with m2: time skew {c1_timeSkew}")
+ assert False
+ c1.start()
+
+ # Step 8: Check that c2 has time skew
+ # Stop server to insure that dse.ldif is uptodate
+ c2.stop()
+ c2_nsState = DSEldif(c2).readNsState(DEFAULT_SUFFIX)[0]
+ c2_timeSkew = int(c2_nsState['time_skew'])
+ log.debug(f"c2 time skew: {c2_timeSkew}")
+ if (c2_timeSkew < timeSkew):
+ log.error(f"c2 csngen state has not been synchronized with m2: time skew {c2_timeSkew}")
+ assert False
+ c2.start()
+
+ # Step 9: Perform on line init from master1 to master2
+ m1_c1.pause()
+ m1_m2.resume()
+ m1_m2.begin_reinit()
+ (done, error) = m1_m2.wait_reinit()
+ assert done is True
+ assert error is False
+
+ # Step 10: Perform update on master2
+ repl.test_replication(m2, c1)
+
+ # Step 11: Check that c1 has time skew
+ # Stop server to insure that dse.ldif is uptodate
+ c1.stop()
+ c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0]
+ c1_timeSkew = int(c1_nsState['time_skew'])
+ log.debug(f"c1 time skew: {c1_timeSkew}")
+ if (c1_timeSkew < timeSkew):
+ log.error(f"c1 csngen state has not been synchronized with m2: time skew {c1_timeSkew}")
+ assert False
+
+
if __name__ == '__main__':
# Run isolated
# -s for DEBUG mode
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
index b35f724c2..f1c596a3f 100644
--- a/ldap/servers/plugins/replication/repl5.h
+++ b/ldap/servers/plugins/replication/repl5.h
@@ -708,6 +708,7 @@ void replica_dump(Replica *r);
void replica_set_enabled(Replica *r, PRBool enable);
Replica *replica_get_replica_from_dn(const Slapi_DN *dn);
Replica *replica_get_replica_from_root(const char *repl_root);
+int replica_check_generation(Replica *r, const RUV *remote_ruv);
int replica_update_ruv(Replica *replica, const CSN *csn, const char *replica_purl);
Replica *replica_get_replica_for_op(Slapi_PBlock *pb);
/* the functions below manipulate replica hash */
diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c b/ldap/servers/plugins/replication/repl5_inc_protocol.c
index 29b1fb073..af5e5897c 100644
--- a/ldap/servers/plugins/replication/repl5_inc_protocol.c
+++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c
@@ -2161,26 +2161,12 @@ examine_update_vector(Private_Repl_Protocol *prp, RUV *remote_ruv)
} else if (NULL == remote_ruv) {
return_value = EXAMINE_RUV_PRISTINE_REPLICA;
} else {
- char *local_gen = NULL;
- char *remote_gen = ruv_get_replica_generation(remote_ruv);
- Object *local_ruv_obj;
- RUV *local_ruv;
-
PR_ASSERT(NULL != prp->replica);
- local_ruv_obj = replica_get_ruv(prp->replica);
- if (NULL != local_ruv_obj) {
- local_ruv = (RUV *)object_get_data(local_ruv_obj);
- PR_ASSERT(local_ruv);
- local_gen = ruv_get_replica_generation(local_ruv);
- object_release(local_ruv_obj);
- }
- if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) {
- return_value = EXAMINE_RUV_GENERATION_MISMATCH;
- } else {
+ if (replica_check_generation(prp->replica, remote_ruv)) {
return_value = EXAMINE_RUV_OK;
+ } else {
+ return_value = EXAMINE_RUV_GENERATION_MISMATCH;
}
- slapi_ch_free((void **)&remote_gen);
- slapi_ch_free((void **)&local_gen);
}
return return_value;
}
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
index f0ea0f8ef..7e56d6557 100644
--- a/ldap/servers/plugins/replication/repl5_replica.c
+++ b/ldap/servers/plugins/replication/repl5_replica.c
@@ -812,6 +812,36 @@ replica_set_ruv(Replica *r, RUV *ruv)
replica_unlock(r->repl_lock);
}
+/*
+ * Check if replica generation is the same than the remote ruv one
+ */
+int
+replica_check_generation(Replica *r, const RUV *remote_ruv)
+{
+ int return_value;
+ char *local_gen = NULL;
+ char *remote_gen = ruv_get_replica_generation(remote_ruv);
+ Object *local_ruv_obj;
+ RUV *local_ruv;
+
+ PR_ASSERT(NULL != r);
+ local_ruv_obj = replica_get_ruv(r);
+ if (NULL != local_ruv_obj) {
+ local_ruv = (RUV *)object_get_data(local_ruv_obj);
+ PR_ASSERT(local_ruv);
+ local_gen = ruv_get_replica_generation(local_ruv);
+ object_release(local_ruv_obj);
+ }
+ if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) {
+ return_value = PR_FALSE;
+ } else {
+ return_value = PR_TRUE;
+ }
+ slapi_ch_free_string(&remote_gen);
+ slapi_ch_free_string(&local_gen);
+ return return_value;
+}
+
/*
* Update one particular CSN in an RUV. This is meant to be called
* whenever (a) the server has processed a client operation and
@@ -1298,6 +1328,11 @@ replica_update_csngen_state_ext(Replica *r, const RUV *ruv, const CSN *extracsn)
PR_ASSERT(r && ruv);
+ if (!replica_check_generation(r, ruv)) /* ruv has wrong generation - we are done */
+ {
+ return 0;
+ }
+
rc = ruv_get_max_csn(ruv, &csn);
if (rc != RUV_SUCCESS) {
return -1;
@@ -3713,8 +3748,8 @@ replica_update_ruv_consumer(Replica *r, RUV *supplier_ruv)
replica_lock(r->repl_lock);
local_ruv = (RUV *)object_get_data(r->repl_ruv);
-
- if (is_cleaned_rid(supplier_id) || local_ruv == NULL) {
+ if (is_cleaned_rid(supplier_id) || local_ruv == NULL ||
+ !replica_check_generation(r, supplier_ruv)) {
replica_unlock(r->repl_lock);
return;
}
diff --git a/src/lib389/lib389/dseldif.py b/src/lib389/lib389/dseldif.py
index 10baba4d7..6850c9a8a 100644
--- a/src/lib389/lib389/dseldif.py
+++ b/src/lib389/lib389/dseldif.py
@@ -317,6 +317,43 @@ class DSEldif(DSLint):
return states
+ def _increaseTimeSkew(self, suffix, timeSkew):
+ # Increase csngen state local_offset by timeSkew
+ # Warning: instance must be stopped before calling this function
+ assert (timeSkew >= 0)
+ nsState = self.readNsState(suffix)[0]
+ self._instance.log.debug(f'_increaseTimeSkew nsState is {nsState}')
+ oldNsState = self.get(nsState['dn'], 'nsState', True)
+ self._instance.log.debug(f'oldNsState is {oldNsState}')
+
+ # Lets reencode the new nsState
+ from lib389.utils import print_nice_time
+ if pack('<h', 1) == pack('=h',1):
+ end = '<'
+ elif pack('>h', 1) == pack('=h',1):
+ end = '>'
+ else:
+ raise ValueError("Unknown endian, unable to proceed")
+
+ thelen = len(oldNsState)
+ if thelen <= 20:
+ pad = 2 # padding for short H values
+ timefmt = 'I' # timevals are unsigned 32-bit int
+ else:
+ pad = 6 # padding for short H values
+ timefmt = 'Q' # timevals are unsigned 64-bit int
+ fmtstr = "%sH%dx3%sH%dx" % (end, pad, timefmt, pad)
+ newNsState = base64.b64encode(pack(fmtstr, int(nsState['rid']),
+ int(nsState['gen_time']), int(nsState['local_offset'])+timeSkew,
+ int(nsState['remote_offset']), int(nsState['seq_num'])))
+ newNsState = newNsState.decode('utf-8')
+ self._instance.log.debug(f'newNsState is {newNsState}')
+ # Lets replace the value.
+ (entry_dn_i, attr_data) = self._find_attr(nsState['dn'], 'nsState')
+ attr_i = next(iter(attr_data))
+ self._contents[entry_dn_i + attr_i] = f"nsState:: {newNsState}"
+ self._update()
+
class FSChecks(DSLint):
"""This is for the healthcheck feature, check commonly used system config files the
--
2.26.2