From 01e941e3eadd7a208982d20c0ca9c104142f2b91 Mon Sep 17 00:00:00 2001 From: Mark Reynolds Date: Wed, 10 Aug 2022 08:58:28 -0400 Subject: [PATCH 4/4] Issue 3903 - fix repl keep alive event interval Description: Previously we passed the interval as seconds to the event queue, but it is supposed to be milliseconds. Fixed a crash with repl logging and decoding extended op payload (referrals). Also reworked alot of the replication CI tests that were flaky. relates: https://github.com/389ds/389-ds-base/issues/3903 Reviewed by: tbordaz & spichugi(Thanks!) --- .../suites/replication/acceptance_test.py | 52 +- .../cleanallruv_abort_certify_test.py | 136 ++++ .../cleanallruv_abort_restart_test.py | 146 ++++ .../replication/cleanallruv_abort_test.py | 123 +++ .../replication/cleanallruv_force_test.py | 187 +++++ .../cleanallruv_multiple_force_test.py | 214 +++++ .../replication/cleanallruv_restart_test.py | 161 ++++ .../cleanallruv_shutdown_crash_test.py | 123 +++ .../replication/cleanallruv_stress_test.py | 216 +++++ .../suites/replication/cleanallruv_test.py | 742 +----------------- .../suites/replication/regression_m2_test.py | 13 +- .../replication/regression_m2c2_test.py | 1 + .../plugins/replication/repl5_replica.c | 12 +- ldap/servers/plugins/replication/repl_extop.c | 4 +- ldap/servers/slapd/task.c | 8 +- src/lib389/lib389/instance/remove.py | 6 + 16 files changed, 1385 insertions(+), 759 deletions(-) create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_abort_certify_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_abort_restart_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_abort_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_force_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_multiple_force_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_restart_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_shutdown_crash_test.py create mode 100644 dirsrvtests/tests/suites/replication/cleanallruv_stress_test.py diff --git a/dirsrvtests/tests/suites/replication/acceptance_test.py b/dirsrvtests/tests/suites/replication/acceptance_test.py index a5f0c4c6b..863ee2553 100644 --- a/dirsrvtests/tests/suites/replication/acceptance_test.py +++ b/dirsrvtests/tests/suites/replication/acceptance_test.py @@ -8,6 +8,7 @@ # import pytest import logging +import time from lib389.replica import Replicas from lib389.tasks import * from lib389.utils import * @@ -124,12 +125,16 @@ def test_modify_entry(topo_m4, create_entry): 8. Some time should pass 9. The change should be present on all suppliers """ + if DEBUGGING: + sleep_time = 8 + else: + sleep_time = 2 log.info('Modifying entry {} - add operation'.format(TEST_ENTRY_DN)) test_user = UserAccount(topo_m4.ms["supplier1"], TEST_ENTRY_DN) test_user.add('mail', '{}@redhat.com'.format(TEST_ENTRY_NAME)) - time.sleep(1) + time.sleep(sleep_time) all_user = topo_m4.all_get_dsldapobject(TEST_ENTRY_DN, UserAccount) for u in all_user: @@ -137,7 +142,7 @@ def test_modify_entry(topo_m4, create_entry): log.info('Modifying entry {} - replace operation'.format(TEST_ENTRY_DN)) test_user.replace('mail', '{}@greenhat.com'.format(TEST_ENTRY_NAME)) - time.sleep(1) + time.sleep(sleep_time) all_user = topo_m4.all_get_dsldapobject(TEST_ENTRY_DN, UserAccount) for u in all_user: @@ -145,7 +150,7 @@ def test_modify_entry(topo_m4, create_entry): log.info('Modifying entry {} - delete operation'.format(TEST_ENTRY_DN)) test_user.remove('mail', '{}@greenhat.com'.format(TEST_ENTRY_NAME)) - time.sleep(1) + time.sleep(sleep_time) all_user = topo_m4.all_get_dsldapobject(TEST_ENTRY_DN, UserAccount) for u in all_user: @@ -167,7 +172,10 @@ def test_delete_entry(topo_m4, create_entry): log.info('Deleting entry {} during the test'.format(TEST_ENTRY_DN)) topo_m4.ms["supplier1"].delete_s(TEST_ENTRY_DN) - + if DEBUGGING: + time.sleep(8) + else: + time.sleep(1) entries = get_repl_entries(topo_m4, TEST_ENTRY_NAME, ["uid"]) assert not entries, "Entry deletion {} wasn't replicated successfully".format(TEST_ENTRY_DN) @@ -231,6 +239,11 @@ def test_modrdn_after_pause(topo_m4): 5. The change should be present on all suppliers """ + if DEBUGGING: + sleep_time = 8 + else: + sleep_time = 3 + newrdn_name = 'newrdn' newrdn_dn = 'uid={},{}'.format(newrdn_name, DEFAULT_SUFFIX) @@ -264,7 +277,7 @@ def test_modrdn_after_pause(topo_m4): topo_m4.resume_all_replicas() log.info('Wait for replication to happen') - time.sleep(3) + time.sleep(sleep_time) try: entries_new = get_repl_entries(topo_m4, newrdn_name, ["uid"]) @@ -354,6 +367,11 @@ def test_many_attrs(topo_m4, create_entry): for add_name in add_list: test_user.add('description', add_name) + if DEBUGGING: + time.sleep(10) + else: + time.sleep(1) + log.info('Check that everything was properly replicated after an add operation') entries = get_repl_entries(topo_m4, TEST_ENTRY_NAME, ["description"]) for entry in entries: @@ -363,6 +381,11 @@ def test_many_attrs(topo_m4, create_entry): for delete_name in delete_list: test_user.remove('description', delete_name) + if DEBUGGING: + time.sleep(10) + else: + time.sleep(1) + log.info('Check that everything was properly replicated after a delete operation') entries = get_repl_entries(topo_m4, TEST_ENTRY_NAME, ["description"]) for entry in entries: @@ -386,12 +409,22 @@ def test_double_delete(topo_m4, create_entry): log.info('Deleting entry {} from supplier1'.format(TEST_ENTRY_DN)) topo_m4.ms["supplier1"].delete_s(TEST_ENTRY_DN) + if DEBUGGING: + time.sleep(5) + else: + time.sleep(1) + log.info('Deleting entry {} from supplier2'.format(TEST_ENTRY_DN)) try: topo_m4.ms["supplier2"].delete_s(TEST_ENTRY_DN) except ldap.NO_SUCH_OBJECT: log.info("Entry {} wasn't found supplier2. It is expected.".format(TEST_ENTRY_DN)) + if DEBUGGING: + time.sleep(5) + else: + time.sleep(1) + log.info('Make searches to check if server is alive') entries = get_repl_entries(topo_m4, TEST_ENTRY_NAME, ["uid"]) assert not entries, "Entry deletion {} wasn't replicated successfully".format(TEST_ENTRY_DN) @@ -436,6 +469,11 @@ def test_password_repl_error(topo_m4, create_entry): m3_conn = test_user_m3.bind(TEST_ENTRY_NEW_PASS) m4_conn = test_user_m4.bind(TEST_ENTRY_NEW_PASS) + if DEBUGGING: + time.sleep(5) + else: + time.sleep(1) + log.info('Check the error log for the error with {}'.format(TEST_ENTRY_DN)) assert not m2.ds_error_log.match('.*can.t add a change for uid={}.*'.format(TEST_ENTRY_NAME)) @@ -552,7 +590,7 @@ def test_csnpurge_large_valueset(topo_m2): replica = replicas.list()[0] log.info('nsds5ReplicaPurgeDelay to 5') replica.set('nsds5ReplicaPurgeDelay', '5') - time.sleep(6) + time.sleep(10) # add some new values to the valueset containing entries that should be purged for i in range(21,25): @@ -612,7 +650,7 @@ def test_urp_trigger_substring_search(topo_m2): break else: log.info('Entry not yet replicated on M2, wait a bit') - time.sleep(2) + time.sleep(3) # check that M2 access logs does not "(&(objectclass=nstombstone)(nscpentrydn=uid=asterisk_*_in_value,dc=example,dc=com))" log.info('Check that on M2, URP as not triggered such internal search') diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_abort_certify_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_abort_certify_test.py new file mode 100644 index 000000000..603693b9e --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_abort_certify_test.py @@ -0,0 +1,136 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import ReplicationManager, Replicas + +log = logging.getLogger(__name__) + + +def remove_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4. """ + + log.info('%s: remove all the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_abort_certify(topology_m4): + """Test the abort task with a replica-certify-all option + + :id: 78959966-d644-44a8-b98c-1fcf21b45eb0 + :setup: Replication setup with four suppliers + :steps: + 1. Disable replication on supplier 4 + 2. Remove agreements to supplier 4 from other suppliers + 3. Stop supplier 2 + 4. Run a cleanallruv task on supplier 1 + 5. Run a cleanallruv abort task on supplier 1 with a replica-certify-all option + :expectedresults: No hanging tasks left + 1. Replication on supplier 4 should be disabled + 2. Agreements to supplier 4 should be removed + 3. Supplier 2 should be stopped + 4. Operation should be successful + 5. Operation should be successful + """ + + log.info('Running test_abort_certify...') + + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_supplier4_agmts("test_abort_certify", topology_m4) + + # Stop supplier 2 + log.info('test_abort_certify: stop supplier 2 to freeze the cleanAllRUV task...') + topology_m4.ms["supplier2"].stop() + + # Run the task + log.info('test_abort_certify: add the cleanAllRUV task...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'yes' + }) + # Wait a bit + time.sleep(2) + + # Abort the task + log.info('test_abort_certify: abort the cleanAllRUV task...') + abort_task = cruv_task.abort(certify=True) + + # Wait a while and make sure the abort task is still running + log.info('test_abort_certify...') + + if task_done(topology_m4, abort_task.dn, 10): + log.fatal('test_abort_certify: abort task incorrectly finished') + assert False + + # Now start supplier 2 so it can be aborted + log.info('test_abort_certify: start supplier 2 to allow the abort task to finish...') + topology_m4.ms["supplier2"].start() + + # Wait for the abort task to stop + if not task_done(topology_m4, abort_task.dn, 90): + log.fatal('test_abort_certify: The abort CleanAllRUV task was not aborted') + assert False + + # Check supplier 1 does not have the clean task running + log.info('test_abort_certify: check supplier 1 no longer has a cleanAllRUV task...') + if not task_done(topology_m4, cruv_task.dn): + log.fatal('test_abort_certify: CleanAllRUV task was not aborted') + assert False + + log.info('test_abort_certify PASSED') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_abort_restart_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_abort_restart_test.py new file mode 100644 index 000000000..1406c6553 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_abort_restart_test.py @@ -0,0 +1,146 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import ReplicationManager + +log = logging.getLogger(__name__) + + +def remove_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4. """ + + log.info('%s: remove all the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_abort_restart(topology_m4): + """Test the abort task can handle a restart, and then resume + + :id: b66e33d4-fe85-4e1c-b882-75da80f70ab3 + :setup: Replication setup with four suppliers + :steps: + 1. Disable replication on supplier 4 + 2. Remove agreements to supplier 4 from other suppliers + 3. Stop supplier 3 + 4. Run a cleanallruv task on supplier 1 + 5. Run a cleanallruv abort task on supplier 1 + 6. Restart supplier 1 + 7. Make sure that no crash happened + 8. Start supplier 3 + 9. Check supplier 1 does not have the clean task running + 10. Check that errors log doesn't have 'Aborting abort task' message + :expectedresults: + 1. Replication on supplier 4 should be disabled + 2. Agreements to supplier 4 should be removed + 3. Supplier 3 should be stopped + 4. Operation should be successful + 5. Operation should be successful + 6. Supplier 1 should be restarted + 7. No crash should happened + 8. Supplier 3 should be started + 9. Check supplier 1 shouldn't have the clean task running + 10. Errors log shouldn't have 'Aborting abort task' message + """ + + log.info('Running test_abort_restart...') + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_supplier4_agmts("test_abort", topology_m4) + + # Stop supplier 3 + log.info('test_abort_restart: stop supplier 3 to freeze the cleanAllRUV task...') + topology_m4.ms["supplier3"].stop() + + # Run the task + log.info('test_abort_restart: add the cleanAllRUV task...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'yes' + }) + # Wait a bit + time.sleep(2) + + # Abort the task + cruv_task.abort(certify=True) + + # Check supplier 1 does not have the clean task running + log.info('test_abort_abort: check supplier 1 no longer has a cleanAllRUV task...') + if not task_done(topology_m4, cruv_task.dn): + log.fatal('test_abort_restart: CleanAllRUV task was not aborted') + assert False + + # Now restart supplier 1, and make sure the abort process completes + topology_m4.ms["supplier1"].restart() + if topology_m4.ms["supplier1"].detectDisorderlyShutdown(): + log.fatal('test_abort_restart: Supplier 1 previously crashed!') + assert False + + # Start supplier 3 + topology_m4.ms["supplier3"].start() + + # Need to wait 5 seconds before server processes any leftover tasks + time.sleep(6) + + # Check supplier 1 tried to run abort task. We expect the abort task to be aborted. + if not topology_m4.ms["supplier1"].searchErrorsLog('Aborting abort task'): + log.fatal('test_abort_restart: Abort task did not restart') + assert False + + log.info('test_abort_restart PASSED') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_abort_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_abort_test.py new file mode 100644 index 000000000..f89188165 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_abort_test.py @@ -0,0 +1,123 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import ReplicationManager + +log = logging.getLogger(__name__) + + +def remove_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4. """ + + log.info('%s: remove all the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_abort(topology_m4): + """Test the abort task basic functionality + + :id: b09a6887-8de0-4fac-8e41-73ccbaaf7a08 + :setup: Replication setup with four suppliers + :steps: + 1. Disable replication on supplier 4 + 2. Remove agreements to supplier 4 from other suppliers + 3. Stop supplier 2 + 4. Run a cleanallruv task on supplier 1 + 5. Run a cleanallruv abort task on supplier 1 + :expectedresults: No hanging tasks left + 1. Replication on supplier 4 should be disabled + 2. Agreements to supplier 4 should be removed + 3. Supplier 2 should be stopped + 4. Operation should be successful + 5. Operation should be successful + """ + + log.info('Running test_abort...') + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_supplier4_agmts("test_abort", topology_m4) + + # Stop supplier 2 + log.info('test_abort: stop supplier 2 to freeze the cleanAllRUV task...') + topology_m4.ms["supplier2"].stop() + + # Run the task + log.info('test_abort: add the cleanAllRUV task...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'yes' + }) + # Wait a bit + time.sleep(2) + + # Abort the task + cruv_task.abort() + + # Check supplier 1 does not have the clean task running + log.info('test_abort: check supplier 1 no longer has a cleanAllRUV task...') + if not task_done(topology_m4, cruv_task.dn): + log.fatal('test_abort: CleanAllRUV task was not aborted') + assert False + + # Start supplier 2 + log.info('test_abort: start supplier 2 to begin the restore process...') + topology_m4.ms["supplier2"].start() + + log.info('test_abort PASSED') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_force_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_force_test.py new file mode 100644 index 000000000..d5b930584 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_force_test.py @@ -0,0 +1,187 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +import random +import threading +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import Replicas, ReplicationManager +from lib389.idm.directorymanager import DirectoryManager +from lib389.idm.user import UserAccounts + +log = logging.getLogger(__name__) + + +class AddUsers(threading.Thread): + def __init__(self, inst, num_users): + threading.Thread.__init__(self) + self.daemon = True + self.inst = inst + self.num_users = num_users + + def run(self): + """Start adding users""" + + dm = DirectoryManager(self.inst) + conn = dm.bind() + + users = UserAccounts(conn, DEFAULT_SUFFIX) + + u_range = list(range(self.num_users)) + random.shuffle(u_range) + + for idx in u_range: + try: + users.create(properties={ + 'uid': 'testuser%s' % idx, + 'cn' : 'testuser%s' % idx, + 'sn' : 'user%s' % idx, + 'uidNumber' : '%s' % (1000 + idx), + 'gidNumber' : '%s' % (1000 + idx), + 'homeDirectory' : '/home/testuser%s' % idx + }) + # One of the suppliers was probably put into read only mode - just break out + except ldap.UNWILLING_TO_PERFORM: + break + except ldap.ALREADY_EXISTS: + pass + conn.close() + +def remove_some_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4 except from supplier3. Used by + the force tests.""" + + log.info('%s: remove the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + +def check_ruvs(msg, topology_m4, m4rid): + """Check suppliers 1-3 for supplier 4's rid.""" + for inst in (topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]): + clean = False + replicas = Replicas(inst) + replica = replicas.get(DEFAULT_SUFFIX) + log.info('check_ruvs for replica %s:%s (suffix:rid)' % (replica.get_suffix(), replica.get_rid())) + + count = 0 + while not clean and count < 20: + ruv = replica.get_ruv() + if m4rid in ruv._rids: + time.sleep(5) + count = count + 1 + else: + clean = True + if not clean: + raise Exception("Supplier %s was not cleaned in time." % inst.serverid) + return True + +def test_clean_force(topology_m4): + """Check that multiple tasks with a 'force' option work properly + + :id: f8810dfe-d2d2-4dd9-ba03-5fc14896fabe + :setup: Replication setup with four suppliers + :steps: + 1. Stop supplier 3 + 2. Add a bunch of updates to supplier 4 + 3. Disable replication on supplier 4 + 4. Start supplier 3 + 5. Remove agreements to supplier 4 from other suppliers + 6. Run a cleanallruv task on supplier 1 with a 'force' option 'on' + 7. Check that everything was cleaned + :expectedresults: + 1. Supplier 3 should be stopped + 2. Operation should be successful + 3. Replication on supplier 4 should be disabled + 4. Supplier 3 should be started + 5. Agreements to supplier 4 should be removed + 6. Operation should be successful + 7. Everything should be cleaned + """ + + log.info('Running test_clean_force...') + + # Stop supplier 3, while we update supplier 4, so that 3 is behind the other suppliers + topology_m4.ms["supplier3"].stop() + + # Add a bunch of updates to supplier 4 + m4_add_users = AddUsers(topology_m4.ms["supplier4"], 10) + m4_add_users.start() + m4_add_users.join() + + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_some_supplier4_agmts("test_clean_force", topology_m4) + + # Start supplier 3, it should be out of sync with the other replicas... + topology_m4.ms["supplier3"].start() + + # Remove the agreement to replica 4 + replica = Replicas(topology_m4.ms["supplier3"]).get(DEFAULT_SUFFIX) + replica.get_agreements().get("004").delete() + + # Run the task, use "force" because supplier 3 is not in sync with the other replicas + # in regards to the replica 4 RUV + log.info('test_clean: run the cleanAllRUV task...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'yes' + }) + cruv_task.wait() + + # Check the other supplier's RUV for 'replica 4' + log.info('test_clean_force: check all the suppliers have been cleaned...') + clean = check_ruvs("test_clean_force", topology_m4, m4rid) + assert clean + + log.info('test_clean_force PASSED') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_multiple_force_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_multiple_force_test.py new file mode 100644 index 000000000..0a0848bda --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_multiple_force_test.py @@ -0,0 +1,214 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import ldap +import logging +import os +import pytest +import random +import time +import threading +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.idm.directorymanager import DirectoryManager +from lib389.idm.user import UserAccounts +from lib389.replica import ReplicationManager, Replicas + +log = logging.getLogger(__name__) + + +class AddUsers(threading.Thread): + def __init__(self, inst, num_users): + threading.Thread.__init__(self) + self.daemon = True + self.inst = inst + self.num_users = num_users + + def run(self): + """Start adding users""" + + dm = DirectoryManager(self.inst) + conn = dm.bind() + + users = UserAccounts(conn, DEFAULT_SUFFIX) + + u_range = list(range(self.num_users)) + random.shuffle(u_range) + + for idx in u_range: + try: + users.create(properties={ + 'uid': 'testuser%s' % idx, + 'cn' : 'testuser%s' % idx, + 'sn' : 'user%s' % idx, + 'uidNumber' : '%s' % (1000 + idx), + 'gidNumber' : '%s' % (1000 + idx), + 'homeDirectory' : '/home/testuser%s' % idx + }) + # One of the suppliers was probably put into read only mode - just break out + except ldap.UNWILLING_TO_PERFORM: + break + except ldap.ALREADY_EXISTS: + pass + conn.close() + +def remove_some_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4 except from supplier3. Used by + the force tests.""" + + log.info('%s: remove the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + +def check_ruvs(msg, topology_m4, m4rid): + """Check suppliers 1-3 for supplier 4's rid.""" + for inst in (topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]): + clean = False + replicas = Replicas(inst) + replica = replicas.get(DEFAULT_SUFFIX) + log.info('check_ruvs for replica %s:%s (suffix:rid)' % (replica.get_suffix(), replica.get_rid())) + + count = 0 + while not clean and count < 20: + ruv = replica.get_ruv() + if m4rid in ruv._rids: + time.sleep(5) + count = count + 1 + else: + clean = True + if not clean: + raise Exception("Supplier %s was not cleaned in time." % inst.serverid) + return True + + +def test_multiple_tasks_with_force(topology_m4): + """Check that multiple tasks with a 'force' option work properly + + :id: eb76a93d-8d1c-405e-9f25-6e8d5a781098 + :setup: Replication setup with four suppliers + :steps: + 1. Stop supplier 3 + 2. Add a bunch of updates to supplier 4 + 3. Disable replication on supplier 4 + 4. Start supplier 3 + 5. Remove agreements to supplier 4 from other suppliers + 6. Run a cleanallruv task on supplier 1 with a 'force' option 'on' + 7. Run one more cleanallruv task on supplier 1 with a 'force' option 'off' + 8. Check that everything was cleaned + :expectedresults: + 1. Supplier 3 should be stopped + 2. Operation should be successful + 3. Replication on supplier 4 should be disabled + 4. Supplier 3 should be started + 5. Agreements to supplier 4 should be removed + 6. Operation should be successful + 7. Operation should be successful + 8. Everything should be cleaned + """ + + log.info('Running test_multiple_tasks_with_force...') + + # Stop supplier 3, while we update supplier 4, so that 3 is behind the other suppliers + topology_m4.ms["supplier3"].stop() + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + + # Add a bunch of updates to supplier 4 + m4_add_users = AddUsers(topology_m4.ms["supplier4"], 10) + m4_add_users.start() + m4_add_users.join() + + # Disable supplier 4 + # Remove the agreements from the other suppliers that point to supplier 4 + remove_some_supplier4_agmts("test_multiple_tasks_with_force", topology_m4) + + # Start supplier 3, it should be out of sync with the other replicas... + topology_m4.ms["supplier3"].start() + + # Remove the agreement to replica 4 + replica = Replicas(topology_m4.ms["supplier3"]).get(DEFAULT_SUFFIX) + replica.get_agreements().get("004").delete() + + # Run the task, use "force" because supplier 3 is not in sync with the other replicas + # in regards to the replica 4 RUV + log.info('test_multiple_tasks_with_force: run the cleanAllRUV task with "force" on...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'yes', + 'replica-certify-all': 'no' + }) + + log.info('test_multiple_tasks_with_force: run the cleanAllRUV task with "force" off...') + + # NOTE: This must be try not py.test raises, because the above may or may + # not have completed yet .... + try: + cruv_task_fail = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task_fail.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'no' + }) + cruv_task_fail.wait() + except ldap.UNWILLING_TO_PERFORM: + pass + # Wait for the force task .... + cruv_task.wait() + + # Check the other supplier's RUV for 'replica 4' + log.info('test_multiple_tasks_with_force: check all the suppliers have been cleaned...') + clean = check_ruvs("test_clean_force", topology_m4, m4rid) + assert clean + # Check supplier 1 does not have the clean task running + log.info('test_abort: check supplier 1 no longer has a cleanAllRUV task...') + if not task_done(topology_m4, cruv_task.dn): + log.fatal('test_abort: CleanAllRUV task was not aborted') + assert False + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_restart_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_restart_test.py new file mode 100644 index 000000000..2e8d7e4a6 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_restart_test.py @@ -0,0 +1,161 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import ReplicationManager, Replicas + +log = logging.getLogger(__name__) + + +def remove_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4. """ + + log.info('%s: remove all the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + + +def check_ruvs(msg, topology_m4, m4rid): + """Check suppliers 1-3 for supplier 4's rid.""" + for inst in (topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]): + clean = False + replicas = Replicas(inst) + replica = replicas.get(DEFAULT_SUFFIX) + log.info('check_ruvs for replica %s:%s (suffix:rid)' % (replica.get_suffix(), replica.get_rid())) + + count = 0 + while not clean and count < 20: + ruv = replica.get_ruv() + if m4rid in ruv._rids: + time.sleep(5) + count = count + 1 + else: + clean = True + if not clean: + raise Exception("Supplier %s was not cleaned in time." % inst.serverid) + return True + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_clean_restart(topology_m4): + """Check that cleanallruv task works properly after a restart + + :id: c6233bb3-092c-4919-9ac9-80dd02cc6e02 + :setup: Replication setup with four suppliers + :steps: + 1. Disable replication on supplier 4 + 2. Remove agreements to supplier 4 from other suppliers + 3. Stop supplier 3 + 4. Run a cleanallruv task on supplier 1 + 5. Stop supplier 1 + 6. Start supplier 3 + 7. Make sure that no crash happened + 8. Start supplier 1 + 9. Make sure that no crash happened + 10. Check that everything was cleaned + :expectedresults: + 1. Operation should be successful + 2. Agreements to supplier 4 should be removed + 3. Supplier 3 should be stopped + 4. Cleanallruv task should be successfully executed + 5. Supplier 1 should be stopped + 6. Supplier 3 should be started + 7. No crash should happened + 8. Supplier 1 should be started + 9. No crash should happened + 10. Everything should be cleaned + """ + log.info('Running test_clean_restart...') + + # Disable supplier 4 + log.info('test_clean: disable supplier 4...') + + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_supplier4_agmts("test_clean", topology_m4) + + # Stop supplier 3 to keep the task running, so we can stop supplier 1... + topology_m4.ms["supplier3"].stop() + + # Run the task + log.info('test_clean: run the cleanAllRUV task...') + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'yes' + }) + + # Sleep a bit, then stop supplier 1 + time.sleep(5) + topology_m4.ms["supplier1"].stop() + + # Now start supplier 3 & 1, and make sure we didn't crash + topology_m4.ms["supplier3"].start() + if topology_m4.ms["supplier3"].detectDisorderlyShutdown(): + log.fatal('test_clean_restart: Supplier 3 previously crashed!') + assert False + + topology_m4.ms["supplier1"].start(timeout=30) + if topology_m4.ms["supplier1"].detectDisorderlyShutdown(): + log.fatal('test_clean_restart: Supplier 1 previously crashed!') + assert False + + # Check the other supplier's RUV for 'replica 4' + log.info('test_clean_restart: check all the suppliers have been cleaned...') + clean = check_ruvs("test_clean_restart", topology_m4, m4rid) + assert clean + + log.info('test_clean_restart PASSED, restoring supplier 4...') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_shutdown_crash_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_shutdown_crash_test.py new file mode 100644 index 000000000..b4b74e339 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_shutdown_crash_test.py @@ -0,0 +1,123 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import os +import time +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.replica import ReplicationManager, Replicas +from lib389.config import CertmapLegacy +from lib389.idm.services import ServiceAccounts + +log = logging.getLogger(__name__) + + +def test_clean_shutdown_crash(topology_m2): + """Check that server didn't crash after shutdown when running CleanAllRUV task + + :id: c34d0b40-3c3e-4f53-8656-5e4c2a310aaf + :setup: Replication setup with two suppliers + :steps: + 1. Enable TLS on both suppliers + 2. Reconfigure both agreements to use TLS Client auth + 3. Stop supplier2 + 4. Run the CleanAllRUV task + 5. Restart supplier1 + 6. Check if supplier1 didn't crash + 7. Restart supplier1 again + 8. Check if supplier1 didn't crash + + :expectedresults: + 1. Success + 2. Success + 3. Success + 4. Success + 5. Success + 6. Success + 7. Success + 8. Success + """ + + m1 = topology_m2.ms["supplier1"] + m2 = topology_m2.ms["supplier2"] + + repl = ReplicationManager(DEFAULT_SUFFIX) + + cm_m1 = CertmapLegacy(m1) + cm_m2 = CertmapLegacy(m2) + + certmaps = cm_m1.list() + certmaps['default']['DNComps'] = None + certmaps['default']['CmapLdapAttr'] = 'nsCertSubjectDN' + + cm_m1.set(certmaps) + cm_m2.set(certmaps) + + log.info('Enabling TLS') + [i.enable_tls() for i in topology_m2] + + log.info('Creating replication dns') + services = ServiceAccounts(m1, DEFAULT_SUFFIX) + repl_m1 = services.get('%s:%s' % (m1.host, m1.sslport)) + repl_m1.set('nsCertSubjectDN', m1.get_server_tls_subject()) + + repl_m2 = services.get('%s:%s' % (m2.host, m2.sslport)) + repl_m2.set('nsCertSubjectDN', m2.get_server_tls_subject()) + + log.info('Changing auth type') + replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX) + agmt_m1 = replica_m1.get_agreements().list()[0] + agmt_m1.replace_many( + ('nsDS5ReplicaBindMethod', 'SSLCLIENTAUTH'), + ('nsDS5ReplicaTransportInfo', 'SSL'), + ('nsDS5ReplicaPort', '%s' % m2.sslport), + ) + + agmt_m1.remove_all('nsDS5ReplicaBindDN') + + replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX) + agmt_m2 = replica_m2.get_agreements().list()[0] + + agmt_m2.replace_many( + ('nsDS5ReplicaBindMethod', 'SSLCLIENTAUTH'), + ('nsDS5ReplicaTransportInfo', 'SSL'), + ('nsDS5ReplicaPort', '%s' % m1.sslport), + ) + agmt_m2.remove_all('nsDS5ReplicaBindDN') + + log.info('Stopping supplier2') + m2.stop() + + log.info('Run the cleanAllRUV task') + cruv_task = CleanAllRUVTask(m1) + cruv_task.create(properties={ + 'replica-id': repl.get_rid(m1), + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no', + 'replica-certify-all': 'yes' + }) + + m1.restart() + + log.info('Check if supplier1 crashed') + assert not m1.detectDisorderlyShutdown() + + log.info('Repeat') + m1.restart() + assert not m1.detectDisorderlyShutdown() + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_stress_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_stress_test.py new file mode 100644 index 000000000..0d43dd7d4 --- /dev/null +++ b/dirsrvtests/tests/suites/replication/cleanallruv_stress_test.py @@ -0,0 +1,216 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2022 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import ldap +import logging +import pytest +import os +import random +import time +import threading +from lib389._constants import DEFAULT_SUFFIX +from lib389.topologies import topology_m4 +from lib389.tasks import CleanAllRUVTask +from lib389.idm.directorymanager import DirectoryManager +from lib389.idm.user import UserAccounts +from lib389.replica import ReplicationManager, Replicas +from lib389.config import LDBMConfig + +log = logging.getLogger(__name__) + + +class AddUsers(threading.Thread): + def __init__(self, inst, num_users): + threading.Thread.__init__(self) + self.daemon = True + self.inst = inst + self.num_users = num_users + + def run(self): + """Start adding users""" + + dm = DirectoryManager(self.inst) + conn = dm.bind() + + users = UserAccounts(conn, DEFAULT_SUFFIX) + + u_range = list(range(self.num_users)) + random.shuffle(u_range) + + for idx in u_range: + try: + users.create(properties={ + 'uid': 'testuser%s' % idx, + 'cn' : 'testuser%s' % idx, + 'sn' : 'user%s' % idx, + 'uidNumber' : '%s' % (1000 + idx), + 'gidNumber' : '%s' % (1000 + idx), + 'homeDirectory' : '/home/testuser%s' % idx + }) + # One of the suppliers was probably put into read only mode - just break out + except ldap.UNWILLING_TO_PERFORM: + break + except ldap.ALREADY_EXISTS: + pass + conn.close() + +def remove_supplier4_agmts(msg, topology_m4): + """Remove all the repl agmts to supplier4. """ + + log.info('%s: remove all the agreements to supplier 4...' % msg) + repl = ReplicationManager(DEFAULT_SUFFIX) + # This will delete m4 from the topo *and* remove all incoming agreements + # to m4. + repl.remove_supplier(topology_m4.ms["supplier4"], + [topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]]) + +def task_done(topology_m4, task_dn, timeout=60): + """Check if the task is complete""" + + attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', + 'nsTaskCurrentItem', 'nsTaskTotalItems'] + done = False + count = 0 + + while not done and count < timeout: + try: + entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) + if entry is not None: + if entry.hasAttr('nsTaskExitCode'): + done = True + break + else: + done = True + break + except ldap.NO_SUCH_OBJECT: + done = True + break + except ldap.LDAPError: + break + time.sleep(1) + count += 1 + + return done + +def check_ruvs(msg, topology_m4, m4rid): + """Check suppliers 1-3 for supplier 4's rid.""" + for inst in (topology_m4.ms["supplier1"], topology_m4.ms["supplier2"], topology_m4.ms["supplier3"]): + clean = False + replicas = Replicas(inst) + replica = replicas.get(DEFAULT_SUFFIX) + log.info('check_ruvs for replica %s:%s (suffix:rid)' % (replica.get_suffix(), replica.get_rid())) + + count = 0 + while not clean and count < 20: + ruv = replica.get_ruv() + if m4rid in ruv._rids: + time.sleep(5) + count = count + 1 + else: + clean = True + if not clean: + raise Exception("Supplier %s was not cleaned in time." % inst.serverid) + return True + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_stress_clean(topology_m4): + """Put each server(m1 - m4) under a stress, and perform the entire clean process + + :id: a8263cd6-f068-4357-86e0-e7c34504c8c5 + :setup: Replication setup with four suppliers + :steps: + 1. Add a bunch of updates to all suppliers + 2. Put supplier 4 to read-only mode + 3. Disable replication on supplier 4 + 4. Remove agreements to supplier 4 from other suppliers + 5. Run a cleanallruv task on supplier 1 + 6. Check that everything was cleaned + :expectedresults: + 1. Operation should be successful + 2. Supplier 4 should be put to read-only mode + 3. Replication on supplier 4 should be disabled + 4. Agreements to supplier 4 should be removed + 5. Operation should be successful + 6. Everything should be cleaned + """ + + log.info('Running test_stress_clean...') + log.info('test_stress_clean: put all the suppliers under load...') + + ldbm_config = LDBMConfig(topology_m4.ms["supplier4"]) + + # Put all the suppliers under load + # not too high load else it takes a long time to converge and + # the test result becomes instable + m1_add_users = AddUsers(topology_m4.ms["supplier1"], 200) + m1_add_users.start() + m2_add_users = AddUsers(topology_m4.ms["supplier2"], 200) + m2_add_users.start() + m3_add_users = AddUsers(topology_m4.ms["supplier3"], 200) + m3_add_users.start() + m4_add_users = AddUsers(topology_m4.ms["supplier4"], 200) + m4_add_users.start() + + # Allow sometime to get replication flowing in all directions + log.info('test_stress_clean: allow some time for replication to get flowing...') + time.sleep(5) + + # Put supplier 4 into read only mode + ldbm_config.set('nsslapd-readonly', 'on') + # We need to wait for supplier 4 to push its changes out + log.info('test_stress_clean: allow some time for supplier 4 to push changes out (60 seconds)...') + time.sleep(60) + + # Remove the agreements from the other suppliers that point to supplier 4 + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) + remove_supplier4_agmts("test_stress_clean", topology_m4) + + # Run the task + cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) + cruv_task.create(properties={ + 'replica-id': m4rid, + 'replica-base-dn': DEFAULT_SUFFIX, + 'replica-force-cleaning': 'no' + }) + cruv_task.wait() + + # Wait for the update to finish + log.info('test_stress_clean: wait for all the updates to finish...') + m1_add_users.join() + m2_add_users.join() + m3_add_users.join() + m4_add_users.join() + + # Check the other supplier's RUV for 'replica 4' + log.info('test_stress_clean: check if all the replicas have been cleaned...') + clean = check_ruvs("test_stress_clean", topology_m4, m4rid) + assert clean + + log.info('test_stress_clean: PASSED, restoring supplier 4...') + + # Sleep for a bit to replication complete + log.info("Sleep for 120 seconds to allow replication to complete...") + repl = ReplicationManager(DEFAULT_SUFFIX) + repl.test_replication_topology([ + topology_m4.ms["supplier1"], + topology_m4.ms["supplier2"], + topology_m4.ms["supplier3"], + ], timeout=120) + + # Turn off readonly mode + ldbm_config.set('nsslapd-readonly', 'off') + + +if __name__ == '__main__': + # Run isolated + # -s for DEBUG mode + CURRENT_FILE = os.path.realpath(__file__) + pytest.main(["-s", CURRENT_FILE]) + diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_test.py index 1e9cd7c28..6d7141ada 100644 --- a/dirsrvtests/tests/suites/replication/cleanallruv_test.py +++ b/dirsrvtests/tests/suites/replication/cleanallruv_test.py @@ -1,27 +1,20 @@ # --- BEGIN COPYRIGHT BLOCK --- -# Copyright (C) 2019 Red Hat, Inc. +# Copyright (C) 2022 Red Hat, Inc. # All rights reserved. # # License: GPL (version 3 or any later version). # See LICENSE for details. # --- END COPYRIGHT BLOCK --- # -import threading import pytest -import random from lib389 import DirSrv from lib389.tasks import * from lib389.utils import * from lib389.topologies import topology_m4, topology_m2 -from lib389._constants import * - -from lib389.idm.directorymanager import DirectoryManager +from lib389._constants import DEFAULT_SUFFIX from lib389.replica import ReplicationManager, Replicas from lib389.tasks import CleanAllRUVTask -from lib389.idm.user import UserAccounts -from lib389.config import LDBMConfig -from lib389.config import CertmapLegacy -from lib389.idm.services import ServiceAccounts + pytestmark = pytest.mark.tier1 @@ -29,42 +22,6 @@ logging.getLogger(__name__).setLevel(logging.DEBUG) log = logging.getLogger(__name__) -class AddUsers(threading.Thread): - def __init__(self, inst, num_users): - threading.Thread.__init__(self) - self.daemon = True - self.inst = inst - self.num_users = num_users - - def run(self): - """Start adding users""" - - dm = DirectoryManager(self.inst) - conn = dm.bind() - - users = UserAccounts(conn, DEFAULT_SUFFIX) - - u_range = list(range(self.num_users)) - random.shuffle(u_range) - - for idx in u_range: - try: - users.create(properties={ - 'uid': 'testuser%s' % idx, - 'cn' : 'testuser%s' % idx, - 'sn' : 'user%s' % idx, - 'uidNumber' : '%s' % (1000 + idx), - 'gidNumber' : '%s' % (1000 + idx), - 'homeDirectory' : '/home/testuser%s' % idx - }) - # One of the suppliers was probably put into read only mode - just break out - except ldap.UNWILLING_TO_PERFORM: - break - except ldap.ALREADY_EXISTS: - pass - conn.close() - - def remove_supplier4_agmts(msg, topology_m4): """Remove all the repl agmts to supplier4. """ @@ -96,92 +53,7 @@ def check_ruvs(msg, topology_m4, m4rid): return True -def task_done(topology_m4, task_dn, timeout=60): - """Check if the task is complete""" - - attrlist = ['nsTaskLog', 'nsTaskStatus', 'nsTaskExitCode', - 'nsTaskCurrentItem', 'nsTaskTotalItems'] - done = False - count = 0 - - while not done and count < timeout: - try: - entry = topology_m4.ms["supplier1"].getEntry(task_dn, attrlist=attrlist) - if entry is not None: - if entry.hasAttr('nsTaskExitCode'): - done = True - break - else: - done = True - break - except ldap.NO_SUCH_OBJECT: - done = True - break - except ldap.LDAPError: - break - time.sleep(1) - count += 1 - - return done - - -def restore_supplier4(topology_m4): - """In our tests will always be removing supplier 4, so we need a common - way to restore it for another test - """ - - # Restart the remaining suppliers to allow rid 4 to be reused. - for inst in topology_m4.ms.values(): - inst.restart() - - repl = ReplicationManager(DEFAULT_SUFFIX) - repl.join_supplier(topology_m4.ms["supplier1"], topology_m4.ms["supplier4"]) - - # Add the 2,3 -> 4 agmt. - repl.ensure_agreement(topology_m4.ms["supplier2"], topology_m4.ms["supplier4"]) - repl.ensure_agreement(topology_m4.ms["supplier3"], topology_m4.ms["supplier4"]) - # And in reverse ... - repl.ensure_agreement(topology_m4.ms["supplier4"], topology_m4.ms["supplier2"]) - repl.ensure_agreement(topology_m4.ms["supplier4"], topology_m4.ms["supplier3"]) - - log.info('Supplier 4 has been successfully restored.') - - -@pytest.fixture() -def m4rid(request, topology_m4): - log.debug("Wait a bit before the reset - it is required for the slow machines") - time.sleep(5) - log.debug("-------------- BEGIN RESET of m4 -----------------") - repl = ReplicationManager(DEFAULT_SUFFIX) - repl.test_replication_topology(topology_m4.ms.values()) - # What is supplier4's rid? - m4rid = repl.get_rid(topology_m4.ms["supplier4"]) - - def fin(): - try: - # Restart the suppliers and rerun cleanallruv - for inst in topology_m4.ms.values(): - inst.restart() - - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - }) - cruv_task.wait() - except ldap.UNWILLING_TO_PERFORM: - # In some casse we already cleaned rid4, so if we fail, it's okay - pass - restore_supplier4(topology_m4) - # Make sure everything works. - repl.test_replication_topology(topology_m4.ms.values()) - request.addfinalizer(fin) - log.debug("-------------- FINISH RESET of m4 -----------------") - return m4rid - - -def test_clean(topology_m4, m4rid): +def test_clean(topology_m4): """Check that cleanallruv task works properly :id: e9b3ce5c-e17c-409e-aafc-e97d630f2878 @@ -204,6 +76,8 @@ def test_clean(topology_m4, m4rid): # Disable supplier 4 # Remove the agreements from the other suppliers that point to supplier 4 log.info('test_clean: disable supplier 4...') + repl = ReplicationManager(DEFAULT_SUFFIX) + m4rid = repl.get_rid(topology_m4.ms["supplier4"]) remove_supplier4_agmts("test_clean", topology_m4) # Run the task @@ -221,610 +95,6 @@ def test_clean(topology_m4, m4rid): clean = check_ruvs("test_clean", topology_m4, m4rid) assert clean - log.info('test_clean PASSED, restoring supplier 4...') - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_clean_restart(topology_m4, m4rid): - """Check that cleanallruv task works properly after a restart - - :id: c6233bb3-092c-4919-9ac9-80dd02cc6e02 - :setup: Replication setup with four suppliers - :steps: - 1. Disable replication on supplier 4 - 2. Remove agreements to supplier 4 from other suppliers - 3. Stop supplier 3 - 4. Run a cleanallruv task on supplier 1 - 5. Stop supplier 1 - 6. Start supplier 3 - 7. Make sure that no crash happened - 8. Start supplier 1 - 9. Make sure that no crash happened - 10. Check that everything was cleaned - :expectedresults: - 1. Operation should be successful - 2. Agreements to supplier 4 should be removed - 3. Supplier 3 should be stopped - 4. Cleanallruv task should be successfully executed - 5. Supplier 1 should be stopped - 6. Supplier 3 should be started - 7. No crash should happened - 8. Supplier 1 should be started - 9. No crash should happened - 10. Everything should be cleaned - """ - log.info('Running test_clean_restart...') - - # Disable supplier 4 - log.info('test_clean: disable supplier 4...') - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_clean", topology_m4) - - # Stop supplier 3 to keep the task running, so we can stop supplier 1... - topology_m4.ms["supplier3"].stop() - - # Run the task - log.info('test_clean: run the cleanAllRUV task...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'yes' - }) - - # Sleep a bit, then stop supplier 1 - time.sleep(5) - topology_m4.ms["supplier1"].stop() - - # Now start supplier 3 & 1, and make sure we didn't crash - topology_m4.ms["supplier3"].start() - if topology_m4.ms["supplier3"].detectDisorderlyShutdown(): - log.fatal('test_clean_restart: Supplier 3 previously crashed!') - assert False - - topology_m4.ms["supplier1"].start(timeout=30) - if topology_m4.ms["supplier1"].detectDisorderlyShutdown(): - log.fatal('test_clean_restart: Supplier 1 previously crashed!') - assert False - - # Check the other supplier's RUV for 'replica 4' - log.info('test_clean_restart: check all the suppliers have been cleaned...') - clean = check_ruvs("test_clean_restart", topology_m4, m4rid) - assert clean - - log.info('test_clean_restart PASSED, restoring supplier 4...') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_clean_force(topology_m4, m4rid): - """Check that multiple tasks with a 'force' option work properly - - :id: f8810dfe-d2d2-4dd9-ba03-5fc14896fabe - :setup: Replication setup with four suppliers - :steps: - 1. Stop supplier 3 - 2. Add a bunch of updates to supplier 4 - 3. Disable replication on supplier 4 - 4. Start supplier 3 - 5. Remove agreements to supplier 4 from other suppliers - 6. Run a cleanallruv task on supplier 1 with a 'force' option 'on' - 7. Check that everything was cleaned - :expectedresults: - 1. Supplier 3 should be stopped - 2. Operation should be successful - 3. Replication on supplier 4 should be disabled - 4. Supplier 3 should be started - 5. Agreements to supplier 4 should be removed - 6. Operation should be successful - 7. Everything should be cleaned - """ - - log.info('Running test_clean_force...') - - # Stop supplier 3, while we update supplier 4, so that 3 is behind the other suppliers - topology_m4.ms["supplier3"].stop() - - # Add a bunch of updates to supplier 4 - m4_add_users = AddUsers(topology_m4.ms["supplier4"], 1500) - m4_add_users.start() - m4_add_users.join() - - # Start supplier 3, it should be out of sync with the other replicas... - topology_m4.ms["supplier3"].start() - - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_clean_force", topology_m4) - - # Run the task, use "force" because supplier 3 is not in sync with the other replicas - # in regards to the replica 4 RUV - log.info('test_clean: run the cleanAllRUV task...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'yes' - }) - cruv_task.wait() - - # Check the other supplier's RUV for 'replica 4' - log.info('test_clean_force: check all the suppliers have been cleaned...') - clean = check_ruvs("test_clean_force", topology_m4, m4rid) - assert clean - - log.info('test_clean_force PASSED, restoring supplier 4...') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_abort(topology_m4, m4rid): - """Test the abort task basic functionality - - :id: b09a6887-8de0-4fac-8e41-73ccbaaf7a08 - :setup: Replication setup with four suppliers - :steps: - 1. Disable replication on supplier 4 - 2. Remove agreements to supplier 4 from other suppliers - 3. Stop supplier 2 - 4. Run a cleanallruv task on supplier 1 - 5. Run a cleanallruv abort task on supplier 1 - :expectedresults: No hanging tasks left - 1. Replication on supplier 4 should be disabled - 2. Agreements to supplier 4 should be removed - 3. Supplier 2 should be stopped - 4. Operation should be successful - 5. Operation should be successful - """ - - log.info('Running test_abort...') - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_abort", topology_m4) - - # Stop supplier 2 - log.info('test_abort: stop supplier 2 to freeze the cleanAllRUV task...') - topology_m4.ms["supplier2"].stop() - - # Run the task - log.info('test_abort: add the cleanAllRUV task...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'yes' - }) - # Wait a bit - time.sleep(2) - - # Abort the task - cruv_task.abort() - - # Check supplier 1 does not have the clean task running - log.info('test_abort: check supplier 1 no longer has a cleanAllRUV task...') - if not task_done(topology_m4, cruv_task.dn): - log.fatal('test_abort: CleanAllRUV task was not aborted') - assert False - - # Start supplier 2 - log.info('test_abort: start supplier 2 to begin the restore process...') - topology_m4.ms["supplier2"].start() - - log.info('test_abort PASSED, restoring supplier 4...') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_abort_restart(topology_m4, m4rid): - """Test the abort task can handle a restart, and then resume - - :id: b66e33d4-fe85-4e1c-b882-75da80f70ab3 - :setup: Replication setup with four suppliers - :steps: - 1. Disable replication on supplier 4 - 2. Remove agreements to supplier 4 from other suppliers - 3. Stop supplier 3 - 4. Run a cleanallruv task on supplier 1 - 5. Run a cleanallruv abort task on supplier 1 - 6. Restart supplier 1 - 7. Make sure that no crash happened - 8. Start supplier 3 - 9. Check supplier 1 does not have the clean task running - 10. Check that errors log doesn't have 'Aborting abort task' message - :expectedresults: - 1. Replication on supplier 4 should be disabled - 2. Agreements to supplier 4 should be removed - 3. Supplier 3 should be stopped - 4. Operation should be successful - 5. Operation should be successful - 6. Supplier 1 should be restarted - 7. No crash should happened - 8. Supplier 3 should be started - 9. Check supplier 1 shouldn't have the clean task running - 10. Errors log shouldn't have 'Aborting abort task' message - """ - - log.info('Running test_abort_restart...') - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_abort", topology_m4) - - # Stop supplier 3 - log.info('test_abort_restart: stop supplier 3 to freeze the cleanAllRUV task...') - topology_m4.ms["supplier3"].stop() - - # Run the task - log.info('test_abort_restart: add the cleanAllRUV task...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'yes' - }) - # Wait a bit - time.sleep(2) - - # Abort the task - cruv_task.abort(certify=True) - - # Check supplier 1 does not have the clean task running - log.info('test_abort_abort: check supplier 1 no longer has a cleanAllRUV task...') - if not task_done(topology_m4, cruv_task.dn): - log.fatal('test_abort_restart: CleanAllRUV task was not aborted') - assert False - - # Now restart supplier 1, and make sure the abort process completes - topology_m4.ms["supplier1"].restart() - if topology_m4.ms["supplier1"].detectDisorderlyShutdown(): - log.fatal('test_abort_restart: Supplier 1 previously crashed!') - assert False - - # Start supplier 3 - topology_m4.ms["supplier3"].start() - - # Need to wait 5 seconds before server processes any leftover tasks - time.sleep(6) - - # Check supplier 1 tried to run abort task. We expect the abort task to be aborted. - if not topology_m4.ms["supplier1"].searchErrorsLog('Aborting abort task'): - log.fatal('test_abort_restart: Abort task did not restart') - assert False - - log.info('test_abort_restart PASSED, restoring supplier 4...') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_abort_certify(topology_m4, m4rid): - """Test the abort task with a replica-certify-all option - - :id: 78959966-d644-44a8-b98c-1fcf21b45eb0 - :setup: Replication setup with four suppliers - :steps: - 1. Disable replication on supplier 4 - 2. Remove agreements to supplier 4 from other suppliers - 3. Stop supplier 2 - 4. Run a cleanallruv task on supplier 1 - 5. Run a cleanallruv abort task on supplier 1 with a replica-certify-all option - :expectedresults: No hanging tasks left - 1. Replication on supplier 4 should be disabled - 2. Agreements to supplier 4 should be removed - 3. Supplier 2 should be stopped - 4. Operation should be successful - 5. Operation should be successful - """ - - log.info('Running test_abort_certify...') - - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_abort_certify", topology_m4) - - # Stop supplier 2 - log.info('test_abort_certify: stop supplier 2 to freeze the cleanAllRUV task...') - topology_m4.ms["supplier2"].stop() - - # Run the task - log.info('test_abort_certify: add the cleanAllRUV task...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'yes' - }) - # Wait a bit - time.sleep(2) - - # Abort the task - log.info('test_abort_certify: abort the cleanAllRUV task...') - abort_task = cruv_task.abort(certify=True) - - # Wait a while and make sure the abort task is still running - log.info('test_abort_certify...') - - if task_done(topology_m4, abort_task.dn, 10): - log.fatal('test_abort_certify: abort task incorrectly finished') - assert False - - # Now start supplier 2 so it can be aborted - log.info('test_abort_certify: start supplier 2 to allow the abort task to finish...') - topology_m4.ms["supplier2"].start() - - # Wait for the abort task to stop - if not task_done(topology_m4, abort_task.dn, 90): - log.fatal('test_abort_certify: The abort CleanAllRUV task was not aborted') - assert False - - # Check supplier 1 does not have the clean task running - log.info('test_abort_certify: check supplier 1 no longer has a cleanAllRUV task...') - if not task_done(topology_m4, cruv_task.dn): - log.fatal('test_abort_certify: CleanAllRUV task was not aborted') - assert False - - log.info('test_abort_certify PASSED, restoring supplier 4...') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_stress_clean(topology_m4, m4rid): - """Put each server(m1 - m4) under a stress, and perform the entire clean process - - :id: a8263cd6-f068-4357-86e0-e7c34504c8c5 - :setup: Replication setup with four suppliers - :steps: - 1. Add a bunch of updates to all suppliers - 2. Put supplier 4 to read-only mode - 3. Disable replication on supplier 4 - 4. Remove agreements to supplier 4 from other suppliers - 5. Run a cleanallruv task on supplier 1 - 6. Check that everything was cleaned - :expectedresults: - 1. Operation should be successful - 2. Supplier 4 should be put to read-only mode - 3. Replication on supplier 4 should be disabled - 4. Agreements to supplier 4 should be removed - 5. Operation should be successful - 6. Everything should be cleaned - """ - - log.info('Running test_stress_clean...') - log.info('test_stress_clean: put all the suppliers under load...') - - ldbm_config = LDBMConfig(topology_m4.ms["supplier4"]) - - # not too high load else it takes a long time to converge and - # the test result becomes instable - m1_add_users = AddUsers(topology_m4.ms["supplier1"], 500) - m1_add_users.start() - m2_add_users = AddUsers(topology_m4.ms["supplier2"], 500) - m2_add_users.start() - m3_add_users = AddUsers(topology_m4.ms["supplier3"], 500) - m3_add_users.start() - m4_add_users = AddUsers(topology_m4.ms["supplier4"], 500) - m4_add_users.start() - - # Allow sometime to get replication flowing in all directions - log.info('test_stress_clean: allow some time for replication to get flowing...') - time.sleep(5) - - # Put supplier 4 into read only mode - ldbm_config.set('nsslapd-readonly', 'on') - # We need to wait for supplier 4 to push its changes out - log.info('test_stress_clean: allow some time for supplier 4 to push changes out (60 seconds)...') - time.sleep(30) - - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_stress_clean", topology_m4) - - # Run the task - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no' - }) - cruv_task.wait() - - # Wait for the update to finish - log.info('test_stress_clean: wait for all the updates to finish...') - m1_add_users.join() - m2_add_users.join() - m3_add_users.join() - m4_add_users.join() - - # Check the other supplier's RUV for 'replica 4' - log.info('test_stress_clean: check if all the replicas have been cleaned...') - clean = check_ruvs("test_stress_clean", topology_m4, m4rid) - assert clean - - log.info('test_stress_clean: PASSED, restoring supplier 4...') - - # Sleep for a bit to replication complete - log.info("Sleep for 120 seconds to allow replication to complete...") - repl = ReplicationManager(DEFAULT_SUFFIX) - repl.test_replication_topology([ - topology_m4.ms["supplier1"], - topology_m4.ms["supplier2"], - topology_m4.ms["supplier3"], - ], timeout=120) - - # Turn off readonly mode - ldbm_config.set('nsslapd-readonly', 'off') - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_multiple_tasks_with_force(topology_m4, m4rid): - """Check that multiple tasks with a 'force' option work properly - - :id: eb76a93d-8d1c-405e-9f25-6e8d5a781098 - :setup: Replication setup with four suppliers - :steps: - 1. Stop supplier 3 - 2. Add a bunch of updates to supplier 4 - 3. Disable replication on supplier 4 - 4. Start supplier 3 - 5. Remove agreements to supplier 4 from other suppliers - 6. Run a cleanallruv task on supplier 1 with a 'force' option 'on' - 7. Run one more cleanallruv task on supplier 1 with a 'force' option 'off' - 8. Check that everything was cleaned - :expectedresults: - 1. Supplier 3 should be stopped - 2. Operation should be successful - 3. Replication on supplier 4 should be disabled - 4. Supplier 3 should be started - 5. Agreements to supplier 4 should be removed - 6. Operation should be successful - 7. Operation should be successful - 8. Everything should be cleaned - """ - - log.info('Running test_multiple_tasks_with_force...') - - # Stop supplier 3, while we update supplier 4, so that 3 is behind the other suppliers - topology_m4.ms["supplier3"].stop() - - # Add a bunch of updates to supplier 4 - m4_add_users = AddUsers(topology_m4.ms["supplier4"], 1500) - m4_add_users.start() - m4_add_users.join() - - # Start supplier 3, it should be out of sync with the other replicas... - topology_m4.ms["supplier3"].start() - - # Disable supplier 4 - # Remove the agreements from the other suppliers that point to supplier 4 - remove_supplier4_agmts("test_multiple_tasks_with_force", topology_m4) - - # Run the task, use "force" because supplier 3 is not in sync with the other replicas - # in regards to the replica 4 RUV - log.info('test_multiple_tasks_with_force: run the cleanAllRUV task with "force" on...') - cruv_task = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'yes', - 'replica-certify-all': 'no' - }) - - log.info('test_multiple_tasks_with_force: run the cleanAllRUV task with "force" off...') - - # NOTE: This must be try not py.test raises, because the above may or may - # not have completed yet .... - try: - cruv_task_fail = CleanAllRUVTask(topology_m4.ms["supplier1"]) - cruv_task_fail.create(properties={ - 'replica-id': m4rid, - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'no' - }) - cruv_task_fail.wait() - except ldap.UNWILLING_TO_PERFORM: - pass - # Wait for the force task .... - cruv_task.wait() - - # Check the other supplier's RUV for 'replica 4' - log.info('test_multiple_tasks_with_force: check all the suppliers have been cleaned...') - clean = check_ruvs("test_clean_force", topology_m4, m4rid) - assert clean - # Check supplier 1 does not have the clean task running - log.info('test_abort: check supplier 1 no longer has a cleanAllRUV task...') - if not task_done(topology_m4, cruv_task.dn): - log.fatal('test_abort: CleanAllRUV task was not aborted') - assert False - - -@pytest.mark.bz1466441 -@pytest.mark.ds50370 -def test_clean_shutdown_crash(topology_m2): - """Check that server didn't crash after shutdown when running CleanAllRUV task - - :id: c34d0b40-3c3e-4f53-8656-5e4c2a310aaf - :setup: Replication setup with two suppliers - :steps: - 1. Enable TLS on both suppliers - 2. Reconfigure both agreements to use TLS Client auth - 3. Stop supplier2 - 4. Run the CleanAllRUV task - 5. Restart supplier1 - 6. Check if supplier1 didn't crash - 7. Restart supplier1 again - 8. Check if supplier1 didn't crash - - :expectedresults: - 1. Success - 2. Success - 3. Success - 4. Success - 5. Success - 6. Success - 7. Success - 8. Success - """ - - m1 = topology_m2.ms["supplier1"] - m2 = topology_m2.ms["supplier2"] - - repl = ReplicationManager(DEFAULT_SUFFIX) - - cm_m1 = CertmapLegacy(m1) - cm_m2 = CertmapLegacy(m2) - - certmaps = cm_m1.list() - certmaps['default']['DNComps'] = None - certmaps['default']['CmapLdapAttr'] = 'nsCertSubjectDN' - - cm_m1.set(certmaps) - cm_m2.set(certmaps) - - log.info('Enabling TLS') - [i.enable_tls() for i in topology_m2] - - log.info('Creating replication dns') - services = ServiceAccounts(m1, DEFAULT_SUFFIX) - repl_m1 = services.get('%s:%s' % (m1.host, m1.sslport)) - repl_m1.set('nsCertSubjectDN', m1.get_server_tls_subject()) - - repl_m2 = services.get('%s:%s' % (m2.host, m2.sslport)) - repl_m2.set('nsCertSubjectDN', m2.get_server_tls_subject()) - - log.info('Changing auth type') - replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX) - agmt_m1 = replica_m1.get_agreements().list()[0] - agmt_m1.replace_many( - ('nsDS5ReplicaBindMethod', 'SSLCLIENTAUTH'), - ('nsDS5ReplicaTransportInfo', 'SSL'), - ('nsDS5ReplicaPort', '%s' % m2.sslport), - ) - - agmt_m1.remove_all('nsDS5ReplicaBindDN') - - replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX) - agmt_m2 = replica_m2.get_agreements().list()[0] - - agmt_m2.replace_many( - ('nsDS5ReplicaBindMethod', 'SSLCLIENTAUTH'), - ('nsDS5ReplicaTransportInfo', 'SSL'), - ('nsDS5ReplicaPort', '%s' % m1.sslport), - ) - agmt_m2.remove_all('nsDS5ReplicaBindDN') - - log.info('Stopping supplier2') - m2.stop() - - log.info('Run the cleanAllRUV task') - cruv_task = CleanAllRUVTask(m1) - cruv_task.create(properties={ - 'replica-id': repl.get_rid(m1), - 'replica-base-dn': DEFAULT_SUFFIX, - 'replica-force-cleaning': 'no', - 'replica-certify-all': 'yes' - }) - - m1.restart() - - log.info('Check if supplier1 crashed') - assert not m1.detectDisorderlyShutdown() - - log.info('Repeat') - m1.restart() - assert not m1.detectDisorderlyShutdown() - if __name__ == '__main__': # Run isolated diff --git a/dirsrvtests/tests/suites/replication/regression_m2_test.py b/dirsrvtests/tests/suites/replication/regression_m2_test.py index bbf9c8486..65c299a0c 100644 --- a/dirsrvtests/tests/suites/replication/regression_m2_test.py +++ b/dirsrvtests/tests/suites/replication/regression_m2_test.py @@ -240,8 +240,12 @@ def test_double_delete(topo_m2, create_entry): log.info('Deleting entry {} from supplier1'.format(create_entry.dn)) topo_m2.ms["supplier1"].delete_s(create_entry.dn) - log.info('Deleting entry {} from supplier2'.format(create_entry.dn)) - topo_m2.ms["supplier2"].delete_s(create_entry.dn) + try: + log.info('Deleting entry {} from supplier2'.format(create_entry.dn)) + topo_m2.ms["supplier2"].delete_s(create_entry.dn) + except ldap.NO_SUCH_OBJECT: + # replication was too fast (DEBUGGING is probably set) + pass repl.enable_to_supplier(m2, [m1]) repl.enable_to_supplier(m1, [m2]) @@ -813,8 +817,9 @@ def test_keepalive_entries(topo_m2): keep_alive_s1 = str(entries[0].data['keepalivetimestamp']) keep_alive_s2 = str(entries[1].data['keepalivetimestamp']) - # Wait for event interval (60 secs) to pass - time.sleep(61) + # Wait for event interval (60 secs) to pass, but first update doesn't + # start until 30 seconds after startup + time.sleep(91) # Check keep alives entries have been updated entries = verify_keepalive_entries(topo_m2, True); diff --git a/dirsrvtests/tests/suites/replication/regression_m2c2_test.py b/dirsrvtests/tests/suites/replication/regression_m2c2_test.py index 97b35c7ab..f9de7383c 100644 --- a/dirsrvtests/tests/suites/replication/regression_m2c2_test.py +++ b/dirsrvtests/tests/suites/replication/regression_m2c2_test.py @@ -289,6 +289,7 @@ def test_csngen_state_not_updated_if_different_uuid(topo_m2c2): log.error(f"c1 csngen state has unexpectedly been synchronized with m2: time skew {c1_timeSkew}") assert False c1.start() + time.sleep(5) # Step 8: Check that c2 has time skew # Stop server to insure that dse.ldif is uptodate diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c index 5dab57de4..d67f1bc71 100644 --- a/ldap/servers/plugins/replication/repl5_replica.c +++ b/ldap/servers/plugins/replication/repl5_replica.c @@ -239,8 +239,8 @@ replica_new_from_entry(Slapi_Entry *e, char *errortext, PRBool is_add_operation, /* create supplier update event */ if (r->repl_eqcxt_ka_update == NULL && replica_get_type(r) == REPLICA_TYPE_UPDATABLE) { r->repl_eqcxt_ka_update = slapi_eq_repeat_rel(replica_subentry_update, r, - slapi_current_rel_time_t() + 30, - replica_get_keepalive_update_interval(r)); + slapi_current_rel_time_t() + 30, + 1000 * replica_get_keepalive_update_interval(r)); } if (r->tombstone_reap_interval > 0) { @@ -518,7 +518,7 @@ replica_subentry_update(time_t when __attribute__((unused)), void *arg) replica_subentry_check(repl_root, rid); slapi_timestamp_utc_hr(buf, SLAPI_TIMESTAMP_BUFSIZE); - slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "replica_subentry_update called at %s\n", buf); + slapi_log_err(SLAPI_LOG_REPL, "NSMMReplicationPlugin", "replica_subentry_update called at %s\n", buf); val.bv_val = buf; val.bv_len = strlen(val.bv_val); vals[0] = &val; @@ -542,7 +542,7 @@ replica_subentry_update(time_t when __attribute__((unused)), void *arg) "Failure (%d) to update replication keep alive entry \"%s: %s\"\n", ldrc, KEEP_ALIVE_ATTR, buf); } else { - slapi_log_err(SLAPI_LOG_PLUGIN, repl_plugin_name, + slapi_log_err(SLAPI_LOG_REPL, "NSMMReplicationPlugin", "replica_subentry_update - " "Successful update of replication keep alive entry \"%s: %s\"\n", KEEP_ALIVE_ATTR, buf); @@ -1536,7 +1536,7 @@ replica_set_enabled(Replica *r, PRBool enable) if (r->repl_eqcxt_ka_update == NULL && replica_get_type(r) == REPLICA_TYPE_UPDATABLE) { r->repl_eqcxt_ka_update = slapi_eq_repeat_rel(replica_subentry_update, r, slapi_current_rel_time_t() + START_UPDATE_DELAY, - replica_get_keepalive_update_interval(r)); + 1000 * replica_get_keepalive_update_interval(r)); } } else /* disable */ { @@ -1546,7 +1546,7 @@ replica_set_enabled(Replica *r, PRBool enable) r->repl_eqcxt_rs = NULL; } /* Remove supplier update event */ - if (replica_get_type(r) == REPLICA_TYPE_PRIMARY) { + if (replica_get_type(r) == REPLICA_TYPE_UPDATABLE) { slapi_eq_cancel_rel(r->repl_eqcxt_ka_update); r->repl_eqcxt_ka_update = NULL; } diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c index 70c45ec50..b32d00941 100644 --- a/ldap/servers/plugins/replication/repl_extop.c +++ b/ldap/servers/plugins/replication/repl_extop.c @@ -493,7 +493,7 @@ free_and_return: slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "decode_startrepl_extop - decoded csn: %s\n", *csnstr); ruv_dump_to_log(*supplier_ruv, "decode_startrepl_extop"); - for (size_t i = 0; *extra_referrals && *extra_referrals[i]; i++) { + for (size_t i = 0; *extra_referrals && extra_referrals[i]; i++) { slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "decode_startrepl_extop - " "decoded referral: %s\n", *extra_referrals[i]); } @@ -1661,7 +1661,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb) * Launch the cleanruv monitoring thread. Once all the replicas are cleaned it will release the rid */ - cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR, "Launching cleanAllRUV thread..."); + cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Launching cleanAllRUV thread..."); data = (cleanruv_data *)slapi_ch_calloc(1, sizeof(cleanruv_data)); if (data == NULL) { slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "multimaster_extop_cleanruv - CleanAllRUV Task - Failed to allocate " diff --git a/ldap/servers/slapd/task.c b/ldap/servers/slapd/task.c index 4c7262ab3..71d5a2fb5 100644 --- a/ldap/servers/slapd/task.c +++ b/ldap/servers/slapd/task.c @@ -742,7 +742,7 @@ get_internal_entry(Slapi_PBlock *pb, char *dn) slapi_pblock_get(pb, SLAPI_PLUGIN_INTOP_RESULT, &ret); if (ret != LDAP_SUCCESS) { slapi_log_err(SLAPI_LOG_WARNING, "get_internal_entry", - "Can't find task entry '%s'\n", dn); + "Failed to search for task entry '%s' error: %d\n", dn, ret); return NULL; } @@ -786,9 +786,9 @@ modify_internal_entry(char *dn, LDAPMod **mods) * entry -- try at least 3 times before giving up. */ tries++; - if (tries == 3) { - slapi_log_err(SLAPI_LOG_WARNING, "modify_internal_entry", "Can't modify task " - "entry '%s'; %s (%d)\n", + if (tries == 5) { + slapi_log_err(SLAPI_LOG_WARNING, "modify_internal_entry", + "Can't modify task entry '%s'; %s (%d)\n", dn, ldap_err2string(ret), ret); slapi_pblock_destroy(pb); return; diff --git a/src/lib389/lib389/instance/remove.py b/src/lib389/lib389/instance/remove.py index e96db3896..5668f375b 100644 --- a/src/lib389/lib389/instance/remove.py +++ b/src/lib389/lib389/instance/remove.py @@ -90,6 +90,12 @@ def remove_ds_instance(dirsrv, force=False): # Remove parent (/var/lib/dirsrv/slapd-INST) shutil.rmtree(remove_paths['db_dir'].replace('db', ''), ignore_errors=True) + # Remove /run/slapd-isntance + try: + os.remove(f'/run/slapd-{dirsrv.serverid}.socket') + except OSError as e: + _log.debug("Failed to remove socket file: " + str(e)) + # We can not assume we have systemd ... if dirsrv.ds_paths.with_systemd: # Remove the systemd symlink -- 2.37.1