From f05f5f20a468efa82d13a99687ac5d3a5d80a3c9 Mon Sep 17 00:00:00 2001 From: tbordaz Date: Tue, 23 Feb 2021 13:42:31 +0100 Subject: [PATCH] Issue 4644 - Large updates can reset the CLcache to the beginning of the changelog (#4647) Bug description: The replication agreements are using bulk load to load updates. For bulk load it uses a cursor with DB_MULTIPLE_KEY and DB_NEXT. Before using the cursor, it must be initialized with DB_SET. If during the cursor/DB_SET the CSN refers to an update that is larger than the size of the provided buffer, then the cursor remains not initialized and c_get returns DB_BUFFER_SMALL. The consequence is that the next c_get(DB_MULTIPLE_KEY and DB_NEXT) will return the first record in the changelog DB. This break CLcache. Fix description: The fix is to harden cursor initialization so that if DB_SET fails because of DB_BUFFER_SMALL. It reallocates buf_data and retries a DB_SET. If DB_SET can not be initialized it logs a warning. The patch also changes the behaviour of the fix #4492. #4492 detected a massive (1day) jump prior the starting csn and ended the replication session. If the jump was systematic, for example if the CLcache got broken because of a too large updates, then replication was systematically stopped. This patch suppress the systematically stop, letting RA doing a big jump. From #4492 only remains the warning. relates: https://github.com/389ds/389-ds-base/issues/4644 Reviewed by: Pierre Rogier (Thanks !!!!) Platforms tested: F31 --- .../servers/plugins/replication/cl5_clcache.c | 68 +++++++++++++++---- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c index fcbca047a..90dec4d54 100644 --- a/ldap/servers/plugins/replication/cl5_clcache.c +++ b/ldap/servers/plugins/replication/cl5_clcache.c @@ -370,9 +370,7 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, cha } csn_as_string(buf->buf_current_csn, 0, curr); slapi_log_err(loglevel, buf->buf_agmt_name, - "clcache_load_buffer - bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn); - /* it just end the session with UPDATE_NO_MORE_UPDATES */ - rc = CLC_STATE_DONE; + "clcache_load_buffer - bulk load cursor (%s) is lower than starting csn %s.\n", curr, initial_starting_csn); } } @@ -413,10 +411,7 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, cha } csn_as_string(buf->buf_current_csn, 0, curr); slapi_log_err(loglevel, buf->buf_agmt_name, - "clcache_load_buffer - (DB_SET_RANGE) bulk load cursor (%s) is lower than starting csn %s. Ending session.\n", curr, initial_starting_csn); - rc = DB_NOTFOUND; - - return rc; + "clcache_load_buffer - (DB_SET_RANGE) bulk load cursor (%s) is lower than starting csn %s.\n", curr, initial_starting_csn); } } } @@ -444,6 +439,42 @@ clcache_load_buffer(CLC_Buffer *buf, CSN **anchorCSN, int *continue_on_miss, cha return rc; } +/* Set a cursor to a specific key (buf->buf_key) + * In case buf_data is too small to receive the value, DB_SET fails + * (DB_BUFFER_SMALL). This let the cursor uninitialized that is + * problematic because further cursor DB_NEXT will reset the cursor + * to the beginning of the CL. + * If buf_data is too small, this function reallocates enough space + * + * It returns the return code of cursor->c_get + */ +static int +clcache_cursor_set(DBC *cursor, CLC_Buffer *buf) +{ + int rc; + uint32_t ulen; + uint32_t dlen; + uint32_t size; + + rc = cursor->c_get(cursor, &buf->buf_key, &buf->buf_data, DB_SET); + if (rc == DB_BUFFER_SMALL) { + uint32_t ulen; + + /* Fortunately, buf->buf_data.size has been set by + * c_get() to the actual data size needed. So we can + * reallocate the data buffer and try to set again. + */ + ulen = buf->buf_data.ulen; + buf->buf_data.ulen = (buf->buf_data.size / DEFAULT_CLC_BUFFER_PAGE_SIZE + 1) * DEFAULT_CLC_BUFFER_PAGE_SIZE; + buf->buf_data.data = slapi_ch_realloc(buf->buf_data.data, buf->buf_data.ulen); + slapi_log_err(SLAPI_LOG_REPL, buf->buf_agmt_name, + "clcache_cursor_set - buf data len reallocated %d -> %d bytes (DB_BUFFER_SMALL)\n", + ulen, buf->buf_data.ulen); + rc = cursor->c_get(cursor, &buf->buf_key, &buf->buf_data, DB_SET); + } + return rc; +} + static int clcache_load_buffer_bulk(CLC_Buffer *buf, int flag) { @@ -478,17 +509,24 @@ retry: if (use_flag == DB_NEXT) { /* For bulk read, position the cursor before read the next block */ - rc = cursor->c_get(cursor, - &buf->buf_key, - &buf->buf_data, - DB_SET); + rc = clcache_cursor_set(cursor, buf); } - /* - * Continue if the error is no-mem since we don't need to - * load in the key record anyway with DB_SET. - */ if (0 == rc || DB_BUFFER_SMALL == rc) { + /* + * It should not have failed with DB_BUFFER_SMALL as we tried + * to adjust buf_data in clcache_cursor_set. + * But if it failed with DB_BUFFER_SMALL, there is a risk in clcache_cursor_get + * that the cursor will be reset to the beginning of the changelog. + * Returning an error at this point will stop replication that is + * a risk. So just accept the risk of a reset to the beginning of the CL + * and log an alarming message. + */ + if (rc == DB_BUFFER_SMALL) { + slapi_log_err(SLAPI_LOG_WARNING, buf->buf_agmt_name, + "clcache_load_buffer_bulk - Fail to position on csn=%s from the changelog (too large update ?). Risk of full CL evaluation.\n", + (char *)buf->buf_key.data); + } rc = clcache_cursor_get(cursor, buf, use_flag); } } -- 2.31.1