From e98e41731051b7bf4a443b51a9d3563fc1853773 Mon Sep 17 00:00:00 2001
From: Rich Megginson <rmeggins@redhat.com>
Date: Wed, 6 Nov 2013 14:22:31 -0700
Subject: [PATCH 47/49] Ticket #47585 Replication Failures related to skipped
entries due to cleaned rids
https://fedorahosted.org/389/ticket/47585
Reviewed by: nhosoi (Thanks!)
Branch: 389-ds-base-1.3.1
Fix Description: If a change was found in the changelog buffer that is
skipped due to having an unknown replica ID (rid), the entire buffer was
marked as CLC_STATE_NEW_RID. When the buffer is exhausted and the iterator
code goes to read in the new buffer, it would not read in the new buffer
because it only loads a new buffer if the current buffer state is
CLC_STATE_READY. I don't know why the entire buffer would be marked as
CLC_STATE_NEW_RID and stop iteration. It seems to me that just the update
should be skipped, but new buffers should be loaded in order to keep sending
non-skipped updates to the consumer.
It is possible for a CSN with an unknown RID to get into the changelog if
the server with that RID had been removed by cleanruv/cleanallruv. In that
case, the CSN should be skipped. It is assumed that the change was already
sent - cleanallruv is supposed to wait until all known changes have been
seen before removing the RID from the RUV - so it is safe to skip it.
Added additional debugging, so that we can better tell why changelog entries
were skipped.
Platforms tested: RHEL6 x86_64
Flag Day: no
Doc impact: no
(cherry picked from commit cf08f1274404e4796966011a98a6a0acbbfd6070)
(cherry picked from commit 30bb98fb693ea1aac9774bdc43b923eacd72570a)
(cherry picked from commit fc70e4ac6accaa14d140e333829e98897f6ff164)
---
ldap/servers/plugins/replication/cl5_clcache.c | 48 ++++++++++++++++++++++----
1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
index 7a6a446..8218312 100644
--- a/ldap/servers/plugins/replication/cl5_clcache.c
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
@@ -120,6 +120,11 @@ struct clc_buffer {
int buf_load_cnt; /* number of loads for session */
int buf_record_cnt; /* number of changes for session */
int buf_record_skipped; /* number of changes skipped */
+ int buf_skipped_new_rid; /* number of changes skipped due to new_rid */
+ int buf_skipped_csn_gt_cons_maxcsn; /* number of changes skipped due to csn greater than consumer maxcsn */
+ int buf_skipped_up_to_date; /* number of changes skipped due to consumer being up-to-date for the given rid */
+ int buf_skipped_csn_gt_ruv; /* number of changes skipped due to preceedents are not covered by local RUV snapshot */
+ int buf_skipped_csn_covered; /* number of changes skipped due to CSNs already covered by consumer RUV */
/*
* fields that should be accessed via bl_lock or pl_lock
@@ -252,6 +257,11 @@ clcache_get_buffer ( CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV
(*buf)->buf_record_skipped = 0;
(*buf)->buf_cursor = NULL;
(*buf)->buf_num_cscbs = 0;
+ (*buf)->buf_skipped_new_rid = 0;
+ (*buf)->buf_skipped_csn_gt_cons_maxcsn = 0;
+ (*buf)->buf_skipped_up_to_date = 0;
+ (*buf)->buf_skipped_csn_gt_ruv = 0;
+ (*buf)->buf_skipped_csn_covered = 0;
}
else {
*buf = clcache_new_buffer ( consumer_rid );
@@ -287,11 +297,16 @@ clcache_return_buffer ( CLC_Buffer **buf )
int i;
slapi_log_error ( SLAPI_LOG_REPL, (*buf)->buf_agmt_name,
- "session end: state=%d load=%d sent=%d skipped=%d\n",
- (*buf)->buf_state,
- (*buf)->buf_load_cnt,
- (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
- (*buf)->buf_record_skipped );
+ "session end: state=%d load=%d sent=%d skipped=%d skipped_new_rid=%d "
+ "skipped_csn_gt_cons_maxcsn=%d skipped_up_to_date=%d "
+ "skipped_csn_gt_ruv=%d skipped_csn_covered=%d\n",
+ (*buf)->buf_state,
+ (*buf)->buf_load_cnt,
+ (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
+ (*buf)->buf_record_skipped, (*buf)->buf_skipped_new_rid,
+ (*buf)->buf_skipped_csn_gt_cons_maxcsn,
+ (*buf)->buf_skipped_up_to_date, (*buf)->buf_skipped_csn_gt_ruv,
+ (*buf)->buf_skipped_csn_covered);
for ( i = 0; i < (*buf)->buf_num_cscbs; i++ ) {
clcache_free_cscb ( &(*buf)->buf_cscbs[i] );
@@ -676,6 +691,8 @@ clcache_skip_change ( CLC_Buffer *buf )
ReplicaId rid;
int skip = 1;
int i;
+ char buf_cur_csn_str[CSN_STRSIZE];
+ char oth_csn_str[CSN_STRSIZE];
do {
@@ -697,6 +714,14 @@ clcache_skip_change ( CLC_Buffer *buf )
* The consumer must have been "restored" and needs this newer update.
*/
skip = 0;
+ } else if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
+ csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
+ csn_as_string(cons_maxcsn, 0, oth_csn_str);
+ slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
+ "Skipping update because the changelog buffer current csn [%s] is "
+ "less than or equal to the consumer max csn [%s]\n",
+ buf_cur_csn_str, oth_csn_str);
+ buf->buf_skipped_csn_gt_cons_maxcsn++;
}
csn_free(&cons_maxcsn);
break;
@@ -714,7 +739,14 @@ clcache_skip_change ( CLC_Buffer *buf )
/* Skip CSN whose RID is unknown to the local RUV snapshot */
if ( i >= buf->buf_num_cscbs ) {
- buf->buf_state = CLC_STATE_NEW_RID;
+ if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
+ csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
+ slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
+ "Skipping update because the changelog buffer current csn [%s] rid "
+ "[%d] is not in the list of changelog csn buffers (length %d)\n",
+ buf_cur_csn_str, rid, buf->buf_num_cscbs);
+ }
+ buf->buf_skipped_new_rid++;
break;
}
@@ -722,17 +754,20 @@ clcache_skip_change ( CLC_Buffer *buf )
/* Skip if the consumer is already up-to-date for the RID */
if ( cscb->state == CLC_STATE_UP_TO_DATE ) {
+ buf->buf_skipped_up_to_date++;
break;
}
/* Skip CSN whose preceedents are not covered by local RUV snapshot */
if ( cscb->state == CLC_STATE_CSN_GT_RUV ) {
+ buf->buf_skipped_csn_gt_ruv++;
break;
}
/* Skip CSNs already covered by consumer RUV */
if ( cscb->consumer_maxcsn &&
csn_compare ( buf->buf_current_csn, cscb->consumer_maxcsn ) <= 0 ) {
+ buf->buf_skipped_csn_covered++;
break;
}
@@ -762,6 +797,7 @@ clcache_skip_change ( CLC_Buffer *buf )
/* Skip CSNs not covered by local RUV snapshot */
cscb->state = CLC_STATE_CSN_GT_RUV;
+ buf->buf_skipped_csn_gt_ruv++;
} while (0);
--
1.8.1.4