|
|
dc8c34 |
From 3712b35b7a61154be53649341885f49dce2820f4 Mon Sep 17 00:00:00 2001
|
|
|
dc8c34 |
From: Rich Megginson <rmeggins@redhat.com>
|
|
|
dc8c34 |
Date: Wed, 26 Jun 2013 13:35:39 -0600
|
|
|
dc8c34 |
Subject: [PATCH 75/99] Ticket #47410 - changelog db deadlocks with DNA and
|
|
|
dc8c34 |
replication
|
|
|
dc8c34 |
|
|
|
dc8c34 |
https://fedorahosted.org/389/ticket/47410
|
|
|
dc8c34 |
Reviewed by: mreynolds (Thanks!)
|
|
|
dc8c34 |
Branch: 389-ds-base-1.2.11
|
|
|
dc8c34 |
Fix Description: The deadlock is caused by having an outer and an inner
|
|
|
dc8c34 |
transaction in one thread, and a replication reader in another thread. The
|
|
|
dc8c34 |
outer transaction acquires a write lock on certain changelog db (cldb) pages
|
|
|
dc8c34 |
as a result of a previous nested transaction (e.g. a DNA shared config
|
|
|
dc8c34 |
area update). The changelog reader in the cursor positioning operation
|
|
|
dc8c34 |
acquires read locks on certain other pages. When another inner write
|
|
|
dc8c34 |
transaction occurs, it may attempt to acquire a write lock on a page held
|
|
|
dc8c34 |
by a read lock in the reader thread. This will eventually fail because
|
|
|
dc8c34 |
the reader will not release its lock on the page until the outer transaction
|
|
|
dc8c34 |
releases the write lock on the page.
|
|
|
dc8c34 |
The solution is to change the way the deadlock detection thread works, to
|
|
|
dc8c34 |
use a different deadlock rejection policy. When using DB_LOCK_MINWRITE
|
|
|
dc8c34 |
instead of the default DB_LOCK_YOUNGEST, the reader thread lock request is
|
|
|
dc8c34 |
rejected. This means the code that positions the changelog cursor has to be
|
|
|
dc8c34 |
able to handle a DB_LOCK_DEADLOCK return.
|
|
|
dc8c34 |
Changing the deadlock rejection policy globally to DB_LOCK_MINWRITE has the
|
|
|
dc8c34 |
potential to cause any search to get a DB_LOCK_DEADLOCK from a db or cursor
|
|
|
dc8c34 |
get(), so this will need to be tested a great deal to make sure we can handle
|
|
|
dc8c34 |
all such cases.
|
|
|
dc8c34 |
Platforms tested: RHEL6 x86_64
|
|
|
dc8c34 |
Flag Day: no
|
|
|
dc8c34 |
Doc impact: no
|
|
|
dc8c34 |
(cherry picked from commit b573d80d9c3acc6dba1bd60bdf7bf3fe4f4168df)
|
|
|
dc8c34 |
(cherry picked from commit 2e1d6331a294378650bef9af8a5f24bd338ae01e)
|
|
|
dc8c34 |
(cherry picked from commit 1987727066e6248cac978139779193d0f56ff9ce)
|
|
|
dc8c34 |
(cherry picked from commit 489de18b135c4019d93a3cf89a108b2c3c4cc3d9)
|
|
|
dc8c34 |
---
|
|
|
dc8c34 |
ldap/servers/plugins/replication/cl5.h | 2 ++
|
|
|
dc8c34 |
ldap/servers/plugins/replication/cl5_api.c | 1 -
|
|
|
dc8c34 |
ldap/servers/plugins/replication/cl5_clcache.c | 20 +++++++++++++++++++-
|
|
|
dc8c34 |
3 files changed, 21 insertions(+), 2 deletions(-)
|
|
|
dc8c34 |
|
|
|
dc8c34 |
diff --git a/ldap/servers/plugins/replication/cl5.h b/ldap/servers/plugins/replication/cl5.h
|
|
|
dc8c34 |
index 4c92ecd..33f8140 100644
|
|
|
dc8c34 |
--- a/ldap/servers/plugins/replication/cl5.h
|
|
|
dc8c34 |
+++ b/ldap/servers/plugins/replication/cl5.h
|
|
|
dc8c34 |
@@ -73,4 +73,6 @@ void changelog5_config_done (changelog5Config *config);
|
|
|
dc8c34 |
/* frees the content and the config structure */
|
|
|
dc8c34 |
void changelog5_config_free (changelog5Config **config);
|
|
|
dc8c34 |
|
|
|
dc8c34 |
+#define MAX_TRIALS 50 /* number of retries on db operations */
|
|
|
dc8c34 |
+
|
|
|
dc8c34 |
#endif
|
|
|
dc8c34 |
diff --git a/ldap/servers/plugins/replication/cl5_api.c b/ldap/servers/plugins/replication/cl5_api.c
|
|
|
dc8c34 |
index 175eb80..e24cead 100644
|
|
|
dc8c34 |
--- a/ldap/servers/plugins/replication/cl5_api.c
|
|
|
dc8c34 |
+++ b/ldap/servers/plugins/replication/cl5_api.c
|
|
|
dc8c34 |
@@ -67,7 +67,6 @@
|
|
|
dc8c34 |
|
|
|
dc8c34 |
#define GUARDIAN_FILE "guardian" /* name of the guardian file */
|
|
|
dc8c34 |
#define VERSION_FILE "DBVERSION" /* name of the version file */
|
|
|
dc8c34 |
-#define MAX_TRIALS 50 /* number of retries on db operations */
|
|
|
dc8c34 |
#define V_5 5 /* changelog entry version */
|
|
|
dc8c34 |
#define CHUNK_SIZE 64*1024
|
|
|
dc8c34 |
#define DBID_SIZE 64
|
|
|
dc8c34 |
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
dc8c34 |
index 202cb64..5329b4b 100644
|
|
|
dc8c34 |
--- a/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
dc8c34 |
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
|
|
|
dc8c34 |
@@ -380,6 +380,7 @@ clcache_load_buffer_bulk ( CLC_Buffer *buf, int flag )
|
|
|
dc8c34 |
DB_TXN *txn = NULL;
|
|
|
dc8c34 |
DBC *cursor = NULL;
|
|
|
dc8c34 |
int rc = 0;
|
|
|
dc8c34 |
+ int tries = 0;
|
|
|
dc8c34 |
|
|
|
dc8c34 |
#if 0 /* txn control seems not improving anything so turn it off */
|
|
|
dc8c34 |
if ( *(_pool->pl_dbenv) ) {
|
|
|
dc8c34 |
@@ -401,6 +402,7 @@ clcache_load_buffer_bulk ( CLC_Buffer *buf, int flag )
|
|
|
dc8c34 |
}
|
|
|
dc8c34 |
|
|
|
dc8c34 |
PR_Lock ( buf->buf_busy_list->bl_lock );
|
|
|
dc8c34 |
+retry:
|
|
|
dc8c34 |
if ( 0 == ( rc = clcache_open_cursor ( txn, buf, &cursor )) ) {
|
|
|
dc8c34 |
|
|
|
dc8c34 |
if ( flag == DB_NEXT ) {
|
|
|
dc8c34 |
@@ -422,10 +424,26 @@ clcache_load_buffer_bulk ( CLC_Buffer *buf, int flag )
|
|
|
dc8c34 |
|
|
|
dc8c34 |
/*
|
|
|
dc8c34 |
* Don't keep a cursor open across the whole replication session.
|
|
|
dc8c34 |
- * That had caused noticable DB resource contention.
|
|
|
dc8c34 |
+ * That had caused noticeable DB resource contention.
|
|
|
dc8c34 |
*/
|
|
|
dc8c34 |
if ( cursor ) {
|
|
|
dc8c34 |
cursor->c_close ( cursor );
|
|
|
dc8c34 |
+ cursor = NULL;
|
|
|
dc8c34 |
+ }
|
|
|
dc8c34 |
+ if ((rc == DB_LOCK_DEADLOCK) && (tries < MAX_TRIALS)) {
|
|
|
dc8c34 |
+ PRIntervalTime interval;
|
|
|
dc8c34 |
+
|
|
|
dc8c34 |
+ tries++;
|
|
|
dc8c34 |
+ slapi_log_error ( SLAPI_LOG_TRACE, "clcache_load_buffer_bulk",
|
|
|
dc8c34 |
+ "deadlock number [%d] - retrying\n", tries );
|
|
|
dc8c34 |
+ /* back off */
|
|
|
dc8c34 |
+ interval = PR_MillisecondsToInterval(slapi_rand() % 100);
|
|
|
dc8c34 |
+ DS_Sleep(interval);
|
|
|
dc8c34 |
+ goto retry;
|
|
|
dc8c34 |
+ }
|
|
|
dc8c34 |
+ if ((rc == DB_LOCK_DEADLOCK) && (tries >= MAX_TRIALS)) {
|
|
|
dc8c34 |
+ slapi_log_error ( SLAPI_LOG_REPL, "clcache_load_buffer_bulk",
|
|
|
dc8c34 |
+ "could not load buffer from changelog after %d tries\n", tries );
|
|
|
dc8c34 |
}
|
|
|
dc8c34 |
|
|
|
dc8c34 |
#if 0 /* txn control seems not improving anything so turn it off */
|
|
|
dc8c34 |
--
|
|
|
dc8c34 |
1.8.1.4
|
|
|
dc8c34 |
|