|
|
a2f18f |
From cb54fa78fdd5e94f890c3fa1c03481358e3c82ce Mon Sep 17 00:00:00 2001
|
|
|
a2f18f |
From: Mark Reynolds <mreynolds@redhat.com>
|
|
|
a2f18f |
Date: Thu, 9 Jul 2015 09:59:46 -0400
|
|
|
a2f18f |
Subject: [PATCH 13/20] Ticket 48217 - cleanAllRUV hangs shutdown if not all of
|
|
|
a2f18f |
the replicas are online
|
|
|
a2f18f |
|
|
|
a2f18f |
Bug Description: There are race conditions where we might not notify the
|
|
|
a2f18f |
clean task when a shutdown is occuring. This casues the
|
|
|
a2f18f |
task refcount to be not decremented, which hangs the
|
|
|
a2f18f |
destructor function.
|
|
|
a2f18f |
|
|
|
a2f18f |
Fix Description: Check that the server is not shutting down before going
|
|
|
a2f18f |
to sleep, and notify the clean/abort tasks to stop in
|
|
|
a2f18f |
the destructor functions(instead of in the mmr plugin stop
|
|
|
a2f18f |
function).
|
|
|
a2f18f |
|
|
|
a2f18f |
https://fedorahosted.org/389/ticket/48217
|
|
|
a2f18f |
|
|
|
a2f18f |
Reviewed by: lkrispen(Thanks!)
|
|
|
a2f18f |
|
|
|
a2f18f |
(cherry picked from commit d6269f2e6898a187d43e3368860b13cdbd39ec55)
|
|
|
a2f18f |
(cherry picked from commit 0bb881aea92d64e509cf7604e86559779e4f9b77)
|
|
|
a2f18f |
---
|
|
|
a2f18f |
.../plugins/replication/repl5_replica_config.c | 49 ++++++++++++++--------
|
|
|
a2f18f |
1 file changed, 31 insertions(+), 18 deletions(-)
|
|
|
a2f18f |
|
|
|
a2f18f |
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
|
|
|
a2f18f |
index faa86b8..446da3f 100644
|
|
|
a2f18f |
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
|
|
|
a2f18f |
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
|
|
|
a2f18f |
@@ -1738,7 +1738,9 @@ replica_cleanallruv_thread(void *arg)
|
|
|
a2f18f |
}
|
|
|
a2f18f |
if (data->task) {
|
|
|
a2f18f |
slapi_task_inc_refcount(data->task);
|
|
|
a2f18f |
- slapi_log_error(SLAPI_LOG_PLUGIN, repl_plugin_name, "replica_cleanallruv_thread --> refcount incremented.\n");
|
|
|
a2f18f |
+ slapi_log_error(SLAPI_LOG_PLUGIN, repl_plugin_name,
|
|
|
a2f18f |
+ "replica_cleanallruv_thread --> refcount incremented (%d).\n",
|
|
|
a2f18f |
+ data->task->task_refcount);
|
|
|
a2f18f |
}
|
|
|
a2f18f |
/*
|
|
|
a2f18f |
* Initialize our settings
|
|
|
a2f18f |
@@ -1871,10 +1873,11 @@ replica_cleanallruv_thread(void *arg)
|
|
|
a2f18f |
*/
|
|
|
a2f18f |
cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, "Not all replicas have received the "
|
|
|
a2f18f |
"cleanallruv extended op, retrying in %d seconds",interval);
|
|
|
a2f18f |
- PR_Lock( notify_lock );
|
|
|
a2f18f |
- PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
- PR_Unlock( notify_lock );
|
|
|
a2f18f |
-
|
|
|
a2f18f |
+ if(!slapi_is_shutting_down()){
|
|
|
a2f18f |
+ PR_Lock( notify_lock );
|
|
|
a2f18f |
+ PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
+ PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ }
|
|
|
a2f18f |
if(interval < 14400){ /* 4 hour max */
|
|
|
a2f18f |
interval = interval * 2;
|
|
|
a2f18f |
} else {
|
|
|
a2f18f |
@@ -1974,6 +1977,7 @@ done:
|
|
|
a2f18f |
if(data->repl_obj && free_obj){
|
|
|
a2f18f |
object_release(data->repl_obj);
|
|
|
a2f18f |
}
|
|
|
a2f18f |
+
|
|
|
a2f18f |
csn_free(&data->maxcsn);
|
|
|
a2f18f |
slapi_sdn_free(&data->sdn);
|
|
|
a2f18f |
slapi_ch_free_string(&data->repl_root);
|
|
|
a2f18f |
@@ -1987,6 +1991,7 @@ replica_cleanall_ruv_destructor(Slapi_Task *task)
|
|
|
a2f18f |
{
|
|
|
a2f18f |
slapi_log_error( SLAPI_LOG_PLUGIN, repl_plugin_name,
|
|
|
a2f18f |
"replica_cleanall_ruv_destructor -->\n" );
|
|
|
a2f18f |
+ stop_ruv_cleaning();
|
|
|
a2f18f |
if (task) {
|
|
|
a2f18f |
while (slapi_task_get_refcount(task) > 0) {
|
|
|
a2f18f |
/* Yield to wait for the fixup task finishes. */
|
|
|
a2f18f |
@@ -2002,6 +2007,7 @@ replica_cleanall_ruv_abort_destructor(Slapi_Task *task)
|
|
|
a2f18f |
{
|
|
|
a2f18f |
slapi_log_error( SLAPI_LOG_PLUGIN, repl_plugin_name,
|
|
|
a2f18f |
"replica_cleanall_ruv_abort_destructor -->\n" );
|
|
|
a2f18f |
+ stop_ruv_cleaning();
|
|
|
a2f18f |
if (task) {
|
|
|
a2f18f |
while (slapi_task_get_refcount(task) > 0) {
|
|
|
a2f18f |
/* Yield to wait for the fixup task finishes. */
|
|
|
a2f18f |
@@ -2055,9 +2061,11 @@ check_replicas_are_done_cleaning(cleanruv_data *data )
|
|
|
a2f18f |
break;
|
|
|
a2f18f |
}
|
|
|
a2f18f |
cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, "Not all replicas finished cleaning, retrying in %d seconds",interval);
|
|
|
a2f18f |
- PR_Lock( notify_lock );
|
|
|
a2f18f |
- PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
- PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ if(!slapi_is_shutting_down()){
|
|
|
a2f18f |
+ PR_Lock( notify_lock );
|
|
|
a2f18f |
+ PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
+ PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ }
|
|
|
a2f18f |
if(interval < 14400){ /* 4 hour max */
|
|
|
a2f18f |
interval = interval * 2;
|
|
|
a2f18f |
} else {
|
|
|
a2f18f |
@@ -2158,9 +2166,11 @@ check_replicas_are_done_aborting(cleanruv_data *data )
|
|
|
a2f18f |
break;
|
|
|
a2f18f |
}
|
|
|
a2f18f |
cleanruv_log(data->task, data->rid, ABORT_CLEANALLRUV_ID, "Not all replicas finished aborting, retrying in %d seconds",interval);
|
|
|
a2f18f |
- PR_Lock( notify_lock );
|
|
|
a2f18f |
- PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
- PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ if(!slapi_is_shutting_down()){
|
|
|
a2f18f |
+ PR_Lock( notify_lock );
|
|
|
a2f18f |
+ PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
+ PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ }
|
|
|
a2f18f |
if(interval < 14400){ /* 4 hour max */
|
|
|
a2f18f |
interval = interval * 2;
|
|
|
a2f18f |
} else {
|
|
|
a2f18f |
@@ -2212,10 +2222,11 @@ check_agmts_are_caught_up(cleanruv_data *data, char *maxcsn)
|
|
|
a2f18f |
}
|
|
|
a2f18f |
cleanruv_log(data->task, data->rid, CLEANALLRUV_ID,
|
|
|
a2f18f |
"Not all replicas caught up, retrying in %d seconds",interval);
|
|
|
a2f18f |
- PR_Lock( notify_lock );
|
|
|
a2f18f |
- PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
- PR_Unlock( notify_lock );
|
|
|
a2f18f |
-
|
|
|
a2f18f |
+ if(!slapi_is_shutting_down()){
|
|
|
a2f18f |
+ PR_Lock( notify_lock );
|
|
|
a2f18f |
+ PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
+ PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ }
|
|
|
a2f18f |
if(interval < 14400){ /* 4 hour max */
|
|
|
a2f18f |
interval = interval * 2;
|
|
|
a2f18f |
} else {
|
|
|
a2f18f |
@@ -2271,10 +2282,12 @@ check_agmts_are_alive(Replica *replica, ReplicaId rid, Slapi_Task *task)
|
|
|
a2f18f |
}
|
|
|
a2f18f |
cleanruv_log(task, rid, CLEANALLRUV_ID, "Not all replicas online, retrying in %d seconds...",
|
|
|
a2f18f |
interval);
|
|
|
a2f18f |
- PR_Lock( notify_lock );
|
|
|
a2f18f |
- PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
- PR_Unlock( notify_lock );
|
|
|
a2f18f |
|
|
|
a2f18f |
+ if(!slapi_is_shutting_down()){
|
|
|
a2f18f |
+ PR_Lock( notify_lock );
|
|
|
a2f18f |
+ PR_WaitCondVar( notify_cvar, PR_SecondsToInterval(interval) );
|
|
|
a2f18f |
+ PR_Unlock( notify_lock );
|
|
|
a2f18f |
+ }
|
|
|
a2f18f |
if(interval < 14400){ /* 4 hour max */
|
|
|
a2f18f |
interval = interval * 2;
|
|
|
a2f18f |
} else {
|
|
|
a2f18f |
--
|
|
|
a2f18f |
1.9.3
|
|
|
a2f18f |
|