Blame SOURCES/0043-Ticket-49238-AddressSanitizer-heap-use-after-free-in.patch

6f51e1
From 4182dd8bbff22f9e0e45b763a4619c0bc8dcb153 Mon Sep 17 00:00:00 2001
6f51e1
From: Mark Reynolds <mreynolds@redhat.com>
6f51e1
Date: Tue, 9 May 2017 12:31:58 -0400
6f51e1
Subject: [PATCH] Ticket 49238 - AddressSanitizer: heap-use-after-free in
6f51e1
 libreplication
6f51e1
6f51e1
Bug Description:
6f51e1
        The bug is detected in csn pending list component, when
6f51e1
        accessing a csn that has already been freed.
6f51e1
6f51e1
        The bug is mostly detectable under ASAN because under normal run
6f51e1
        the read access to the csn would only crash if the csn was in
6f51e1
        an unmapped page (that is quite difficult to acheive).
6f51e1
6f51e1
        The bug was observed under the following conditions:
6f51e1
            - very slow machine
6f51e1
            - all instances running on the same machine
6f51e1
6f51e1
        The patch address 2 issues
6f51e1
6f51e1
        Issue - 1
6f51e1
        Under specfic circumstance (failure, like "db_deadlock" during changelog update),
6f51e1
        the csn was freed but still present in the pending list (fix-1).
6f51e1
6f51e1
        Issue - 2
6f51e1
        Further investigations, showed an other corner case where a
6f51e1
        replica could be updated by several suppliers in parallel.
6f51e1
        In such scenario, an update (on one thread-2) with a higher csn (let csn-2)
6f51e1
        may be applied before an update (on another thread-1) with a smaller
6f51e1
        csn (let csn-1).
6f51e1
        csn-2 is freed when thread-2 complete but the csn-2 will remain
6f51e1
        in the pending list until csn-1 is commited.
6f51e1
        so followup of pending list may access a csn that was freed
6f51e1
6f51e1
Fix Description:
6f51e1
        Issue - 1
6f51e1
        The fix in repl5_plugins.c, frees the csn (thread private area)
6f51e1
        at the condition pending list was roll up for that csn (ruv update).
6f51e1
6f51e1
        Issue - 2
6f51e1
        The fix is in two parts:
6f51e1
            If a supplier tries to acquire a replica while it is
6f51e1
        already owner of it, the replica is granted.
6f51e1
6f51e1
            If a supplier owns a replica and is asking again for it,
6f51e1
        but this time the replica is not granted, the replica is release and
6f51e1
        the supplier disconnected.
6f51e1
6f51e1
https://pagure.io/389-ds-base/issue/49238
6f51e1
6f51e1
Reviewed by: Mark Reynolds, Ludwig Krispenz, William Brown (thanks to you all !!)
6f51e1
6f51e1
Platforms tested: 7.4
6f51e1
6f51e1
Flag Day: no
6f51e1
6f51e1
Doc impact: no
6f51e1
---
6f51e1
 ldap/servers/plugins/replication/repl5.h         |  1 +
6f51e1
 ldap/servers/plugins/replication/repl5_plugins.c |  7 +++-
6f51e1
 ldap/servers/plugins/replication/repl5_replica.c | 49 +++++++++++++++++++-----
6f51e1
 ldap/servers/plugins/replication/repl_extop.c    | 42 ++++++++++++++++++--
6f51e1
 4 files changed, 86 insertions(+), 13 deletions(-)
6f51e1
6f51e1
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
6f51e1
index c3bd10c..1d8989c 100644
6f51e1
--- a/ldap/servers/plugins/replication/repl5.h
6f51e1
+++ b/ldap/servers/plugins/replication/repl5.h
6f51e1
@@ -549,6 +549,7 @@ void replica_relinquish_exclusive_access(Replica *r, PRUint64 connid, int opid);
6f51e1
 PRBool replica_get_tombstone_reap_active(const Replica *r);
6f51e1
 const Slapi_DN *replica_get_root(const Replica *r);
6f51e1
 const char *replica_get_name(const Replica *r);
6f51e1
+uint64_t replica_get_locking_conn(const Replica *r);
6f51e1
 ReplicaId replica_get_rid (const Replica *r);
6f51e1
 void replica_set_rid (Replica *r, ReplicaId rid);
6f51e1
 PRBool replica_is_initialized (const Replica *r);
6f51e1
diff --git a/ldap/servers/plugins/replication/repl5_plugins.c b/ldap/servers/plugins/replication/repl5_plugins.c
6f51e1
index ebcc230..9ef06af 100644
6f51e1
--- a/ldap/servers/plugins/replication/repl5_plugins.c
6f51e1
+++ b/ldap/servers/plugins/replication/repl5_plugins.c
6f51e1
@@ -1224,7 +1224,12 @@ common_return:
6f51e1
 	opcsn = operation_get_csn(op);
6f51e1
 	prim_csn = get_thread_primary_csn();
6f51e1
 	if (csn_is_equal(opcsn, prim_csn)) {
6f51e1
-		set_thread_primary_csn(NULL);
6f51e1
+		if (return_value == 0) {
6f51e1
+			/* the primary csn was succesfully committed
6f51e1
+			 * unset it in the thread local data
6f51e1
+			 */
6f51e1
+			set_thread_primary_csn(NULL);
6f51e1
+		}
6f51e1
 	}
6f51e1
 	if (repl_obj) {
6f51e1
 		object_release (repl_obj);
6f51e1
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
6f51e1
index a106f8b..1bdc138 100644
6f51e1
--- a/ldap/servers/plugins/replication/repl5_replica.c
6f51e1
+++ b/ldap/servers/plugins/replication/repl5_replica.c
6f51e1
@@ -64,6 +64,7 @@ struct replica {
6f51e1
 	PRBool state_update_inprogress; /* replica state is being updated */
6f51e1
 	PRLock *agmt_lock;          	/* protects agreement creation, start and stop */
6f51e1
 	char *locking_purl;				/* supplier who has exclusive access */
6f51e1
+	uint64_t locking_conn;         	/* The supplier's connection id */
6f51e1
 	Slapi_Counter *protocol_timeout;/* protocol shutdown timeout */
6f51e1
 	Slapi_Counter *backoff_min;		/* backoff retry minimum */
6f51e1
 	Slapi_Counter *backoff_max;		/* backoff retry maximum */
6f51e1
@@ -602,19 +603,32 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
6f51e1
 				slapi_sdn_get_dn(r->repl_root),
6f51e1
 				r->locking_purl ? r->locking_purl : "unknown");
6f51e1
 		rval = PR_FALSE;
6f51e1
+		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS)) {
6f51e1
+			/* inc update */
6f51e1
+			if (r->locking_purl && r->locking_conn == connid) {
6f51e1
+				/* This is the same supplier connection, reset the replica
6f51e1
+				 * purl, and return success */
6f51e1
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name,
6f51e1
+					"replica_get_exclusive_access - "
6f51e1
+					"This is a second acquire attempt from the same replica connection "
6f51e1
+					" - return success instead of busy\n");
6f51e1
+				slapi_ch_free_string(&r->locking_purl);
6f51e1
+				r->locking_purl = slapi_ch_strdup(locking_purl);
6f51e1
+				rval = PR_TRUE;
6f51e1
+				goto done;
6f51e1
+			}
6f51e1
+			if (replica_get_release_timeout(r)) {
6f51e1
+				/*
6f51e1
+				 * Abort the current session so other replicas can acquire
6f51e1
+				 * this server.
6f51e1
+				 */
6f51e1
+				r->abort_session = ABORT_SESSION;
6f51e1
+			}
6f51e1
+		}
6f51e1
 		if (current_purl)
6f51e1
 		{
6f51e1
 			*current_purl = slapi_ch_strdup(r->locking_purl);
6f51e1
 		}
6f51e1
-		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS) &&
6f51e1
-		    replica_get_release_timeout(r))
6f51e1
-		{
6f51e1
-			/*
6f51e1
-			 * We are not doing a total update, so abort the current session
6f51e1
-			 * so other replicas can acquire this server.
6f51e1
-			 */
6f51e1
-			r->abort_session = ABORT_SESSION;
6f51e1
-		}
6f51e1
 	}
6f51e1
 	else
6f51e1
 	{
6f51e1
@@ -642,7 +656,9 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
6f51e1
 		}
6f51e1
 		slapi_ch_free_string(&r->locking_purl);
6f51e1
 		r->locking_purl = slapi_ch_strdup(locking_purl);
6f51e1
+		r->locking_conn = connid;
6f51e1
 	}
6f51e1
+done:
6f51e1
 	replica_unlock(r->repl_lock);
6f51e1
 	return rval;
6f51e1
 }
6f51e1
@@ -720,6 +736,18 @@ replica_get_name(const Replica *r) /* ONREPL - should we return copy instead? */
6f51e1
     return(r->repl_name);
6f51e1
 }
6f51e1
 
6f51e1
+/*
6f51e1
+ * Returns locking_conn of this replica
6f51e1
+ */
6f51e1
+uint64_t
6f51e1
+replica_get_locking_conn(const Replica *r)
6f51e1
+{
6f51e1
+	uint64_t connid;
6f51e1
+	replica_lock(r->repl_lock);
6f51e1
+	connid = r->locking_conn;
6f51e1
+	replica_unlock(r->repl_lock);
6f51e1
+	return connid;
6f51e1
+}
6f51e1
 /* 
6f51e1
  * Returns replicaid of this replica 
6f51e1
  */
6f51e1
@@ -2251,6 +2279,9 @@ _replica_init_from_config (Replica *r, Slapi_Entry *e, char *errortext)
6f51e1
     }
6f51e1
 
6f51e1
     r->tombstone_reap_stop = r->tombstone_reap_active = PR_FALSE;
6f51e1
+    
6f51e1
+    /* No supplier holding the replica */
6f51e1
+    r->locking_conn = ULONG_MAX;
6f51e1
 
6f51e1
     return (_replica_check_validity (r));
6f51e1
 }
6f51e1
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
6f51e1
index 412caec..a39d918 100644
6f51e1
--- a/ldap/servers/plugins/replication/repl_extop.c
6f51e1
+++ b/ldap/servers/plugins/replication/repl_extop.c
6f51e1
@@ -1138,9 +1138,45 @@ send_response:
6f51e1
 		 */
6f51e1
 		if (NULL != connext && NULL != connext->replica_acquired)
6f51e1
 		{
6f51e1
-            Object *r_obj = (Object*)connext->replica_acquired;
6f51e1
-			replica_relinquish_exclusive_access((Replica*)object_get_data (r_obj),
6f51e1
-												connid, opid);
6f51e1
+			Replica *r = (Replica*)object_get_data ((Object*)connext->replica_acquired);
6f51e1
+			uint64_t r_locking_conn;
6f51e1
+			
6f51e1
+			/* At this point the supplier runs a Replica Agreement for 
6f51e1
+			 * the specific replica connext->replica_acquired.
6f51e1
+			 * The RA does not know it holds the replica (because it is
6f51e1
+			 * sending this request).
6f51e1
+			 * The situation is confused
6f51e1
+			 */
6f51e1
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
6f51e1
+				"already acquired replica: replica not ready (%d) (replica=%s)\n", response, replica_get_name(r) ? replica_get_name(r) : "no name");
6f51e1
+			
6f51e1
+			/*
6f51e1
+			 * On consumer side, we release the exclusive access at the
6f51e1
+			 * condition this is this RA that holds the replica
6f51e1
+			 */
6f51e1
+			if (r) {
6f51e1
+				
6f51e1
+				r_locking_conn = replica_get_locking_conn(r);
6f51e1
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
6f51e1
+				"already acquired replica: locking_conn=%d, current connid=%d\n", (int) r_locking_conn, (int) connid);
6f51e1
+				
6f51e1
+				if ((r_locking_conn != ULONG_MAX) && (r_locking_conn == connid)) {
6f51e1
+					replica_relinquish_exclusive_access(r, connid, opid);
6f51e1
+					object_release((Object*) connext->replica_acquired);
6f51e1
+					connext->replica_acquired = NULL;
6f51e1
+				}
6f51e1
+			}
6f51e1
+			/*
6f51e1
+			 * On consumer side we should not keep a incoming connection
6f51e1
+			 * with replica_acquired set although the supplier is not aware of
6f51e1
+			 * 
6f51e1
+			 * On the supplier, we need to close the connection so
6f51e1
+			 * that the RA will restart a new session in a clear state 
6f51e1
+			 */
6f51e1
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
6f51e1
+				"already acquired replica: disconnect conn=%d\n", connid);
6f51e1
+			slapi_disconnect_server(conn);
6f51e1
+            
6f51e1
 		}
6f51e1
 		/* Remove any flags that would indicate repl session in progress */
6f51e1
 		if (NULL != connext)
6f51e1
-- 
6f51e1
2.9.4
6f51e1