c2d2c5
From 05bf4fa9056d85782dad528bba3d0fa90946d4d8 Mon Sep 17 00:00:00 2001
c2d2c5
From: Petr Mensik <pemensik@redhat.com>
c2d2c5
Date: Wed, 24 Apr 2019 20:09:07 +0200
c2d2c5
Subject: [PATCH] 5200. [security] tcp-clients settings could be exceeded in
c2d2c5
 some cases, which could lead to exhaustion of file descriptors.
c2d2c5
 (CVE-2018-5743) [GL #615]
c2d2c5
---
c2d2c5
 bin/named/client.c                     | 428 +++++++++++++++++++------
c2d2c5
 bin/named/include/named/client.h       |  13 +-
c2d2c5
 bin/named/include/named/interfacemgr.h |  13 +-
c2d2c5
 bin/named/interfacemgr.c               |   9 +-
c2d2c5
 lib/isc/include/isc/quota.h            |   7 +
c2d2c5
 lib/isc/quota.c                        |  33 +-
c2d2c5
 6 files changed, 389 insertions(+), 114 deletions(-)
c2d2c5
c2d2c5
diff --git a/bin/named/client.c b/bin/named/client.c
c2d2c5
index f014b61..0ba383b 100644
c2d2c5
--- a/bin/named/client.c
c2d2c5
+++ b/bin/named/client.c
c2d2c5
@@ -234,7 +234,9 @@ static void ns_client_dumpmessage(ns_client_t *client, const char *reason);
c2d2c5
 static isc_result_t get_client(ns_clientmgr_t *manager, ns_interface_t *ifp,
c2d2c5
 			       dns_dispatch_t *disp, isc_boolean_t tcp);
c2d2c5
 static isc_result_t get_worker(ns_clientmgr_t *manager, ns_interface_t *ifp,
c2d2c5
-			       isc_socket_t *socket);
c2d2c5
+			       isc_socket_t *sock, ns_client_t *oldclient);
c2d2c5
+static inline isc_boolean_t
c2d2c5
+allowed(isc_netaddr_t *addr, dns_name_t *signer, dns_acl_t *acl);
c2d2c5
 
c2d2c5
 void
c2d2c5
 ns_client_recursing(ns_client_t *client) {
c2d2c5
@@ -280,6 +282,119 @@ ns_client_settimeout(ns_client_t *client, unsigned int seconds) {
c2d2c5
 	}
c2d2c5
 }
c2d2c5
 
c2d2c5
+/*%
c2d2c5
+ * Allocate a reference-counted object that will maintain a single pointer to
c2d2c5
+ * the (also reference-counted) TCP client quota, shared between all the
c2d2c5
+ * clients processing queries on a single TCP connection, so that all
c2d2c5
+ * clients sharing the one socket will together consume only one slot in
c2d2c5
+ * the 'tcp-clients' quota.
c2d2c5
+ */
c2d2c5
+static isc_result_t
c2d2c5
+tcpconn_init(ns_client_t *client, isc_boolean_t force) {
c2d2c5
+	isc_result_t result;
c2d2c5
+	isc_quota_t *quota = NULL;
c2d2c5
+	ns_tcpconn_t *tconn = NULL;
c2d2c5
+
c2d2c5
+	REQUIRE(client->tcpconn == NULL);
c2d2c5
+
c2d2c5
+	/*
c2d2c5
+	 * Try to attach to the quota first, so we won't pointlessly
c2d2c5
+	 * allocate memory for a tcpconn object if we can't get one.
c2d2c5
+	 */
c2d2c5
+	if (force) {
c2d2c5
+		result = isc_quota_force(&ns_g_server->tcpquota, "a;;
c2d2c5
+	} else {
c2d2c5
+		result = isc_quota_attach(&ns_g_server->tcpquota, "a;;
c2d2c5
+	}
c2d2c5
+	if (result != ISC_R_SUCCESS) {
c2d2c5
+		return (result);
c2d2c5
+	}
c2d2c5
+
c2d2c5
+	/*
c2d2c5
+	 * A global memory context is used for the allocation as different
c2d2c5
+	 * client structures may have different memory contexts assigned and a
c2d2c5
+	 * reference counter allocated here might need to be freed by a
c2d2c5
+	 * different client.  The performance impact caused by memory context
c2d2c5
+	 * contention here is expected to be negligible, given that this code
c2d2c5
+	 * is only executed for TCP connections.
c2d2c5
+	 */
c2d2c5
+	tconn = isc_mem_allocate(ns_g_mctx, sizeof(*tconn));
c2d2c5
+
c2d2c5
+	isc_refcount_init(&tconn->refs, 1);
c2d2c5
+	tconn->tcpquota = quota;
c2d2c5
+	quota = NULL;
c2d2c5
+	tconn->pipelined = ISC_FALSE;
c2d2c5
+
c2d2c5
+	client->tcpconn = tconn;
c2d2c5
+
c2d2c5
+	return (ISC_R_SUCCESS);
c2d2c5
+}
c2d2c5
+
c2d2c5
+/*%
c2d2c5
+ * Increase the count of client structures sharing the TCP connection
c2d2c5
+ * that 'source' is associated with; add a pointer to the same tcpconn
c2d2c5
+ * to 'target', thus associating it with the same TCP connection.
c2d2c5
+ */
c2d2c5
+static void
c2d2c5
+tcpconn_attach(ns_client_t *source, ns_client_t *target) {
c2d2c5
+	int refs;
c2d2c5
+
c2d2c5
+	REQUIRE(source->tcpconn != NULL);
c2d2c5
+	REQUIRE(target->tcpconn == NULL);
c2d2c5
+	REQUIRE(source->tcpconn->pipelined);
c2d2c5
+
c2d2c5
+	isc_refcount_increment(&source->tcpconn->refs, &refs);
c2d2c5
+	INSIST(refs > 1);
c2d2c5
+	target->tcpconn = source->tcpconn;
c2d2c5
+}
c2d2c5
+
c2d2c5
+/*%
c2d2c5
+ * Decrease the count of client structures sharing the TCP connection that
c2d2c5
+ * 'client' is associated with.  If this is the last client using this TCP
c2d2c5
+ * connection, we detach from the TCP quota and free the tcpconn
c2d2c5
+ * object. Either way, client->tcpconn is set to NULL.
c2d2c5
+ */
c2d2c5
+static void
c2d2c5
+tcpconn_detach(ns_client_t *client) {
c2d2c5
+	ns_tcpconn_t *tconn = NULL;
c2d2c5
+	int refs;
c2d2c5
+
c2d2c5
+	REQUIRE(client->tcpconn != NULL);
c2d2c5
+
c2d2c5
+	tconn = client->tcpconn;
c2d2c5
+	client->tcpconn = NULL;
c2d2c5
+
c2d2c5
+	isc_refcount_decrement(&tconn->refs, &refs);
c2d2c5
+	if (refs == 0) {
c2d2c5
+		isc_quota_detach(&tconn->tcpquota);
c2d2c5
+		isc_mem_free(ns_g_mctx, tconn);
c2d2c5
+	}
c2d2c5
+}
c2d2c5
+
c2d2c5
+/*%
c2d2c5
+ * Mark a client as active and increment the interface's 'ntcpactive'
c2d2c5
+ * counter, as a signal that there is at least one client servicing
c2d2c5
+ * TCP queries for the interface. If we reach the TCP client quota at
c2d2c5
+ * some point, this will be used to determine whether a quota overrun
c2d2c5
+ * should be permitted.
c2d2c5
+ *
c2d2c5
+ * Marking the client active with the 'tcpactive' flag ensures proper
c2d2c5
+ * accounting, by preventing us from incrementing or decrementing
c2d2c5
+ * 'ntcpactive' more than once per client.
c2d2c5
+ */
c2d2c5
+static void
c2d2c5
+mark_tcp_active(ns_client_t *client, isc_boolean_t active) {
c2d2c5
+	if (active && !client->tcpactive) {
c2d2c5
+		isc_atomic_xadd(&client->interface->ntcpactive, 1);
c2d2c5
+		client->tcpactive = active;
c2d2c5
+	} else if (!active && client->tcpactive) {
c2d2c5
+		uint32_t old =
c2d2c5
+			isc_atomic_xadd(&client->interface->ntcpactive, -1);
c2d2c5
+		INSIST(old > 0);
c2d2c5
+		client->tcpactive = active;
c2d2c5
+	}
c2d2c5
+}
c2d2c5
+
c2d2c5
 /*%
c2d2c5
  * Check for a deactivation or shutdown request and take appropriate
c2d2c5
  * action.  Returns ISC_TRUE if either is in progress; in this case
c2d2c5
@@ -369,7 +484,8 @@ exit_check(ns_client_t *client) {
c2d2c5
 		INSIST(client->recursionquota == NULL);
c2d2c5
 
c2d2c5
 		if (NS_CLIENTSTATE_READING == client->newstate) {
c2d2c5
-			if (!client->pipelined) {
c2d2c5
+			INSIST(client->tcpconn != NULL);
c2d2c5
+			if (!client->tcpconn->pipelined) {
c2d2c5
 				client_read(client);
c2d2c5
 				client->newstate = NS_CLIENTSTATE_MAX;
c2d2c5
 				return (ISC_TRUE); /* We're done. */
c2d2c5
@@ -386,10 +502,13 @@ exit_check(ns_client_t *client) {
c2d2c5
 		 */
c2d2c5
 		INSIST(client->recursionquota == NULL);
c2d2c5
 		INSIST(client->newstate <= NS_CLIENTSTATE_READY);
c2d2c5
-		if (client->nreads > 0)
c2d2c5
+
c2d2c5
+		if (client->nreads > 0) {
c2d2c5
 			dns_tcpmsg_cancelread(&client->tcpmsg);
c2d2c5
-		if (! client->nreads == 0) {
c2d2c5
-			/* Still waiting for read cancel completion. */
c2d2c5
+		}
c2d2c5
+
c2d2c5
+		/* Still waiting for read cancel completion. */
c2d2c5
+		if (client->nreads > 0) {
c2d2c5
 			return (ISC_TRUE);
c2d2c5
 		}
c2d2c5
 
c2d2c5
@@ -397,14 +516,49 @@ exit_check(ns_client_t *client) {
c2d2c5
 			dns_tcpmsg_invalidate(&client->tcpmsg);
c2d2c5
 			client->tcpmsg_valid = ISC_FALSE;
c2d2c5
 		}
c2d2c5
+
c2d2c5
+		/*
c2d2c5
+		 * Soon the client will be ready to accept a new TCP
c2d2c5
+		 * connection or UDP request, but we may have enough
c2d2c5
+		 * clients doing that already.  Check whether this client
c2d2c5
+		 * needs to remain active and allow it go inactive if
c2d2c5
+		 * not.
c2d2c5
+		 *
c2d2c5
+		 * UDP clients always go inactive at this point, but a TCP
c2d2c5
+		 * client may need to stay active and return to READY
c2d2c5
+		 * state if no other clients are available to listen
c2d2c5
+		 * for TCP requests on this interface.
c2d2c5
+		 *
c2d2c5
+		 * Regardless, if we're going to FREED state, that means
c2d2c5
+		 * the system is shutting down and we don't need to
c2d2c5
+		 * retain clients.
c2d2c5
+		 */
c2d2c5
+		if (client->mortal && TCP_CLIENT(client) &&
c2d2c5
+		    client->newstate != NS_CLIENTSTATE_FREED &&
c2d2c5
+		    !ns_g_clienttest &&
c2d2c5
+		    isc_atomic_xadd(&client->interface->ntcpaccepting, 0) == 0)
c2d2c5
+		{
c2d2c5
+			/* Nobody else is accepting */
c2d2c5
+			client->mortal = ISC_FALSE;
c2d2c5
+			client->newstate = NS_CLIENTSTATE_READY;
c2d2c5
+		}
c2d2c5
+
c2d2c5
+		/*
c2d2c5
+		 * Detach from TCP connection and TCP client quota,
c2d2c5
+		 * if appropriate. If this is the last reference to
c2d2c5
+		 * the TCP connection in our pipeline group, the
c2d2c5
+		 * TCP quota slot will be released.
c2d2c5
+		 */
c2d2c5
+		if (client->tcpconn) {
c2d2c5
+			tcpconn_detach(client);
c2d2c5
+		}
c2d2c5
+
c2d2c5
 		if (client->tcpsocket != NULL) {
c2d2c5
 			CTRACE("closetcp");
c2d2c5
 			isc_socket_detach(&client->tcpsocket);
c2d2c5
+			mark_tcp_active(client, ISC_FALSE);
c2d2c5
 		}
c2d2c5
 
c2d2c5
-		if (client->tcpquota != NULL)
c2d2c5
-			isc_quota_detach(&client->tcpquota);
c2d2c5
-
c2d2c5
 		if (client->timerset) {
c2d2c5
 			(void)isc_timer_reset(client->timer,
c2d2c5
 					      isc_timertype_inactive,
c2d2c5
@@ -412,45 +566,26 @@ exit_check(ns_client_t *client) {
c2d2c5
 			client->timerset = ISC_FALSE;
c2d2c5
 		}
c2d2c5
 
c2d2c5
-		client->pipelined = ISC_FALSE;
c2d2c5
-
c2d2c5
 		client->peeraddr_valid = ISC_FALSE;
c2d2c5
 
c2d2c5
 		client->state = NS_CLIENTSTATE_READY;
c2d2c5
-		INSIST(client->recursionquota == NULL);
c2d2c5
-
c2d2c5
-		/*
c2d2c5
-		 * Now the client is ready to accept a new TCP connection
c2d2c5
-		 * or UDP request, but we may have enough clients doing
c2d2c5
-		 * that already.  Check whether this client needs to remain
c2d2c5
-		 * active and force it to go inactive if not.
c2d2c5
-		 *
c2d2c5
-		 * UDP clients go inactive at this point, but TCP clients
c2d2c5
-		 * may remain active if we have fewer active TCP client
c2d2c5
-		 * objects than desired due to an earlier quota exhaustion.
c2d2c5
-		 */
c2d2c5
-		if (client->mortal && TCP_CLIENT(client) && !ns_g_clienttest) {
c2d2c5
-			LOCK(&client->interface->lock);
c2d2c5
-			if (client->interface->ntcpcurrent <
c2d2c5
-				    client->interface->ntcptarget)
c2d2c5
-				client->mortal = ISC_FALSE;
c2d2c5
-			UNLOCK(&client->interface->lock);
c2d2c5
-		}
c2d2c5
 
c2d2c5
 		/*
c2d2c5
 		 * We don't need the client; send it to the inactive
c2d2c5
 		 * queue for recycling.
c2d2c5
 		 */
c2d2c5
 		if (client->mortal) {
c2d2c5
-			if (client->newstate > NS_CLIENTSTATE_INACTIVE)
c2d2c5
+			if (client->newstate > NS_CLIENTSTATE_INACTIVE) {
c2d2c5
 				client->newstate = NS_CLIENTSTATE_INACTIVE;
c2d2c5
+			}
c2d2c5
 		}
c2d2c5
 
c2d2c5
 		if (NS_CLIENTSTATE_READY == client->newstate) {
c2d2c5
 			if (TCP_CLIENT(client)) {
c2d2c5
 				client_accept(client);
c2d2c5
-			} else
c2d2c5
+			} else {
c2d2c5
 				client_udprecv(client);
c2d2c5
+			}
c2d2c5
 			client->newstate = NS_CLIENTSTATE_MAX;
c2d2c5
 			return (ISC_TRUE);
c2d2c5
 		}
c2d2c5
@@ -462,41 +597,51 @@ exit_check(ns_client_t *client) {
c2d2c5
 		/*
c2d2c5
 		 * We are trying to enter the inactive state.
c2d2c5
 		 */
c2d2c5
-		if (client->naccepts > 0)
c2d2c5
+		if (client->naccepts > 0) {
c2d2c5
 			isc_socket_cancel(client->tcplistener, client->task,
c2d2c5
 					  ISC_SOCKCANCEL_ACCEPT);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		/* Still waiting for accept cancel completion. */
c2d2c5
-		if (! (client->naccepts == 0))
c2d2c5
+		if (client->naccepts > 0) {
c2d2c5
 			return (ISC_TRUE);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		/* Accept cancel is complete. */
c2d2c5
-		if (client->nrecvs > 0)
c2d2c5
+		if (client->nrecvs > 0) {
c2d2c5
 			isc_socket_cancel(client->udpsocket, client->task,
c2d2c5
 					  ISC_SOCKCANCEL_RECV);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		/* Still waiting for recv cancel completion. */
c2d2c5
-		if (! (client->nrecvs == 0))
c2d2c5
+		if (client->nrecvs > 0) {
c2d2c5
 			return (ISC_TRUE);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		/* Still waiting for control event to be delivered */
c2d2c5
-		if (client->nctls > 0)
c2d2c5
+		if (client->nctls > 0) {
c2d2c5
 			return (ISC_TRUE);
c2d2c5
-
c2d2c5
-		/* Deactivate the client. */
c2d2c5
-		if (client->interface)
c2d2c5
-			ns_interface_detach(&client->interface);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		INSIST(client->naccepts == 0);
c2d2c5
 		INSIST(client->recursionquota == NULL);
c2d2c5
-		if (client->tcplistener != NULL)
c2d2c5
+		if (client->tcplistener != NULL) {
c2d2c5
 			isc_socket_detach(&client->tcplistener);
c2d2c5
+			mark_tcp_active(client, ISC_FALSE);
c2d2c5
+		}
c2d2c5
 
c2d2c5
-		if (client->udpsocket != NULL)
c2d2c5
+		if (client->udpsocket != NULL) {
c2d2c5
 			isc_socket_detach(&client->udpsocket);
c2d2c5
+		}
c2d2c5
 
c2d2c5
-		if (client->dispatch != NULL)
c2d2c5
+		/* Deactivate the client. */
c2d2c5
+		if (client->interface != NULL) {
c2d2c5
+			ns_interface_detach(&client->interface);
c2d2c5
+		}
c2d2c5
+
c2d2c5
+		if (client->dispatch != NULL) {
c2d2c5
 			dns_dispatch_detach(&client->dispatch);
c2d2c5
+		}
c2d2c5
 
c2d2c5
 		client->attributes = 0;
c2d2c5
 		client->mortal = ISC_FALSE;
c2d2c5
@@ -515,10 +660,13 @@ exit_check(ns_client_t *client) {
c2d2c5
 			client->newstate = NS_CLIENTSTATE_MAX;
c2d2c5
 			if (!ns_g_clienttest && manager != NULL &&
c2d2c5
 			    !manager->exiting)
c2d2c5
+			{
c2d2c5
 				ISC_QUEUE_PUSH(manager->inactive, client,
c2d2c5
 					       ilink);
c2d2c5
-			if (client->needshutdown)
c2d2c5
+			}
c2d2c5
+			if (client->needshutdown) {
c2d2c5
 				isc_task_shutdown(client->task);
c2d2c5
+			}
c2d2c5
 			return (ISC_TRUE);
c2d2c5
 		}
c2d2c5
 	}
c2d2c5
@@ -613,7 +761,7 @@ client_start(isc_task_t *task, isc_event_t *event) {
c2d2c5
 		return;
c2d2c5
 
c2d2c5
 	if (TCP_CLIENT(client)) {
c2d2c5
-		if (client->pipelined) {
c2d2c5
+		if (client->tcpconn != NULL) {
c2d2c5
 			client_read(client);
c2d2c5
 		} else {
c2d2c5
 			client_accept(client);
c2d2c5
@@ -623,7 +771,6 @@ client_start(isc_task_t *task, isc_event_t *event) {
c2d2c5
 	}
c2d2c5
 }
c2d2c5
 
c2d2c5
-
c2d2c5
 /*%
c2d2c5
  * The client's task has received a shutdown event.
c2d2c5
  */
c2d2c5
@@ -1511,6 +1658,7 @@ client_request(isc_task_t *task, isc_event_t *event) {
c2d2c5
 		client->nrecvs--;
c2d2c5
 	} else {
c2d2c5
 		INSIST(TCP_CLIENT(client));
c2d2c5
+		INSIST(client->tcpconn != NULL);
c2d2c5
 		REQUIRE(event->ev_type == DNS_EVENT_TCPMSG);
c2d2c5
 		REQUIRE(event->ev_sender == &client->tcpmsg);
c2d2c5
 		buffer = &client->tcpmsg.buffer;
c2d2c5
@@ -1653,18 +1801,27 @@ client_request(isc_task_t *task, isc_event_t *event) {
c2d2c5
 	/*
c2d2c5
 	 * Pipeline TCP query processing.
c2d2c5
 	 */
c2d2c5
-	if (client->message->opcode != dns_opcode_query)
c2d2c5
-		client->pipelined = ISC_FALSE;
c2d2c5
-	if (TCP_CLIENT(client) && client->pipelined) {
c2d2c5
-		result = isc_quota_reserve(&ns_g_server->tcpquota);
c2d2c5
-		if (result == ISC_R_SUCCESS)
c2d2c5
-			result = ns_client_replace(client);
c2d2c5
+	if (TCP_CLIENT(client) &&
c2d2c5
+	    client->message->opcode != dns_opcode_query)
c2d2c5
+	{
c2d2c5
+		client->tcpconn->pipelined = ISC_FALSE;
c2d2c5
+	}
c2d2c5
+	if (TCP_CLIENT(client) && client->tcpconn->pipelined) {
c2d2c5
+		/*
c2d2c5
+		 * We're pipelining. Replace the client; the
c2d2c5
+		 * replacement can read the TCP socket looking
c2d2c5
+		 * for new messages and this one can process the
c2d2c5
+		 * current message asynchronously.
c2d2c5
+		 *
c2d2c5
+		 * There will now be at least three clients using this
c2d2c5
+		 * TCP socket - one accepting new connections,
c2d2c5
+		 * one reading an existing connection to get new
c2d2c5
+		 * messages, and one answering the message already
c2d2c5
+		 * received.
c2d2c5
+		 */
c2d2c5
+		result = ns_client_replace(client);
c2d2c5
 		if (result != ISC_R_SUCCESS) {
c2d2c5
-			ns_client_log(client, NS_LOGCATEGORY_CLIENT,
c2d2c5
-				      NS_LOGMODULE_CLIENT, ISC_LOG_WARNING,
c2d2c5
-				      "no more TCP clients(read): %s",
c2d2c5
-				      isc_result_totext(result));
c2d2c5
-			client->pipelined = ISC_FALSE;
c2d2c5
+			client->tcpconn->pipelined = ISC_FALSE;
c2d2c5
 		}
c2d2c5
 	}
c2d2c5
 
c2d2c5
@@ -2168,8 +2325,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) {
c2d2c5
 	client->signer = NULL;
c2d2c5
 	dns_name_init(&client->signername, NULL);
c2d2c5
 	client->mortal = ISC_FALSE;
c2d2c5
-	client->pipelined = ISC_FALSE;
c2d2c5
-	client->tcpquota = NULL;
c2d2c5
+	client->tcpconn = NULL;
c2d2c5
 	client->recursionquota = NULL;
c2d2c5
 	client->interface = NULL;
c2d2c5
 	client->peeraddr_valid = ISC_FALSE;
c2d2c5
@@ -2177,6 +2333,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) {
c2d2c5
 	client->filter_aaaa = dns_v4_aaaa_ok;
c2d2c5
 #endif
c2d2c5
 	client->needshutdown = ns_g_clienttest;
c2d2c5
+	client->tcpactive = ISC_FALSE;
c2d2c5
 
c2d2c5
 	ISC_EVENT_INIT(&client->ctlevent, sizeof(client->ctlevent), 0, NULL,
c2d2c5
 		       NS_EVENT_CLIENTCONTROL, client_start, client, client,
c2d2c5
@@ -2269,9 +2426,10 @@ client_read(ns_client_t *client) {
c2d2c5
 
c2d2c5
 static void
c2d2c5
 client_newconn(isc_task_t *task, isc_event_t *event) {
c2d2c5
+	isc_result_t result;
c2d2c5
 	ns_client_t *client = event->ev_arg;
c2d2c5
 	isc_socket_newconnev_t *nevent = (isc_socket_newconnev_t *)event;
c2d2c5
-	isc_result_t result;
c2d2c5
+	uint32_t old;
c2d2c5
 
c2d2c5
 	REQUIRE(event->ev_type == ISC_SOCKEVENT_NEWCONN);
c2d2c5
 	REQUIRE(NS_CLIENT_VALID(client));
c2d2c5
@@ -2281,13 +2439,18 @@ client_newconn(isc_task_t *task, isc_event_t *event) {
c2d2c5
 
c2d2c5
 	INSIST(client->state == NS_CLIENTSTATE_READY);
c2d2c5
 
c2d2c5
+	/*
c2d2c5
+	 * The accept() was successful and we're now establishing a new
c2d2c5
+	 * connection. We need to make note of it in the client and
c2d2c5
+	 * interface objects so client objects can do the right thing
c2d2c5
+	 * when going inactive in exit_check() (see comments in
c2d2c5
+	 * client_accept() for details).
c2d2c5
+	 */
c2d2c5
 	INSIST(client->naccepts == 1);
c2d2c5
 	client->naccepts--;
c2d2c5
 
c2d2c5
-	LOCK(&client->interface->lock);
c2d2c5
-	INSIST(client->interface->ntcpcurrent > 0);
c2d2c5
-	client->interface->ntcpcurrent--;
c2d2c5
-	UNLOCK(&client->interface->lock);
c2d2c5
+	old = isc_atomic_xadd(&client->interface->ntcpaccepting, -1);
c2d2c5
+	INSIST(old > 0);
c2d2c5
 
c2d2c5
 	/*
c2d2c5
 	 * We must take ownership of the new socket before the exit
c2d2c5
@@ -2320,6 +2483,7 @@ client_newconn(isc_task_t *task, isc_event_t *event) {
c2d2c5
 			      NS_LOGMODULE_CLIENT, ISC_LOG_DEBUG(3),
c2d2c5
 			      "accept failed: %s",
c2d2c5
 			      isc_result_totext(nevent->result));
c2d2c5
+		tcpconn_detach(client);
c2d2c5
 	}
c2d2c5
 
c2d2c5
 	if (exit_check(client))
c2d2c5
@@ -2357,20 +2521,13 @@ client_newconn(isc_task_t *task, isc_event_t *event) {
c2d2c5
 		 * telnetting to port 53 (once per CPU) will
c2d2c5
 		 * deny service to legitimate TCP clients.
c2d2c5
 		 */
c2d2c5
-		client->pipelined = ISC_FALSE;
c2d2c5
-		result = isc_quota_attach(&ns_g_server->tcpquota,
c2d2c5
-					  &client->tcpquota);
c2d2c5
-		if (result == ISC_R_SUCCESS)
c2d2c5
-			result = ns_client_replace(client);
c2d2c5
-		if (result != ISC_R_SUCCESS) {
c2d2c5
-			ns_client_log(client, NS_LOGCATEGORY_CLIENT,
c2d2c5
-				      NS_LOGMODULE_CLIENT, ISC_LOG_WARNING,
c2d2c5
-				      "no more TCP clients(accept): %s",
c2d2c5
-				      isc_result_totext(result));
c2d2c5
-		} else if (ns_g_server->keepresporder == NULL ||
c2d2c5
-			   !allowed(&netaddr, NULL, NULL, 0, NULL,
c2d2c5
-				    ns_g_server->keepresporder)) {
c2d2c5
-			client->pipelined = ISC_TRUE;
c2d2c5
+		result = ns_client_replace(client);
c2d2c5
+		if (result == ISC_R_SUCCESS &&
c2d2c5
+		    (ns_g_server->keepresporder == NULL ||
c2d2c5
+		     !allowed(&netaddr, NULL,
c2d2c5
+			      ns_g_server->keepresporder)))
c2d2c5
+		{
c2d2c5
+			client->tcpconn->pipelined = ISC_TRUE;
c2d2c5
 		}
c2d2c5
 
c2d2c5
 		client_read(client);
c2d2c5
@@ -2386,12 +2543,66 @@ client_accept(ns_client_t *client) {
c2d2c5
 
c2d2c5
 	CTRACE("accept");
c2d2c5
 
c2d2c5
+	/*
c2d2c5
+	 * Set up a new TCP connection. This means try to attach to the
c2d2c5
+	 * TCP client quota (tcp-clients), but fail if we're over quota.
c2d2c5
+	 */
c2d2c5
+	result = tcpconn_init(client, ISC_FALSE);
c2d2c5
+	if (result != ISC_R_SUCCESS) {
c2d2c5
+		isc_boolean_t exit;
c2d2c5
+
c2d2c5
+		ns_client_log(client, NS_LOGCATEGORY_CLIENT,
c2d2c5
+			      NS_LOGMODULE_CLIENT, ISC_LOG_WARNING,
c2d2c5
+			      "TCP client quota reached: %s",
c2d2c5
+			      isc_result_totext(result));
c2d2c5
+
c2d2c5
+		/*
c2d2c5
+		 * We have exceeded the system-wide TCP client quota.  But,
c2d2c5
+		 * we can't just block this accept in all cases, because if
c2d2c5
+		 * we did, a heavy TCP load on other interfaces might cause
c2d2c5
+		 * this interface to be starved, with no clients able to
c2d2c5
+		 * accept new connections.
c2d2c5
+		 *
c2d2c5
+		 * So, we check here to see if any other clients are
c2d2c5
+		 * already servicing TCP queries on this interface (whether
c2d2c5
+		 * accepting, reading, or processing). If we find that at
c2d2c5
+		 * least one client other than this one is active, then
c2d2c5
+		 * it's okay *not* to call accept - we can let this
c2d2c5
+		 * client go inactive and another will take over when it's
c2d2c5
+		 * done.
c2d2c5
+		 *
c2d2c5
+		 * If there aren't enough active clients on the interface,
c2d2c5
+		 * then we can be a little bit flexible about the quota.
c2d2c5
+		 * We'll allow *one* extra client through to ensure we're
c2d2c5
+		 * listening on every interface; we do this by setting the
c2d2c5
+		 * 'force' option to tcpconn_init().
c2d2c5
+		 *
c2d2c5
+		 * (Note: In practice this means that the real TCP client
c2d2c5
+		 * quota is tcp-clients plus the number of listening
c2d2c5
+		 * interfaces plus 1.)
c2d2c5
+		 */
c2d2c5
+		exit = (isc_atomic_xadd(&client->interface->ntcpactive, 0) >
c2d2c5
+			(client->tcpactive ? 1 : 0));
c2d2c5
+		if (exit) {
c2d2c5
+			client->newstate = NS_CLIENTSTATE_INACTIVE;
c2d2c5
+			(void)exit_check(client);
c2d2c5
+			return;
c2d2c5
+		}
c2d2c5
+
c2d2c5
+		result = tcpconn_init(client, ISC_TRUE);
c2d2c5
+		RUNTIME_CHECK(result == ISC_R_SUCCESS);
c2d2c5
+	}
c2d2c5
+
c2d2c5
+	/*
c2d2c5
+	 * If this client was set up using get_client() or get_worker(),
c2d2c5
+	 * then TCP is already marked active. However, if it was restarted
c2d2c5
+	 * from exit_check(), it might not be, so we take care of it now.
c2d2c5
+	 */
c2d2c5
+	mark_tcp_active(client, ISC_TRUE);
c2d2c5
+
c2d2c5
 	result = isc_socket_accept(client->tcplistener, client->task,
c2d2c5
 				   client_newconn, client);
c2d2c5
 	if (result != ISC_R_SUCCESS) {
c2d2c5
-		UNEXPECTED_ERROR(__FILE__, __LINE__,
c2d2c5
-				 "isc_socket_accept() failed: %s",
c2d2c5
-				 isc_result_totext(result));
c2d2c5
 		/*
c2d2c5
 		 * XXXRTH  What should we do?  We're trying to accept but
c2d2c5
 		 *	   it didn't work.  If we just give up, then TCP
c2d2c5
@@ -2399,13 +2610,37 @@ client_accept(ns_client_t *client) {
c2d2c5
 		 *
c2d2c5
 		 *	   For now, we just go idle.
c2d2c5
 		 */
c2d2c5
+		UNEXPECTED_ERROR(__FILE__, __LINE__,
c2d2c5
+				 "isc_socket_accept() failed: %s",
c2d2c5
+				 isc_result_totext(result));
c2d2c5
+
c2d2c5
+		tcpconn_detach(client);
c2d2c5
+		mark_tcp_active(client, ISC_FALSE);
c2d2c5
 		return;
c2d2c5
 	}
c2d2c5
+
c2d2c5
+	/*
c2d2c5
+	 * The client's 'naccepts' counter indicates that this client has
c2d2c5
+	 * called accept() and is waiting for a new connection. It should
c2d2c5
+	 * never exceed 1.
c2d2c5
+	 */
c2d2c5
 	INSIST(client->naccepts == 0);
c2d2c5
 	client->naccepts++;
c2d2c5
-	LOCK(&client->interface->lock);
c2d2c5
-	client->interface->ntcpcurrent++;
c2d2c5
-	UNLOCK(&client->interface->lock);
c2d2c5
+
c2d2c5
+	/*
c2d2c5
+	 * The interface's 'ntcpaccepting' counter is incremented when
c2d2c5
+	 * any client calls accept(), and decremented in client_newconn()
c2d2c5
+	 * once the connection is established.
c2d2c5
+	 *
c2d2c5
+	 * When the client object is shutting down after handling a TCP
c2d2c5
+	 * request (see exit_check()), if this value is at least one, that
c2d2c5
+	 * means another client has called accept() and is waiting to
c2d2c5
+	 * establish the next connection. That means the client may be
c2d2c5
+	 * be free to become inactive; otherwise it may need to start
c2d2c5
+	 * listening for connections itself to prevent the interface
c2d2c5
+	 * going dead.
c2d2c5
+	 */
c2d2c5
+	isc_atomic_xadd(&client->interface->ntcpaccepting, 1);
c2d2c5
 }
c2d2c5
 
c2d2c5
 static void
c2d2c5
@@ -2476,15 +2711,17 @@ ns_client_replace(ns_client_t *client) {
c2d2c5
 	REQUIRE(client->manager != NULL);
c2d2c5
 
c2d2c5
 	tcp = TCP_CLIENT(client);
c2d2c5
-	if (tcp && client->pipelined) {
c2d2c5
+	if (tcp && client->tcpconn != NULL && client->tcpconn->pipelined) {
c2d2c5
 		result = get_worker(client->manager, client->interface,
c2d2c5
-				    client->tcpsocket);
c2d2c5
+				    client->tcpsocket, client);
c2d2c5
 	} else {
c2d2c5
 		result = get_client(client->manager, client->interface,
c2d2c5
 				    client->dispatch, tcp);
c2d2c5
+
c2d2c5
 	}
c2d2c5
-	if (result != ISC_R_SUCCESS)
c2d2c5
+	if (result != ISC_R_SUCCESS) {
c2d2c5
 		return (result);
c2d2c5
+	}
c2d2c5
 
c2d2c5
 	/*
c2d2c5
 	 * The responsibility for listening for new requests is hereby
c2d2c5
@@ -2668,9 +2905,12 @@ get_client(ns_clientmgr_t *manager, ns_interface_t *ifp,
c2d2c5
 	INSIST(client->recursionquota == NULL);
c2d2c5
 
c2d2c5
 	if (tcp) {
c2d2c5
+		mark_tcp_active(client, ISC_TRUE);
c2d2c5
+
c2d2c5
 		client->attributes |= NS_CLIENTATTR_TCP;
c2d2c5
 		isc_socket_attach(ifp->tcpsocket,
c2d2c5
 				  &client->tcplistener);
c2d2c5
+
c2d2c5
 	} else {
c2d2c5
 		isc_socket_t *sock;
c2d2c5
 
c2d2c5
@@ -2688,7 +2928,8 @@ get_client(ns_clientmgr_t *manager, ns_interface_t *ifp,
c2d2c5
 }
c2d2c5
 
c2d2c5
 static isc_result_t
c2d2c5
-get_worker(ns_clientmgr_t *manager, ns_interface_t *ifp, isc_socket_t *socket)
c2d2c5
+get_worker(ns_clientmgr_t *manager, ns_interface_t *ifp, isc_socket_t *sock,
c2d2c5
+	   ns_client_t *oldclient)
c2d2c5
 {
c2d2c5
 	isc_result_t result = ISC_R_SUCCESS;
c2d2c5
 	isc_event_t *ev;
c2d2c5
@@ -2696,6 +2937,7 @@ get_worker(ns_clientmgr_t *manager, ns_interface_t *ifp, isc_socket_t *socket)
c2d2c5
 	MTRACE("get worker");
c2d2c5
 
c2d2c5
 	REQUIRE(manager != NULL);
c2d2c5
+	REQUIRE(oldclient != NULL);
c2d2c5
 
c2d2c5
 	if (manager->exiting)
c2d2c5
 		return (ISC_R_SHUTTINGDOWN);
c2d2c5
@@ -2728,15 +2970,15 @@ get_worker(ns_clientmgr_t *manager, ns_interface_t *ifp, isc_socket_t *socket)
c2d2c5
 	ns_interface_attach(ifp, &client->interface);
c2d2c5
 	client->newstate = client->state = NS_CLIENTSTATE_WORKING;
c2d2c5
 	INSIST(client->recursionquota == NULL);
c2d2c5
-	client->tcpquota = &ns_g_server->tcpquota;
c2d2c5
-
c2d2c5
-	client->dscp = ifp->dscp;
c2d2c5
 
c2d2c5
 	client->attributes |= NS_CLIENTATTR_TCP;
c2d2c5
-	client->pipelined = ISC_TRUE;
c2d2c5
+	client->mortal = ISC_TRUE;
c2d2c5
+
c2d2c5
+	tcpconn_attach(oldclient, client);
c2d2c5
+	mark_tcp_active(client, ISC_TRUE);
c2d2c5
 
c2d2c5
 	isc_socket_attach(ifp->tcpsocket, &client->tcplistener);
c2d2c5
-	isc_socket_attach(socket, &client->tcpsocket);
c2d2c5
+	isc_socket_attach(sock, &client->tcpsocket);
c2d2c5
 	isc_socket_setname(client->tcpsocket, "worker-tcp", NULL);
c2d2c5
 	(void)isc_socket_getpeername(client->tcpsocket, &client->peeraddr);
c2d2c5
 	client->peeraddr_valid = ISC_TRUE;
c2d2c5
diff --git a/bin/named/include/named/client.h b/bin/named/include/named/client.h
c2d2c5
index bf4d201..ef55a7a 100644
c2d2c5
--- a/bin/named/include/named/client.h
c2d2c5
+++ b/bin/named/include/named/client.h
c2d2c5
@@ -15,8 +15,6 @@
c2d2c5
  * PERFORMANCE OF THIS SOFTWARE.
c2d2c5
  */
c2d2c5
 
c2d2c5
-/* $Id$ */
c2d2c5
-
c2d2c5
 #ifndef NAMED_CLIENT_H
c2d2c5
 #define NAMED_CLIENT_H 1
c2d2c5
 
c2d2c5
@@ -83,6 +81,13 @@
c2d2c5
  *** Types
c2d2c5
  ***/
c2d2c5
 
c2d2c5
+/*% reference-counted TCP connection object */
c2d2c5
+typedef struct ns_tcpconn {
c2d2c5
+	isc_refcount_t		refs;
c2d2c5
+	isc_quota_t		*tcpquota;
c2d2c5
+	isc_boolean_t		pipelined;
c2d2c5
+} ns_tcpconn_t;
c2d2c5
+
c2d2c5
 /*% nameserver client structure */
c2d2c5
 struct ns_client {
c2d2c5
 	unsigned int		magic;
c2d2c5
@@ -133,8 +138,8 @@ struct ns_client {
c2d2c5
 	dns_name_t		signername;   /*%< [T]SIG key name */
c2d2c5
 	dns_name_t *		signer;	      /*%< NULL if not valid sig */
c2d2c5
 	isc_boolean_t		mortal;	      /*%< Die after handling request */
c2d2c5
-	isc_boolean_t		pipelined;   /*%< TCP queries not in sequence */
c2d2c5
-	isc_quota_t		*tcpquota;
c2d2c5
+	isc_boolean_t		tcpactive;
c2d2c5
+	ns_tcpconn_t		*tcpconn;
c2d2c5
 	isc_quota_t		*recursionquota;
c2d2c5
 	ns_interface_t		*interface;
c2d2c5
 	isc_sockaddr_t		peeraddr;
c2d2c5
diff --git a/bin/named/include/named/interfacemgr.h b/bin/named/include/named/interfacemgr.h
c2d2c5
index 380dbed..a342866 100644
c2d2c5
--- a/bin/named/include/named/interfacemgr.h
c2d2c5
+++ b/bin/named/include/named/interfacemgr.h
c2d2c5
@@ -15,8 +15,6 @@
c2d2c5
  * PERFORMANCE OF THIS SOFTWARE.
c2d2c5
  */
c2d2c5
 
c2d2c5
-/* $Id: interfacemgr.h,v 1.35 2011/07/28 23:47:58 tbox Exp $ */
c2d2c5
-
c2d2c5
 #ifndef NAMED_INTERFACEMGR_H
c2d2c5
 #define NAMED_INTERFACEMGR_H 1
c2d2c5
 
c2d2c5
@@ -80,9 +78,14 @@ struct ns_interface {
c2d2c5
 	dns_dispatch_t *	udpdispatch[MAX_UDP_DISPATCH];
c2d2c5
 						/*%< UDP dispatchers. */
c2d2c5
 	isc_socket_t *		tcpsocket;	/*%< TCP socket. */
c2d2c5
-	int			ntcptarget;	/*%< Desired number of concurrent
c2d2c5
-						     TCP accepts */
c2d2c5
-	int			ntcpcurrent;	/*%< Current ditto, locked */
c2d2c5
+	int32_t			ntcpaccepting;	/*%< Number of clients
c2d2c5
+						     ready to accept new
c2d2c5
+						     TCP connections on this
c2d2c5
+						     interface */
c2d2c5
+	int32_t			ntcpactive;	/*%< Number of clients
c2d2c5
+						     servicing TCP queries
c2d2c5
+						     (whether accepting or
c2d2c5
+						     connected) */
c2d2c5
 	int			nudpdispatch;	/*%< Number of UDP dispatches */
c2d2c5
 	ns_clientmgr_t *	clientmgr;	/*%< Client manager. */
c2d2c5
 	ISC_LINK(ns_interface_t) link;
c2d2c5
diff --git a/bin/named/interfacemgr.c b/bin/named/interfacemgr.c
c2d2c5
index 4aee47a..ebec0c4 100644
c2d2c5
--- a/bin/named/interfacemgr.c
c2d2c5
+++ b/bin/named/interfacemgr.c
c2d2c5
@@ -380,8 +380,9 @@ ns_interface_create(ns_interfacemgr_t *mgr, isc_sockaddr_t *addr,
c2d2c5
 	 * connections will be handled in parallel even though there is
c2d2c5
 	 * only one client initially.
c2d2c5
 	 */
c2d2c5
-	ifp->ntcptarget = 1;
c2d2c5
-	ifp->ntcpcurrent = 0;
c2d2c5
+	ifp->ntcpaccepting = 0;
c2d2c5
+	ifp->ntcpactive = 0;
c2d2c5
+
c2d2c5
 	ifp->nudpdispatch = 0;
c2d2c5
 
c2d2c5
 	ISC_LINK_INIT(ifp, link);
c2d2c5
@@ -510,9 +511,7 @@ ns_interface_accepttcp(ns_interface_t *ifp) {
c2d2c5
 	 */
c2d2c5
 	(void)isc_socket_filter(ifp->tcpsocket, "dataready");
c2d2c5
 
c2d2c5
-	result = ns_clientmgr_createclients(ifp->clientmgr,
c2d2c5
-					    ifp->ntcptarget, ifp,
c2d2c5
-					    ISC_TRUE);
c2d2c5
+	result = ns_clientmgr_createclients(ifp->clientmgr, 1, ifp, ISC_TRUE);
c2d2c5
 	if (result != ISC_R_SUCCESS) {
c2d2c5
 		UNEXPECTED_ERROR(__FILE__, __LINE__,
c2d2c5
 				 "TCP ns_clientmgr_createclients(): %s",
c2d2c5
diff --git a/lib/isc/include/isc/quota.h b/lib/isc/include/isc/quota.h
c2d2c5
index 7b0d0d9..bb1a927 100644
c2d2c5
--- a/lib/isc/include/isc/quota.h
c2d2c5
+++ b/lib/isc/include/isc/quota.h
c2d2c5
@@ -107,6 +107,13 @@ isc_quota_attach(isc_quota_t *quota, isc_quota_t **p);
c2d2c5
  * quota if successful (ISC_R_SUCCESS or ISC_R_SOFTQUOTA).
c2d2c5
  */
c2d2c5
 
c2d2c5
+isc_result_t
c2d2c5
+isc_quota_force(isc_quota_t *quota, isc_quota_t **p);
c2d2c5
+/*%<
c2d2c5
+ * Like isc_quota_attach, but will attach '*p' to the quota
c2d2c5
+ * even if the hard quota has been exceeded.
c2d2c5
+ */
c2d2c5
+
c2d2c5
 void
c2d2c5
 isc_quota_detach(isc_quota_t **p);
c2d2c5
 /*%<
c2d2c5
diff --git a/lib/isc/quota.c b/lib/isc/quota.c
c2d2c5
index 5e5c50c..ca4c478 100644
c2d2c5
--- a/lib/isc/quota.c
c2d2c5
+++ b/lib/isc/quota.c
c2d2c5
@@ -81,20 +81,39 @@ isc_quota_release(isc_quota_t *quota) {
c2d2c5
 	UNLOCK(&quota->lock);
c2d2c5
 }
c2d2c5
 
c2d2c5
-isc_result_t
c2d2c5
-isc_quota_attach(isc_quota_t *quota, isc_quota_t **p)
c2d2c5
-{
c2d2c5
+static isc_result_t
c2d2c5
+doattach(isc_quota_t *quota, isc_quota_t **p, isc_boolean_t force) {
c2d2c5
 	isc_result_t result;
c2d2c5
-	INSIST(p != NULL && *p == NULL);
c2d2c5
+	REQUIRE(p != NULL && *p == NULL);
c2d2c5
+
c2d2c5
 	result = isc_quota_reserve(quota);
c2d2c5
-	if (result == ISC_R_SUCCESS || result == ISC_R_SOFTQUOTA)
c2d2c5
+	if (result == ISC_R_SUCCESS || result == ISC_R_SOFTQUOTA) {
c2d2c5
+		*p = quota;
c2d2c5
+	} else if (result == ISC_R_QUOTA && force) {
c2d2c5
+		/* attach anyway */
c2d2c5
+		LOCK(&quota->lock);
c2d2c5
+		quota->used++;
c2d2c5
+		UNLOCK(&quota->lock);
c2d2c5
+
c2d2c5
 		*p = quota;
c2d2c5
+		result = ISC_R_SUCCESS;
c2d2c5
+	}
c2d2c5
+
c2d2c5
 	return (result);
c2d2c5
 }
c2d2c5
 
c2d2c5
+isc_result_t
c2d2c5
+isc_quota_attach(isc_quota_t *quota, isc_quota_t **p) {
c2d2c5
+	return (doattach(quota, p, ISC_FALSE));
c2d2c5
+}
c2d2c5
+
c2d2c5
+isc_result_t
c2d2c5
+isc_quota_force(isc_quota_t *quota, isc_quota_t **p) {
c2d2c5
+	return (doattach(quota, p, ISC_TRUE));
c2d2c5
+}
c2d2c5
+
c2d2c5
 void
c2d2c5
-isc_quota_detach(isc_quota_t **p)
c2d2c5
-{
c2d2c5
+isc_quota_detach(isc_quota_t **p) {
c2d2c5
 	INSIST(p != NULL && *p != NULL);
c2d2c5
 	isc_quota_release(*p);
c2d2c5
 	*p = NULL;
c2d2c5
-- 
c2d2c5
2.20.1
c2d2c5