|
|
e06446 |
From 70e6c68196d381d05d35414c138894b54939d236 Mon Sep 17 00:00:00 2001
|
|
|
e06446 |
From: Thierry Bordaz <tbordaz@redhat.com>
|
|
|
e06446 |
Date: Thu, 7 Jun 2018 16:19:34 +0200
|
|
|
e06446 |
Subject: [PATCH] Ticket 49765 - Async operations can hang when the server is
|
|
|
e06446 |
running nunc-stans
|
|
|
e06446 |
|
|
|
e06446 |
Bug Description:
|
|
|
e06446 |
The fix https://pagure.io/389-ds-base/issue/48184 allowed to schedule
|
|
|
e06446 |
several NS handlers where each handler waits for the dispatch of the
|
|
|
e06446 |
previous handler before being schedule.
|
|
|
e06446 |
|
|
|
e06446 |
In case the current handler is never called (connection inactivity)
|
|
|
e06446 |
those that are waiting can wait indefinitely (until timeout or connection
|
|
|
e06446 |
closure). But those that are waiting delay the processing of the operation
|
|
|
e06446 |
when the scheduling is called by connection_threadmain.
|
|
|
e06446 |
|
|
|
e06446 |
The some operations can appear hanging.
|
|
|
e06446 |
This scenario happens with async operations
|
|
|
e06446 |
|
|
|
e06446 |
Fix Description:
|
|
|
e06446 |
Instead of waiting for the completion of the scheduled handler,
|
|
|
e06446 |
evaluates if the scheduled handler needs to be cleared (ns_job_done)
|
|
|
e06446 |
or the waiting handler to be canceled.
|
|
|
e06446 |
|
|
|
e06446 |
https://pagure.io/389-ds-base/issue/49765
|
|
|
e06446 |
|
|
|
e06446 |
Reviewed by: Mark Reynolds (thanks Mark !)
|
|
|
e06446 |
|
|
|
e06446 |
Platforms tested: F26
|
|
|
e06446 |
|
|
|
e06446 |
Flag Day: no
|
|
|
e06446 |
|
|
|
e06446 |
Doc impact: no
|
|
|
e06446 |
---
|
|
|
e06446 |
ldap/servers/slapd/daemon.c | 142 +++++++++++++++-------------
|
|
|
e06446 |
src/nunc-stans/include/nunc-stans.h | 3 +
|
|
|
e06446 |
src/nunc-stans/ns/ns_thrpool.c | 5 +
|
|
|
e06446 |
3 files changed, 84 insertions(+), 66 deletions(-)
|
|
|
e06446 |
|
|
|
e06446 |
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
|
|
|
e06446 |
index 35cfe7de0..0a723c4a8 100644
|
|
|
e06446 |
--- a/ldap/servers/slapd/daemon.c
|
|
|
e06446 |
+++ b/ldap/servers/slapd/daemon.c
|
|
|
e06446 |
@@ -152,12 +152,21 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN
|
|
|
e06446 |
*/
|
|
|
e06446 |
static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn);
|
|
|
e06446 |
static void ns_handle_new_connection(struct ns_job_t *job);
|
|
|
e06446 |
+static void ns_handle_closure(struct ns_job_t *job);
|
|
|
e06446 |
static void handle_pr_read_ready(Connection_Table *ct, PRIntn num_poll);
|
|
|
e06446 |
static int clear_signal(struct POLL_STRUCT *fds);
|
|
|
e06446 |
static void unfurl_banners(Connection_Table *ct, daemon_ports_t *ports, PRFileDesc **n_tcps, PRFileDesc **s_tcps, PRFileDesc **i_unix);
|
|
|
e06446 |
static int write_pid_file(void);
|
|
|
e06446 |
static int init_shutdown_detect(void);
|
|
|
e06446 |
|
|
|
e06446 |
+#define NS_HANDLER_NEW_CONNECTION 0
|
|
|
e06446 |
+#define NS_HANDLER_READ_CONNECTION 1
|
|
|
e06446 |
+#define NS_HANDLER_CLOSE_CONNECTION 2
|
|
|
e06446 |
+static ns_job_func_t ns_handlers[] = {
|
|
|
e06446 |
+ ns_handle_new_connection,
|
|
|
e06446 |
+ ns_handle_pr_read_ready,
|
|
|
e06446 |
+ ns_handle_closure
|
|
|
e06446 |
+};
|
|
|
e06446 |
/*
|
|
|
e06446 |
* NSPR has different implementations for PRMonitor, depending
|
|
|
e06446 |
* on the availble threading model
|
|
|
e06446 |
@@ -1058,7 +1067,7 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
|
|
|
e06446 |
for (size_t ii = 0; ii < listeners; ++ii) {
|
|
|
e06446 |
listener_idxs[ii].ct = the_connection_table; /* to pass to handle_new_connection */
|
|
|
e06446 |
ns_result_t result = ns_add_io_job(tp, listener_idxs[ii].listenfd, NS_JOB_ACCEPT | NS_JOB_PERSIST | NS_JOB_PRESERVE_FD,
|
|
|
e06446 |
- ns_handle_new_connection, &listener_idxs[ii], &(listener_idxs[ii].ns_job));
|
|
|
e06446 |
+ ns_handlers[NS_HANDLER_NEW_CONNECTION], &listener_idxs[ii], &(listener_idxs[ii].ns_job));
|
|
|
e06446 |
if (result != NS_SUCCESS) {
|
|
|
e06446 |
slapi_log_err(SLAPI_LOG_CRIT, "slapd_daemon", "ns_add_io_job failed to create add acceptor %d\n", result);
|
|
|
e06446 |
}
|
|
|
e06446 |
@@ -1684,28 +1693,84 @@ ns_handle_closure(struct ns_job_t *job)
|
|
|
e06446 |
/**
|
|
|
e06446 |
* Schedule more I/O for this connection, or make sure that it
|
|
|
e06446 |
* is closed in the event loop.
|
|
|
e06446 |
+ *
|
|
|
e06446 |
* caller must hold c_mutex
|
|
|
e06446 |
- * It returns
|
|
|
e06446 |
- * 0 on success
|
|
|
e06446 |
- * 1 on need to retry
|
|
|
e06446 |
*/
|
|
|
e06446 |
-static int
|
|
|
e06446 |
-ns_connection_post_io_or_closing_try(Connection *conn)
|
|
|
e06446 |
+void
|
|
|
e06446 |
+ns_connection_post_io_or_closing(Connection *conn)
|
|
|
e06446 |
{
|
|
|
e06446 |
struct timeval tv;
|
|
|
e06446 |
|
|
|
e06446 |
if (!enable_nunc_stans) {
|
|
|
e06446 |
- return 0;
|
|
|
e06446 |
+ return;
|
|
|
e06446 |
}
|
|
|
e06446 |
|
|
|
e06446 |
/*
|
|
|
e06446 |
* A job was already scheduled.
|
|
|
e06446 |
- * Let it be dispatched first
|
|
|
e06446 |
+ * Check if it is the appropriate one
|
|
|
e06446 |
*/
|
|
|
e06446 |
if (conn->c_job != NULL) {
|
|
|
e06446 |
- return 1;
|
|
|
e06446 |
+ if (connection_is_free(conn, 0)) {
|
|
|
e06446 |
+ PRStatus shutdown_status;
|
|
|
e06446 |
+
|
|
|
e06446 |
+ /* The connection being freed,
|
|
|
e06446 |
+ * It means that ns_handle_closure already completed and the connection
|
|
|
e06446 |
+ * is no longer on the active list.
|
|
|
e06446 |
+ * The scheduled job is useless and scheduling a new one as well
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+ shutdown_status = ns_job_done(conn->c_job);
|
|
|
e06446 |
+ if (shutdown_status != PR_SUCCESS) {
|
|
|
e06446 |
+ slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
|
|
|
e06446 |
+ }
|
|
|
e06446 |
+ conn->c_job = NULL;
|
|
|
e06446 |
+ return;
|
|
|
e06446 |
+ }
|
|
|
e06446 |
+ if (CONN_NEEDS_CLOSING(conn)) {
|
|
|
e06446 |
+ if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION])) {
|
|
|
e06446 |
+ /* Due to the closing state we would schedule a ns_handle_closure
|
|
|
e06446 |
+ * but one is already registered.
|
|
|
e06446 |
+ * Just return;
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+ slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_closure "
|
|
|
e06446 |
+ "job in progress on conn %" PRIu64 " for fd=%d\n",
|
|
|
e06446 |
+ conn->c_connid, conn->c_sd);
|
|
|
e06446 |
+ return;
|
|
|
e06446 |
+ } else {
|
|
|
e06446 |
+ /* Due to the closing state we would schedule a ns_handle_closure
|
|
|
e06446 |
+ * but a different handler is registered. Stop it and schedule (below) ns_handle_closure
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+ ns_job_done(conn->c_job);
|
|
|
e06446 |
+ conn->c_job = NULL;
|
|
|
e06446 |
+ }
|
|
|
e06446 |
+ } else {
|
|
|
e06446 |
+ /* Here the connection is still active => ignore the call and return */
|
|
|
e06446 |
+ if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_READ_CONNECTION])) {
|
|
|
e06446 |
+ /* Connection is still active and a read_ready is already scheduled
|
|
|
e06446 |
+ * Likely a consequence of async operations
|
|
|
e06446 |
+ * Just let the current read_ready do its job
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+ slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_pr_read_ready "
|
|
|
e06446 |
+ "job in progress on conn %" PRIu64 " for fd=%d\n",
|
|
|
e06446 |
+ conn->c_connid, conn->c_sd);
|
|
|
e06446 |
+ } else {
|
|
|
e06446 |
+ /* Weird situation where the connection is not flagged closing but ns_handle_closure
|
|
|
e06446 |
+ * is scheduled.
|
|
|
e06446 |
+ * We should not try to read it anymore
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+ PR_ASSERT(ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION]));
|
|
|
e06446 |
+ }
|
|
|
e06446 |
+ return;
|
|
|
e06446 |
+ }
|
|
|
e06446 |
}
|
|
|
e06446 |
|
|
|
e06446 |
+ /* At this point conn->c_job is NULL
|
|
|
e06446 |
+ * Either it was null when the function was called
|
|
|
e06446 |
+ * Or we cleared it (+ns_job_done) if the wrong (according
|
|
|
e06446 |
+ * to the connection state) handler was scheduled
|
|
|
e06446 |
+ *
|
|
|
e06446 |
+ * Now we need to determine which handler to schedule
|
|
|
e06446 |
+ */
|
|
|
e06446 |
+
|
|
|
e06446 |
if (CONN_NEEDS_CLOSING(conn)) {
|
|
|
e06446 |
/* there should only ever be 0 or 1 active closure jobs */
|
|
|
e06446 |
PR_ASSERT((conn->c_ns_close_jobs == 0) || (conn->c_ns_close_jobs == 1));
|
|
|
e06446 |
@@ -1718,7 +1783,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
|
|
|
e06446 |
conn->c_ns_close_jobs++; /* now 1 active closure job */
|
|
|
e06446 |
connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing */); /* event framework now has a reference */
|
|
|
e06446 |
/* Close the job asynchronously. Why? */
|
|
|
e06446 |
- ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));
|
|
|
e06446 |
+ ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handlers[NS_HANDLER_CLOSE_CONNECTION], conn, &(conn->c_job));
|
|
|
e06446 |
if (job_result != NS_SUCCESS) {
|
|
|
e06446 |
if (job_result == NS_SHUTDOWN) {
|
|
|
e06446 |
slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "
|
|
|
e06446 |
@@ -1762,7 +1827,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
|
|
|
e06446 |
#endif
|
|
|
e06446 |
ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,
|
|
|
e06446 |
NS_JOB_READ | NS_JOB_PRESERVE_FD,
|
|
|
e06446 |
- ns_handle_pr_read_ready, conn, &(conn->c_job));
|
|
|
e06446 |
+ ns_handlers[NS_HANDLER_READ_CONNECTION], conn, &(conn->c_job));
|
|
|
e06446 |
if (job_result != NS_SUCCESS) {
|
|
|
e06446 |
if (job_result == NS_SHUTDOWN) {
|
|
|
e06446 |
slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "
|
|
|
e06446 |
@@ -1782,61 +1847,6 @@ ns_connection_post_io_or_closing_try(Connection *conn)
|
|
|
e06446 |
return 0;
|
|
|
e06446 |
}
|
|
|
e06446 |
|
|
|
e06446 |
-/*
|
|
|
e06446 |
- * Tries to schedule I/O for this connection
|
|
|
e06446 |
- * If the connection is already busy with a scheduled I/O
|
|
|
e06446 |
- * it can wait until scheduled I/O is dispatched
|
|
|
e06446 |
- *
|
|
|
e06446 |
- * caller must hold c_mutex
|
|
|
e06446 |
- */
|
|
|
e06446 |
-void
|
|
|
e06446 |
-ns_connection_post_io_or_closing(Connection *conn)
|
|
|
e06446 |
-{
|
|
|
e06446 |
- while (ns_connection_post_io_or_closing_try(conn)) {
|
|
|
e06446 |
- /* Here a job is currently scheduled (c->job is set) and not yet dispatched
|
|
|
e06446 |
- * Job can be either:
|
|
|
e06446 |
- * - ns_handle_closure
|
|
|
e06446 |
- * - ns_handle_pr_read_ready
|
|
|
e06446 |
- */
|
|
|
e06446 |
-
|
|
|
e06446 |
- if (connection_is_free(conn, 0)) {
|
|
|
e06446 |
- PRStatus shutdown_status;
|
|
|
e06446 |
-
|
|
|
e06446 |
- /* The connection being freed,
|
|
|
e06446 |
- * It means that ns_handle_closure already completed and the connection
|
|
|
e06446 |
- * is no longer on the active list.
|
|
|
e06446 |
- * The scheduled job is useless and scheduling a new one as well
|
|
|
e06446 |
- */
|
|
|
e06446 |
- shutdown_status = ns_job_done(conn->c_job);
|
|
|
e06446 |
- if (shutdown_status != PR_SUCCESS) {
|
|
|
e06446 |
- slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
|
|
|
e06446 |
- }
|
|
|
e06446 |
- conn->c_job = NULL;
|
|
|
e06446 |
- return;
|
|
|
e06446 |
- }
|
|
|
e06446 |
- if (g_get_shutdown() && CONN_NEEDS_CLOSING(conn)) {
|
|
|
e06446 |
- PRStatus shutdown_status;
|
|
|
e06446 |
-
|
|
|
e06446 |
- /* This is shutting down cancel any scheduled job to register ns_handle_closure
|
|
|
e06446 |
- */
|
|
|
e06446 |
- shutdown_status = ns_job_done(conn->c_job);
|
|
|
e06446 |
- if (shutdown_status != PR_SUCCESS) {
|
|
|
e06446 |
- slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed to cancel a job during shutdown %d !\n", conn->c_sd);
|
|
|
e06446 |
- }
|
|
|
e06446 |
- conn->c_job = NULL;
|
|
|
e06446 |
- continue;
|
|
|
e06446 |
- }
|
|
|
e06446 |
-
|
|
|
e06446 |
- /* We are not suppose to work immediately on the connection that is taken by
|
|
|
e06446 |
- * another job
|
|
|
e06446 |
- * release the lock and give some time
|
|
|
e06446 |
- */
|
|
|
e06446 |
- PR_ExitMonitor(conn->c_mutex);
|
|
|
e06446 |
- DS_Sleep(PR_MillisecondsToInterval(100));
|
|
|
e06446 |
- PR_EnterMonitor(conn->c_mutex);
|
|
|
e06446 |
- }
|
|
|
e06446 |
-}
|
|
|
e06446 |
-
|
|
|
e06446 |
/* This function must be called without the thread flag, in the
|
|
|
e06446 |
* event loop. This function may free the connection. This can
|
|
|
e06446 |
* only be done in the event loop thread.
|
|
|
e06446 |
diff --git a/src/nunc-stans/include/nunc-stans.h b/src/nunc-stans/include/nunc-stans.h
|
|
|
e06446 |
index 192e38ec3..a0ddbdb42 100644
|
|
|
e06446 |
--- a/src/nunc-stans/include/nunc-stans.h
|
|
|
e06446 |
+++ b/src/nunc-stans/include/nunc-stans.h
|
|
|
e06446 |
@@ -959,4 +959,7 @@ ns_result_t ns_thrpool_wait(struct ns_thrpool_t *tp);
|
|
|
e06446 |
*/
|
|
|
e06446 |
ns_result_t ns_job_rearm(struct ns_job_t *job);
|
|
|
e06446 |
|
|
|
e06446 |
+int
|
|
|
e06446 |
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func);
|
|
|
e06446 |
+
|
|
|
e06446 |
#endif /* NS_THRPOOL_H */
|
|
|
e06446 |
diff --git a/src/nunc-stans/ns/ns_thrpool.c b/src/nunc-stans/ns/ns_thrpool.c
|
|
|
e06446 |
index d95b0c38b..774607c88 100644
|
|
|
e06446 |
--- a/src/nunc-stans/ns/ns_thrpool.c
|
|
|
e06446 |
+++ b/src/nunc-stans/ns/ns_thrpool.c
|
|
|
e06446 |
@@ -1237,6 +1237,11 @@ ns_job_rearm(ns_job_t *job)
|
|
|
e06446 |
/* Unreachable code .... */
|
|
|
e06446 |
return NS_INVALID_REQUEST;
|
|
|
e06446 |
}
|
|
|
e06446 |
+int
|
|
|
e06446 |
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func)
|
|
|
e06446 |
+{
|
|
|
e06446 |
+ return(job && job->func == func);
|
|
|
e06446 |
+}
|
|
|
e06446 |
|
|
|
e06446 |
static void
|
|
|
e06446 |
ns_thrpool_delete(ns_thrpool_t *tp)
|
|
|
e06446 |
--
|
|
|
e06446 |
2.17.1
|
|
|
e06446 |
|