dcavalca / rpms / systemd

Forked from rpms/systemd 5 months ago
Clone
923a60
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001
923a60
From: Jan Synacek <jsynacek@redhat.com>
923a60
Date: Tue, 21 Nov 2017 12:46:28 +0100
923a60
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET
923a60
 socket
923a60
923a60
Otherwise we might run into deadlocks, when journald blocks on the
923a60
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and
923a60
dbus-daemon blocks on logging to journald. Break this cycle by making
923a60
sure that journald never ever blocks on PID 1.
923a60
923a60
Note that this change disables support for event loop watchdog support,
923a60
as these messages are sent in blocking style by sd-event. That should
923a60
not be a big loss though, as people reported frequent problems with the
923a60
watchdog hitting journald on excessively slow IO.
923a60
923a60
Fixes: #1505.
923a60
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7)
923a60
923a60
Resolves: #1511565
923a60
---
923a60
 src/journal/journald-server.c     | 130 +++++++++++++++++++++++++++++-
923a60
 src/journal/journald-server.h     |  13 ++-
923a60
 src/journal/journald-stream.c     |  68 ++++++++++++++--
923a60
 src/journal/journald-stream.h     |   3 +
923a60
 src/journal/journald.c            |   8 --
923a60
 units/systemd-journald.service.in |   1 -
923a60
 6 files changed, 201 insertions(+), 22 deletions(-)
923a60
923a60
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
923a60
index daeecd5191..a810829b24 100644
923a60
--- a/src/journal/journald-server.c
923a60
+++ b/src/journal/journald-server.c
923a60
@@ -67,6 +67,8 @@
923a60
 
923a60
 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
923a60
 
923a60
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024)
923a60
+
923a60
 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
923a60
 + * for a bit of additional metadata. */
923a60
 #define DEFAULT_LINE_MAX (48*1024)
923a60
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) {
923a60
         return 0;
923a60
 }
923a60
 
923a60
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
923a60
+        Server *s = userdata;
923a60
+        int r;
923a60
+
923a60
+        assert(s);
923a60
+        assert(s->notify_event_source == es);
923a60
+        assert(s->notify_fd == fd);
923a60
+
923a60
+        if (revents != EPOLLOUT) {
923a60
+                log_error("Invalid events on notify file descriptor.");
923a60
+                return -EINVAL;
923a60
+        }
923a60
+
923a60
+        /* The $NOTIFY_SOCKET is writable again, now send exactly one
923a60
+         * message on it. Either it's the initial READY=1 event or an
923a60
+         * stdout stream event. If there's nothing to write anymore,
923a60
+         * turn our event source off. The next time there's something
923a60
+         * to send it will be turned on again. */
923a60
+
923a60
+        if (!s->sent_notify_ready) {
923a60
+                static const char p[] =
923a60
+                        "READY=1\n"
923a60
+                        "STATUS=Processing requests...";
923a60
+                ssize_t l;
923a60
+
923a60
+                l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
923a60
+                if (l < 0) {
923a60
+                        if (errno == EAGAIN)
923a60
+                                return 0;
923a60
+
923a60
+                        return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
923a60
+                }
923a60
+
923a60
+                s->sent_notify_ready = true;
923a60
+                log_debug("Sent READY=1 notification.");
923a60
+
923a60
+        } else if (s->stdout_streams_notify_queue)
923a60
+                /* Dispatch one stream notification event */
923a60
+                stdout_stream_send_notify(s->stdout_streams_notify_queue);
923a60
+
923a60
+        /* Leave us enabled if there's still more to to do. */
923a60
+        if (s->stdout_streams_notify_queue)
923a60
+                return 0;
923a60
+
923a60
+        /* There was nothing to do anymore, let's turn ourselves off. */
923a60
+        r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
923a60
+        if (r < 0)
923a60
+                return log_error_errno(r, "Failed to turn off notify event source: %m");
923a60
+
923a60
+        return 0;
923a60
+}
923a60
+
923a60
+static int server_connect_notify(Server *s) {
923a60
+        union sockaddr_union sa = {
923a60
+                .un.sun_family = AF_UNIX,
923a60
+        };
923a60
+        const char *e;
923a60
+        int r;
923a60
+
923a60
+        assert(s);
923a60
+        assert(s->notify_fd < 0);
923a60
+        assert(!s->notify_event_source);
923a60
+
923a60
+        /*
923a60
+          So here's the problem: we'd like to send notification
923a60
+          messages to PID 1, but we cannot do that via sd_notify(),
923a60
+          since that's synchronous, and we might end up blocking on
923a60
+          it. Specifically: given that PID 1 might block on
923a60
+          dbus-daemon during IPC, and dbus-daemon is logging to us,
923a60
+          and might hence block on us, we might end up in a deadlock
923a60
+          if we block on sending PID 1 notification messages -- by
923a60
+          generating a full blocking circle. To avoid this, let's
923a60
+          create a non-blocking socket, and connect it to the
923a60
+          notification socket, and then wait for POLLOUT before we
923a60
+          send anything. This should efficiently avoid any deadlocks,
923a60
+          as we'll never block on PID 1, hence PID 1 can safely block
923a60
+          on dbus-daemon which can safely block on us again.
923a60
+
923a60
+          Don't think that this issue is real? It is, see:
923a60
+          https://github.com/systemd/systemd/issues/1505
923a60
+        */
923a60
+
923a60
+        e = getenv("NOTIFY_SOCKET");
923a60
+        if (!e)
923a60
+                return 0;
923a60
+
923a60
+        if ((e[0] != '@' && e[0] != '/') || e[1] == 0) {
923a60
+                log_error("NOTIFY_SOCKET set to an invalid value: %s", e);
923a60
+                return -EINVAL;
923a60
+        }
923a60
+
923a60
+        if (strlen(e) > sizeof(sa.un.sun_path)) {
923a60
+                log_error("NOTIFY_SOCKET path too long: %s", e);
923a60
+                return -EINVAL;
923a60
+        }
923a60
+
923a60
+        s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
923a60
+        if (s->notify_fd < 0)
923a60
+                return log_error_errno(errno, "Failed to create notify socket: %m");
923a60
+
923a60
+        (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
923a60
+
923a60
+        strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path));
923a60
+        if (sa.un.sun_path[0] == '@')
923a60
+                sa.un.sun_path[0] = 0;
923a60
+
923a60
+        r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e));
923a60
+        if (r < 0)
923a60
+                return log_error_errno(errno, "Failed to connect to notify socket: %m");
923a60
+
923a60
+        r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
923a60
+        if (r < 0)
923a60
+                return log_error_errno(r, "Failed to watch notification socket: %m");
923a60
+
923a60
+        /* This should fire pretty soon, which we'll use to send the
923a60
+         * READY=1 event. */
923a60
+
923a60
+        return 0;
923a60
+}
923a60
+
923a60
 int server_init(Server *s) {
923a60
         _cleanup_fdset_free_ FDSet *fds = NULL;
923a60
         int n, r, fd;
923a60
@@ -1563,7 +1685,7 @@ int server_init(Server *s) {
923a60
         assert(s);
923a60
 
923a60
         zero(*s);
923a60
-        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1;
923a60
+        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1;
923a60
         s->compress = true;
923a60
         s->seal = true;
923a60
 
923a60
@@ -1611,8 +1733,6 @@ int server_init(Server *s) {
923a60
         if (r < 0)
923a60
                 return log_error_errno(r, "Failed to create event loop: %m");
923a60
 
923a60
-        sd_event_set_watchdog(s->event, true);
923a60
-
923a60
         n = sd_listen_fds(true);
923a60
         if (n < 0)
923a60
                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
923a60
@@ -1718,6 +1838,8 @@ int server_init(Server *s) {
923a60
         server_cache_boot_id(s);
923a60
         server_cache_machine_id(s);
923a60
 
923a60
+        (void) server_connect_notify(s);
923a60
+
923a60
         r = system_journal_open(s, false);
923a60
         if (r < 0)
923a60
                 return r;
923a60
@@ -1770,6 +1892,7 @@ void server_done(Server *s) {
923a60
         sd_event_source_unref(s->sigterm_event_source);
923a60
         sd_event_source_unref(s->sigint_event_source);
923a60
         sd_event_source_unref(s->hostname_event_source);
923a60
+        sd_event_source_unref(s->notify_event_source);
923a60
         sd_event_unref(s->event);
923a60
 
923a60
         safe_close(s->syslog_fd);
923a60
@@ -1778,6 +1901,7 @@ void server_done(Server *s) {
923a60
         safe_close(s->dev_kmsg_fd);
923a60
         safe_close(s->audit_fd);
923a60
         safe_close(s->hostname_fd);
923a60
+        safe_close(s->notify_fd);
923a60
 
923a60
         if (s->rate_limit)
923a60
                 journal_rate_limit_free(s->rate_limit);
923a60
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
923a60
index b294107788..e59ff35e22 100644
923a60
--- a/src/journal/journald-server.h
923a60
+++ b/src/journal/journald-server.h
923a60
@@ -27,12 +27,15 @@
923a60
 #include <sys/types.h>
923a60
 #include <sys/socket.h>
923a60
 
923a60
+typedef struct Server Server;
923a60
+
923a60
 #include "sd-event.h"
923a60
 #include "journal-file.h"
923a60
 #include "hashmap.h"
923a60
 #include "util.h"
923a60
 #include "audit.h"
923a60
 #include "journald-rate-limit.h"
923a60
+#include "journald-stream.h"
923a60
 #include "list.h"
923a60
 
923a60
 typedef enum Storage {
923a60
@@ -52,15 +55,14 @@ typedef enum SplitMode {
923a60
         _SPLIT_INVALID = -1
923a60
 } SplitMode;
923a60
 
923a60
-typedef struct StdoutStream StdoutStream;
923a60
-
923a60
-typedef struct Server {
923a60
+struct Server {
923a60
         int syslog_fd;
923a60
         int native_fd;
923a60
         int stdout_fd;
923a60
         int dev_kmsg_fd;
923a60
         int audit_fd;
923a60
         int hostname_fd;
923a60
+        int notify_fd;
923a60
 
923a60
         sd_event *event;
923a60
 
923a60
@@ -75,6 +77,7 @@ typedef struct Server {
923a60
         sd_event_source *sigterm_event_source;
923a60
         sd_event_source *sigint_event_source;
923a60
         sd_event_source *hostname_event_source;
923a60
+        sd_event_source *notify_event_source;
923a60
 
923a60
         JournalFile *runtime_journal;
923a60
         JournalFile *system_journal;
923a60
@@ -114,6 +117,7 @@ typedef struct Server {
923a60
         usec_t oldest_file_usec;
923a60
 
923a60
         LIST_HEAD(StdoutStream, stdout_streams);
923a60
+        LIST_HEAD(StdoutStream, stdout_streams_notify_queue);
923a60
         unsigned n_stdout_streams;
923a60
 
923a60
         char *tty_path;
923a60
@@ -135,6 +139,7 @@ typedef struct Server {
923a60
 
923a60
         struct udev *udev;
923a60
 
923a60
+        bool sent_notify_ready;
923a60
         bool sync_scheduled;
923a60
 
923a60
         char machine_id_field[sizeof("_MACHINE_ID=") + 32];
923a60
@@ -145,7 +150,7 @@ typedef struct Server {
923a60
         char *cgroup_root;
923a60
 
923a60
         size_t line_max;
923a60
-} Server;
923a60
+};
923a60
 
923a60
 #define N_IOVEC_META_FIELDS 20
923a60
 #define N_IOVEC_KERNEL_FIELDS 64
923a60
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
923a60
index 4d6b7ad189..9118d1a318 100644
923a60
--- a/src/journal/journald-stream.c
923a60
+++ b/src/journal/journald-stream.c
923a60
@@ -80,6 +80,7 @@ struct StdoutStream {
923a60
         bool forward_to_console:1;
923a60
 
923a60
         bool fdstore:1;
923a60
+        bool in_notify_queue:1;
923a60
 
923a60
         char *buffer;
923a60
         size_t length;
923a60
@@ -90,6 +91,7 @@ struct StdoutStream {
923a60
         char *state_file;
923a60
 
923a60
         LIST_FIELDS(StdoutStream, stdout_stream);
923a60
+        LIST_FIELDS(StdoutStream, stdout_stream_notify_queue);
923a60
 
923a60
         char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX];
923a60
 };
923a60
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) {
923a60
                 assert(s->server->n_stdout_streams > 0);
923a60
                 s->server->n_stdout_streams --;
923a60
                 LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
923a60
+
923a60
+                if (s->in_notify_queue)
923a60
+                        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
923a60
         }
923a60
 
923a60
         if (s->event_source) {
923a60
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) {
923a60
                 return;
923a60
 
923a60
         if (s->state_file)
923a60
-                unlink(s->state_file);
923a60
+                (void) unlink(s->state_file);
923a60
 
923a60
         stdout_stream_free(s);
923a60
 }
923a60
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) {
923a60
         free(temp_path);
923a60
         temp_path = NULL;
923a60
 
923a60
-        /* Store the connection fd in PID 1, so that we get it passed
923a60
-         * in again on next start */
923a60
-        if (!s->fdstore) {
923a60
-                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
923a60
-                s->fdstore = true;
923a60
+        if (!s->fdstore && !s->in_notify_queue) {
923a60
+                LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
923a60
+                s->in_notify_queue = true;
923a60
+
923a60
+                if (s->server->notify_event_source) {
923a60
+                        r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON);
923a60
+                        if (r < 0)
923a60
+                                log_warning_errno(r, "Failed to enable notify event source: %m");
923a60
+                }
923a60
         }
923a60
 
923a60
 finish:
923a60
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) {
923a60
 
923a60
         return 0;
923a60
 }
923a60
+
923a60
+void stdout_stream_send_notify(StdoutStream *s) {
923a60
+        struct iovec iovec = {
923a60
+                .iov_base = (char*) "FDSTORE=1",
923a60
+                .iov_len = strlen("FDSTORE=1"),
923a60
+        };
923a60
+        struct msghdr msghdr = {
923a60
+                .msg_iov = &iovec,
923a60
+                .msg_iovlen = 1,
923a60
+        };
923a60
+        struct cmsghdr *cmsg;
923a60
+        ssize_t l;
923a60
+
923a60
+        assert(s);
923a60
+        assert(!s->fdstore);
923a60
+        assert(s->in_notify_queue);
923a60
+        assert(s->server);
923a60
+        assert(s->server->notify_fd >= 0);
923a60
+
923a60
+        /* Store the connection fd in PID 1, so that we get it passed
923a60
+         * in again on next start */
923a60
+
923a60
+        msghdr.msg_controllen = CMSG_SPACE(sizeof(int));
923a60
+        msghdr.msg_control = alloca0(msghdr.msg_controllen);
923a60
+
923a60
+        cmsg = CMSG_FIRSTHDR(&msghdr);
923a60
+        cmsg->cmsg_level = SOL_SOCKET;
923a60
+        cmsg->cmsg_type = SCM_RIGHTS;
923a60
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
923a60
+
923a60
+        memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int));
923a60
+
923a60
+        l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL);
923a60
+        if (l < 0) {
923a60
+                if (errno == EAGAIN)
923a60
+                        return;
923a60
+
923a60
+                log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m");
923a60
+        } else {
923a60
+                log_debug("Successfully sent stream file descriptor to service manager.");
923a60
+                s->fdstore = 1;
923a60
+        }
923a60
+
923a60
+        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
923a60
+        s->in_notify_queue = false;
923a60
+
923a60
+}
923a60
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
923a60
index 94bf955d78..fd8c94fb65 100644
923a60
--- a/src/journal/journald-stream.h
923a60
+++ b/src/journal/journald-stream.h
923a60
@@ -21,9 +21,12 @@
923a60
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
923a60
 ***/
923a60
 
923a60
+typedef struct StdoutStream StdoutStream;
923a60
+
923a60
 #include "fdset.h"
923a60
 #include "journald-server.h"
923a60
 
923a60
 int server_open_stdout_socket(Server *s, FDSet *fds);
923a60
 
923a60
 void stdout_stream_free(StdoutStream *s);
923a60
+void stdout_stream_send_notify(StdoutStream *s);
923a60
diff --git a/src/journal/journald.c b/src/journal/journald.c
923a60
index 15bbcbe3de..b7ba2b6ec5 100644
923a60
--- a/src/journal/journald.c
923a60
+++ b/src/journal/journald.c
923a60
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) {
923a60
         log_debug("systemd-journald running as pid "PID_FMT, getpid());
923a60
         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
923a60
 
923a60
-        sd_notify(false,
923a60
-                  "READY=1\n"
923a60
-                  "STATUS=Processing requests...");
923a60
-
923a60
         for (;;) {
923a60
                 usec_t t = USEC_INFINITY, n;
923a60
 
923a60
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) {
923a60
         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
923a60
 
923a60
 finish:
923a60
-        sd_notify(false,
923a60
-                  "STOPPING=1\n"
923a60
-                  "STATUS=Shutting down...");
923a60
-
923a60
         server_done(&server);
923a60
 
923a60
         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
923a60
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
923a60
index 0d1ea61fe8..c94c0bfba1 100644
923a60
--- a/units/systemd-journald.service.in
923a60
+++ b/units/systemd-journald.service.in
923a60
@@ -22,7 +22,6 @@ RestartSec=0
923a60
 StandardOutput=null
923a60
 FileDescriptorStoreMax=4224
923a60
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
923a60
-WatchdogSec=3min
923a60
 
923a60
 # Increase the default a bit in order to allow many simultaneous
923a60
 # services being run since we keep one fd open per service. Also, when