a9339c
From 13bcf85ffab4b4e67039599246604a3f5b503975 Mon Sep 17 00:00:00 2001
a9339c
From: David Tardon <dtardon@redhat.com>
a9339c
Date: Tue, 24 Apr 2018 15:19:38 +0200
a9339c
Subject: [PATCH] fix race between daemon-reload and other commands
a9339c
a9339c
When "systemctl daemon-reload" is run at the same time as "systemctl
a9339c
start foo", the latter might hang. That's because commands like start
a9339c
wait for JobRemoved signal to know when the job is finished. But if the
a9339c
job is finished during reloading, the signal is never sent.
a9339c
a9339c
The hang can be easily reproduced by running
a9339c
a9339c
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl daemon-reload ; done
a9339c
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl start systemd-coredump.socket ; done
a9339c
a9339c
in two different terminals. The start command will hang after 1-2
a9339c
iterations.
a9339c
a9339c
This keeps track of jobs that were started before reload and finished
a9339c
during it and sends JobRemoved after the reload has finished.
a9339c
a9339c
(cherry picked from commit a7a7163df7fc8a9f794f6803b2f6c9c9b0745a1f)
a9339c
---
a9339c
 src/core/job.c     | 45 ++++++++++++++++++++++++++++++++++++++++-----
a9339c
 src/core/job.h     |  2 ++
a9339c
 src/core/manager.c | 15 +++++++++++++++
a9339c
 src/core/manager.h |  3 +++
a9339c
 4 files changed, 60 insertions(+), 5 deletions(-)
a9339c
a9339c
diff --git a/src/core/job.c b/src/core/job.c
a9339c
index 1861c8a63..275503169 100644
a9339c
--- a/src/core/job.c
a9339c
+++ b/src/core/job.c
a9339c
@@ -53,6 +53,7 @@ Job* job_new_raw(Unit *unit) {
a9339c
         j->manager = unit->manager;
a9339c
         j->unit = unit;
a9339c
         j->type = _JOB_TYPE_INVALID;
a9339c
+        j->reloaded = false;
a9339c
 
a9339c
         return j;
a9339c
 }
a9339c
@@ -74,7 +75,7 @@ Job* job_new(Unit *unit, JobType type) {
a9339c
         return j;
a9339c
 }
a9339c
 
a9339c
-void job_free(Job *j) {
a9339c
+void job_unlink(Job *j) {
a9339c
         assert(j);
a9339c
         assert(!j->installed);
a9339c
         assert(!j->transaction_prev);
a9339c
@@ -82,13 +83,28 @@ void job_free(Job *j) {
a9339c
         assert(!j->subject_list);
a9339c
         assert(!j->object_list);
a9339c
 
a9339c
-        if (j->in_run_queue)
a9339c
+        if (j->in_run_queue) {
a9339c
                 LIST_REMOVE(run_queue, j->manager->run_queue, j);
a9339c
+                j->in_run_queue = false;
a9339c
+        }
a9339c
 
a9339c
-        if (j->in_dbus_queue)
a9339c
+        if (j->in_dbus_queue) {
a9339c
                 LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
a9339c
+                j->in_dbus_queue = false;
a9339c
+        }
a9339c
+
a9339c
+        j->timer_event_source = sd_event_source_unref(j->timer_event_source);
a9339c
+}
a9339c
+
a9339c
+void job_free(Job *j) {
a9339c
+        assert(j);
a9339c
+        assert(!j->installed);
a9339c
+        assert(!j->transaction_prev);
a9339c
+        assert(!j->transaction_next);
a9339c
+        assert(!j->subject_list);
a9339c
+        assert(!j->object_list);
a9339c
 
a9339c
-        sd_event_source_unref(j->timer_event_source);
a9339c
+        job_unlink(j);
a9339c
 
a9339c
         sd_bus_track_unref(j->clients);
a9339c
         strv_free(j->deserialized_clients);
a9339c
@@ -246,6 +262,7 @@ int job_install_deserialized(Job *j) {
a9339c
 
a9339c
         *pj = j;
a9339c
         j->installed = true;
a9339c
+        j->reloaded = true;
a9339c
 
a9339c
         if (j->state == JOB_RUNNING)
a9339c
                 j->unit->manager->n_running_jobs++;
a9339c
@@ -790,6 +807,19 @@ static void job_emit_status_message(Unit *u, JobType t, JobResult result) {
a9339c
                 job_print_status_message(u, t, result);
a9339c
 }
a9339c
 
a9339c
+static int job_save_pending_finished_job(Job *j) {
a9339c
+        int r;
a9339c
+
a9339c
+        assert(j);
a9339c
+
a9339c
+        r = set_ensure_allocated(&j->manager->pending_finished_jobs, NULL);
a9339c
+        if (r < 0)
a9339c
+                return r;
a9339c
+
a9339c
+        job_unlink(j);
a9339c
+        return set_put(j->manager->pending_finished_jobs, j);
a9339c
+}
a9339c
+
a9339c
 int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
a9339c
         Unit *u;
a9339c
         Unit *other;
a9339c
@@ -829,7 +859,12 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
a9339c
                 j->manager->n_failed_jobs ++;
a9339c
 
a9339c
         job_uninstall(j);
a9339c
-        job_free(j);
a9339c
+        /* Remember jobs started before the reload */
a9339c
+        if (j->manager->n_reloading > 0 && j->reloaded) {
a9339c
+                if (job_save_pending_finished_job(j) < 0)
a9339c
+                        job_free(j);
a9339c
+        } else
a9339c
+                job_free(j);
a9339c
 
a9339c
         /* Fail depending jobs on failure */
a9339c
         if (result != JOB_DONE && recursive) {
a9339c
diff --git a/src/core/job.h b/src/core/job.h
a9339c
index 535052b48..4ae6f2802 100644
a9339c
--- a/src/core/job.h
a9339c
+++ b/src/core/job.h
a9339c
@@ -172,10 +172,12 @@ struct Job {
a9339c
         bool sent_dbus_new_signal:1;
a9339c
         bool ignore_order:1;
a9339c
         bool irreversible:1;
a9339c
+        bool reloaded:1;
a9339c
 };
a9339c
 
a9339c
 Job* job_new(Unit *unit, JobType type);
a9339c
 Job* job_new_raw(Unit *unit);
a9339c
+void job_unlink(Job *job);
a9339c
 void job_free(Job *job);
a9339c
 Job* job_install(Job *j);
a9339c
 int job_install_deserialized(Job *j);
a9339c
diff --git a/src/core/manager.c b/src/core/manager.c
a9339c
index 47b09e1e9..9c406bb5b 100644
a9339c
--- a/src/core/manager.c
a9339c
+++ b/src/core/manager.c
a9339c
@@ -2702,6 +2702,18 @@ finish:
a9339c
         return r;
a9339c
 }
a9339c
 
a9339c
+static void manager_flush_finished_jobs(Manager *m) {
a9339c
+        Job *j;
a9339c
+
a9339c
+        while ((j = set_steal_first(m->pending_finished_jobs))) {
a9339c
+                bus_job_send_removed_signal(j);
a9339c
+                job_free(j);
a9339c
+        }
a9339c
+
a9339c
+        set_free(m->pending_finished_jobs);
a9339c
+        m->pending_finished_jobs = NULL;
a9339c
+}
a9339c
+
a9339c
 int manager_reload(Manager *m) {
a9339c
         int r, q;
a9339c
         _cleanup_fclose_ FILE *f = NULL;
a9339c
@@ -2784,6 +2796,9 @@ int manager_reload(Manager *m) {
a9339c
         assert(m->n_reloading > 0);
a9339c
         m->n_reloading--;
a9339c
 
a9339c
+        if (m->n_reloading <= 0)
a9339c
+                manager_flush_finished_jobs(m);
a9339c
+
a9339c
         m->send_reloading_done = true;
a9339c
 
a9339c
         return r;
a9339c
diff --git a/src/core/manager.h b/src/core/manager.h
a9339c
index e91e7bd8b..90d2d982e 100644
a9339c
--- a/src/core/manager.h
a9339c
+++ b/src/core/manager.h
a9339c
@@ -270,6 +270,9 @@ struct Manager {
a9339c
 
a9339c
         /* non-zero if we are reloading or reexecuting, */
a9339c
         int n_reloading;
a9339c
+        /* A set which contains all jobs that started before reload and finished
a9339c
+         * during it */
a9339c
+        Set *pending_finished_jobs;
a9339c
 
a9339c
         unsigned n_installed_jobs;
a9339c
         unsigned n_failed_jobs;