65878a
From 1386240aee3f78a9101a118f11a7028571d33a71 Mon Sep 17 00:00:00 2001
65878a
From: Michal Sekletar <msekleta@redhat.com>
65878a
Date: Thu, 27 Feb 2014 18:16:19 +0100
65878a
Subject: [PATCH] core: watch SIGCHLD more closely to track processes of units
65878a
 with no reliable cgroup empty notifier
65878a
65878a
When a process dies that we can associate with a specific unit, start
65878a
watching all other processes of that unit, so that we can associate
65878a
those processes with the unit too.
65878a
65878a
Also, for service units start doing this as soon as we get the first
65878a
SIGCHLD for either control or main process, so that we can follow the
65878a
processes of the service from one to the other, as long as process that
65878a
remain are processes of the ones we watched that died and got reassigned
65878a
to us as parent.
65878a
65878a
Similar, for scope units start doing this as soon as the scope
65878a
controller abandons the unit, and thus management entirely reverts to
65878a
systemd. To abandon a unit introduce a new Abandon() scope unit method
65878a
call.
65878a
65878a
Based-on: a911bb9ab27ac0eb3bbf4e8b4109e5da9b88eee3
65878a
---
65878a
 src/core/dbus-scope.c |  36 +++++++++----
65878a
 src/core/manager.c    |   2 +-
65878a
 src/core/scope.c      |  87 ++++++++++++++++++++++---------
65878a
 src/core/scope.h      |   5 +-
65878a
 src/core/service.c    | 140 ++++++++++++++++++++++++++++++--------------------
65878a
 src/core/unit.c       | 112 +++++++++++++++++++++++++++++++++++++++-
65878a
 src/core/unit.h       |   9 ++++
65878a
 7 files changed, 298 insertions(+), 93 deletions(-)
65878a
65878a
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
65878a
index b576f76..58dd9ff 100644
65878a
--- a/src/core/dbus-scope.c
65878a
+++ b/src/core/dbus-scope.c
65878a
@@ -30,6 +30,7 @@
65878a
 
65878a
 #define BUS_SCOPE_INTERFACE                                             \
65878a
         " <interface name=\"org.freedesktop.systemd1.Scope\">\n"        \
65878a
+        "  <method name=\"Abandon\"/>\n"                                \
65878a
         BUS_UNIT_CGROUP_INTERFACE                                       \
65878a
         "  <property name=\"Controller\" type=\"s\" access=\"read\"/>\n"\
65878a
         "  <property name=\"TimeoutStopUSec\" type=\"t\" access=\"read\"/>\n" \
65878a
@@ -66,19 +67,40 @@ static const BusProperty bus_scope_properties[] = {
65878a
 
65878a
 DBusHandlerResult bus_scope_message_handler(Unit *u, DBusConnection *c, DBusMessage *message) {
65878a
         Scope *s = SCOPE(u);
65878a
+        _cleanup_dbus_message_unref_ DBusMessage *reply = NULL;
65878a
 
65878a
-        const BusBoundProperties bps[] = {
65878a
+        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
65878a
+
65878a
+        if (dbus_message_is_method_call(message, "org.freedesktop.systemd1.Scope", "Abandon")) {
65878a
+                int r;
65878a
+
65878a
+                r = scope_abandon(s);
65878a
+                if (r < 0)
65878a
+                        log_error("Failed to mark scope %s as abandoned : %s", UNIT(s)->id, strerror(-r));
65878a
+
65878a
+                reply = dbus_message_new_method_return(message);
65878a
+                if (!reply)
65878a
+                        goto oom;
65878a
+        } else {
65878a
+                const BusBoundProperties bps[] = {
65878a
                 { "org.freedesktop.systemd1.Unit",  bus_unit_properties,           u },
65878a
                 { "org.freedesktop.systemd1.Scope", bus_unit_cgroup_properties,    u },
65878a
                 { "org.freedesktop.systemd1.Scope", bus_scope_properties,          s },
65878a
                 { "org.freedesktop.systemd1.Scope", bus_cgroup_context_properties, &s->cgroup_context },
65878a
                 { "org.freedesktop.systemd1.Scope", bus_kill_context_properties,   &s->kill_context   },
65878a
                 {}
65878a
-        };
65878a
+                };
65878a
 
65878a
-        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
65878a
+               return  bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
65878a
+        }
65878a
+
65878a
+        if (reply)
65878a
+                if (!bus_maybe_send_reply(c, message, reply))
65878a
+                        goto oom;
65878a
 
65878a
-        return bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
65878a
+        return DBUS_HANDLER_RESULT_HANDLED;
65878a
+oom:
65878a
+        return DBUS_HANDLER_RESULT_NEED_MEMORY;
65878a
 }
65878a
 
65878a
 static int bus_scope_set_transient_property(
65878a
@@ -102,10 +124,6 @@ static int bus_scope_set_transient_property(
65878a
                     dbus_message_iter_get_element_type(i) != DBUS_TYPE_UINT32)
65878a
                         return -EINVAL;
65878a
 
65878a
-                r = set_ensure_allocated(&s->pids, trivial_hash_func, trivial_compare_func);
65878a
-                if (r < 0)
65878a
-                        return r;
65878a
-
65878a
                 dbus_message_iter_recurse(i, &sub);
65878a
                 while (dbus_message_iter_get_arg_type(&sub) == DBUS_TYPE_UINT32) {
65878a
                         uint32_t pid;
65878a
@@ -116,7 +134,7 @@ static int bus_scope_set_transient_property(
65878a
                                 return -EINVAL;
65878a
 
65878a
                         if (mode != UNIT_CHECK) {
65878a
-                                r = set_put(s->pids, LONG_TO_PTR(pid));
65878a
+                                r = unit_watch_pid(UNIT(s), pid);
65878a
                                 if (r < 0 && r != -EEXIST)
65878a
                                         return r;
65878a
                         }
65878a
diff --git a/src/core/manager.c b/src/core/manager.c
65878a
index a34a3c6..db5094f 100644
65878a
--- a/src/core/manager.c
65878a
+++ b/src/core/manager.c
65878a
@@ -1389,7 +1389,7 @@ static int manager_dispatch_sigchld(Manager *m) {
65878a
                 log_debug_unit(u->id,
65878a
                                "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id);
65878a
 
65878a
-                hashmap_remove(m->watch_pids, LONG_TO_PTR(si.si_pid));
65878a
+                unit_unwatch_pid(u, si.si_pid);
65878a
                 UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status);
65878a
         }
65878a
 
65878a
diff --git a/src/core/scope.c b/src/core/scope.c
65878a
index e75fc2b..22bdfb2 100644
65878a
--- a/src/core/scope.c
65878a
+++ b/src/core/scope.c
65878a
@@ -35,6 +35,7 @@
65878a
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
65878a
         [SCOPE_DEAD] = UNIT_INACTIVE,
65878a
         [SCOPE_RUNNING] = UNIT_ACTIVE,
65878a
+        [SCOPE_ABANDONED] = UNIT_ACTIVE,
65878a
         [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
65878a
         [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
65878a
         [SCOPE_FAILED] = UNIT_FAILED
65878a
@@ -67,9 +68,6 @@ static void scope_done(Unit *u) {
65878a
         free(s->controller);
65878a
         s->controller = NULL;
65878a
 
65878a
-        set_free(s->pids);
65878a
-        s->pids = NULL;
65878a
-
65878a
         unit_unwatch_timer(u, &s->timer_watch);
65878a
 }
65878a
 
65878a
@@ -84,6 +82,9 @@ static void scope_set_state(Scope *s, ScopeState state) {
65878a
             state != SCOPE_STOP_SIGKILL)
65878a
                 unit_unwatch_timer(UNIT(s), &s->timer_watch);
65878a
 
65878a
+        if (state == SCOPE_DEAD || state == SCOPE_FAILED)
65878a
+                unit_unwatch_all_pids(UNIT(s));
65878a
+
65878a
         if (state != old_state)
65878a
                 log_debug("%s changed %s -> %s",
65878a
                           UNIT(s)->id,
65878a
@@ -115,7 +116,7 @@ static int scope_verify(Scope *s) {
65878a
         if (UNIT(s)->load_state != UNIT_LOADED)
65878a
                 return 0;
65878a
 
65878a
-        if (set_size(s->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
65878a
+        if (set_size(UNIT(s)->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
65878a
                 log_error_unit(UNIT(s)->id, "Scope %s has no PIDs. Refusing.", UNIT(s)->id);
65878a
                 return -EINVAL;
65878a
         }
65878a
@@ -169,6 +170,9 @@ static int scope_coldplug(Unit *u) {
65878a
                                 return r;
65878a
                 }
65878a
 
65878a
+                if (s->deserialized_state != SCOPE_DEAD && s->deserialized_state != SCOPE_FAILED)
65878a
+                        unit_watch_all_pids(UNIT(s));
65878a
+
65878a
                 scope_set_state(s, s->deserialized_state);
65878a
         }
65878a
 
65878a
@@ -209,6 +213,8 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
65878a
         if (f != SCOPE_SUCCESS)
65878a
                 s->result = f;
65878a
 
65878a
+        unit_watch_all_pids(UNIT(s));
65878a
+
65878a
         /* If we have a controller set let's ask the controller nicely
65878a
          * to terminate the scope, instead of us going directly into
65878a
          * SIGTERM beserk mode */
65878a
@@ -271,13 +277,10 @@ static int scope_start(Unit *u) {
65878a
                 return r;
65878a
         }
65878a
 
65878a
-        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, s->pids);
65878a
+        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, UNIT(s)->pids);
65878a
         if (r < 0)
65878a
                 return r;
65878a
 
65878a
-        set_free(s->pids);
65878a
-        s->pids = NULL;
65878a
-
65878a
         s->result = SCOPE_SUCCESS;
65878a
 
65878a
         scope_set_state(s, SCOPE_RUNNING);
65878a
@@ -288,13 +291,13 @@ static int scope_stop(Unit *u) {
65878a
         Scope *s = SCOPE(u);
65878a
 
65878a
         assert(s);
65878a
-        assert(s->state == SCOPE_RUNNING);
65878a
 
65878a
         if (s->state == SCOPE_STOP_SIGTERM ||
65878a
             s->state == SCOPE_STOP_SIGKILL)
65878a
                 return 0;
65878a
 
65878a
-        assert(s->state == SCOPE_RUNNING);
65878a
+        assert(s->state == SCOPE_RUNNING ||
65878a
+               s->state == SCOPE_ABANDONED);
65878a
 
65878a
         scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
65878a
         return 0;
65878a
@@ -358,7 +361,7 @@ static bool scope_check_gc(Unit *u) {
65878a
         /* Never clean up scopes that still have a process around,
65878a
          * even if the scope is formally dead. */
65878a
 
65878a
-        if (UNIT(s)->cgroup_path) {
65878a
+        if (u->cgroup_path) {
65878a
                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path, true);
65878a
                 if (r <= 0)
65878a
                         return true;
65878a
@@ -367,6 +370,33 @@ static bool scope_check_gc(Unit *u) {
65878a
         return false;
65878a
 }
65878a
 
65878a
+static void scope_notify_cgroup_empty_event(Unit *u) {
65878a
+        Scope *s = SCOPE(u);
65878a
+
65878a
+        assert(u);
65878a
+
65878a
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
65878a
+
65878a
+        if (s->state == SCOPE_RUNNING || s->state == SCOPE_ABANDONED ||
65878a
+            s->state == SCOPE_STOP_SIGTERM || SCOPE_STOP_SIGKILL)
65878a
+                scope_enter_dead(s, SCOPE_SUCCESS);
65878a
+}
65878a
+
65878a
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
65878a
+        /* If we get a SIGCHLD event for one of the processes we were
65878a
+           interested in, then we look for others to watch, under the
65878a
+           assumption that we'll sooner or later get a SIGCHLD for
65878a
+           them, as the original process we watched was probably the
65878a
+           parent of them, and they are hence now our children. */
65878a
+
65878a
+        unit_tidy_watch_pids(u, 0, 0);
65878a
+        unit_watch_all_pids(u);
65878a
+
65878a
+        /* If the PID set is empty now, then let's finish this off */
65878a
+        if (set_isempty(u->pids))
65878a
+                scope_notify_cgroup_empty_event(u);
65878a
+}
65878a
+
65878a
 static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
65878a
         Scope *s = SCOPE(u);
65878a
 
65878a
@@ -397,24 +427,30 @@ static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
65878a
         }
65878a
 }
65878a
 
65878a
-static void scope_notify_cgroup_empty_event(Unit *u) {
65878a
-        Scope *s = SCOPE(u);
65878a
-        assert(u);
65878a
+int scope_abandon(Scope *s) {
65878a
+        assert(s);
65878a
 
65878a
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
65878a
+        if (s->state != SCOPE_RUNNING && s->state != SCOPE_ABANDONED)
65878a
+                return -ESTALE;
65878a
 
65878a
-        switch (s->state) {
65878a
+        free(s->controller);
65878a
+        s->controller = NULL;
65878a
 
65878a
-        case SCOPE_RUNNING:
65878a
-        case SCOPE_STOP_SIGTERM:
65878a
-        case SCOPE_STOP_SIGKILL:
65878a
-                scope_enter_dead(s, SCOPE_SUCCESS);
65878a
+        /* The client is no longer watching the remaining processes,
65878a
+         * so let's step in here, under the assumption that the
65878a
+         * remaining processes will be sooner or later reassigned to
65878a
+         * us as parent. */
65878a
 
65878a
-                break;
65878a
+        unit_tidy_watch_pids(UNIT(s), 0, 0);
65878a
+        unit_watch_all_pids(UNIT(s));
65878a
 
65878a
-        default:
65878a
-                ;
65878a
-        }
65878a
+        /* If the PID set is empty now, then let's finish this off */
65878a
+        if (set_isempty(UNIT(s)->pids))
65878a
+                scope_notify_cgroup_empty_event(UNIT(s));
65878a
+        else
65878a
+                scope_set_state(s, SCOPE_ABANDONED);
65878a
+
65878a
+        return 0;
65878a
 }
65878a
 
65878a
 _pure_ static UnitActiveState scope_active_state(Unit *u) {
65878a
@@ -432,6 +468,7 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
65878a
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
65878a
         [SCOPE_DEAD] = "dead",
65878a
         [SCOPE_RUNNING] = "running",
65878a
+        [SCOPE_ABANDONED] = "abandoned",
65878a
         [SCOPE_STOP_SIGTERM] = "stop-sigterm",
65878a
         [SCOPE_STOP_SIGKILL] = "stop-sigkill",
65878a
         [SCOPE_FAILED] = "failed",
65878a
@@ -481,6 +518,8 @@ const UnitVTable scope_vtable = {
65878a
 
65878a
         .check_gc = scope_check_gc,
65878a
 
65878a
+        .sigchld_event = scope_sigchld_event,
65878a
+
65878a
         .timer_event = scope_timer_event,
65878a
 
65878a
         .reset_failed = scope_reset_failed,
65878a
diff --git a/src/core/scope.h b/src/core/scope.h
65878a
index b4bafa7..1e9f201 100644
65878a
--- a/src/core/scope.h
65878a
+++ b/src/core/scope.h
65878a
@@ -29,6 +29,7 @@ typedef struct Scope Scope;
65878a
 typedef enum ScopeState {
65878a
         SCOPE_DEAD,
65878a
         SCOPE_RUNNING,
65878a
+        SCOPE_ABANDONED,
65878a
         SCOPE_STOP_SIGTERM,
65878a
         SCOPE_STOP_SIGKILL,
65878a
         SCOPE_FAILED,
65878a
@@ -57,13 +58,13 @@ struct Scope {
65878a
 
65878a
         char *controller;
65878a
 
65878a
-        Set *pids;
65878a
-
65878a
         Watch timer_watch;
65878a
 };
65878a
 
65878a
 extern const UnitVTable scope_vtable;
65878a
 
65878a
+int scope_abandon(Scope *s);
65878a
+
65878a
 const char* scope_state_to_string(ScopeState i) _const_;
65878a
 ScopeState scope_state_from_string(const char *s) _pure_;
65878a
 
65878a
diff --git a/src/core/service.c b/src/core/service.c
65878a
index f0acda1..41e5cb5 100644
65878a
--- a/src/core/service.c
65878a
+++ b/src/core/service.c
65878a
@@ -1546,6 +1546,11 @@ static void service_set_state(Service *s, ServiceState state) {
65878a
                 s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
65878a
         }
65878a
 
65878a
+        if (state == SERVICE_DEAD ||
65878a
+            state == SERVICE_FAILED ||
65878a
+            state == SERVICE_AUTO_RESTART)
65878a
+                unit_unwatch_all_pids(UNIT(s));
65878a
+
65878a
         if (state != SERVICE_START_PRE &&
65878a
             state != SERVICE_START &&
65878a
             state != SERVICE_START_POST &&
65878a
@@ -1661,8 +1666,14 @@ static int service_coldplug(Unit *u) {
65878a
                                         return r;
65878a
                         }
65878a
 
65878a
+                if (s->deserialized_state != SERVICE_DEAD &&
65878a
+                    s->deserialized_state != SERVICE_FAILED &&
65878a
+                    s->deserialized_state != SERVICE_AUTO_RESTART)
65878a
+                        unit_watch_all_pids(UNIT(s));
65878a
+
65878a
                 if (s->deserialized_state == SERVICE_START_POST ||
65878a
-                    s->deserialized_state == SERVICE_RUNNING)
65878a
+                    s->deserialized_state == SERVICE_RUNNING ||
65878a
+                    s->deserialized_state == SERVICE_RELOAD)
65878a
                         service_handle_watchdog(s);
65878a
 
65878a
                 service_set_state(s, s->deserialized_state);
65878a
@@ -1970,6 +1981,7 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
65878a
                 s->result = f;
65878a
 
65878a
         service_unwatch_control_pid(s);
65878a
+        unit_watch_all_pids(UNIT(s));
65878a
 
65878a
         s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
65878a
         if (s->control_command) {
65878a
@@ -2010,6 +2022,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
65878a
         if (f != SERVICE_SUCCESS)
65878a
                 s->result = f;
65878a
 
65878a
+        unit_watch_all_pids(UNIT(s));
65878a
+
65878a
         r = unit_kill_context(
65878a
                         UNIT(s),
65878a
                         &s->kill_context,
65878a
@@ -2055,6 +2069,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
65878a
                 s->result = f;
65878a
 
65878a
         service_unwatch_control_pid(s);
65878a
+        unit_watch_all_pids(UNIT(s));
65878a
 
65878a
         s->control_command = s->exec_command[SERVICE_EXEC_STOP];
65878a
         if (s->control_command) {
65878a
@@ -2961,6 +2976,62 @@ fail:
65878a
         service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
65878a
 }
65878a
 
65878a
+static void service_notify_cgroup_empty_event(Unit *u) {
65878a
+        Service *s = SERVICE(u);
65878a
+
65878a
+        assert(u);
65878a
+
65878a
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
65878a
+
65878a
+        switch (s->state) {
65878a
+
65878a
+                /* Waiting for SIGCHLD is usually more interesting,
65878a
+                 * because it includes return codes/signals. Which is
65878a
+                 * why we ignore the cgroup events for most cases,
65878a
+                 * except when we don't know pid which to expect the
65878a
+                 * SIGCHLD for. */
65878a
+
65878a
+        case SERVICE_START:
65878a
+        case SERVICE_START_POST:
65878a
+                /* If we were hoping for the daemon to write its PID file,
65878a
+                 * we can give up now. */
65878a
+                if (s->pid_file_pathspec) {
65878a
+                        log_warning_unit(u->id,
65878a
+                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
65878a
+                        service_unwatch_pid_file(s);
65878a
+                        if (s->state == SERVICE_START)
65878a
+                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
65878a
+                        else
65878a
+                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
65878a
+                }
65878a
+                break;
65878a
+
65878a
+        case SERVICE_RUNNING:
65878a
+                /* service_enter_running() will figure out what to do */
65878a
+                service_enter_running(s, SERVICE_SUCCESS);
65878a
+                break;
65878a
+
65878a
+        case SERVICE_STOP_SIGTERM:
65878a
+        case SERVICE_STOP_SIGKILL:
65878a
+
65878a
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
65878a
+                        service_enter_stop_post(s, SERVICE_SUCCESS);
65878a
+
65878a
+                break;
65878a
+
65878a
+        case SERVICE_STOP_POST:
65878a
+        case SERVICE_FINAL_SIGTERM:
65878a
+        case SERVICE_FINAL_SIGKILL:
65878a
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
65878a
+                        service_enter_dead(s, SERVICE_SUCCESS, true);
65878a
+
65878a
+                break;
65878a
+
65878a
+        default:
65878a
+                ;
65878a
+        }
65878a
+}
65878a
+
65878a
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
65878a
         Service *s = SERVICE(u);
65878a
         ServiceResult f;
65878a
@@ -3229,6 +3300,18 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
65878a
 
65878a
         /* Notify clients about changed exit status */
65878a
         unit_add_to_dbus_queue(u);
65878a
+
65878a
+        /* We got one SIGCHLD for the service, let's watch all
65878a
+         * processes that are now running of the service, and watch
65878a
+         * that. Among the PIDs we then watch will be children
65878a
+         * reassigned to us, which hopefully allows us to identify
65878a
+         * when all children are gone */
65878a
+        unit_tidy_watch_pids(u, s->main_pid, s->control_pid);
65878a
+        unit_watch_all_pids(u);
65878a
+
65878a
+        /* If the PID set is empty now, then let's finish this off */
65878a
+        if (set_isempty(u->pids))
65878a
+                service_notify_cgroup_empty_event(u);
65878a
 }
65878a
 
65878a
 static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
65878a
@@ -3332,61 +3415,6 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
65878a
         }
65878a
 }
65878a
 
65878a
-static void service_notify_cgroup_empty_event(Unit *u) {
65878a
-        Service *s = SERVICE(u);
65878a
-
65878a
-        assert(u);
65878a
-
65878a
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
65878a
-
65878a
-        switch (s->state) {
65878a
-
65878a
-                /* Waiting for SIGCHLD is usually more interesting,
65878a
-                 * because it includes return codes/signals. Which is
65878a
-                 * why we ignore the cgroup events for most cases,
65878a
-                 * except when we don't know pid which to expect the
65878a
-                 * SIGCHLD for. */
65878a
-
65878a
-        case SERVICE_START:
65878a
-        case SERVICE_START_POST:
65878a
-                /* If we were hoping for the daemon to write its PID file,
65878a
-                 * we can give up now. */
65878a
-                if (s->pid_file_pathspec) {
65878a
-                        log_warning_unit(u->id,
65878a
-                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
65878a
-                        service_unwatch_pid_file(s);
65878a
-                        if (s->state == SERVICE_START)
65878a
-                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
65878a
-                        else
65878a
-                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
65878a
-                }
65878a
-                break;
65878a
-
65878a
-        case SERVICE_RUNNING:
65878a
-                /* service_enter_running() will figure out what to do */
65878a
-                service_enter_running(s, SERVICE_SUCCESS);
65878a
-                break;
65878a
-
65878a
-        case SERVICE_STOP_SIGTERM:
65878a
-        case SERVICE_STOP_SIGKILL:
65878a
-
65878a
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
65878a
-                        service_enter_stop_post(s, SERVICE_SUCCESS);
65878a
-
65878a
-                break;
65878a
-
65878a
-        case SERVICE_FINAL_SIGTERM:
65878a
-        case SERVICE_FINAL_SIGKILL:
65878a
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
65878a
-                        service_enter_dead(s, SERVICE_SUCCESS, true);
65878a
-
65878a
-                break;
65878a
-
65878a
-        default:
65878a
-                ;
65878a
-        }
65878a
-}
65878a
-
65878a
 static void service_notify_message(Unit *u, pid_t pid, char **tags) {
65878a
         Service *s = SERVICE(u);
65878a
         const char *e;
65878a
diff --git a/src/core/unit.c b/src/core/unit.c
65878a
index 6c2c4a0..0332094 100644
65878a
--- a/src/core/unit.c
65878a
+++ b/src/core/unit.c
65878a
@@ -472,6 +472,8 @@ void unit_free(Unit *u) {
65878a
 
65878a
         set_free_free(u->names);
65878a
 
65878a
+        unit_unwatch_all_pids(u);
65878a
+
65878a
         condition_free_list(u->conditions);
65878a
 
65878a
         unit_ref_unset(&u->slice);
65878a
@@ -1658,13 +1660,25 @@ void unit_unwatch_fd(Unit *u, Watch *w) {
65878a
 }
65878a
 
65878a
 int unit_watch_pid(Unit *u, pid_t pid) {
65878a
+        int q, r;
65878a
+
65878a
         assert(u);
65878a
         assert(pid >= 1);
65878a
 
65878a
+        r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func);
65878a
+        if (r < 0)
65878a
+                return r;
65878a
+
65878a
         /* Watch a specific PID. We only support one unit watching
65878a
          * each PID for now. */
65878a
 
65878a
-        return hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
65878a
+        r = set_put(u->pids, LONG_TO_PTR(pid));
65878a
+
65878a
+        q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
65878a
+        if (q < 0)
65878a
+                return q;
65878a
+
65878a
+        return r;
65878a
 }
65878a
 
65878a
 void unit_unwatch_pid(Unit *u, pid_t pid) {
65878a
@@ -1672,6 +1686,102 @@ void unit_unwatch_pid(Unit *u, pid_t pid) {
65878a
         assert(pid >= 1);
65878a
 
65878a
         hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u);
65878a
+        set_remove(u->pids, LONG_TO_PTR(pid));
65878a
+}
65878a
+
65878a
+static int watch_pids_in_path(Unit *u, const char *path) {
65878a
+        _cleanup_closedir_ DIR *d = NULL;
65878a
+        _cleanup_fclose_ FILE *f = NULL;
65878a
+        int ret = 0, r;
65878a
+
65878a
+        assert(u);
65878a
+        assert(path);
65878a
+
65878a
+        /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
65878a
+
65878a
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
65878a
+        if (r >= 0) {
65878a
+                pid_t pid;
65878a
+
65878a
+                while ((r = cg_read_pid(f, &pid)) > 0) {
65878a
+                        r = unit_watch_pid(u, pid);
65878a
+                        if (r < 0 && ret >= 0)
65878a
+                                ret = r;
65878a
+                }
65878a
+                if (r < 0 && ret >= 0)
65878a
+                        ret = r;
65878a
+
65878a
+        } else if (ret >= 0)
65878a
+                ret = r;
65878a
+
65878a
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
65878a
+        if (r >= 0) {
65878a
+                char *fn;
65878a
+
65878a
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
65878a
+                        _cleanup_free_ char *p = NULL;
65878a
+
65878a
+                        p = strjoin(path, "/", fn, NULL);
65878a
+                        free(fn);
65878a
+
65878a
+                        if (!p)
65878a
+                                return -ENOMEM;
65878a
+
65878a
+                        r = watch_pids_in_path(u, p);
65878a
+                        if (r < 0 && ret >= 0)
65878a
+                                ret = r;
65878a
+                }
65878a
+                if (r < 0 && ret >= 0)
65878a
+                        ret = r;
65878a
+
65878a
+        } else if (ret >= 0)
65878a
+                ret = r;
65878a
+
65878a
+        return ret;
65878a
+}
65878a
+
65878a
+
65878a
+int unit_watch_all_pids(Unit *u) {
65878a
+        assert(u);
65878a
+
65878a
+        if (!u->cgroup_path)
65878a
+                return -ENOENT;
65878a
+
65878a
+        /* Adds all PIDs from our cgroup to the set of PIDs we watch */
65878a
+
65878a
+        return watch_pids_in_path(u, u->cgroup_path);
65878a
+}
65878a
+
65878a
+void unit_unwatch_all_pids(Unit *u) {
65878a
+        Iterator i;
65878a
+        void *e;
65878a
+
65878a
+        assert(u);
65878a
+
65878a
+        SET_FOREACH(e, u->pids, i)
65878a
+                hashmap_remove_value(u->manager->watch_pids, e, u);
65878a
+
65878a
+        set_free(u->pids);
65878a
+        u->pids = NULL;
65878a
+}
65878a
+
65878a
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
65878a
+        Iterator i;
65878a
+        void *e;
65878a
+
65878a
+        assert(u);
65878a
+
65878a
+        /* Cleans dead PIDs from our list */
65878a
+
65878a
+        SET_FOREACH(e, u->pids, i) {
65878a
+                pid_t pid = PTR_TO_LONG(e);
65878a
+
65878a
+                if (pid == except1 || pid == except2)
65878a
+                        continue;
65878a
+
65878a
+                if (kill(pid, 0) < 0 && errno == ESRCH)
65878a
+                        set_remove(u->pids, e);
65878a
+        }
65878a
 }
65878a
 
65878a
 int unit_watch_timer(Unit *u, clockid_t clock_id, bool relative, usec_t usec, Watch *w) {
65878a
diff --git a/src/core/unit.h b/src/core/unit.h
65878a
index 6dd750f..6dff25e 100644
65878a
--- a/src/core/unit.h
65878a
+++ b/src/core/unit.h
65878a
@@ -198,6 +198,11 @@ struct Unit {
65878a
         /* CGroup realize members queue */
65878a
         LIST_FIELDS(Unit, cgroup_queue);
65878a
 
65878a
+        /* PIDs we keep an eye on. Note that a unit might have many
65878a
+         * more, but these are the ones we care enough about to
65878a
+         * process SIGCHLD for */
65878a
+        Set *pids;
65878a
+
65878a
         /* Used during GC sweeps */
65878a
         unsigned gc_marker;
65878a
 
65878a
@@ -531,6 +536,10 @@ void unit_unwatch_fd(Unit *u, Watch *w);
65878a
 
65878a
 int unit_watch_pid(Unit *u, pid_t pid);
65878a
 void unit_unwatch_pid(Unit *u, pid_t pid);
65878a
+int unit_watch_all_pids(Unit *u);
65878a
+void unit_unwatch_all_pids(Unit *u);
65878a
+
65878a
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
65878a
 
65878a
 int unit_watch_timer(Unit *u, clockid_t, bool relative, usec_t usec, Watch *w);
65878a
 void unit_unwatch_timer(Unit *u, Watch *w);