923a60
From db57bf73d3e5e650b261834a0c39c9d368f9eeea Mon Sep 17 00:00:00 2001
923a60
From: Kyle Walker <kwalker@redhat.com>
923a60
Date: Thu, 14 Dec 2017 11:46:03 -0500
923a60
Subject: [PATCH] core: Implement sync_with_progress()
923a60
923a60
In similar fashion to the previous change, sync() operations can stall
923a60
endlessly if cache is unable to be written out. In order to avoid an
923a60
unbounded hang, the sync takes place within a child process. Every 10
923a60
seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked
923a60
to verify it is smaller than the last iteration. If the sync is not making
923a60
progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is
923a60
sent to the sync process and the shutdown continues.
923a60
923a60
(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d)
923a60
923a60
Related: #1571098
923a60
---
923a60
 src/core/shutdown.c | 116 ++++++++++++++++++++++++++++++++++++++++++--
923a60
 1 file changed, 111 insertions(+), 5 deletions(-)
923a60
923a60
diff --git a/src/core/shutdown.c b/src/core/shutdown.c
923a60
index 71f001ac13..0b0a54a7de 100644
923a60
--- a/src/core/shutdown.c
923a60
+++ b/src/core/shutdown.c
923a60
@@ -53,6 +53,9 @@
923a60
 
923a60
 #define FINALIZE_ATTEMPTS 50
923a60
 
923a60
+#define SYNC_PROGRESS_ATTEMPTS 3
923a60
+#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
923a60
+
923a60
 static char* arg_verb;
923a60
 
923a60
 static int parse_argv(int argc, char *argv[]) {
923a60
@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) {
923a60
         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
923a60
 }
923a60
 
923a60
+/* Read the following fields from /proc/meminfo:
923a60
+ *
923a60
+ *  NFS_Unstable
923a60
+ *  Writeback
923a60
+ *  Dirty
923a60
+ *
923a60
+ * Return true if the sum of these fields is greater than the previous
923a60
+ * value input. For all other issues, report the failure and indicate that
923a60
+ * the sync is not making progress.
923a60
+ */
923a60
+static bool sync_making_progress(unsigned long long *prev_dirty) {
923a60
+        _cleanup_fclose_ FILE *f = NULL;
923a60
+        char line[LINE_MAX];
923a60
+        bool r = false;
923a60
+        unsigned long long val = 0;
923a60
+
923a60
+        f = fopen("/proc/meminfo", "re");
923a60
+        if (!f)
923a60
+                return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
923a60
+
923a60
+        FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
923a60
+                unsigned long long ull = 0;
923a60
+
923a60
+                if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
923a60
+                        continue;
923a60
+
923a60
+                errno = 0;
923a60
+                if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
923a60
+                        if (errno != 0)
923a60
+                                log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
923a60
+                        else
923a60
+                                log_warning("Failed to parse /proc/meminfo");
923a60
+
923a60
+                        return false;
923a60
+                }
923a60
+
923a60
+                val += ull;
923a60
+        }
923a60
+
923a60
+        r = *prev_dirty > val;
923a60
+
923a60
+        *prev_dirty = val;
923a60
+
923a60
+        return r;
923a60
+}
923a60
+
923a60
+static void sync_with_progress(void) {
923a60
+        unsigned checks;
923a60
+        pid_t pid;
923a60
+        int r;
923a60
+        unsigned long long dirty = ULONG_LONG_MAX;
923a60
+
923a60
+        BLOCK_SIGNALS(SIGCHLD);
923a60
+
923a60
+        /* Due to the possiblity of the sync operation hanging, we fork
923a60
+         * a child process and monitor the progress. If the timeout
923a60
+         * lapses, the assumption is that that particular sync stalled. */
923a60
+        pid = fork();
923a60
+        if (pid < 0) {
923a60
+                log_error_errno(errno, "Failed to fork: %m");
923a60
+                return;
923a60
+        }
923a60
+
923a60
+        if (pid == 0) {
923a60
+                /* Start the sync operation here in the child */
923a60
+                sync();
923a60
+                _exit(EXIT_SUCCESS);
923a60
+        }
923a60
+
923a60
+        log_info("Syncing filesystems and block devices.");
923a60
+
923a60
+        /* Start monitoring the sync operation. If more than
923a60
+         * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
923a60
+         * we assume that the sync is stalled */
923a60
+        for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
923a60
+                r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
923a60
+                if (r == 0)
923a60
+                        /* Sync finished without error.
923a60
+                         * (The sync itself does not return an error code) */
923a60
+                        return;
923a60
+                else if (r == -ETIMEDOUT) {
923a60
+                        /* Reset the check counter if the "Dirty" value is
923a60
+                         * decreasing */
923a60
+                        if (sync_making_progress(&dirty))
923a60
+                                checks = 0;
923a60
+                } else {
923a60
+                        log_error_errno(r, "Failed to sync filesystems and block devices: %m");
923a60
+                        return;
923a60
+                }
923a60
+        }
923a60
+
923a60
+        /* Only reached in the event of a timeout. We should issue a kill
923a60
+         * to the stray process. */
923a60
+        log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
923a60
+        (void) kill(pid, SIGKILL);
923a60
+}
923a60
 
923a60
 int main(int argc, char *argv[]) {
923a60
         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
923a60
@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) {
923a60
         /* lock us into memory */
923a60
         mlockall(MCL_CURRENT|MCL_FUTURE);
923a60
 
923a60
+        /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
923a60
+         * slow IO is processed here already and the final process killing spree is not impacted by processes
923a60
+         * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
923a60
+         * result. */
923a60
+        if (!in_container)
923a60
+                sync_with_progress();
923a60
+
923a60
         log_info("Sending SIGTERM to remaining processes...");
923a60
         broadcast_signal(SIGTERM, true, true);
923a60
 
923a60
@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) {
923a60
                           need_loop_detach ? " loop devices," : "",
923a60
                           need_dm_detach ? " DM devices," : "");
923a60
 
923a60
-        /* The kernel will automaticall flush ATA disks and suchlike
923a60
-         * on reboot(), but the file systems need to be synce'd
923a60
-         * explicitly in advance. So let's do this here, but not
923a60
-         * needlessly slow down containers. */
923a60
+        /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
923a60
+         * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
923a60
+         * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
923a60
+         * let's do it once more. Do not remove this sync, data corruption will result. */
923a60
         if (!in_container)
923a60
-                sync();
923a60
+                sync_with_progress();
923a60
 
923a60
         switch (cmd) {
923a60