c2dfb7
From 77a273e02c1c811485d13ddca0f844512aed2cff Mon Sep 17 00:00:00 2001
c2dfb7
From: Jan Synacek <jsynacek@redhat.com>
c2dfb7
Date: Wed, 12 Feb 2020 12:58:54 +0100
c2dfb7
Subject: [PATCH] pid1: make sure to restore correct default values for some
c2dfb7
 rlimits
c2dfb7
c2dfb7
Commit fb39af4ce42d7ef9af63009f271f404038703704 forgot to restore the default
c2dfb7
rlimit values (RLIMIT_NOFILE and RLIMIT_MEMLOCK) while PID1 is reloading.
c2dfb7
c2dfb7
This patch extracts the code in charge of initializing the default values for
c2dfb7
those rlimits in order to create dedicated functions, which take care of their
c2dfb7
initialization.
c2dfb7
c2dfb7
These functions are then called in parse_configuration() so we make sure that
c2dfb7
the default values for these rlimits get restored every time PID1 is reloading
c2dfb7
its configuration.
c2dfb7
c2dfb7
(cherry picked from commit a9fd4cd1206832a61aaf61fff583bb133e6cb965)
c2dfb7
Resolves: #1789930
c2dfb7
---
c2dfb7
 src/core/main.c | 135 +++++++++++++++++++++++++++++++++++++-----------
c2dfb7
 1 file changed, 106 insertions(+), 29 deletions(-)
c2dfb7
c2dfb7
diff --git a/src/core/main.c b/src/core/main.c
c2dfb7
index c83249a8dc..b8c1e567ad 100644
c2dfb7
--- a/src/core/main.c
c2dfb7
+++ b/src/core/main.c
c2dfb7
@@ -136,7 +136,8 @@ static EmergencyAction arg_cad_burst_action;
c2dfb7
 static CPUSet arg_cpu_affinity;
c2dfb7
 static NUMAPolicy arg_numa_policy;
c2dfb7
 
c2dfb7
-static int parse_configuration(void);
c2dfb7
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
c2dfb7
+                               const struct rlimit *saved_rlimit_memlock);
c2dfb7
 
c2dfb7
 _noreturn_ static void freeze_or_reboot(void) {
c2dfb7
 
c2dfb7
@@ -1149,25 +1150,6 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching
c2dfb7
 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
c2dfb7
         int r, nr;
c2dfb7
 
c2dfb7
-        assert(saved_rlimit);
c2dfb7
-
c2dfb7
-        /* Save the original RLIMIT_NOFILE so that we can reset it
c2dfb7
-         * later when transitioning from the initrd to the main
c2dfb7
-         * systemd or suchlike. */
c2dfb7
-        if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0)
c2dfb7
-                return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
c2dfb7
-
c2dfb7
-        /* Make sure forked processes get the default kernel setting */
c2dfb7
-        if (!arg_default_rlimit[RLIMIT_NOFILE]) {
c2dfb7
-                struct rlimit *rl;
c2dfb7
-
c2dfb7
-                rl = newdup(struct rlimit, saved_rlimit, 1);
c2dfb7
-                if (!rl)
c2dfb7
-                        return log_oom();
c2dfb7
-
c2dfb7
-                arg_default_rlimit[RLIMIT_NOFILE] = rl;
c2dfb7
-        }
c2dfb7
-
c2dfb7
         /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows */
c2dfb7
         nr = read_nr_open();
c2dfb7
         r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr));
c2dfb7
@@ -1180,16 +1162,12 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
c2dfb7
 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
c2dfb7
         int r;
c2dfb7
 
c2dfb7
-        assert(saved_rlimit);
c2dfb7
         assert(getuid() == 0);
c2dfb7
 
c2dfb7
         /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which
c2dfb7
          * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's
c2dfb7
          * bump the value high enough for the root user. */
c2dfb7
 
c2dfb7
-        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
c2dfb7
-                return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
c2dfb7
-
c2dfb7
         r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL));
c2dfb7
         if (r < 0)
c2dfb7
                 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
c2dfb7
@@ -1651,6 +1629,8 @@ static void do_reexecute(
c2dfb7
 
c2dfb7
 static int invoke_main_loop(
c2dfb7
                 Manager *m,
c2dfb7
+                const struct rlimit *saved_rlimit_nofile,
c2dfb7
+                const struct rlimit *saved_rlimit_memlock,
c2dfb7
                 bool *ret_reexecute,
c2dfb7
                 int *ret_retval,                   /* Return parameters relevant for shutting down */
c2dfb7
                 const char **ret_shutdown_verb,    /* … */
c2dfb7
@@ -1662,6 +1642,8 @@ static int invoke_main_loop(
c2dfb7
         int r;
c2dfb7
 
c2dfb7
         assert(m);
c2dfb7
+        assert(saved_rlimit_nofile);
c2dfb7
+        assert(saved_rlimit_memlock);
c2dfb7
         assert(ret_reexecute);
c2dfb7
         assert(ret_retval);
c2dfb7
         assert(ret_shutdown_verb);
c2dfb7
@@ -1691,7 +1673,7 @@ static int invoke_main_loop(
c2dfb7
                         saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
c2dfb7
                         saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
c2dfb7
 
c2dfb7
-                        (void) parse_configuration();
c2dfb7
+                        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
c2dfb7
 
c2dfb7
                         set_manager_defaults(m);
c2dfb7
 
c2dfb7
@@ -1983,6 +1965,80 @@ static int do_queue_default_job(
c2dfb7
         return 0;
c2dfb7
 }
c2dfb7
 
c2dfb7
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
c2dfb7
+                         struct rlimit *saved_rlimit_memlock) {
c2dfb7
+
c2dfb7
+        assert(saved_rlimit_nofile);
c2dfb7
+        assert(saved_rlimit_memlock);
c2dfb7
+
c2dfb7
+        if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
c2dfb7
+                log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
c2dfb7
+
c2dfb7
+        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
c2dfb7
+                log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
c2dfb7
+}
c2dfb7
+
c2dfb7
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
c2dfb7
+        struct rlimit *rl;
c2dfb7
+
c2dfb7
+        if (arg_default_rlimit[RLIMIT_NOFILE])
c2dfb7
+                return;
c2dfb7
+
c2dfb7
+        /* Make sure forked processes get limits based on the original kernel setting */
c2dfb7
+
c2dfb7
+        rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
c2dfb7
+        if (!rl) {
c2dfb7
+                log_oom();
c2dfb7
+                return;
c2dfb7
+        }
c2dfb7
+
c2dfb7
+        /* Bump the hard limit for system services to a substantially higher value. The default
c2dfb7
+         * hard limit current kernels set is pretty low (4K), mostly for historical
c2dfb7
+         * reasons. According to kernel developers, the fd handling in recent kernels has been
c2dfb7
+         * optimized substantially enough, so that we can bump the limit now, without paying too
c2dfb7
+         * high a price in memory or performance. Note however that we only bump the hard limit,
c2dfb7
+         * not the soft limit. That's because select() works the way it works, and chokes on fds
c2dfb7
+         * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
c2dfb7
+         * unexpecting programs that they get fds higher than what they can process using
c2dfb7
+         * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
c2dfb7
+         * this pitfall:  programs that are written by folks aware of the select() problem in mind
c2dfb7
+         * (and thus use poll()/epoll instead of select(), the way everybody should) can
c2dfb7
+         * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
c2dfb7
+         * we pass. */
c2dfb7
+        if (arg_system) {
c2dfb7
+                int nr;
c2dfb7
+
c2dfb7
+                /* Get the underlying absolute limit the kernel enforces */
c2dfb7
+                nr = read_nr_open();
c2dfb7
+
c2dfb7
+                rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
c2dfb7
+        }
c2dfb7
+
c2dfb7
+        /* If for some reason we were invoked with a soft limit above 1024 (which should never
c2dfb7
+         * happen!, but who knows what we get passed in from pam_limit when invoked as --user
c2dfb7
+         * instance), then lower what we pass on to not confuse our children */
c2dfb7
+        rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
c2dfb7
+
c2dfb7
+        arg_default_rlimit[RLIMIT_NOFILE] = rl;
c2dfb7
+}
c2dfb7
+
c2dfb7
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
c2dfb7
+        struct rlimit *rl;
c2dfb7
+
c2dfb7
+        /* Pass the original value down to invoked processes */
c2dfb7
+
c2dfb7
+        if (arg_default_rlimit[RLIMIT_MEMLOCK])
c2dfb7
+                return;
c2dfb7
+
c2dfb7
+        rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
c2dfb7
+        if (!rl) {
c2dfb7
+                log_oom();
c2dfb7
+                return;
c2dfb7
+        }
c2dfb7
+
c2dfb7
+        arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
c2dfb7
+}
c2dfb7
+
c2dfb7
 static void reset_arguments(void) {
c2dfb7
         /* Frees/resets arg_* variables, with a few exceptions commented below. */
c2dfb7
 
c2dfb7
@@ -2040,9 +2096,13 @@ static void reset_arguments(void) {
c2dfb7
         numa_policy_reset(&arg_numa_policy);
c2dfb7
 }
c2dfb7
 
c2dfb7
-static int parse_configuration(void) {
c2dfb7
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
c2dfb7
+                               const struct rlimit *saved_rlimit_memlock) {
c2dfb7
         int r;
c2dfb7
 
c2dfb7
+        assert(saved_rlimit_nofile);
c2dfb7
+        assert(saved_rlimit_memlock);
c2dfb7
+
c2dfb7
         arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
c2dfb7
 
c2dfb7
         /* Assign configuration defaults */
c2dfb7
@@ -2058,18 +2118,29 @@ static int parse_configuration(void) {
c2dfb7
                         log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
c2dfb7
         }
c2dfb7
 
c2dfb7
+        /* Initialize some default rlimits for services if they haven't been configured */
c2dfb7
+        fallback_rlimit_nofile(saved_rlimit_nofile);
c2dfb7
+        fallback_rlimit_memlock(saved_rlimit_memlock);
c2dfb7
+
c2dfb7
         /* Note that this also parses bits from the kernel command line, including "debug". */
c2dfb7
         log_parse_environment();
c2dfb7
 
c2dfb7
         return 0;
c2dfb7
 }
c2dfb7
 
c2dfb7
-static int load_configuration(int argc, char **argv, const char **ret_error_message) {
c2dfb7
+static int load_configuration(
c2dfb7
+                int argc,
c2dfb7
+                char **argv,
c2dfb7
+                const struct rlimit *saved_rlimit_nofile,
c2dfb7
+                const struct rlimit *saved_rlimit_memlock,
c2dfb7
+                const char **ret_error_message) {
c2dfb7
         int r;
c2dfb7
 
c2dfb7
+        assert(saved_rlimit_nofile);
c2dfb7
+        assert(saved_rlimit_memlock);
c2dfb7
         assert(ret_error_message);
c2dfb7
 
c2dfb7
-        (void) parse_configuration();
c2dfb7
+        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
c2dfb7
 
c2dfb7
         r = parse_argv(argc, argv);
c2dfb7
         if (r < 0) {
c2dfb7
@@ -2403,11 +2474,15 @@ int main(int argc, char *argv[]) {
c2dfb7
                 }
c2dfb7
         }
c2dfb7
 
c2dfb7
+        /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
c2dfb7
+         * transitioning from the initrd to the main systemd or suchlike. */
c2dfb7
+        save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
c2dfb7
+
c2dfb7
         /* Reset all signal handlers. */
c2dfb7
         (void) reset_all_signal_handlers();
c2dfb7
         (void) ignore_signals(SIGNALS_IGNORE, -1);
c2dfb7
 
c2dfb7
-        r = load_configuration(argc, argv, &error_message);
c2dfb7
+        r = load_configuration(argc, argv, &saved_rlimit_nofile, &saved_rlimit_memlock, &error_message);
c2dfb7
         if (r < 0)
c2dfb7
                 goto finish;
c2dfb7
 
c2dfb7
@@ -2522,6 +2597,8 @@ int main(int argc, char *argv[]) {
c2dfb7
         }
c2dfb7
 
c2dfb7
         (void) invoke_main_loop(m,
c2dfb7
+                                &saved_rlimit_nofile,
c2dfb7
+                                &saved_rlimit_memlock,
c2dfb7
                                 &reexecute,
c2dfb7
                                 &retval,
c2dfb7
                                 &shutdown_verb,