dcavalca / rpms / systemd

Forked from rpms/systemd 5 months ago
Clone
Blob Blame History Raw
From c20aa7b17166b9f331da33ad9288f9ede75c72db Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Sun, 24 Jan 2021 00:16:19 -0800
Subject: [PATCH 1/4] oom: make memory pressure duration configurable through
 oomd.conf

---
 man/oomd.conf.xml          | 12 +++++++++++-
 src/oom/oomd-manager.c     | 13 +++++++++----
 src/oom/oomd-manager.h     |  5 +++--
 src/oom/oomd-util.h        |  1 +
 src/oom/oomd.c             |  4 +++-
 src/oom/oomd.conf          |  1 +
 test/units/testsuite-56.sh |  3 +++
 7 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml
index 35a0686bc50..bb5da87c548 100644
--- a/man/oomd.conf.xml
+++ b/man/oomd.conf.xml
@@ -65,13 +65,23 @@
         will take action. A unit can override this value with <varname>ManagedOOMMemoryPressureLimitPercent=</varname>.
         The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks
         in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the
-        limit set for more than 30 seconds, <command>systemd-oomd</command> will act on eligible descendant cgroups,
+        limit set for longer than the duration set by <varname>DefaultMemoryPressureDurationSec=</varname>,
+        <command>systemd-oomd</command> will act on eligible descendant cgroups,
         starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are
         monitored and what action gets taken depends on what the unit has configured for
         <varname>ManagedOOMMemoryPressure=</varname>. Takes a percentage value between 0% and 100%, inclusive.
         Defaults to 60%.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>DefaultMemoryPressureDurationSec=</varname></term>
+
+        <listitem><para>Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before
+        <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
+        <varname>DefaultMemoryPressureLimitPercent=</varname> and <varname>ManagedOOMMemoryPressureLimitPercent=</varname>.
+        Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
index fec96519e01..e8ed6a52739 100644
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -306,7 +306,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                         m->post_action_delay_start = 0;
         }
 
-        r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets);
+        r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
         if (r == -ENOMEM)
                 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
         else if (r == 1) {
@@ -325,7 +325,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
 
                         SET_FOREACH(t, targets) {
                                 log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
-                                        t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC);
+                                        t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
 
                                 r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
                                 if (r == -ENOMEM)
@@ -471,7 +471,7 @@ static int manager_connect_bus(Manager *m) {
         return 0;
 }
 
-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) {
+int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) {
         unsigned long l;
         int r;
 
@@ -487,6 +487,8 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur
         if (r < 0)
                 return r;
 
+        m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
+
         r = manager_connect_bus(m);
         if (r < 0)
                 return r;
@@ -505,6 +507,7 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur
 int manager_get_dump_string(Manager *m, char **ret) {
         _cleanup_free_ char *dump = NULL;
         _cleanup_fclose_ FILE *f = NULL;
+        char buf[FORMAT_TIMESPAN_MAX];
         OomdCGroupContext *c;
         size_t size;
         char *key;
@@ -521,10 +524,12 @@ int manager_get_dump_string(Manager *m, char **ret) {
                 "Dry Run: %s\n"
                 "Swap Used Limit: %u%%\n"
                 "Default Memory Pressure Limit: %lu%%\n"
+                "Default Memory Pressure Duration: %s\n"
                 "System Context:\n",
                 yes_no(m->dry_run),
                 m->swap_used_limit,
-                LOAD_INT(m->default_mem_pressure_limit));
+                LOAD_INT(m->default_mem_pressure_limit),
+                format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
         oomd_dump_system_context(&m->system_context, f, "\t");
 
         fprintf(f, "Swap Monitored CGroups:\n");
diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h
index 3f3eb5aa4b6..ede9903e5a6 100644
--- a/src/oom/oomd-manager.h
+++ b/src/oom/oomd-manager.h
@@ -16,7 +16,7 @@
  * percentage of time all tasks were delayed (i.e. unproductive).
  * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in
  * system.slice are assumed to be less latency sensitive. */
-#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
+#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
 #define DEFAULT_MEM_PRESSURE_LIMIT 60
 #define DEFAULT_SWAP_USED_LIMIT 90
 
@@ -33,6 +33,7 @@ struct Manager {
         bool dry_run;
         unsigned swap_used_limit;
         loadavg_t default_mem_pressure_limit;
+        usec_t default_mem_pressure_duration_usec;
 
         /* k: cgroup paths -> v: OomdCGroupContext
          * Used to detect when to take action. */
@@ -53,7 +54,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
 
 int manager_new(Manager **ret);
 
-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit);
+int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec);
 
 int manager_get_dump_string(Manager *m, char **ret);
 
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
index 0834cbf09d7..d7a9890e7a2 100644
--- a/src/oom/oomd-util.h
+++ b/src/oom/oomd-util.h
@@ -31,6 +31,7 @@ struct OomdCGroupContext {
 
         /* These are only used by oomd_pressure_above for acting on high memory pressure. */
         loadavg_t mem_pressure_limit;
+        usec_t mem_pressure_duration_usec;
         usec_t last_hit_mem_pressure_limit;
 };
 
diff --git a/src/oom/oomd.c b/src/oom/oomd.c
index 8cf776ec0f5..1b0f8ff6c40 100644
--- a/src/oom/oomd.c
+++ b/src/oom/oomd.c
@@ -19,11 +19,13 @@
 static bool arg_dry_run = false;
 static int arg_swap_used_limit = -1;
 static int arg_mem_pressure_limit = -1;
+static usec_t arg_mem_pressure_usec = 0;
 
 static int parse_config(void) {
         static const ConfigTableItem items[] = {
                 { "OOM", "SwapUsedLimitPercent",              config_parse_percent, 0, &arg_swap_used_limit    },
                 { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit },
+                { "OOM", "DefaultMemoryPressureDurationSec",  config_parse_sec,     0, &arg_mem_pressure_usec  },
                 {}
         };
 
@@ -160,7 +162,7 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 return log_error_errno(r, "Failed to create manager: %m");
 
-        r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit);
+        r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec);
         if (r < 0)
                 return log_error_errno(r, "Failed to start up daemon: %m");
 
diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf
index 8ac97169610..766cb1717f7 100644
--- a/src/oom/oomd.conf
+++ b/src/oom/oomd.conf
@@ -14,3 +14,4 @@
 [OOM]
 #SwapUsedLimitPercent=90%
 #DefaultMemoryPressureLimitPercent=60%
+#DefaultMemoryPressureDurationSec=30s
diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh
index 1846248855b..6e7941a57fc 100755
--- a/test/units/testsuite-56.sh
+++ b/test/units/testsuite-56.sh
@@ -14,12 +14,15 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]
 fi
 [[ -e /skipped ]] && exit 0 || true
 
+echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf
+
 systemctl start testsuite-56-testbloat.service
 systemctl start testsuite-56-testchill.service
 
 # Verify systemd-oomd is monitoring the expected units
 oomctl | grep "/testsuite-56-workload.slice"
 oomctl | grep "1%"
+oomctl | grep "Default Memory Pressure Duration: 5s"
 
 # systemd-oomd watches for elevated pressure for 30 seconds before acting.
 # It can take time to build up pressure so either wait 5 minutes or for the service to fail.

From 408a3bbd76326793ea5d1cf4e0a9444a4c252d86 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Sat, 23 Jan 2021 22:10:42 -0800
Subject: [PATCH 2/4] oom: make swap a soft requirement

---
 man/systemd-oomd.service.xml |  4 ++--
 src/oom/oomd-manager.c       |  8 ++++++--
 src/oom/oomd.c               |  6 ++----
 src/oom/test-oomd-util.c     | 11 +++++++++++
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/man/systemd-oomd.service.xml b/man/systemd-oomd.service.xml
index 9cb9c6076a9..ebd2467ee23 100644
--- a/man/systemd-oomd.service.xml
+++ b/man/systemd-oomd.service.xml
@@ -56,8 +56,8 @@
 
     <para>You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above.</para>
 
-    <para>The system must also have swap enabled for <command>systemd-oomd</command> to function correctly. With swap
-    enabled, the system spends enough time swapping pages to let <command>systemd-oomd</command> react.
+    <para>It is highly recommended for the system to have swap enabled for <command>systemd-oomd</command> to function
+    optimally. With swap enabled, the system spends enough time swapping pages to let <command>systemd-oomd</command> react.
     Without swap, the system enters a livelocked state much more quickly and may prevent <command>systemd-oomd</command>
     from responding in a reasonable amount of time. See
     <ulink url="https://chrisdown.name/2018/01/02/in-defence-of-swap.html">"In defence of swap: common misconceptions"</ulink>
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
index e8ed6a52739..814fda51f31 100644
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -6,6 +6,7 @@
 #include "cgroup-util.h"
 #include "fd-util.h"
 #include "fileio.h"
+#include "memory-util.h"
 #include "oomd-manager-bus.h"
 #include "oomd-manager.h"
 #include "path-util.h"
@@ -294,9 +295,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                 return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
 
         r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
-        /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */
-        if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
+        /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
+         * Allow ENOENT in the event that swap is disabled on the system. */
+        if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
                 return log_error_errno(r, "Failed to acquire system context");
+        else if (r == -ENOENT)
+                zero(m->system_context);
 
         /* If we're still recovering from a kill, don't try to kill again yet */
         if (m->post_action_delay_start > 0) {
diff --git a/src/oom/oomd.c b/src/oom/oomd.c
index 1b0f8ff6c40..1fbcf41492d 100644
--- a/src/oom/oomd.c
+++ b/src/oom/oomd.c
@@ -142,10 +142,8 @@ static int run(int argc, char *argv[]) {
                 return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m");
 
         r = safe_atollu(swap, &s);
-        if (r < 0)
-                return log_error_errno(r, "Failed to parse SwapTotal from /proc/meminfo: %s: %m", swap);
-        if (s == 0)
-                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires swap to operate");
+        if (r < 0 || s == 0)
+                log_warning("Swap is currently not detected; memory pressure usage will be degraded");
 
         if (!is_pressure_supported())
                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported");
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
index 8143408902b..54fe2a03d14 100644
--- a/src/oom/test-oomd-util.c
+++ b/src/oom/test-oomd-util.c
@@ -159,6 +159,11 @@ static void test_oomd_system_context_acquire(void) {
         assert_se(ctx.swap_total == 0);
         assert_se(ctx.swap_used == 0);
 
+        assert_se(write_string_file(path, "Filename                                Type            Size    Used    Priority", WRITE_STRING_FILE_CREATE) == 0);
+        assert_se(oomd_system_context_acquire(path, &ctx) == 0);
+        assert_se(ctx.swap_total == 0);
+        assert_se(ctx.swap_used == 0);
+
         assert_se(write_string_file(path, "Filename                                Type            Size    Used    Priority\n"
                                           "/swapvol/swapfile                       file            18971644        0       -3\n"
                                           "/dev/vda2                               partition       1999868 993780  -2", WRITE_STRING_FILE_CREATE) == 0);
@@ -268,6 +273,12 @@ static void test_oomd_swap_free_below(void) {
                 .swap_used = 3310136 * 1024U,
         };
         assert_se(oomd_swap_free_below(&ctx, 20) == false);
+
+        ctx = (OomdSystemContext) {
+                .swap_total = 0,
+                .swap_used = 0,
+        };
+        assert_se(oomd_swap_free_below(&ctx, 20) == false);
 }
 
 static void test_oomd_sort_cgroups(void) {

From 924c89e9fe95d47b6ad94544bfdd5f087646daea Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Sun, 24 Jan 2021 01:22:51 -0800
Subject: [PATCH 3/4] oom: fix reclaim activity detection

This should have been checking for any reclaim activity within a larger interval
of time rather than within the past second. On systems with swap this
doesn't seem to have mattered too much as reclaim would always increase when
memory pressure was elevated. But testing in the no swap case having
this larger interval made a difference between oomd killing or not.
---
 src/oom/oomd-manager.c | 7 +++++--
 src/oom/oomd-manager.h | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
index 814fda51f31..3efa629002e 100644
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -302,6 +302,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         else if (r == -ENOENT)
                 zero(m->system_context);
 
+        if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
+                m->last_reclaim_at = usec_now;
+
         /* If we're still recovering from a kill, don't try to kill again yet */
         if (m->post_action_delay_start > 0) {
                 if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
@@ -314,12 +317,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         if (r == -ENOMEM)
                 return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
         else if (r == 1) {
-                /* Check if there was reclaim activity in the last interval. The concern is the following case:
+                /* Check if there was reclaim activity in the given interval. The concern is the following case:
                  * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
                  * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
                  * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
                  * to kill something (it won't help anyways). */
-                if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) {
+                if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
                         _cleanup_hashmap_free_ Hashmap *candidates = NULL;
                         OomdCGroupContext *t;
 
diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h
index ede9903e5a6..ee17abced26 100644
--- a/src/oom/oomd-manager.h
+++ b/src/oom/oomd-manager.h
@@ -20,6 +20,7 @@
 #define DEFAULT_MEM_PRESSURE_LIMIT 60
 #define DEFAULT_SWAP_USED_LIMIT 90
 
+#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
 #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
 
 typedef struct Manager Manager;
@@ -42,6 +43,7 @@ struct Manager {
 
         OomdSystemContext system_context;
 
+        usec_t last_reclaim_at;
         usec_t post_action_delay_start;
 
         sd_event_source *cgroup_context_event_source;

From 2e744a2cd89fc0ea67cf78cfba617b5105a26215 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Sun, 24 Jan 2021 01:34:23 -0800
Subject: [PATCH 4/4] oom: update extended test to remove swap gating

---
 test/units/testsuite-56.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh
index 6e7941a57fc..4dc9d8c7a86 100755
--- a/test/units/testsuite-56.sh
+++ b/test/units/testsuite-56.sh
@@ -6,7 +6,6 @@ systemd-analyze log-level debug
 systemd-analyze log-target console
 
 # Loose checks to ensure the environment has the necessary features for systemd-oomd
-[[ "$( awk '/SwapTotal/ { print $2 }' /proc/meminfo )" != "0" ]] || echo "no swap" >> /skipped
 [[ -e /proc/pressure ]] || echo "no PSI" >> /skipped
 cgroup_type=$(stat -fc %T /sys/fs/cgroup/)
 if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]; then
@@ -16,8 +15,8 @@ fi
 
 echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf
 
-systemctl start testsuite-56-testbloat.service
 systemctl start testsuite-56-testchill.service
+systemctl start testsuite-56-testbloat.service
 
 # Verify systemd-oomd is monitoring the expected units
 oomctl | grep "/testsuite-56-workload.slice"