dcavalca / rpms / systemd

Forked from rpms/systemd 4 months ago
Clone
Blob Blame History Raw
From 2ccd5198faa8ca65001f90c551924e86bf737a85 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Mon, 25 Jan 2021 23:56:23 -0800
Subject: [PATCH 1/7] oom: shorten xattr name

---
 src/core/cgroup.c        | 2 +-
 src/oom/oomd-util.c      | 4 ++--
 src/oom/test-oomd-util.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index c9cf7fb16c6..70282a7abda 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -2746,7 +2746,7 @@ int unit_check_oomd_kill(Unit *u) {
         else if (r == 0)
                 return 0;
 
-        r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value);
+        r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value);
         if (r < 0 && r != -ENODATA)
                 return r;
 
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
index fcccddb92ea..80b9583440c 100644
--- a/src/oom/oomd-util.c
+++ b/src/oom/oomd-util.c
@@ -201,9 +201,9 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
         if (r < 0)
                 return r;
 
-        r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed));
+        r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
         if (r < 0)
-                log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m");
+                log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
 
         return set_size(pids_killed) != 0;
 }
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
index 54fe2a03d14..3dec4f0ff06 100644
--- a/src/oom/test-oomd-util.c
+++ b/src/oom/test-oomd-util.c
@@ -79,7 +79,7 @@ static void test_oomd_cgroup_kill(void) {
                 sleep(2);
                 assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true);
 
-                assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0);
+                assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0);
                 assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0);
         }
 }

From d38916b398127e005d0cf131092a99317661ec3c Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Fri, 5 Feb 2021 03:00:11 -0800
Subject: [PATCH 2/7] oom: wrap reply.path with empty_to_root

---
 src/oom/oomd-manager.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
index 338935b3ec6..825fe38e189 100644
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -93,7 +93,7 @@ static int process_managed_oom_reply(
                                 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
 
                 if (reply.mode == MANAGED_OOM_AUTO) {
-                        (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path));
+                        (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
                         continue;
                 }
 
@@ -109,7 +109,7 @@ static int process_managed_oom_reply(
                         }
                 }
 
-                ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
+                ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path));
                 if (ret == -ENOMEM) {
                         r = ret;
                         goto finish;
@@ -117,7 +117,7 @@ static int process_managed_oom_reply(
 
                 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
                  * ignored so always updating it here is not a problem. */
-                ctx = hashmap_get(monitor_hm, reply.path);
+                ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
                 if (ctx)
                         ctx->mem_pressure_limit = limit;
         }

From a695da238e7a6bd6eb440facc784aa6fca6c3d90 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Wed, 27 Jan 2021 23:43:13 -0800
Subject: [PATCH 3/7] oom: sort by pgscan and memory usage

If 2 candidates have the same pgscan, prioritize the one with the larger
memory usage.
---
 src/oom/oomd-util.c      |  2 +-
 src/oom/oomd-util.h      |  5 ++++-
 src/oom/test-oomd-util.c | 24 ++++++++++++++----------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
index 80b9583440c..8f138d64c6c 100644
--- a/src/oom/oomd-util.c
+++ b/src/oom/oomd-util.c
@@ -214,7 +214,7 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
 
         assert(h);
 
-        r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted);
+        r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted);
         if (r < 0)
                 return r;
 
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
index d7a9890e7a2..f0648c5dcdd 100644
--- a/src/oom/oomd-util.h
+++ b/src/oom/oomd-util.h
@@ -61,10 +61,13 @@ bool oomd_memory_reclaim(Hashmap *h);
 /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
 bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
 
-static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
         assert(c1);
         assert(c2);
 
+        if ((*c2)->pgscan == (*c1)->pgscan)
+                return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage);
+
         return CMP((*c2)->pgscan, (*c1)->pgscan);
 }
 
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
index 3dec4f0ff06..a1fe78806a1 100644
--- a/src/oom/test-oomd-util.c
+++ b/src/oom/test-oomd-util.c
@@ -292,16 +292,20 @@ static void test_oomd_sort_cgroups(void) {
         OomdCGroupContext ctx[4] = {
                 { .path = paths[0],
                   .swap_usage = 20,
-                  .pgscan = 60 },
+                  .pgscan = 60,
+                  .current_memory_usage = 10 },
                 { .path = paths[1],
                   .swap_usage = 60,
-                  .pgscan = 40 },
+                  .pgscan = 40,
+                  .current_memory_usage = 20 },
                 { .path = paths[2],
                   .swap_usage = 40,
-                  .pgscan = 20 },
+                  .pgscan = 40,
+                  .current_memory_usage = 40 },
                 { .path = paths[3],
                   .swap_usage = 10,
-                  .pgscan = 80 },
+                  .pgscan = 80,
+                  .current_memory_usage = 10 },
         };
 
         assert_se(h = hashmap_new(&string_hash_ops));
@@ -318,16 +322,16 @@ static void test_oomd_sort_cgroups(void) {
         assert_se(sorted_cgroups[3] == &ctx[3]);
         sorted_cgroups = mfree(sorted_cgroups);
 
-        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4);
+        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4);
         assert_se(sorted_cgroups[0] == &ctx[3]);
         assert_se(sorted_cgroups[1] == &ctx[0]);
-        assert_se(sorted_cgroups[2] == &ctx[1]);
-        assert_se(sorted_cgroups[3] == &ctx[2]);
+        assert_se(sorted_cgroups[2] == &ctx[2]);
+        assert_se(sorted_cgroups[3] == &ctx[1]);
         sorted_cgroups = mfree(sorted_cgroups);
 
-        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
-        assert_se(sorted_cgroups[0] == &ctx[1]);
-        assert_se(sorted_cgroups[1] == &ctx[2]);
+        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
+        assert_se(sorted_cgroups[0] == &ctx[2]);
+        assert_se(sorted_cgroups[1] == &ctx[1]);
         assert_se(sorted_cgroups[2] == 0);
         assert_se(sorted_cgroups[3] == 0);
         sorted_cgroups = mfree(sorted_cgroups);

From c73a2c3a6788a2a28899f29579fdd68816f60d59 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Thu, 28 Jan 2021 15:47:26 -0800
Subject: [PATCH 4/7] oom: skip over cgroups with no memory usage

---
 src/oom/oomd-util.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
index 8f138d64c6c..fa8b8b70b19 100644
--- a/src/oom/oomd-util.c
+++ b/src/oom/oomd-util.c
@@ -219,7 +219,8 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
                 return r;
 
         for (int i = 0; i < r; i++) {
-                if (sorted[i]->pgscan == 0)
+                /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */
+                if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
                         break;
 
                 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);

From 63d6d9160523a2c1a71e96ff4125a1440d827b32 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Tue, 26 Jan 2021 00:57:36 -0800
Subject: [PATCH 5/7] oom: implement avoid/omit xattr support

There may be situations where a cgroup should be protected from killing
or deprioritized as a candidate. In FB oomd xattrs are used to bias oomd
away from supervisor cgroups and towards worker cgroups in container
tasks. On desktops this can be used to protect important units with
unpredictable resource consumption.

The patch allows systemd-oomd to understand 2 xattrs:
"user.oomd_avoid" and "user.oomd_omit". If systemd-oomd sees these
xattrs set to 1 on a candidate cgroup (i.e. while attempting to kill something)
AND the cgroup is owned by root:root, it will either deprioritize the cgroup as
a candidate (avoid) or remove it completely as a candidate (omit).

Usage is restricted to root:root cgroups to prevent situations where an
unprivileged user can set their own cgroups lower in the kill priority than
another user's (and prevent them from omitting their units from
systemd-oomd killing).
---
 src/basic/cgroup-util.c                   | 22 +++++++++
 src/basic/cgroup-util.h                   |  1 +
 src/oom/oomd-util.c                       | 35 ++++++++++++---
 src/oom/oomd-util.h                       | 11 +++++
 src/oom/test-oomd-util.c                  | 54 +++++++++++++++++++++--
 test/test-functions                       |  1 +
 test/units/testsuite-56-testmunch.service |  7 +++
 test/units/testsuite-56.sh                | 31 +++++++++++--
 8 files changed, 149 insertions(+), 13 deletions(-)
 create mode 100644 test/units/testsuite-56-testmunch.service

diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index b567822b7ef..45dc1142048 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -1703,6 +1703,28 @@ int cg_get_attribute_as_bool(const char *controller, const char *path, const cha
         return 0;
 }
 
+
+int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid) {
+        _cleanup_free_ char *f = NULL;
+        struct stat stats;
+        int r;
+
+        assert(ret_uid);
+        assert(ret_gid);
+
+        r = cg_get_path(controller, path, NULL, &f);
+        if (r < 0)
+                return r;
+
+        r = stat(f, &stats);
+        if (r < 0)
+                return -errno;
+
+        *ret_uid = stats.st_uid;
+        *ret_gid = stats.st_gid;
+        return 0;
+}
+
 int cg_get_keyed_attribute_full(
                 const char *controller,
                 const char *path,
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index bdc0d0d086c..63bd25f703e 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -212,6 +212,7 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c
 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret);
 
 int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
+int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid);
 
 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size);
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
index fa8b8b70b19..db6383bf436 100644
--- a/src/oom/oomd-util.c
+++ b/src/oom/oomd-util.c
@@ -159,7 +159,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
                 return -ENOMEM;
 
         HASHMAP_FOREACH(item, h) {
-                if (item->path && prefix && !path_startswith(item->path, prefix))
+                /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
+                if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->omit)
                         continue;
 
                 sorted[k++] = item;
@@ -219,9 +220,10 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
                 return r;
 
         for (int i = 0; i < r; i++) {
-                /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */
+                /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */
+                /* Don't break since there might be "avoid" cgroups at the end. */
                 if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
-                        break;
+                        continue;
 
                 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
                 if (r > 0 || r == -ENOMEM)
@@ -244,8 +246,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
         /* Try to kill cgroups with non-zero swap usage until we either succeed in
          * killing or we get to a cgroup with no swap usage. */
         for (int i = 0; i < r; i++) {
+                /* Skip over cgroups with no resource usage. Don't break since there might be "avoid"
+                 * cgroups at the end. */
                 if (sorted[i]->swap_usage == 0)
-                        break;
+                        continue;
 
                 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
                 if (r > 0 || r == -ENOMEM)
@@ -257,8 +261,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
 
 int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
         _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
-        _cleanup_free_ char *p = NULL, *val = NULL;
+        _cleanup_free_ char *p = NULL, *val = NULL, *avoid_val = NULL, *omit_val = NULL;
         bool is_root;
+        uid_t uid;
+        gid_t gid;
         int r;
 
         assert(path);
@@ -278,6 +284,25 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
         if (r < 0)
                 return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
 
+        r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid, &gid);
+        if (r < 0)
+                log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
+        else if (uid == 0 && gid == 0) {
+                /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
+                 * as an optional feature of systemd-oomd (and the system might not even support them). */
+                r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid", &avoid_val);
+                if (r >= 0 && streq(avoid_val, "1"))
+                        ctx->avoid = true;
+                else if (r == -ENOMEM)
+                        return r;
+
+                r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit", &omit_val);
+                if (r >= 0 && streq(omit_val, "1"))
+                        ctx->omit = true;
+                else if (r == -ENOMEM)
+                        return r;
+        }
+
         if (is_root) {
                 r = procfs_memory_get_used(&ctx->current_memory_usage);
                 if (r < 0)
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
index f0648c5dcdd..ab6a8da1ef6 100644
--- a/src/oom/oomd-util.h
+++ b/src/oom/oomd-util.h
@@ -29,6 +29,9 @@ struct OomdCGroupContext {
         uint64_t last_pgscan;
         uint64_t pgscan;
 
+        bool avoid;
+        bool omit;
+
         /* These are only used by oomd_pressure_above for acting on high memory pressure. */
         loadavg_t mem_pressure_limit;
         usec_t mem_pressure_duration_usec;
@@ -61,10 +64,15 @@ bool oomd_memory_reclaim(Hashmap *h);
 /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
 bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
 
+/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
+ * (after the smallest values). */
 static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
         assert(c1);
         assert(c2);
 
+        if ((*c1)->avoid != (*c2)->avoid)
+                return CMP((*c1)->avoid, (*c2)->avoid);
+
         if ((*c2)->pgscan == (*c1)->pgscan)
                 return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage);
 
@@ -75,6 +83,9 @@ static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupCo
         assert(c1);
         assert(c2);
 
+        if ((*c1)->avoid != (*c2)->avoid)
+                return CMP((*c1)->avoid, (*c2)->avoid);
+
         return CMP((*c2)->swap_usage, (*c1)->swap_usage);
 }
 
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
index a1fe78806a1..193edee0eba 100644
--- a/src/oom/test-oomd-util.c
+++ b/src/oom/test-oomd-util.c
@@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
         _cleanup_free_ char *cgroup = NULL;
         OomdCGroupContext *c1, *c2;
+        bool test_xattrs;
+        int r;
 
         if (geteuid() != 0)
                 return (void) log_tests_skipped("not root");
@@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
 
         assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
 
+        /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
+         * so skip the xattr portions of the test. */
+        r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
+        test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
+
+        if (test_xattrs) {
+                assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
+                assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+        }
+
         assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
 
         assert_se(streq(ctx->path, cgroup));
@@ -110,12 +122,21 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(ctx->swap_usage == 0);
         assert_se(ctx->last_pgscan == 0);
         assert_se(ctx->pgscan == 0);
+        if (test_xattrs) {
+                assert_se(ctx->omit == true);
+                assert_se(ctx->avoid == true);
+        } else {
+                assert_se(ctx->omit == false);
+                assert_se(ctx->avoid == false);
+        }
         ctx = oomd_cgroup_context_free(ctx);
 
         /* Test the root cgroup */
         assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
         assert_se(streq(ctx->path, "/"));
         assert_se(ctx->current_memory_usage > 0);
+        assert_se(ctx->omit == false);
+        assert_se(ctx->avoid == false);
 
         /* Test hashmap inserts */
         assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
@@ -137,6 +158,15 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(c2->last_pgscan == 5555);
         assert_se(c2->mem_pressure_limit == 6789);
         assert_se(c2->last_hit_mem_pressure_limit == 42);
+
+        /* Assert that avoid/omit are not set if the cgroup is not owned by root */
+        if (test_xattrs) {
+                ctx = oomd_cgroup_context_free(ctx);
+                assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 65534) >= 0);
+                assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+                assert_se(ctx->omit == false);
+                assert_se(ctx->avoid == false);
+        }
 }
 
 static void test_oomd_system_context_acquire(void) {
@@ -287,9 +317,11 @@ static void test_oomd_sort_cgroups(void) {
         char **paths = STRV_MAKE("/herp.slice",
                                  "/herp.slice/derp.scope",
                                  "/herp.slice/derp.scope/sheep.service",
-                                 "/zupa.slice");
+                                 "/zupa.slice",
+                                 "/omitted.slice",
+                                 "/avoid.slice");
 
-        OomdCGroupContext ctx[4] = {
+        OomdCGroupContext ctx[6] = {
                 { .path = paths[0],
                   .swap_usage = 20,
                   .pgscan = 60,
@@ -306,6 +338,14 @@ static void test_oomd_sort_cgroups(void) {
                   .swap_usage = 10,
                   .pgscan = 80,
                   .current_memory_usage = 10 },
+                { .path = paths[4],
+                  .swap_usage = 90,
+                  .pgscan = 100,
+                  .omit = true },
+                { .path = paths[5],
+                  .swap_usage = 99,
+                  .pgscan = 200,
+                  .avoid = true },
         };
 
         assert_se(h = hashmap_new(&string_hash_ops));
@@ -314,19 +354,23 @@ static void test_oomd_sort_cgroups(void) {
         assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
         assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
         assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
+        assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
+        assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
 
-        assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
+        assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
         assert_se(sorted_cgroups[0] == &ctx[1]);
         assert_se(sorted_cgroups[1] == &ctx[2]);
         assert_se(sorted_cgroups[2] == &ctx[0]);
         assert_se(sorted_cgroups[3] == &ctx[3]);
+        assert_se(sorted_cgroups[4] == &ctx[5]);
         sorted_cgroups = mfree(sorted_cgroups);
 
-        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4);
+        assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
         assert_se(sorted_cgroups[0] == &ctx[3]);
         assert_se(sorted_cgroups[1] == &ctx[0]);
         assert_se(sorted_cgroups[2] == &ctx[2]);
         assert_se(sorted_cgroups[3] == &ctx[1]);
+        assert_se(sorted_cgroups[4] == &ctx[5]);
         sorted_cgroups = mfree(sorted_cgroups);
 
         assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
@@ -334,6 +378,8 @@ static void test_oomd_sort_cgroups(void) {
         assert_se(sorted_cgroups[1] == &ctx[1]);
         assert_se(sorted_cgroups[2] == 0);
         assert_se(sorted_cgroups[3] == 0);
+        assert_se(sorted_cgroups[4] == 0);
+        assert_se(sorted_cgroups[5] == 0);
         sorted_cgroups = mfree(sorted_cgroups);
 }
 
diff --git a/test/test-functions b/test/test-functions
index df6022982c2..6996cd74752 100644
--- a/test/test-functions
+++ b/test/test-functions
@@ -124,6 +124,7 @@ BASICTOOLS=(
     rmdir
     sed
     seq
+    setfattr
     setfont
     setsid
     sfdisk
diff --git a/test/units/testsuite-56-testmunch.service b/test/units/testsuite-56-testmunch.service
new file mode 100644
index 00000000000..b4b925a7af0
--- /dev/null
+++ b/test/units/testsuite-56-testmunch.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Create some memory pressure
+
+[Service]
+MemoryHigh=2M
+Slice=testsuite-56-workload.slice
+ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh
diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh
index 8b01fe37ed4..88c185b8869 100755
--- a/test/units/testsuite-56.sh
+++ b/test/units/testsuite-56.sh
@@ -23,20 +23,43 @@ oomctl | grep "/testsuite-56-workload.slice"
 oomctl | grep "1.00%"
 oomctl | grep "Default Memory Pressure Duration: 5s"
 
-# systemd-oomd watches for elevated pressure for 30 seconds before acting.
-# It can take time to build up pressure so either wait 5 minutes or for the service to fail.
-timeout=$(date -ud "5 minutes" +%s)
+# systemd-oomd watches for elevated pressure for 5 seconds before acting.
+# It can take time to build up pressure so either wait 2 minutes or for the service to fail.
+timeout=$(date -ud "2 minutes" +%s)
 while [[ $(date -u +%s) -le $timeout ]]; do
     if ! systemctl status testsuite-56-testbloat.service; then
         break
     fi
-    sleep 15
+    sleep 5
 done
 
 # testbloat should be killed and testchill should be fine
 if systemctl status testsuite-56-testbloat.service; then exit 42; fi
 if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
 
+# only run this portion of the test if we can set xattrs
+if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
+    sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down
+
+    systemctl start testsuite-56-testchill.service
+    systemctl start testsuite-56-testmunch.service
+    systemctl start testsuite-56-testbloat.service
+    setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service
+
+    timeout=$(date -ud "2 minutes" +%s)
+    while [[ $(date -u +%s) -le $timeout ]]; do
+        if ! systemctl status testsuite-56-testmunch.service; then
+            break
+        fi
+        sleep 5
+    done
+
+    # testmunch should be killed since testbloat had the avoid xattr on it
+    if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi
+    if systemctl status testsuite-56-testmunch.service; then exit 43; fi
+    if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
+fi
+
 systemd-analyze log-level info
 
 echo OK > /testok

From d87ecfecdb6fb77097f843888e2a05945b6b396b Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Thu, 28 Jan 2021 02:31:44 -0800
Subject: [PATCH 6/7] oom: add unit file settings for oomd avoid/omit xattrs

---
 docs/TRANSIENT-SETTINGS.md                  |  1 +
 src/core/cgroup.c                           | 58 ++++++++++++++++++---
 src/core/cgroup.h                           | 15 ++++++
 src/core/dbus-cgroup.c                      | 22 ++++++++
 src/core/execute.c                          |  4 ++
 src/core/load-fragment-gperf.gperf.m4       |  1 +
 src/core/load-fragment.c                    |  1 +
 src/core/load-fragment.h                    |  1 +
 src/shared/bus-unit-util.c                  |  3 +-
 src/test/test-tables.c                      |  1 +
 test/fuzz/fuzz-unit-file/directives.service |  4 ++
 test/units/testsuite-56.sh                  |  8 ++-
 12 files changed, 109 insertions(+), 10 deletions(-)

diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md
index 50370602543..9f69a3162a0 100644
--- a/docs/TRANSIENT-SETTINGS.md
+++ b/docs/TRANSIENT-SETTINGS.md
@@ -273,6 +273,7 @@ All cgroup/resource control settings are available for transient units
 ✓ ManagedOOMSwap=
 ✓ ManagedOOMMemoryPressure=
 ✓ ManagedOOMMemoryPressureLimit=
+✓ ManagedOOMPreference=
 ```
 
 ## Process Killing Settings
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 70282a7abda..833b434b555 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -131,6 +131,7 @@ void cgroup_context_init(CGroupContext *c) {
 
                 .moom_swap = MANAGED_OOM_AUTO,
                 .moom_mem_pressure = MANAGED_OOM_AUTO,
+                .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
         };
 }
 
@@ -417,7 +418,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 "%sDelegate: %s\n"
                 "%sManagedOOMSwap: %s\n"
                 "%sManagedOOMMemoryPressure: %s\n"
-                "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n",
+                "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n"
+                "%sManagedOOMPreference: %s%%\n",
                 prefix, yes_no(c->cpu_accounting),
                 prefix, yes_no(c->io_accounting),
                 prefix, yes_no(c->blockio_accounting),
@@ -450,7 +452,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 prefix, yes_no(c->delegate),
                 prefix, managed_oom_mode_to_string(c->moom_swap),
                 prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
-                prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100);
+                prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100,
+                prefix, managed_oom_preference_to_string(c->moom_preference));
 
         if (c->delegate) {
                 _cleanup_free_ char *t = NULL;
@@ -600,6 +603,35 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode)
 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
 UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
 
+void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) {
+        CGroupContext *c;
+        int r;
+
+        assert(u);
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return;
+
+        r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid");
+        if (r != -ENODATA)
+                log_unit_debug_errno(u, r, "Failed to remove oomd_avoid flag on control group %s, ignoring: %m", cgroup_path);
+
+        r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit");
+        if (r != -ENODATA)
+                log_unit_debug_errno(u, r, "Failed to remove oomd_omit flag on control group %s, ignoring: %m", cgroup_path);
+
+        if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) {
+                r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid", "1", 1, 0);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to set oomd_avoid flag on control group %s, ignoring: %m", cgroup_path);
+        } else if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) {
+                r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit", "1", 1, 0);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to set oomd_omit flag on control group %s, ignoring: %m", cgroup_path);
+        }
+}
+
 static void cgroup_xattr_apply(Unit *u) {
         char ids[SD_ID128_STRING_MAX];
         int r;
@@ -630,6 +662,8 @@ static void cgroup_xattr_apply(Unit *u) {
                 if (r != -ENODATA)
                         log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path);
         }
+
+        cgroup_oomd_xattr_apply(u, u->cgroup_path);
 }
 
 static int lookup_block_device(const char *p, dev_t *ret) {
@@ -3737,12 +3771,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
         return 1;
 }
 
-static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
-        [CGROUP_DEVICE_POLICY_AUTO]   = "auto",
-        [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
-        [CGROUP_DEVICE_POLICY_STRICT] = "strict",
-};
-
 int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
         _cleanup_free_ char *v = NULL;
         int r;
@@ -3771,6 +3799,12 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
         return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
 }
 
+static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
+        [CGROUP_DEVICE_POLICY_AUTO]   = "auto",
+        [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
+        [CGROUP_DEVICE_POLICY_STRICT] = "strict",
+};
+
 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
 
 static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
@@ -3779,3 +3813,11 @@ static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
 };
 
 DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
+
+static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
+        [MANAGED_OOM_PREFERENCE_NONE] = "none",
+        [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
+        [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index 9fbfabbb7e3..7d9ab4ae6b8 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -94,6 +94,15 @@ struct CGroupBlockIODeviceBandwidth {
         uint64_t wbps;
 };
 
+typedef enum ManagedOOMPreference {
+        MANAGED_OOM_PREFERENCE_NONE,
+        MANAGED_OOM_PREFERENCE_AVOID,
+        MANAGED_OOM_PREFERENCE_OMIT,
+
+        _MANAGED_OOM_PREFERENCE_MAX,
+        _MANAGED_OOM_PREFERENCE_INVALID = -1
+} ManagedOOMPreference;
+
 struct CGroupContext {
         bool cpu_accounting;
         bool io_accounting;
@@ -164,6 +173,7 @@ struct CGroupContext {
         ManagedOOMMode moom_swap;
         ManagedOOMMode moom_mem_pressure;
         uint32_t moom_mem_pressure_limit_permyriad;
+        ManagedOOMPreference moom_preference;
 };
 
 /* Used when querying IP accounting data */
@@ -204,6 +214,8 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI
 
 int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
 
+void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path);
+
 CGroupMask unit_get_own_mask(Unit *u);
 CGroupMask unit_get_delegate_mask(Unit *u);
 CGroupMask unit_get_members_mask(Unit *u);
@@ -294,3 +306,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
 
 const char* freezer_action_to_string(FreezerAction a) _const_;
 FreezerAction freezer_action_from_string(const char *s) _pure_;
+
+const char* managed_oom_preference_to_string(ManagedOOMPreference a) _const_;
+ManagedOOMPreference managed_oom_preference_from_string(const char *s) _pure_;
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index 6f309feb236..0b2d945283e 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -21,6 +21,7 @@ BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_res
 
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference);
 
 static int property_get_cgroup_mask(
                 sd_bus *bus,
@@ -395,6 +396,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
         SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
         SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0),
+        SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
         SD_BUS_VTABLE_END
 };
 
@@ -1720,6 +1722,26 @@ int bus_cgroup_set_property(
                 return 1;
         }
 
+        if (streq(name, "ManagedOOMPreference")) {
+                ManagedOOMPreference p;
+                const char *pref;
+
+                r = sd_bus_message_read(message, "s", &pref);
+                if (r < 0)
+                        return r;
+
+                p = managed_oom_preference_from_string(pref);
+                if (p < 0)
+                        return -EINVAL;
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                        c->moom_preference = p;
+                        unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref);
+                }
+
+                return 1;
+        }
+
         if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
                 return bus_cgroup_set_transient_property(u, c, name, message, flags, error);
 
diff --git a/src/core/execute.c b/src/core/execute.c
index b7d78f2197e..0368582884c 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -4701,6 +4701,10 @@ int exec_spawn(Unit *unit,
                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
                         if (r < 0)
                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+
+                        /* Normally we would not propagate the oomd xattrs to children but since we created this
+                         * sub-cgroup interally we should do it. */
+                        cgroup_oomd_xattr_apply(unit, subcgroup_path);
                 }
         }
 
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 81f4561a572..dbcbe645934 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -230,6 +230,7 @@ $1.IPEgressFilterPath,                   config_parse_ip_filter_bpf_progs,
 $1.ManagedOOMSwap,                       config_parse_managed_oom_mode,               0,                                  offsetof($1, cgroup_context.moom_swap)
 $1.ManagedOOMMemoryPressure,             config_parse_managed_oom_mode,               0,                                  offsetof($1, cgroup_context.moom_mem_pressure)
 $1.ManagedOOMMemoryPressureLimit,        config_parse_managed_oom_mem_pressure_limit, 0,                                  offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad)
+$1.ManagedOOMPreference,                 config_parse_managed_oom_preference,         0,                                  offsetof($1, cgroup_context.moom_preference)
 $1.NetClass,                             config_parse_warn_compat,                    DISABLED_LEGACY,                    0'
 )m4_dnl
 Unit.Description,                        config_parse_unit_string_printf,             0,                                  offsetof(Unit, description)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 06b71aaf157..c6b017556f9 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceR
 DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
 DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
 DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
 DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index 6b2175cd2af..e4a5cb79869 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -78,6 +78,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
 CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
 CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
 CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
 CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
 CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 84f57d94d23..5bbaa07dd1c 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -435,7 +435,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
         if (STR_IN_SET(field, "DevicePolicy",
                               "Slice",
                               "ManagedOOMSwap",
-                              "ManagedOOMMemoryPressure"))
+                              "ManagedOOMMemoryPressure",
+                              "ManagedOOMPreference"))
                 return bus_append_string(m, field, eq);
 
         if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {
diff --git a/src/test/test-tables.c b/src/test/test-tables.c
index 641cadec858..cc93bbbc749 100644
--- a/src/test/test-tables.c
+++ b/src/test/test-tables.c
@@ -73,6 +73,7 @@ int main(int argc, char **argv) {
         test_table(log_target, LOG_TARGET);
         test_table(mac_address_policy, MAC_ADDRESS_POLICY);
         test_table(managed_oom_mode, MANAGED_OOM_MODE);
+        test_table(managed_oom_preference, MANAGED_OOM_PREFERENCE);
         test_table(manager_state, MANAGER_STATE);
         test_table(manager_timestamp, MANAGER_TIMESTAMP);
         test_table(mount_exec_command, MOUNT_EXEC_COMMAND);
diff --git a/test/fuzz/fuzz-unit-file/directives.service b/test/fuzz/fuzz-unit-file/directives.service
index 15fa556dd64..0c7ded6786a 100644
--- a/test/fuzz/fuzz-unit-file/directives.service
+++ b/test/fuzz/fuzz-unit-file/directives.service
@@ -138,6 +138,10 @@ MakeDirectory=
 Mark=
 MaxConnections=
 MaxConnectionsPerSource=
+ManagedOOMSwap=
+ManagedOOMMemoryPressure=
+ManagedOOMMemoryPressureLimitPercent=
+ManagedOOMPreference=
 MemoryAccounting=
 MemoryHigh=
 MemoryLimit=
diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh
index 88c185b8869..1884f814689 100755
--- a/test/units/testsuite-56.sh
+++ b/test/units/testsuite-56.sh
@@ -13,6 +13,8 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]
 fi
 [[ -e /skipped ]] && exit 0 || true
 
+rm -rf /etc/systemd/system/testsuite-56-testbloat.service.d
+
 echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf
 
 systemctl start testsuite-56-testchill.service
@@ -41,10 +43,14 @@ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
 if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
     sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down
 
+    mkdir -p /etc/systemd/system/testsuite-56-testbloat.service.d/
+    echo "[Service]" > /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf
+    echo "ManagedOOMPreference=avoid" >> /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf
+
+    systemctl daemon-reload
     systemctl start testsuite-56-testchill.service
     systemctl start testsuite-56-testmunch.service
     systemctl start testsuite-56-testbloat.service
-    setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service
 
     timeout=$(date -ud "2 minutes" +%s)
     while [[ $(date -u +%s) -le $timeout ]]; do

From 32d695eccfeef00023992cdf20bf39f9d0288c67 Mon Sep 17 00:00:00 2001
From: Anita Zhang <the.anitazha@gmail.com>
Date: Thu, 28 Jan 2021 17:35:17 -0800
Subject: [PATCH 7/7] man: document ManagedOOMPreference=

---
 man/org.freedesktop.systemd1.xml | 36 ++++++++++++++++++++++++++++++++
 man/systemd.resource-control.xml | 32 ++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index 7543a617b78..1d419ac495e 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2450,6 +2450,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as Environment = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -2974,6 +2976,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--property EnvironmentFiles is not documented!-->
 
     <!--property PassEnvironment is not documented!-->
@@ -3538,6 +3542,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@@ -4204,6 +4210,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as Environment = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -4756,6 +4764,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--property EnvironmentFiles is not documented!-->
 
     <!--property PassEnvironment is not documented!-->
@@ -5318,6 +5328,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@@ -5897,6 +5909,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as Environment = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -6377,6 +6391,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--property EnvironmentFiles is not documented!-->
 
     <!--property PassEnvironment is not documented!-->
@@ -6857,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@@ -7557,6 +7575,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly as Environment = ['...', ...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -8023,6 +8043,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--property EnvironmentFiles is not documented!-->
 
     <!--property PassEnvironment is not documented!-->
@@ -8489,6 +8511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@@ -9042,6 +9066,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
   };
   interface org.freedesktop.DBus.Peer { ... };
   interface org.freedesktop.DBus.Introspectable { ... };
@@ -9178,6 +9204,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--Autogenerated cross-references for systemd.directives, do not edit-->
 
     <variablelist class="dbus-interface" generated="True" extra-ref="org.freedesktop.systemd1.Unit"/>
@@ -9318,6 +9346,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <!--End of Autogenerated section-->
 
     <refsect2>
@@ -9477,6 +9507,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
       readonly s ManagedOOMMemoryPressure = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s ManagedOOMPreference = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s KillMode = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -9629,6 +9661,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
 
+    <!--property ManagedOOMPreference is not documented!-->
+
     <!--property KillMode is not documented!-->
 
     <!--property KillSignal is not documented!-->
@@ -9795,6 +9829,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="KillMode"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="KillSignal"/>
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
index be9c35057db..13ff7e9a740 100644
--- a/man/systemd.resource-control.xml
+++ b/man/systemd.resource-control.xml
@@ -913,6 +913,38 @@ DeviceAllow=/dev/loop-control
           </para>
         </listitem>
       </varlistentry>
+
+      <varlistentry>
+        <term><varname>ManagedOOMPreference=none|avoid|omit</varname></term>
+
+        <listitem>
+          <para>Allows deprioritizing or omitting this unit's cgroup as a candidate when <command>systemd-oomd</command>
+          needs to act. Requires support for extended attributes (see
+          <citerefentry project='man-pages'><refentrytitle>xattr</refentrytitle><manvolnum>7</manvolnum></citerefentry>)
+          in order to use <option>avoid</option> or <option>omit</option>. Additionally, <command>systemd-oomd</command>
+          will ignore these extended attributes if the unit's cgroup is not owned by the root user and group.</para>
+
+          <para>If this property is set to <option>avoid</option>, the service manager will set the
+          "user.oomd_avoid" extended attribute on the unit's cgroup to "1". If <command>systemd-oomd</command> sees
+          this extended attribute on a cgroup set to "1" when choosing between candidates, it will only select the
+          cgroup with "user.oomd_avoid" if there are no other viable candidates.</para>
+
+          <para>If this property is set to <option>omit</option>, the service manager will set the "user.oomd_omit"
+          extended attribute on the unit's cgroup to "1". If <command>systemd-oomd</command> sees the this extended
+          attribute on the cgroup set to "1", it will ignore the cgroup as a candidate and will not perform any actions
+          on the cgroup.</para>
+
+          <para>It is recommended to use <option>avoid</option> and <option>omit</option> sparingly as it can adversely
+          affect <command>systemd-oomd</command>'s kill behavior. Also note that these extended attributes are not
+          applied recursively to cgroups under this unit's cgroup.</para>
+
+          <para>Defaults to <option>none</option> which means no extended attributes will be set and systemd-oomd will
+          sort this unit's cgroup as defined in
+          <citerefentry><refentrytitle>systemd-oomd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+          and <citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> (if this
+          unit's cgroup becomes a candidate).</para>
+        </listitem>
+      </varlistentry>
     </variablelist>
   </refsect1>