99cbc7
From eb7ef8053311d82d43912a5cc1e82d0266bb29de Mon Sep 17 00:00:00 2001
99cbc7
Message-Id: <eb7ef8053311d82d43912a5cc1e82d0266bb29de@dist-git>
99cbc7
From: Michal Privoznik <mprivozn@redhat.com>
99cbc7
Date: Thu, 18 Apr 2019 18:58:57 +0200
99cbc7
Subject: [PATCH] qemu: Rework setting process affinity
99cbc7
MIME-Version: 1.0
99cbc7
Content-Type: text/plain; charset=UTF-8
99cbc7
Content-Transfer-Encoding: 8bit
99cbc7
99cbc7
RHEL-7.7: https://bugzilla.redhat.com/show_bug.cgi?id=1695434
99cbc7
RHEL-8.0.1: https://bugzilla.redhat.com/show_bug.cgi?id=1503284
99cbc7
99cbc7
The way we currently start qemu from CPU affinity POV is as
99cbc7
follows:
99cbc7
99cbc7
  1) the child process is set affinity to all online CPUs (unless
99cbc7
  some vcpu pinning was given in the domain XML)
99cbc7
99cbc7
  2) Once qemu is running, cpuset cgroup is configured taking
99cbc7
  memory pinning into account
99cbc7
99cbc7
Problem is that we let qemu allocate its memory just anywhere in
99cbc7
1) and then rely in 2) to be able to move the memory to
99cbc7
configured NUMA nodes. This might not be always possible (e.g.
99cbc7
qemu might lock some parts of its memory) and is very suboptimal
99cbc7
(copying large memory between NUMA nodes takes significant amount
99cbc7
of time).
99cbc7
99cbc7
The solution is to set affinity to one of (in priority order):
99cbc7
  - The CPUs associated with NUMA memory affinity mask
99cbc7
  - The CPUs associated with emulator pinning
99cbc7
  - All online host CPUs
99cbc7
99cbc7
Later (once QEMU has allocated its memory) we then change this
99cbc7
again to (again in priority order):
99cbc7
  - The CPUs associated with emulator pinning
99cbc7
  - The CPUs returned by numad
99cbc7
  - The CPUs associated with vCPU pinning
99cbc7
  - All online host CPUs
99cbc7
99cbc7
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
99cbc7
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
99cbc7
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
99cbc7
99cbc7
I had to explicitly free bitmaps, because there is no VIR_AUTOPTR
99cbc7
just yet.
99cbc7
99cbc7
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
99cbc7
Message-Id: <a6edd347c999f999a49d1a878c74c690eb2ab619.1555606711.git.mprivozn@redhat.com>
99cbc7
Reviewed-by: Jiri Denemark <jdenemar@redhat.com>
99cbc7
---
99cbc7
 src/qemu/qemu_process.c | 130 +++++++++++++++++++---------------------
99cbc7
 1 file changed, 63 insertions(+), 67 deletions(-)
99cbc7
99cbc7
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
99cbc7
index 6945d76b18..7d04a6e226 100644
99cbc7
--- a/src/qemu/qemu_process.c
99cbc7
+++ b/src/qemu/qemu_process.c
99cbc7
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
99cbc7
 }
99cbc7
 
99cbc7
 
99cbc7
+static int
99cbc7
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
99cbc7
+{
99cbc7
+    *cpumapRet = NULL;
99cbc7
+
99cbc7
+    if (!virHostCPUHasBitmap())
99cbc7
+        return 0;
99cbc7
+
99cbc7
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
99cbc7
+        return -1;
99cbc7
+
99cbc7
+    return 0;
99cbc7
+}
99cbc7
+
99cbc7
+
99cbc7
 /*
99cbc7
  * To be run between fork/exec of QEMU only
99cbc7
  */
99cbc7
@@ -2342,9 +2357,9 @@ static int
99cbc7
 qemuProcessInitCpuAffinity(virDomainObjPtr vm)
99cbc7
 {
99cbc7
     int ret = -1;
99cbc7
-    virBitmapPtr cpumap = NULL;
99cbc7
     virBitmapPtr cpumapToSet = NULL;
99cbc7
     virBitmapPtr hostcpumap = NULL;
99cbc7
+    virDomainNumatuneMemMode mem_mode;
99cbc7
     qemuDomainObjPrivatePtr priv = vm->privateData;
99cbc7
 
99cbc7
     if (!vm->pid) {
99cbc7
@@ -2353,58 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
99cbc7
         return -1;
99cbc7
     }
99cbc7
 
99cbc7
-    if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
99cbc7
-        VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
99cbc7
-        cpumapToSet = priv->autoCpuset;
99cbc7
+    /* Here is the deal, we can't set cpuset.mems before qemu is
99cbc7
+     * started as it clashes with KVM allocation. Therefore, we
99cbc7
+     * used to let qemu allocate its memory anywhere as we would
99cbc7
+     * then move the memory to desired NUMA node via CGroups.
99cbc7
+     * However, that might not be always possible because qemu
99cbc7
+     * might lock some parts of its memory (e.g. due to VFIO).
99cbc7
+     * Even if it possible, memory has to be copied between NUMA
99cbc7
+     * nodes which is suboptimal.
99cbc7
+     * Solution is to set affinity that matches the best what we
99cbc7
+     * would have set in CGroups and then fix it later, once qemu
99cbc7
+     * is already running. */
99cbc7
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
99cbc7
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
99cbc7
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
99cbc7
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
99cbc7
+                                             priv->autoNodeset,
99cbc7
+                                             &cpumapToSet,
99cbc7
+                                             -1) < 0)
99cbc7
+            goto cleanup;
99cbc7
+    } else if (vm->def->cputune.emulatorpin) {
99cbc7
+        cpumapToSet = vm->def->cputune.emulatorpin;
99cbc7
     } else {
99cbc7
-        VIR_DEBUG("Set CPU affinity with specified cpuset");
99cbc7
-        if (vm->def->cpumask) {
99cbc7
-            cpumapToSet = vm->def->cpumask;
99cbc7
-        } else {
99cbc7
-            /* You may think this is redundant, but we can't assume libvirtd
99cbc7
-             * itself is running on all pCPUs, so we need to explicitly set
99cbc7
-             * the spawned QEMU instance to all pCPUs if no map is given in
99cbc7
-             * its config file */
99cbc7
-            int hostcpus;
99cbc7
-
99cbc7
-            if (virHostCPUHasBitmap()) {
99cbc7
-                hostcpumap = virHostCPUGetOnlineBitmap();
99cbc7
-                cpumap = virProcessGetAffinity(vm->pid);
99cbc7
-            }
99cbc7
-
99cbc7
-            if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
99cbc7
-                /* we're using all available CPUs, no reason to set
99cbc7
-                 * mask. If libvirtd is running without explicit
99cbc7
-                 * affinity, we can use hotplugged CPUs for this VM */
99cbc7
-                ret = 0;
99cbc7
-                goto cleanup;
99cbc7
-            } else {
99cbc7
-                /* setaffinity fails if you set bits for CPUs which
99cbc7
-                 * aren't present, so we have to limit ourselves */
99cbc7
-                if ((hostcpus = virHostCPUGetCount()) < 0)
99cbc7
-                    goto cleanup;
99cbc7
-
99cbc7
-                if (hostcpus > QEMUD_CPUMASK_LEN)
99cbc7
-                    hostcpus = QEMUD_CPUMASK_LEN;
99cbc7
-
99cbc7
-                virBitmapFree(cpumap);
99cbc7
-                if (!(cpumap = virBitmapNew(hostcpus)))
99cbc7
-                    goto cleanup;
99cbc7
-
99cbc7
-                virBitmapSetAll(cpumap);
99cbc7
-
99cbc7
-                cpumapToSet = cpumap;
99cbc7
-            }
99cbc7
-        }
99cbc7
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
99cbc7
+            goto cleanup;
99cbc7
+        cpumapToSet = hostcpumap;
99cbc7
     }
99cbc7
 
99cbc7
-    if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
99cbc7
+    if (cpumapToSet &&
99cbc7
+        virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
99cbc7
         goto cleanup;
99cbc7
 
99cbc7
     ret = 0;
99cbc7
-
99cbc7
  cleanup:
99cbc7
-    virBitmapFree(cpumap);
99cbc7
     virBitmapFree(hostcpumap);
99cbc7
     return ret;
99cbc7
 }
99cbc7
@@ -2478,7 +2474,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
99cbc7
     qemuDomainObjPrivatePtr priv = vm->privateData;
99cbc7
     virDomainNumatuneMemMode mem_mode;
99cbc7
     virCgroupPtr cgroup = NULL;
99cbc7
-    virBitmapPtr use_cpumask;
99cbc7
+    virBitmapPtr use_cpumask = NULL;
99cbc7
+    virBitmapPtr hostcpumap = NULL;
99cbc7
     char *mem_mask = NULL;
99cbc7
     int ret = -1;
99cbc7
 
99cbc7
@@ -2490,12 +2487,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
99cbc7
     }
99cbc7
 
99cbc7
     /* Infer which cpumask shall be used. */
99cbc7
-    if (cpumask)
99cbc7
+    if (cpumask) {
99cbc7
         use_cpumask = cpumask;
99cbc7
-    else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
99cbc7
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
99cbc7
         use_cpumask = priv->autoCpuset;
99cbc7
-    else
99cbc7
+    } else if (vm->def->cpumask) {
99cbc7
         use_cpumask = vm->def->cpumask;
99cbc7
+    } else {
99cbc7
+        /* You may think this is redundant, but we can't assume libvirtd
99cbc7
+         * itself is running on all pCPUs, so we need to explicitly set
99cbc7
+         * the spawned QEMU instance to all pCPUs if no map is given in
99cbc7
+         * its config file */
99cbc7
+        if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
99cbc7
+            goto cleanup;
99cbc7
+        use_cpumask = hostcpumap;
99cbc7
+    }
99cbc7
 
99cbc7
     /*
99cbc7
      * If CPU cgroup controller is not initialized here, then we need
99cbc7
@@ -2520,13 +2526,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
99cbc7
                 qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
99cbc7
                 goto cleanup;
99cbc7
 
99cbc7
-            /*
99cbc7
-             * Don't setup cpuset.mems for the emulator, they need to
99cbc7
-             * be set up after initialization in order for kvm
99cbc7
-             * allocations to succeed.
99cbc7
-             */
99cbc7
-            if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
99cbc7
-                mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
99cbc7
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
99cbc7
                 goto cleanup;
99cbc7
 
99cbc7
         }
99cbc7
@@ -2553,6 +2553,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
99cbc7
     ret = 0;
99cbc7
  cleanup:
99cbc7
     VIR_FREE(mem_mask);
99cbc7
+    virBitmapFree(hostcpumap);
99cbc7
     if (cgroup) {
99cbc7
         if (ret < 0)
99cbc7
             virCgroupRemove(cgroup);
99cbc7
@@ -6428,12 +6429,7 @@ qemuProcessLaunch(virConnectPtr conn,
99cbc7
 
99cbc7
     /* This must be done after cgroup placement to avoid resetting CPU
99cbc7
      * affinity */
99cbc7
-    if (!vm->def->cputune.emulatorpin &&
99cbc7
-        qemuProcessInitCpuAffinity(vm) < 0)
99cbc7
-        goto cleanup;
99cbc7
-
99cbc7
-    VIR_DEBUG("Setting emulator tuning/settings");
99cbc7
-    if (qemuProcessSetupEmulator(vm) < 0)
99cbc7
+    if (qemuProcessInitCpuAffinity(vm) < 0)
99cbc7
         goto cleanup;
99cbc7
 
99cbc7
     VIR_DEBUG("Setting cgroup for external devices (if required)");
99cbc7
@@ -6502,10 +6498,6 @@ qemuProcessLaunch(virConnectPtr conn,
99cbc7
     if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
99cbc7
         goto cleanup;
99cbc7
 
99cbc7
-    VIR_DEBUG("Setting up post-init cgroup restrictions");
99cbc7
-    if (qemuSetupCpusetMems(vm) < 0)
99cbc7
-        goto cleanup;
99cbc7
-
99cbc7
     VIR_DEBUG("setting up hotpluggable cpus");
99cbc7
     if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
99cbc7
         if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
99cbc7
@@ -6531,6 +6523,10 @@ qemuProcessLaunch(virConnectPtr conn,
99cbc7
     if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
99cbc7
         goto cleanup;
99cbc7
 
99cbc7
+    VIR_DEBUG("Setting emulator tuning/settings");
99cbc7
+    if (qemuProcessSetupEmulator(vm) < 0)
99cbc7
+        goto cleanup;
99cbc7
+
99cbc7
     VIR_DEBUG("Setting global CPU cgroup (if required)");
99cbc7
     if (qemuSetupGlobalCpuCgroup(vm) < 0)
99cbc7
         goto cleanup;
99cbc7
-- 
99cbc7
2.21.0
99cbc7