|
|
99cbc7 |
From eb7ef8053311d82d43912a5cc1e82d0266bb29de Mon Sep 17 00:00:00 2001
|
|
|
99cbc7 |
Message-Id: <eb7ef8053311d82d43912a5cc1e82d0266bb29de@dist-git>
|
|
|
99cbc7 |
From: Michal Privoznik <mprivozn@redhat.com>
|
|
|
99cbc7 |
Date: Thu, 18 Apr 2019 18:58:57 +0200
|
|
|
99cbc7 |
Subject: [PATCH] qemu: Rework setting process affinity
|
|
|
99cbc7 |
MIME-Version: 1.0
|
|
|
99cbc7 |
Content-Type: text/plain; charset=UTF-8
|
|
|
99cbc7 |
Content-Transfer-Encoding: 8bit
|
|
|
99cbc7 |
|
|
|
99cbc7 |
RHEL-7.7: https://bugzilla.redhat.com/show_bug.cgi?id=1695434
|
|
|
99cbc7 |
RHEL-8.0.1: https://bugzilla.redhat.com/show_bug.cgi?id=1503284
|
|
|
99cbc7 |
|
|
|
99cbc7 |
The way we currently start qemu from CPU affinity POV is as
|
|
|
99cbc7 |
follows:
|
|
|
99cbc7 |
|
|
|
99cbc7 |
1) the child process is set affinity to all online CPUs (unless
|
|
|
99cbc7 |
some vcpu pinning was given in the domain XML)
|
|
|
99cbc7 |
|
|
|
99cbc7 |
2) Once qemu is running, cpuset cgroup is configured taking
|
|
|
99cbc7 |
memory pinning into account
|
|
|
99cbc7 |
|
|
|
99cbc7 |
Problem is that we let qemu allocate its memory just anywhere in
|
|
|
99cbc7 |
1) and then rely in 2) to be able to move the memory to
|
|
|
99cbc7 |
configured NUMA nodes. This might not be always possible (e.g.
|
|
|
99cbc7 |
qemu might lock some parts of its memory) and is very suboptimal
|
|
|
99cbc7 |
(copying large memory between NUMA nodes takes significant amount
|
|
|
99cbc7 |
of time).
|
|
|
99cbc7 |
|
|
|
99cbc7 |
The solution is to set affinity to one of (in priority order):
|
|
|
99cbc7 |
- The CPUs associated with NUMA memory affinity mask
|
|
|
99cbc7 |
- The CPUs associated with emulator pinning
|
|
|
99cbc7 |
- All online host CPUs
|
|
|
99cbc7 |
|
|
|
99cbc7 |
Later (once QEMU has allocated its memory) we then change this
|
|
|
99cbc7 |
again to (again in priority order):
|
|
|
99cbc7 |
- The CPUs associated with emulator pinning
|
|
|
99cbc7 |
- The CPUs returned by numad
|
|
|
99cbc7 |
- The CPUs associated with vCPU pinning
|
|
|
99cbc7 |
- All online host CPUs
|
|
|
99cbc7 |
|
|
|
99cbc7 |
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
|
|
|
99cbc7 |
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
|
|
|
99cbc7 |
(cherry picked from commit f136b83139c63f20de0df3285d9e82df2fb97bfc)
|
|
|
99cbc7 |
|
|
|
99cbc7 |
I had to explicitly free bitmaps, because there is no VIR_AUTOPTR
|
|
|
99cbc7 |
just yet.
|
|
|
99cbc7 |
|
|
|
99cbc7 |
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
|
|
|
99cbc7 |
Message-Id: <a6edd347c999f999a49d1a878c74c690eb2ab619.1555606711.git.mprivozn@redhat.com>
|
|
|
99cbc7 |
Reviewed-by: Jiri Denemark <jdenemar@redhat.com>
|
|
|
99cbc7 |
---
|
|
|
99cbc7 |
src/qemu/qemu_process.c | 130 +++++++++++++++++++---------------------
|
|
|
99cbc7 |
1 file changed, 63 insertions(+), 67 deletions(-)
|
|
|
99cbc7 |
|
|
|
99cbc7 |
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
|
|
|
99cbc7 |
index 6945d76b18..7d04a6e226 100644
|
|
|
99cbc7 |
--- a/src/qemu/qemu_process.c
|
|
|
99cbc7 |
+++ b/src/qemu/qemu_process.c
|
|
|
99cbc7 |
@@ -2335,6 +2335,21 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver,
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
|
|
|
99cbc7 |
|
|
|
99cbc7 |
+static int
|
|
|
99cbc7 |
+qemuProcessGetAllCpuAffinity(virBitmapPtr *cpumapRet)
|
|
|
99cbc7 |
+{
|
|
|
99cbc7 |
+ *cpumapRet = NULL;
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
+ if (!virHostCPUHasBitmap())
|
|
|
99cbc7 |
+ return 0;
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
+ if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
|
|
|
99cbc7 |
+ return -1;
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
+ return 0;
|
|
|
99cbc7 |
+}
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
/*
|
|
|
99cbc7 |
* To be run between fork/exec of QEMU only
|
|
|
99cbc7 |
*/
|
|
|
99cbc7 |
@@ -2342,9 +2357,9 @@ static int
|
|
|
99cbc7 |
qemuProcessInitCpuAffinity(virDomainObjPtr vm)
|
|
|
99cbc7 |
{
|
|
|
99cbc7 |
int ret = -1;
|
|
|
99cbc7 |
- virBitmapPtr cpumap = NULL;
|
|
|
99cbc7 |
virBitmapPtr cpumapToSet = NULL;
|
|
|
99cbc7 |
virBitmapPtr hostcpumap = NULL;
|
|
|
99cbc7 |
+ virDomainNumatuneMemMode mem_mode;
|
|
|
99cbc7 |
qemuDomainObjPrivatePtr priv = vm->privateData;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
if (!vm->pid) {
|
|
|
99cbc7 |
@@ -2353,58 +2368,39 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
|
|
|
99cbc7 |
return -1;
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
|
|
|
99cbc7 |
- if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
|
|
|
99cbc7 |
- VIR_DEBUG("Set CPU affinity with advisory nodeset from numad");
|
|
|
99cbc7 |
- cpumapToSet = priv->autoCpuset;
|
|
|
99cbc7 |
+ /* Here is the deal, we can't set cpuset.mems before qemu is
|
|
|
99cbc7 |
+ * started as it clashes with KVM allocation. Therefore, we
|
|
|
99cbc7 |
+ * used to let qemu allocate its memory anywhere as we would
|
|
|
99cbc7 |
+ * then move the memory to desired NUMA node via CGroups.
|
|
|
99cbc7 |
+ * However, that might not be always possible because qemu
|
|
|
99cbc7 |
+ * might lock some parts of its memory (e.g. due to VFIO).
|
|
|
99cbc7 |
+ * Even if it possible, memory has to be copied between NUMA
|
|
|
99cbc7 |
+ * nodes which is suboptimal.
|
|
|
99cbc7 |
+ * Solution is to set affinity that matches the best what we
|
|
|
99cbc7 |
+ * would have set in CGroups and then fix it later, once qemu
|
|
|
99cbc7 |
+ * is already running. */
|
|
|
99cbc7 |
+ if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
|
|
|
99cbc7 |
+ virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
|
|
|
99cbc7 |
+ mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
|
|
|
99cbc7 |
+ if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
|
|
|
99cbc7 |
+ priv->autoNodeset,
|
|
|
99cbc7 |
+ &cpumapToSet,
|
|
|
99cbc7 |
+ -1) < 0)
|
|
|
99cbc7 |
+ goto cleanup;
|
|
|
99cbc7 |
+ } else if (vm->def->cputune.emulatorpin) {
|
|
|
99cbc7 |
+ cpumapToSet = vm->def->cputune.emulatorpin;
|
|
|
99cbc7 |
} else {
|
|
|
99cbc7 |
- VIR_DEBUG("Set CPU affinity with specified cpuset");
|
|
|
99cbc7 |
- if (vm->def->cpumask) {
|
|
|
99cbc7 |
- cpumapToSet = vm->def->cpumask;
|
|
|
99cbc7 |
- } else {
|
|
|
99cbc7 |
- /* You may think this is redundant, but we can't assume libvirtd
|
|
|
99cbc7 |
- * itself is running on all pCPUs, so we need to explicitly set
|
|
|
99cbc7 |
- * the spawned QEMU instance to all pCPUs if no map is given in
|
|
|
99cbc7 |
- * its config file */
|
|
|
99cbc7 |
- int hostcpus;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- if (virHostCPUHasBitmap()) {
|
|
|
99cbc7 |
- hostcpumap = virHostCPUGetOnlineBitmap();
|
|
|
99cbc7 |
- cpumap = virProcessGetAffinity(vm->pid);
|
|
|
99cbc7 |
- }
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) {
|
|
|
99cbc7 |
- /* we're using all available CPUs, no reason to set
|
|
|
99cbc7 |
- * mask. If libvirtd is running without explicit
|
|
|
99cbc7 |
- * affinity, we can use hotplugged CPUs for this VM */
|
|
|
99cbc7 |
- ret = 0;
|
|
|
99cbc7 |
- goto cleanup;
|
|
|
99cbc7 |
- } else {
|
|
|
99cbc7 |
- /* setaffinity fails if you set bits for CPUs which
|
|
|
99cbc7 |
- * aren't present, so we have to limit ourselves */
|
|
|
99cbc7 |
- if ((hostcpus = virHostCPUGetCount()) < 0)
|
|
|
99cbc7 |
- goto cleanup;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- if (hostcpus > QEMUD_CPUMASK_LEN)
|
|
|
99cbc7 |
- hostcpus = QEMUD_CPUMASK_LEN;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- virBitmapFree(cpumap);
|
|
|
99cbc7 |
- if (!(cpumap = virBitmapNew(hostcpus)))
|
|
|
99cbc7 |
- goto cleanup;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- virBitmapSetAll(cpumap);
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- cpumapToSet = cpumap;
|
|
|
99cbc7 |
- }
|
|
|
99cbc7 |
- }
|
|
|
99cbc7 |
+ if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
|
|
|
99cbc7 |
+ goto cleanup;
|
|
|
99cbc7 |
+ cpumapToSet = hostcpumap;
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
|
|
|
99cbc7 |
- if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
|
|
|
99cbc7 |
+ if (cpumapToSet &&
|
|
|
99cbc7 |
+ virProcessSetAffinity(vm->pid, cpumapToSet) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
ret = 0;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
cleanup:
|
|
|
99cbc7 |
- virBitmapFree(cpumap);
|
|
|
99cbc7 |
virBitmapFree(hostcpumap);
|
|
|
99cbc7 |
return ret;
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
@@ -2478,7 +2474,8 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
|
99cbc7 |
qemuDomainObjPrivatePtr priv = vm->privateData;
|
|
|
99cbc7 |
virDomainNumatuneMemMode mem_mode;
|
|
|
99cbc7 |
virCgroupPtr cgroup = NULL;
|
|
|
99cbc7 |
- virBitmapPtr use_cpumask;
|
|
|
99cbc7 |
+ virBitmapPtr use_cpumask = NULL;
|
|
|
99cbc7 |
+ virBitmapPtr hostcpumap = NULL;
|
|
|
99cbc7 |
char *mem_mask = NULL;
|
|
|
99cbc7 |
int ret = -1;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
@@ -2490,12 +2487,21 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
|
|
|
99cbc7 |
/* Infer which cpumask shall be used. */
|
|
|
99cbc7 |
- if (cpumask)
|
|
|
99cbc7 |
+ if (cpumask) {
|
|
|
99cbc7 |
use_cpumask = cpumask;
|
|
|
99cbc7 |
- else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)
|
|
|
99cbc7 |
+ } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
|
|
|
99cbc7 |
use_cpumask = priv->autoCpuset;
|
|
|
99cbc7 |
- else
|
|
|
99cbc7 |
+ } else if (vm->def->cpumask) {
|
|
|
99cbc7 |
use_cpumask = vm->def->cpumask;
|
|
|
99cbc7 |
+ } else {
|
|
|
99cbc7 |
+ /* You may think this is redundant, but we can't assume libvirtd
|
|
|
99cbc7 |
+ * itself is running on all pCPUs, so we need to explicitly set
|
|
|
99cbc7 |
+ * the spawned QEMU instance to all pCPUs if no map is given in
|
|
|
99cbc7 |
+ * its config file */
|
|
|
99cbc7 |
+ if (qemuProcessGetAllCpuAffinity(&hostcpumap) < 0)
|
|
|
99cbc7 |
+ goto cleanup;
|
|
|
99cbc7 |
+ use_cpumask = hostcpumap;
|
|
|
99cbc7 |
+ }
|
|
|
99cbc7 |
|
|
|
99cbc7 |
/*
|
|
|
99cbc7 |
* If CPU cgroup controller is not initialized here, then we need
|
|
|
99cbc7 |
@@ -2520,13 +2526,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
|
99cbc7 |
qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
- /*
|
|
|
99cbc7 |
- * Don't setup cpuset.mems for the emulator, they need to
|
|
|
99cbc7 |
- * be set up after initialization in order for kvm
|
|
|
99cbc7 |
- * allocations to succeed.
|
|
|
99cbc7 |
- */
|
|
|
99cbc7 |
- if (nameval != VIR_CGROUP_THREAD_EMULATOR &&
|
|
|
99cbc7 |
- mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
|
|
|
99cbc7 |
+ if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
}
|
|
|
99cbc7 |
@@ -2553,6 +2553,7 @@ qemuProcessSetupPid(virDomainObjPtr vm,
|
|
|
99cbc7 |
ret = 0;
|
|
|
99cbc7 |
cleanup:
|
|
|
99cbc7 |
VIR_FREE(mem_mask);
|
|
|
99cbc7 |
+ virBitmapFree(hostcpumap);
|
|
|
99cbc7 |
if (cgroup) {
|
|
|
99cbc7 |
if (ret < 0)
|
|
|
99cbc7 |
virCgroupRemove(cgroup);
|
|
|
99cbc7 |
@@ -6428,12 +6429,7 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
|
99cbc7 |
|
|
|
99cbc7 |
/* This must be done after cgroup placement to avoid resetting CPU
|
|
|
99cbc7 |
* affinity */
|
|
|
99cbc7 |
- if (!vm->def->cputune.emulatorpin &&
|
|
|
99cbc7 |
- qemuProcessInitCpuAffinity(vm) < 0)
|
|
|
99cbc7 |
- goto cleanup;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
- VIR_DEBUG("Setting emulator tuning/settings");
|
|
|
99cbc7 |
- if (qemuProcessSetupEmulator(vm) < 0)
|
|
|
99cbc7 |
+ if (qemuProcessInitCpuAffinity(vm) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
VIR_DEBUG("Setting cgroup for external devices (if required)");
|
|
|
99cbc7 |
@@ -6502,10 +6498,6 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
|
99cbc7 |
if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
- VIR_DEBUG("Setting up post-init cgroup restrictions");
|
|
|
99cbc7 |
- if (qemuSetupCpusetMems(vm) < 0)
|
|
|
99cbc7 |
- goto cleanup;
|
|
|
99cbc7 |
-
|
|
|
99cbc7 |
VIR_DEBUG("setting up hotpluggable cpus");
|
|
|
99cbc7 |
if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
|
|
|
99cbc7 |
if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0)
|
|
|
99cbc7 |
@@ -6531,6 +6523,10 @@ qemuProcessLaunch(virConnectPtr conn,
|
|
|
99cbc7 |
if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
|
|
|
99cbc7 |
+ VIR_DEBUG("Setting emulator tuning/settings");
|
|
|
99cbc7 |
+ if (qemuProcessSetupEmulator(vm) < 0)
|
|
|
99cbc7 |
+ goto cleanup;
|
|
|
99cbc7 |
+
|
|
|
99cbc7 |
VIR_DEBUG("Setting global CPU cgroup (if required)");
|
|
|
99cbc7 |
if (qemuSetupGlobalCpuCgroup(vm) < 0)
|
|
|
99cbc7 |
goto cleanup;
|
|
|
99cbc7 |
--
|
|
|
99cbc7 |
2.21.0
|
|
|
99cbc7 |
|