|
|
7a3408 |
From f5b7d2d701f347b9b4e13d008de476e37c82c8cb Mon Sep 17 00:00:00 2001
|
|
|
7a3408 |
Message-Id: <f5b7d2d701f347b9b4e13d008de476e37c82c8cb@dist-git>
|
|
|
7a3408 |
From: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com>
|
|
|
7a3408 |
Date: Wed, 5 Aug 2015 18:18:35 +0200
|
|
|
7a3408 |
Subject: [PATCH] nodeinfo: Fix output on PPC64 KVM hosts
|
|
|
7a3408 |
|
|
|
7a3408 |
The nodeinfo is reporting incorrect number of cpus and incorrect host
|
|
|
7a3408 |
topology on PPC64 KVM hosts. The KVM hypervisor on PPC64 needs only
|
|
|
7a3408 |
the primary thread in a core to be online, and the secondaries offlined.
|
|
|
7a3408 |
While scheduling a guest in, the kvm scheduler wakes up the secondaries to
|
|
|
7a3408 |
run in guest context.
|
|
|
7a3408 |
|
|
|
7a3408 |
The host scheduling of the guests happen at the core level(as only primary
|
|
|
7a3408 |
thread is online). The kvm scheduler exploits as many threads of the core
|
|
|
7a3408 |
as needed by guest. Further, starting POWER8, the processor allows splitting
|
|
|
7a3408 |
a physical core into multiple subcores with 2 or 4 threads each. Again, only
|
|
|
7a3408 |
the primary thread in a subcore is online in the host. The KVM-PPC
|
|
|
7a3408 |
scheduler allows guests to exploit all the offline threads in the subcore,
|
|
|
7a3408 |
by bringing them online when needed.
|
|
|
7a3408 |
(Kernel patches on split-core http://www.spinics.net/lists/kvm-ppc/msg09121.html)
|
|
|
7a3408 |
|
|
|
7a3408 |
Recently with dynamic micro-threading changes in ppc-kvm, makes sure
|
|
|
7a3408 |
to utilize all the offline cpus across guests, and across guests with
|
|
|
7a3408 |
different cpu topologies.
|
|
|
7a3408 |
(https://www.mail-archive.com/kvm@vger.kernel.org/msg115978.html)
|
|
|
7a3408 |
|
|
|
7a3408 |
Since the offline cpus are brought online in the guest context, it is safe
|
|
|
7a3408 |
to count them as online. Nodeinfo today discounts these offline cpus from
|
|
|
7a3408 |
cpu count/topology calclulation, and the nodeinfo output is not of any help
|
|
|
7a3408 |
and the host appears overcommited when it is actually not.
|
|
|
7a3408 |
|
|
|
7a3408 |
The patch carefully counts those offline threads whose primary threads are
|
|
|
7a3408 |
online. The host topology displayed by the nodeinfo is also fixed when the
|
|
|
7a3408 |
host is in valid kvm state.
|
|
|
7a3408 |
|
|
|
7a3408 |
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com>
|
|
|
7a3408 |
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
|
|
|
7a3408 |
(cherry picked from commit 014208c4d028d2a632cdfe89d361fca8811899b6)
|
|
|
7a3408 |
|
|
|
7a3408 |
Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1213713
|
|
|
7a3408 |
|
|
|
7a3408 |
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
|
|
|
7a3408 |
Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
|
|
|
7a3408 |
---
|
|
|
7a3408 |
src/libvirt_private.syms | 1 +
|
|
|
7a3408 |
src/nodeinfo.c | 153 +++++++++++++++++++++++++++++++++++++++++++++--
|
|
|
7a3408 |
src/nodeinfo.h | 1 +
|
|
|
7a3408 |
3 files changed, 151 insertions(+), 4 deletions(-)
|
|
|
7a3408 |
|
|
|
7a3408 |
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
|
|
|
7a3408 |
index 1363c92..a6b523a 100644
|
|
|
7a3408 |
--- a/src/libvirt_private.syms
|
|
|
7a3408 |
+++ b/src/libvirt_private.syms
|
|
|
7a3408 |
@@ -1008,6 +1008,7 @@ nodeGetMemoryParameters;
|
|
|
7a3408 |
nodeGetMemoryStats;
|
|
|
7a3408 |
nodeGetOnlineCPUBitmap;
|
|
|
7a3408 |
nodeGetPresentCPUBitmap;
|
|
|
7a3408 |
+nodeGetThreadsPerSubcore;
|
|
|
7a3408 |
nodeSetMemoryParameters;
|
|
|
7a3408 |
|
|
|
7a3408 |
|
|
|
7a3408 |
diff --git a/src/nodeinfo.c b/src/nodeinfo.c
|
|
|
7a3408 |
index 8b56376..ec78f65 100644
|
|
|
7a3408 |
--- a/src/nodeinfo.c
|
|
|
7a3408 |
+++ b/src/nodeinfo.c
|
|
|
7a3408 |
@@ -31,6 +31,12 @@
|
|
|
7a3408 |
#include <dirent.h>
|
|
|
7a3408 |
#include <sys/utsname.h>
|
|
|
7a3408 |
#include "conf/domain_conf.h"
|
|
|
7a3408 |
+#include <fcntl.h>
|
|
|
7a3408 |
+#include <sys/ioctl.h>
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+#if HAVE_LINUX_KVM_H
|
|
|
7a3408 |
+# include <linux/kvm.h>
|
|
|
7a3408 |
+#endif
|
|
|
7a3408 |
|
|
|
7a3408 |
#if defined(__FreeBSD__) || defined(__APPLE__)
|
|
|
7a3408 |
# include <sys/time.h>
|
|
|
7a3408 |
@@ -391,13 +397,14 @@ virNodeParseSocket(const char *dir,
|
|
|
7a3408 |
* filling arguments */
|
|
|
7a3408 |
static int
|
|
|
7a3408 |
ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(3)
|
|
|
7a3408 |
-ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(5)
|
|
|
7a3408 |
-ATTRIBUTE_NONNULL(6) ATTRIBUTE_NONNULL(7)
|
|
|
7a3408 |
-ATTRIBUTE_NONNULL(8)
|
|
|
7a3408 |
+ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(6)
|
|
|
7a3408 |
+ATTRIBUTE_NONNULL(7) ATTRIBUTE_NONNULL(8)
|
|
|
7a3408 |
+ATTRIBUTE_NONNULL(9)
|
|
|
7a3408 |
virNodeParseNode(const char *node,
|
|
|
7a3408 |
virArch arch,
|
|
|
7a3408 |
virBitmapPtr present_cpus_map,
|
|
|
7a3408 |
virBitmapPtr online_cpus_map,
|
|
|
7a3408 |
+ int threads_per_subcore,
|
|
|
7a3408 |
int *sockets,
|
|
|
7a3408 |
int *cores,
|
|
|
7a3408 |
int *threads,
|
|
|
7a3408 |
@@ -491,7 +498,18 @@ virNodeParseNode(const char *node,
|
|
|
7a3408 |
continue;
|
|
|
7a3408 |
|
|
|
7a3408 |
if (!virBitmapIsBitSet(online_cpus_map, cpu)) {
|
|
|
7a3408 |
- (*offline)++;
|
|
|
7a3408 |
+ if (threads_per_subcore > 0 &&
|
|
|
7a3408 |
+ cpu % threads_per_subcore != 0 &&
|
|
|
7a3408 |
+ virBitmapIsBitSet(online_cpus_map,
|
|
|
7a3408 |
+ cpu - (cpu % threads_per_subcore))) {
|
|
|
7a3408 |
+ /* Secondary offline threads are counted as online when
|
|
|
7a3408 |
+ * subcores are in use and the corresponding primary
|
|
|
7a3408 |
+ * thread is online */
|
|
|
7a3408 |
+ processors++;
|
|
|
7a3408 |
+ } else {
|
|
|
7a3408 |
+ /* But they are counted as offline otherwise */
|
|
|
7a3408 |
+ (*offline)++;
|
|
|
7a3408 |
+ }
|
|
|
7a3408 |
continue;
|
|
|
7a3408 |
}
|
|
|
7a3408 |
|
|
|
7a3408 |
@@ -544,6 +562,12 @@ virNodeParseNode(const char *node,
|
|
|
7a3408 |
*cores = core;
|
|
|
7a3408 |
}
|
|
|
7a3408 |
|
|
|
7a3408 |
+ if (threads_per_subcore > 0) {
|
|
|
7a3408 |
+ /* The thread count ignores offline threads, which means that only
|
|
|
7a3408 |
+ * only primary threads have been considered so far. If subcores
|
|
|
7a3408 |
+ * are in use, we need to also account for secondary threads */
|
|
|
7a3408 |
+ *threads *= threads_per_subcore;
|
|
|
7a3408 |
+ }
|
|
|
7a3408 |
ret = processors;
|
|
|
7a3408 |
|
|
|
7a3408 |
cleanup:
|
|
|
7a3408 |
@@ -562,6 +586,41 @@ virNodeParseNode(const char *node,
|
|
|
7a3408 |
return ret;
|
|
|
7a3408 |
}
|
|
|
7a3408 |
|
|
|
7a3408 |
+/* Check whether the host subcore configuration is valid.
|
|
|
7a3408 |
+ *
|
|
|
7a3408 |
+ * A valid configuration is one where no secondary thread is online;
|
|
|
7a3408 |
+ * the primary thread in a subcore is always the first one */
|
|
|
7a3408 |
+static bool
|
|
|
7a3408 |
+nodeHasValidSubcoreConfiguration(const char *sysfs_prefix,
|
|
|
7a3408 |
+ int threads_per_subcore)
|
|
|
7a3408 |
+{
|
|
|
7a3408 |
+ virBitmapPtr online_cpus = NULL;
|
|
|
7a3408 |
+ int cpu = -1;
|
|
|
7a3408 |
+ bool ret = false;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ /* No point in checking if subcores are not in use */
|
|
|
7a3408 |
+ if (threads_per_subcore <= 0)
|
|
|
7a3408 |
+ goto cleanup;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ if (!(online_cpus = nodeGetOnlineCPUBitmap(sysfs_prefix)))
|
|
|
7a3408 |
+ goto cleanup;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ while ((cpu = virBitmapNextSetBit(online_cpus, cpu)) >= 0) {
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ /* A single online secondary thread is enough to
|
|
|
7a3408 |
+ * make the configuration invalid */
|
|
|
7a3408 |
+ if (cpu % threads_per_subcore != 0)
|
|
|
7a3408 |
+ goto cleanup;
|
|
|
7a3408 |
+ }
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ ret = true;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ cleanup:
|
|
|
7a3408 |
+ virBitmapFree(online_cpus);
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ return ret;
|
|
|
7a3408 |
+}
|
|
|
7a3408 |
+
|
|
|
7a3408 |
int
|
|
|
7a3408 |
linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
|
|
|
7a3408 |
FILE *cpuinfo,
|
|
|
7a3408 |
@@ -575,6 +634,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
|
|
|
7a3408 |
DIR *nodedir = NULL;
|
|
|
7a3408 |
struct dirent *nodedirent = NULL;
|
|
|
7a3408 |
int cpus, cores, socks, threads, offline = 0;
|
|
|
7a3408 |
+ int threads_per_subcore = 0;
|
|
|
7a3408 |
unsigned int node;
|
|
|
7a3408 |
int ret = -1;
|
|
|
7a3408 |
char *sysfs_nodedir = NULL;
|
|
|
7a3408 |
@@ -682,6 +742,36 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
|
|
|
7a3408 |
goto fallback;
|
|
|
7a3408 |
}
|
|
|
7a3408 |
|
|
|
7a3408 |
+ /* PPC-KVM needs the secondary threads of a core to be offline on the
|
|
|
7a3408 |
+ * host. The kvm scheduler brings the secondary threads online in the
|
|
|
7a3408 |
+ * guest context. Moreover, P8 processor has split-core capability
|
|
|
7a3408 |
+ * where, there can be 1,2 or 4 subcores per core. The primaries of the
|
|
|
7a3408 |
+ * subcores alone will be online on the host for a subcore in the
|
|
|
7a3408 |
+ * host. Even though the actual threads per core for P8 processor is 8,
|
|
|
7a3408 |
+ * depending on the subcores_per_core = 1, 2 or 4, the threads per
|
|
|
7a3408 |
+ * subcore will vary accordingly to 8, 4 and 2 repectively.
|
|
|
7a3408 |
+ * So, On host threads_per_core what is arrived at from sysfs in the
|
|
|
7a3408 |
+ * current logic is actually the subcores_per_core. Threads per subcore
|
|
|
7a3408 |
+ * can only be obtained from the kvm device. For example, on P8 wih 1
|
|
|
7a3408 |
+ * core having 8 threads, sub_cores_percore=4, the threads 0,2,4 & 6
|
|
|
7a3408 |
+ * will be online. The sysfs reflects this and in the current logic
|
|
|
7a3408 |
+ * variable 'threads' will be 4 which is nothing but subcores_per_core.
|
|
|
7a3408 |
+ * If the user tampers the cpu online/offline states using chcpu or other
|
|
|
7a3408 |
+ * means, then it is an unsupported configuration for kvm.
|
|
|
7a3408 |
+ * The code below tries to keep in mind
|
|
|
7a3408 |
+ * - when the libvirtd is run inside a KVM guest or Phyp based guest.
|
|
|
7a3408 |
+ * - Or on the kvm host where user manually tampers the cpu states to
|
|
|
7a3408 |
+ * offline/online randomly.
|
|
|
7a3408 |
+ * On hosts other than POWER this will be 0, in which case a simpler
|
|
|
7a3408 |
+ * thread-counting logic will be used */
|
|
|
7a3408 |
+ if ((threads_per_subcore = nodeGetThreadsPerSubcore(arch)) < 0)
|
|
|
7a3408 |
+ goto cleanup;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ /* If the subcore configuration is not valid, just pretend subcores
|
|
|
7a3408 |
+ * are not in use and count threads one by one */
|
|
|
7a3408 |
+ if (!nodeHasValidSubcoreConfiguration(sysfs_prefix, threads_per_subcore))
|
|
|
7a3408 |
+ threads_per_subcore = 0;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
while ((direrr = virDirRead(nodedir, &nodedirent, sysfs_nodedir)) > 0) {
|
|
|
7a3408 |
if (sscanf(nodedirent->d_name, "node%u", &node) != 1)
|
|
|
7a3408 |
continue;
|
|
|
7a3408 |
@@ -695,6 +785,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
|
|
|
7a3408 |
if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
|
|
|
7a3408 |
present_cpus_map,
|
|
|
7a3408 |
online_cpus_map,
|
|
|
7a3408 |
+ threads_per_subcore,
|
|
|
7a3408 |
&socks, &cores,
|
|
|
7a3408 |
&threads, &offline)) < 0)
|
|
|
7a3408 |
goto cleanup;
|
|
|
7a3408 |
@@ -728,6 +819,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
|
|
|
7a3408 |
if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
|
|
|
7a3408 |
present_cpus_map,
|
|
|
7a3408 |
online_cpus_map,
|
|
|
7a3408 |
+ threads_per_subcore,
|
|
|
7a3408 |
&socks, &cores,
|
|
|
7a3408 |
&threads, &offline)) < 0)
|
|
|
7a3408 |
goto cleanup;
|
|
|
7a3408 |
@@ -2247,3 +2339,56 @@ nodeAllocPages(unsigned int npages,
|
|
|
7a3408 |
cleanup:
|
|
|
7a3408 |
return ret;
|
|
|
7a3408 |
}
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+/* Get the number of threads per subcore.
|
|
|
7a3408 |
+ *
|
|
|
7a3408 |
+ * This will be 2, 4 or 8 on POWER hosts, depending on the current
|
|
|
7a3408 |
+ * micro-threading configuration, and 0 everywhere else.
|
|
|
7a3408 |
+ *
|
|
|
7a3408 |
+ * Returns the number of threads per subcore if subcores are in use, zero
|
|
|
7a3408 |
+ * if subcores are not in use, and a negative value on error */
|
|
|
7a3408 |
+int
|
|
|
7a3408 |
+nodeGetThreadsPerSubcore(virArch arch)
|
|
|
7a3408 |
+{
|
|
|
7a3408 |
+ int threads_per_subcore = 0;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+#if HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT)
|
|
|
7a3408 |
+ const char *kvmpath = "/dev/kvm";
|
|
|
7a3408 |
+ int kvmfd;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ if (ARCH_IS_PPC64(arch)) {
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ /* It's okay if /dev/kvm doesn't exist, because
|
|
|
7a3408 |
+ * a. we might be running in a guest
|
|
|
7a3408 |
+ * b. the kvm module might not be installed or enabled
|
|
|
7a3408 |
+ * In either case, falling back to the subcore-unaware thread
|
|
|
7a3408 |
+ * counting logic is the right thing to do */
|
|
|
7a3408 |
+ if (!virFileExists(kvmpath))
|
|
|
7a3408 |
+ goto out;
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ if ((kvmfd = open(kvmpath, O_RDONLY)) < 0) {
|
|
|
7a3408 |
+ /* This can happen when running as a regular user if
|
|
|
7a3408 |
+ * permissions are tight enough, in which case erroring out
|
|
|
7a3408 |
+ * is better than silently falling back and reporting
|
|
|
7a3408 |
+ * different nodeinfo depending on the user */
|
|
|
7a3408 |
+ virReportSystemError(errno,
|
|
|
7a3408 |
+ _("Failed to open '%s'"),
|
|
|
7a3408 |
+ kvmpath);
|
|
|
7a3408 |
+ threads_per_subcore = -1;
|
|
|
7a3408 |
+ goto out;
|
|
|
7a3408 |
+ }
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ /* For Phyp and KVM based guests the ioctl for KVM_CAP_PPC_SMT
|
|
|
7a3408 |
+ * returns zero and both primary and secondary threads will be
|
|
|
7a3408 |
+ * online */
|
|
|
7a3408 |
+ threads_per_subcore = ioctl(kvmfd,
|
|
|
7a3408 |
+ KVM_CHECK_EXTENSION,
|
|
|
7a3408 |
+ KVM_CAP_PPC_SMT);
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ VIR_FORCE_CLOSE(kvmfd);
|
|
|
7a3408 |
+ }
|
|
|
7a3408 |
+#endif /* HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT) */
|
|
|
7a3408 |
+
|
|
|
7a3408 |
+ out:
|
|
|
7a3408 |
+ return threads_per_subcore;
|
|
|
7a3408 |
+}
|
|
|
7a3408 |
diff --git a/src/nodeinfo.h b/src/nodeinfo.h
|
|
|
7a3408 |
index 1810c1c..ac96dca 100644
|
|
|
7a3408 |
--- a/src/nodeinfo.h
|
|
|
7a3408 |
+++ b/src/nodeinfo.h
|
|
|
7a3408 |
@@ -47,6 +47,7 @@ int nodeGetMemory(unsigned long long *mem,
|
|
|
7a3408 |
virBitmapPtr nodeGetPresentCPUBitmap(const char *sysfs_prefix);
|
|
|
7a3408 |
virBitmapPtr nodeGetOnlineCPUBitmap(const char *sysfs_prefix);
|
|
|
7a3408 |
int nodeGetCPUCount(const char *sysfs_prefix);
|
|
|
7a3408 |
+int nodeGetThreadsPerSubcore(virArch arch);
|
|
|
7a3408 |
|
|
|
7a3408 |
int nodeGetMemoryParameters(virTypedParameterPtr params,
|
|
|
7a3408 |
int *nparams,
|
|
|
7a3408 |
--
|
|
|
7a3408 |
2.5.0
|
|
|
7a3408 |
|