render / rpms / libvirt

Forked from rpms/libvirt 9 months ago
Clone
7a3408
From f5b7d2d701f347b9b4e13d008de476e37c82c8cb Mon Sep 17 00:00:00 2001
7a3408
Message-Id: <f5b7d2d701f347b9b4e13d008de476e37c82c8cb@dist-git>
7a3408
From: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com>
7a3408
Date: Wed, 5 Aug 2015 18:18:35 +0200
7a3408
Subject: [PATCH] nodeinfo: Fix output on PPC64 KVM hosts
7a3408
7a3408
The nodeinfo is reporting incorrect number of cpus and incorrect host
7a3408
topology on PPC64 KVM hosts. The KVM hypervisor on PPC64 needs only
7a3408
the primary thread in a core to be online, and the secondaries offlined.
7a3408
While scheduling a guest in, the kvm scheduler wakes up the secondaries to
7a3408
run in guest context.
7a3408
7a3408
The host scheduling of the guests happen at the core level(as only primary
7a3408
thread is online). The kvm scheduler exploits as many threads of the core
7a3408
as needed by guest. Further, starting POWER8, the processor allows splitting
7a3408
a physical core into multiple subcores with 2 or 4 threads each. Again, only
7a3408
the primary thread in a subcore is online in the host. The KVM-PPC
7a3408
scheduler allows guests to exploit all the offline threads in the subcore,
7a3408
by bringing them online when needed.
7a3408
(Kernel patches on split-core http://www.spinics.net/lists/kvm-ppc/msg09121.html)
7a3408
7a3408
Recently with dynamic micro-threading changes in ppc-kvm, makes sure
7a3408
to utilize all the offline cpus across guests, and across guests with
7a3408
different cpu topologies.
7a3408
(https://www.mail-archive.com/kvm@vger.kernel.org/msg115978.html)
7a3408
7a3408
Since the offline cpus are brought online in the guest context, it is safe
7a3408
to count them as online. Nodeinfo today discounts these offline cpus from
7a3408
cpu count/topology calclulation, and the nodeinfo output is not of any help
7a3408
and the host appears overcommited when it is actually not.
7a3408
7a3408
The patch carefully counts those offline threads whose primary threads are
7a3408
online. The host topology displayed by the nodeinfo is also fixed when the
7a3408
host is in valid kvm state.
7a3408
7a3408
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.vnet.ibm.com>
7a3408
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
7a3408
(cherry picked from commit 014208c4d028d2a632cdfe89d361fca8811899b6)
7a3408
7a3408
Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1213713
7a3408
7a3408
Signed-off-by: Andrea Bolognani <abologna@redhat.com>
7a3408
Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
7a3408
---
7a3408
 src/libvirt_private.syms |   1 +
7a3408
 src/nodeinfo.c           | 153 +++++++++++++++++++++++++++++++++++++++++++++--
7a3408
 src/nodeinfo.h           |   1 +
7a3408
 3 files changed, 151 insertions(+), 4 deletions(-)
7a3408
7a3408
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
7a3408
index 1363c92..a6b523a 100644
7a3408
--- a/src/libvirt_private.syms
7a3408
+++ b/src/libvirt_private.syms
7a3408
@@ -1008,6 +1008,7 @@ nodeGetMemoryParameters;
7a3408
 nodeGetMemoryStats;
7a3408
 nodeGetOnlineCPUBitmap;
7a3408
 nodeGetPresentCPUBitmap;
7a3408
+nodeGetThreadsPerSubcore;
7a3408
 nodeSetMemoryParameters;
7a3408
 
7a3408
 
7a3408
diff --git a/src/nodeinfo.c b/src/nodeinfo.c
7a3408
index 8b56376..ec78f65 100644
7a3408
--- a/src/nodeinfo.c
7a3408
+++ b/src/nodeinfo.c
7a3408
@@ -31,6 +31,12 @@
7a3408
 #include <dirent.h>
7a3408
 #include <sys/utsname.h>
7a3408
 #include "conf/domain_conf.h"
7a3408
+#include <fcntl.h>
7a3408
+#include <sys/ioctl.h>
7a3408
+
7a3408
+#if HAVE_LINUX_KVM_H
7a3408
+# include <linux/kvm.h>
7a3408
+#endif
7a3408
 
7a3408
 #if defined(__FreeBSD__) || defined(__APPLE__)
7a3408
 # include <sys/time.h>
7a3408
@@ -391,13 +397,14 @@ virNodeParseSocket(const char *dir,
7a3408
  * filling arguments */
7a3408
 static int
7a3408
 ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(3)
7a3408
-ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(5)
7a3408
-ATTRIBUTE_NONNULL(6) ATTRIBUTE_NONNULL(7)
7a3408
-ATTRIBUTE_NONNULL(8)
7a3408
+ATTRIBUTE_NONNULL(4) ATTRIBUTE_NONNULL(6)
7a3408
+ATTRIBUTE_NONNULL(7) ATTRIBUTE_NONNULL(8)
7a3408
+ATTRIBUTE_NONNULL(9)
7a3408
 virNodeParseNode(const char *node,
7a3408
                  virArch arch,
7a3408
                  virBitmapPtr present_cpus_map,
7a3408
                  virBitmapPtr online_cpus_map,
7a3408
+                 int threads_per_subcore,
7a3408
                  int *sockets,
7a3408
                  int *cores,
7a3408
                  int *threads,
7a3408
@@ -491,7 +498,18 @@ virNodeParseNode(const char *node,
7a3408
             continue;
7a3408
 
7a3408
         if (!virBitmapIsBitSet(online_cpus_map, cpu)) {
7a3408
-            (*offline)++;
7a3408
+            if (threads_per_subcore > 0 &&
7a3408
+                cpu % threads_per_subcore != 0 &&
7a3408
+                virBitmapIsBitSet(online_cpus_map,
7a3408
+                                  cpu - (cpu % threads_per_subcore))) {
7a3408
+                /* Secondary offline threads are counted as online when
7a3408
+                 * subcores are in use and the corresponding primary
7a3408
+                 * thread is online */
7a3408
+                processors++;
7a3408
+            } else {
7a3408
+                /* But they are counted as offline otherwise */
7a3408
+                (*offline)++;
7a3408
+            }
7a3408
             continue;
7a3408
         }
7a3408
 
7a3408
@@ -544,6 +562,12 @@ virNodeParseNode(const char *node,
7a3408
             *cores = core;
7a3408
     }
7a3408
 
7a3408
+    if (threads_per_subcore > 0) {
7a3408
+        /* The thread count ignores offline threads, which means that only
7a3408
+         * only primary threads have been considered so far. If subcores
7a3408
+         * are in use, we need to also account for secondary threads */
7a3408
+        *threads *= threads_per_subcore;
7a3408
+    }
7a3408
     ret = processors;
7a3408
 
7a3408
  cleanup:
7a3408
@@ -562,6 +586,41 @@ virNodeParseNode(const char *node,
7a3408
     return ret;
7a3408
 }
7a3408
 
7a3408
+/* Check whether the host subcore configuration is valid.
7a3408
+ *
7a3408
+ * A valid configuration is one where no secondary thread is online;
7a3408
+ * the primary thread in a subcore is always the first one */
7a3408
+static bool
7a3408
+nodeHasValidSubcoreConfiguration(const char *sysfs_prefix,
7a3408
+                                 int threads_per_subcore)
7a3408
+{
7a3408
+    virBitmapPtr online_cpus = NULL;
7a3408
+    int cpu = -1;
7a3408
+    bool ret = false;
7a3408
+
7a3408
+    /* No point in checking if subcores are not in use */
7a3408
+    if (threads_per_subcore <= 0)
7a3408
+        goto cleanup;
7a3408
+
7a3408
+    if (!(online_cpus = nodeGetOnlineCPUBitmap(sysfs_prefix)))
7a3408
+        goto cleanup;
7a3408
+
7a3408
+    while ((cpu = virBitmapNextSetBit(online_cpus, cpu)) >= 0) {
7a3408
+
7a3408
+        /* A single online secondary thread is enough to
7a3408
+         * make the configuration invalid */
7a3408
+        if (cpu % threads_per_subcore != 0)
7a3408
+            goto cleanup;
7a3408
+    }
7a3408
+
7a3408
+    ret = true;
7a3408
+
7a3408
+ cleanup:
7a3408
+    virBitmapFree(online_cpus);
7a3408
+
7a3408
+    return ret;
7a3408
+}
7a3408
+
7a3408
 int
7a3408
 linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
7a3408
                          FILE *cpuinfo,
7a3408
@@ -575,6 +634,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
7a3408
     DIR *nodedir = NULL;
7a3408
     struct dirent *nodedirent = NULL;
7a3408
     int cpus, cores, socks, threads, offline = 0;
7a3408
+    int threads_per_subcore = 0;
7a3408
     unsigned int node;
7a3408
     int ret = -1;
7a3408
     char *sysfs_nodedir = NULL;
7a3408
@@ -682,6 +742,36 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
7a3408
         goto fallback;
7a3408
     }
7a3408
 
7a3408
+    /* PPC-KVM needs the secondary threads of a core to be offline on the
7a3408
+     * host. The kvm scheduler brings the secondary threads online in the
7a3408
+     * guest context. Moreover, P8 processor has split-core capability
7a3408
+     * where, there can be 1,2 or 4 subcores per core. The primaries of the
7a3408
+     * subcores alone will be online on the host for a subcore in the
7a3408
+     * host. Even though the actual threads per core for P8 processor is 8,
7a3408
+     * depending on the subcores_per_core = 1, 2 or 4, the threads per
7a3408
+     * subcore will vary accordingly to 8, 4 and 2 repectively.
7a3408
+     * So, On host threads_per_core what is arrived at from sysfs in the
7a3408
+     * current logic is actually the subcores_per_core. Threads per subcore
7a3408
+     * can only be obtained from the kvm device. For example, on P8 wih 1
7a3408
+     * core having 8 threads, sub_cores_percore=4, the threads 0,2,4 & 6
7a3408
+     * will be online. The sysfs reflects this and in the current logic
7a3408
+     * variable 'threads' will be 4 which is nothing but subcores_per_core.
7a3408
+     * If the user tampers the cpu online/offline states using chcpu or other
7a3408
+     * means, then it is an unsupported configuration for kvm.
7a3408
+     * The code below tries to keep in mind
7a3408
+     *  - when the libvirtd is run inside a KVM guest or Phyp based guest.
7a3408
+     *  - Or on the kvm host where user manually tampers the cpu states to
7a3408
+     *    offline/online randomly.
7a3408
+     * On hosts other than POWER this will be 0, in which case a simpler
7a3408
+     * thread-counting logic will be used  */
7a3408
+    if ((threads_per_subcore = nodeGetThreadsPerSubcore(arch)) < 0)
7a3408
+        goto cleanup;
7a3408
+
7a3408
+    /* If the subcore configuration is not valid, just pretend subcores
7a3408
+     * are not in use and count threads one by one */
7a3408
+    if (!nodeHasValidSubcoreConfiguration(sysfs_prefix, threads_per_subcore))
7a3408
+        threads_per_subcore = 0;
7a3408
+
7a3408
     while ((direrr = virDirRead(nodedir, &nodedirent, sysfs_nodedir)) > 0) {
7a3408
         if (sscanf(nodedirent->d_name, "node%u", &node) != 1)
7a3408
             continue;
7a3408
@@ -695,6 +785,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
7a3408
         if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
7a3408
                                      present_cpus_map,
7a3408
                                      online_cpus_map,
7a3408
+                                     threads_per_subcore,
7a3408
                                      &socks, &cores,
7a3408
                                      &threads, &offline)) < 0)
7a3408
             goto cleanup;
7a3408
@@ -728,6 +819,7 @@ linuxNodeInfoCPUPopulate(const char *sysfs_prefix,
7a3408
     if ((cpus = virNodeParseNode(sysfs_cpudir, arch,
7a3408
                                  present_cpus_map,
7a3408
                                  online_cpus_map,
7a3408
+                                 threads_per_subcore,
7a3408
                                  &socks, &cores,
7a3408
                                  &threads, &offline)) < 0)
7a3408
         goto cleanup;
7a3408
@@ -2247,3 +2339,56 @@ nodeAllocPages(unsigned int npages,
7a3408
  cleanup:
7a3408
     return ret;
7a3408
 }
7a3408
+
7a3408
+/* Get the number of threads per subcore.
7a3408
+ *
7a3408
+ * This will be 2, 4 or 8 on POWER hosts, depending on the current
7a3408
+ * micro-threading configuration, and 0 everywhere else.
7a3408
+ *
7a3408
+ * Returns the number of threads per subcore if subcores are in use, zero
7a3408
+ * if subcores are not in use, and a negative value on error */
7a3408
+int
7a3408
+nodeGetThreadsPerSubcore(virArch arch)
7a3408
+{
7a3408
+    int threads_per_subcore = 0;
7a3408
+
7a3408
+#if HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT)
7a3408
+    const char *kvmpath = "/dev/kvm";
7a3408
+    int kvmfd;
7a3408
+
7a3408
+    if (ARCH_IS_PPC64(arch)) {
7a3408
+
7a3408
+        /* It's okay if /dev/kvm doesn't exist, because
7a3408
+         *   a. we might be running in a guest
7a3408
+         *   b. the kvm module might not be installed or enabled
7a3408
+         * In either case, falling back to the subcore-unaware thread
7a3408
+         * counting logic is the right thing to do */
7a3408
+        if (!virFileExists(kvmpath))
7a3408
+            goto out;
7a3408
+
7a3408
+        if ((kvmfd = open(kvmpath, O_RDONLY)) < 0) {
7a3408
+            /* This can happen when running as a regular user if
7a3408
+             * permissions are tight enough, in which case erroring out
7a3408
+             * is better than silently falling back and reporting
7a3408
+             * different nodeinfo depending on the user */
7a3408
+            virReportSystemError(errno,
7a3408
+                                 _("Failed to open '%s'"),
7a3408
+                                 kvmpath);
7a3408
+            threads_per_subcore = -1;
7a3408
+            goto out;
7a3408
+        }
7a3408
+
7a3408
+        /* For Phyp and KVM based guests the ioctl for KVM_CAP_PPC_SMT
7a3408
+         * returns zero and both primary and secondary threads will be
7a3408
+         * online */
7a3408
+        threads_per_subcore = ioctl(kvmfd,
7a3408
+                                    KVM_CHECK_EXTENSION,
7a3408
+                                    KVM_CAP_PPC_SMT);
7a3408
+
7a3408
+        VIR_FORCE_CLOSE(kvmfd);
7a3408
+    }
7a3408
+#endif /* HAVE_LINUX_KVM_H && defined(KVM_CAP_PPC_SMT) */
7a3408
+
7a3408
+ out:
7a3408
+    return threads_per_subcore;
7a3408
+}
7a3408
diff --git a/src/nodeinfo.h b/src/nodeinfo.h
7a3408
index 1810c1c..ac96dca 100644
7a3408
--- a/src/nodeinfo.h
7a3408
+++ b/src/nodeinfo.h
7a3408
@@ -47,6 +47,7 @@ int nodeGetMemory(unsigned long long *mem,
7a3408
 virBitmapPtr nodeGetPresentCPUBitmap(const char *sysfs_prefix);
7a3408
 virBitmapPtr nodeGetOnlineCPUBitmap(const char *sysfs_prefix);
7a3408
 int nodeGetCPUCount(const char *sysfs_prefix);
7a3408
+int nodeGetThreadsPerSubcore(virArch arch);
7a3408
 
7a3408
 int nodeGetMemoryParameters(virTypedParameterPtr params,
7a3408
                             int *nparams,
7a3408
-- 
7a3408
2.5.0
7a3408