Blob Blame History Raw
From c1bbe50778b3a114bccccb1d94fa9beefcc52ae5 Mon Sep 17 00:00:00 2001
Message-Id: <c1bbe50778b3a114bccccb1d94fa9beefcc52ae5@dist-git>
From: Andrea Bolognani <abologna@redhat.com>
Date: Fri, 20 Nov 2015 13:29:14 +0100
Subject: [PATCH] qemu: Add ppc64-specific math to
 qemuDomainGetMlockLimitBytes()

The amount of memory a ppc64 domain might need to lock is different
than that of a equally-sized x86 domain, so we need to check the
domain's architecture and act accordingly.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1273480

(cherry picked from commit d269ef165c178ad62b48e5179fc4f3b4fa5e590b)

Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1283924

Signed-off-by: Andrea Bolognani <abologna@redhat.com>
Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
---
 src/qemu/qemu_domain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index be3dcf6..4615e3e 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -3430,7 +3430,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver,
  * @def: domain definition
  *
  * Returns the size of the memory in bytes that needs to be set as
- * RLIMIT_MEMLOCK for purpose of VFIO device passthrough.
+ * RLIMIT_MEMLOCK for the QEMU process.
  * If a mem.hard_limit is set, then that value is preferred; otherwise, the
  * value returned may depend upon the architecture or devices present.
  */
@@ -3445,6 +3445,84 @@ qemuDomainGetMlockLimitBytes(virDomainDefPtr def)
         goto done;
     }
 
+    if (ARCH_IS_PPC64(def->os.arch)) {
+        unsigned long long maxMemory;
+        unsigned long long memory;
+        unsigned long long baseLimit;
+        unsigned long long passthroughLimit;
+        size_t nPCIHostBridges;
+        size_t i;
+        bool usesVFIO = false;
+
+        /* TODO: Detect at runtime once we start using more than just
+         *       the default PCI Host Bridge */
+        nPCIHostBridges = 1;
+
+        for (i = 0; i < def->nhostdevs; i++) {
+            virDomainHostdevDefPtr dev = def->hostdevs[i];
+
+            if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+                dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
+                dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) {
+                usesVFIO = true;
+                break;
+            }
+        }
+
+        memory = virDomainDefGetMemoryActual(def);
+
+        if (def->mem.max_memory)
+            maxMemory = def->mem.max_memory;
+        else
+            maxMemory = memory;
+
+        /* baseLimit := maxMemory / 128                                  (a)
+         *              + 4 MiB * #PHBs + 8 MiB                          (b)
+         *
+         * (a) is the hash table
+         *
+         * (b) is accounting for the 32-bit DMA window - it could be either the
+         * KVM accelerated TCE tables for emulated devices, or the VFIO
+         * userspace view. The 4 MiB per-PHB (including the default one) covers
+         * a 2GiB DMA window: default is 1GiB, but it's possible it'll be
+         * increased to help performance. The 8 MiB extra should be plenty for
+         * the TCE table index for any reasonable number of PHBs and several
+         * spapr-vlan or spapr-vscsi devices (512kB + a tiny bit each) */
+        baseLimit = maxMemory / 128 +
+                    4096 * nPCIHostBridges +
+                    8192;
+
+        /* passthroughLimit := max( 2 GiB * #PHBs,                       (c)
+         *                          memory                               (d)
+         *                          + memory * 1/512 * #PHBs + 8 MiB )   (e)
+         *
+         * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB
+         * rather than 1 GiB
+         *
+         * (d) is the with-DDW (and memory pre-registration and related
+         * features) DMA window accounting - assuming that we only account RAM
+         * once, even if mapped to multiple PHBs
+         *
+         * (e) is the with-DDW userspace view and overhead for the 64-bit DMA
+         * window. This is based a bit on expected guest behaviour, but there
+         * really isn't a way to completely avoid that. We assume the guest
+         * requests a 64-bit DMA window (per PHB) just big enough to map all
+         * its RAM. 4 kiB page size gives the 1/512; it will be less with 64
+         * kiB pages, less still if the guest is mapped with hugepages (unlike
+         * the default 32-bit DMA window, DDW windows can use large IOMMU
+         * pages). 8 MiB is for second and further level overheads, like (b) */
+        passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges,
+                               memory +
+                               memory / 512 * nPCIHostBridges + 8192);
+
+        if (usesVFIO)
+            memKB = baseLimit + passthroughLimit;
+        else
+            memKB = baseLimit;
+
+        goto done;
+    }
+
     /* For device passthrough using VFIO the guest memory and MMIO memory
      * regions need to be locked persistent in order to allow DMA.
      *
-- 
2.6.3