|
|
9ae3a8 |
From e9148733cef44bebb0d74a731a70b3304e720634 Mon Sep 17 00:00:00 2001
|
|
|
9ae3a8 |
From: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
Date: Thu, 13 Dec 2018 21:55:26 +0100
|
|
|
9ae3a8 |
Subject: [PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a
|
|
|
9ae3a8 |
container
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
RH-Author: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
Message-id: <154473812659.22725.6814768117383324849.stgit@gimli.home>
|
|
|
9ae3a8 |
Patchwork-id: 83497
|
|
|
9ae3a8 |
O-Subject: [RHEL-7.7 qemu-kvm PATCH 5/5] vfio: Inhibit ballooning based on group attachment to a container
|
|
|
9ae3a8 |
Bugzilla: 1659229
|
|
|
9ae3a8 |
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Bugzilla: 1659229
|
|
|
9ae3a8 |
Notes: Error path has more exit paths versus upstream
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
We use a VFIOContainer to associate an AddressSpace to one or more
|
|
|
9ae3a8 |
VFIOGroups. The VFIOContainer represents the DMA context for that
|
|
|
9ae3a8 |
AdressSpace for those VFIOGroups and is synchronized to changes in
|
|
|
9ae3a8 |
that AddressSpace via a MemoryListener. For IOMMU backed devices,
|
|
|
9ae3a8 |
maintaining the DMA context for a VFIOGroup generally involves
|
|
|
9ae3a8 |
pinning a host virtual address in order to create a stable host
|
|
|
9ae3a8 |
physical address and then mapping a translation from the associated
|
|
|
9ae3a8 |
guest physical address to that host physical address into the IOMMU.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
While the above maintains the VFIOContainer synchronized to the QEMU
|
|
|
9ae3a8 |
memory API of the VM, memory ballooning occurs outside of that API.
|
|
|
9ae3a8 |
Inflating the memory balloon (ie. cooperatively capturing pages from
|
|
|
9ae3a8 |
the guest for use by the host) simply uses MADV_DONTNEED to "zap"
|
|
|
9ae3a8 |
pages from QEMU's host virtual address space. The page pinning and
|
|
|
9ae3a8 |
IOMMU mapping above remains in place, negating the host's ability to
|
|
|
9ae3a8 |
reuse the page, but the host virtual to host physical mapping of the
|
|
|
9ae3a8 |
page is invalidated outside of QEMU's memory API.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
When the balloon is later deflated, attempting to cooperatively
|
|
|
9ae3a8 |
return pages to the guest, the page is simply freed by the guest
|
|
|
9ae3a8 |
balloon driver, allowing it to be used in the guest and incurring a
|
|
|
9ae3a8 |
page fault when that occurs. The page fault maps a new host physical
|
|
|
9ae3a8 |
page backing the existing host virtual address, meanwhile the
|
|
|
9ae3a8 |
VFIOContainer still maintains the translation to the original host
|
|
|
9ae3a8 |
physical address. At this point the guest vCPU and any assigned
|
|
|
9ae3a8 |
devices will map different host physical addresses to the same guest
|
|
|
9ae3a8 |
physical address. Badness.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
The IOMMU typically does not have page level granularity with which
|
|
|
9ae3a8 |
it can track this mapping without also incurring inefficiencies in
|
|
|
9ae3a8 |
using page size mappings throughout. MMU notifiers in the host
|
|
|
9ae3a8 |
kernel also provide indicators for invalidating the mapping on
|
|
|
9ae3a8 |
balloon inflation, not for updating the mapping when the balloon is
|
|
|
9ae3a8 |
deflated. For these reasons we assume a default behavior that the
|
|
|
9ae3a8 |
mapping of each VFIOGroup into the VFIOContainer is incompatible
|
|
|
9ae3a8 |
with memory ballooning and increment the balloon inhibitor to match
|
|
|
9ae3a8 |
the attached VFIOGroups.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Reviewed-by: Peter Xu <peterx@redhat.com>
|
|
|
9ae3a8 |
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
|
|
9ae3a8 |
(cherry picked from commit c65ee433153b5925e183a00ebf568e160077c694)
|
|
|
9ae3a8 |
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
|
|
|
9ae3a8 |
---
|
|
|
9ae3a8 |
hw/misc/vfio.c | 35 +++++++++++++++++++++++++++++++++++
|
|
|
9ae3a8 |
1 file changed, 35 insertions(+)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
|
|
|
9ae3a8 |
index f91eecb..f7360bf 100644
|
|
|
9ae3a8 |
--- a/hw/misc/vfio.c
|
|
|
9ae3a8 |
+++ b/hw/misc/vfio.c
|
|
|
9ae3a8 |
@@ -37,6 +37,7 @@
|
|
|
9ae3a8 |
#include "qemu/event_notifier.h"
|
|
|
9ae3a8 |
#include "qemu/queue.h"
|
|
|
9ae3a8 |
#include "qemu/range.h"
|
|
|
9ae3a8 |
+#include "sysemu/balloon.h"
|
|
|
9ae3a8 |
#include "sysemu/kvm.h"
|
|
|
9ae3a8 |
#include "sysemu/sysemu.h"
|
|
|
9ae3a8 |
#include "trace.h"
|
|
|
9ae3a8 |
@@ -3667,6 +3668,33 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
return 0;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ /*
|
|
|
9ae3a8 |
+ * VFIO is currently incompatible with memory ballooning insofar as the
|
|
|
9ae3a8 |
+ * madvise to purge (zap) the page from QEMU's address space does not
|
|
|
9ae3a8 |
+ * interact with the memory API and therefore leaves stale virtual to
|
|
|
9ae3a8 |
+ * physical mappings in the IOMMU if the page was previously pinned. We
|
|
|
9ae3a8 |
+ * therefore add a balloon inhibit for each group added to a container,
|
|
|
9ae3a8 |
+ * whether the container is used individually or shared. This provides
|
|
|
9ae3a8 |
+ * us with options to allow devices within a group to opt-in and allow
|
|
|
9ae3a8 |
+ * ballooning, so long as it is done consistently for a group (for instance
|
|
|
9ae3a8 |
+ * if the device is an mdev device where it is known that the host vendor
|
|
|
9ae3a8 |
+ * driver will never pin pages outside of the working set of the guest
|
|
|
9ae3a8 |
+ * driver, which would thus not be ballooning candidates).
|
|
|
9ae3a8 |
+ *
|
|
|
9ae3a8 |
+ * The first opportunity to induce pinning occurs here where we attempt to
|
|
|
9ae3a8 |
+ * attach the group to existing containers within the AddressSpace. If any
|
|
|
9ae3a8 |
+ * pages are already zapped from the virtual address space, such as from a
|
|
|
9ae3a8 |
+ * previous ballooning opt-in, new pinning will cause valid mappings to be
|
|
|
9ae3a8 |
+ * re-established. Likewise, when the overall MemoryListener for a new
|
|
|
9ae3a8 |
+ * container is registered, a replay of mappings within the AddressSpace
|
|
|
9ae3a8 |
+ * will occur, re-establishing any previously zapped pages as well.
|
|
|
9ae3a8 |
+ *
|
|
|
9ae3a8 |
+ * NB. Balloon inhibiting does not currently block operation of the
|
|
|
9ae3a8 |
+ * balloon driver or revoke previously pinned pages, it only prevents
|
|
|
9ae3a8 |
+ * calling madvise to modify the virtual mapping of ballooned pages.
|
|
|
9ae3a8 |
+ */
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(true);
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
QLIST_FOREACH(container, &container_list, next) {
|
|
|
9ae3a8 |
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
|
|
|
9ae3a8 |
group->container = container;
|
|
|
9ae3a8 |
@@ -3678,6 +3706,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
fd = qemu_open("/dev/vfio/vfio", O_RDWR);
|
|
|
9ae3a8 |
if (fd < 0) {
|
|
|
9ae3a8 |
error_report("vfio: failed to open /dev/vfio/vfio: %m");
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
return -errno;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -3686,6 +3715,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
error_report("vfio: supported vfio version: %d, "
|
|
|
9ae3a8 |
"reported version: %d", VFIO_API_VERSION, ret);
|
|
|
9ae3a8 |
close(fd);
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
return -EINVAL;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -3701,6 +3731,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
error_report("vfio: failed to set group container: %m");
|
|
|
9ae3a8 |
g_free(container);
|
|
|
9ae3a8 |
close(fd);
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
return -errno;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -3710,6 +3741,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
error_report("vfio: failed to set iommu for container: %m");
|
|
|
9ae3a8 |
g_free(container);
|
|
|
9ae3a8 |
close(fd);
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
return -errno;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -3724,6 +3756,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
vfio_listener_release(container);
|
|
|
9ae3a8 |
g_free(container);
|
|
|
9ae3a8 |
close(fd);
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
error_report("vfio: memory listener initialization failed for container\n");
|
|
|
9ae3a8 |
return ret;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
@@ -3734,6 +3767,7 @@ static int vfio_connect_container(VFIOGroup *group)
|
|
|
9ae3a8 |
error_report("vfio: No available IOMMU models");
|
|
|
9ae3a8 |
g_free(container);
|
|
|
9ae3a8 |
close(fd);
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
return -EINVAL;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -3834,6 +3868,7 @@ static void vfio_put_group(VFIOGroup *group)
|
|
|
9ae3a8 |
return;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+ qemu_balloon_inhibit(false);
|
|
|
9ae3a8 |
vfio_kvm_device_del_group(group);
|
|
|
9ae3a8 |
vfio_disconnect_container(group);
|
|
|
9ae3a8 |
QLIST_REMOVE(group, next);
|
|
|
9ae3a8 |
--
|
|
|
9ae3a8 |
1.8.3.1
|
|
|
9ae3a8 |
|