|
|
ae23c9 |
From f9416fd5d1232f47af1366c8099003a88dab4a21 Mon Sep 17 00:00:00 2001
|
|
|
ae23c9 |
From: Alex Williamson <alex.williamson@redhat.com>
|
|
|
ae23c9 |
Date: Mon, 3 Dec 2018 22:01:48 +0000
|
|
|
ae23c9 |
Subject: [PATCH 12/16] vfio: Inhibit ballooning based on group attachment to a
|
|
|
ae23c9 |
container
|
|
|
ae23c9 |
|
|
|
ae23c9 |
RH-Author: Alex Williamson <alex.williamson@redhat.com>
|
|
|
ae23c9 |
Message-id: <154387450879.27651.3509144221336190827.stgit@gimli.home>
|
|
|
ae23c9 |
Patchwork-id: 83238
|
|
|
ae23c9 |
O-Subject: [RHEL-8.0 qemu-kvm PATCH 3/7] vfio: Inhibit ballooning based on group attachment to a container
|
|
|
ae23c9 |
Bugzilla: 1650272
|
|
|
ae23c9 |
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
|
|
ae23c9 |
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
|
|
|
ae23c9 |
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
|
|
|
ae23c9 |
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
|
|
ae23c9 |
|
|
|
ae23c9 |
Bugzilla: 1650272
|
|
|
ae23c9 |
|
|
|
ae23c9 |
We use a VFIOContainer to associate an AddressSpace to one or more
|
|
|
ae23c9 |
VFIOGroups. The VFIOContainer represents the DMA context for that
|
|
|
ae23c9 |
AdressSpace for those VFIOGroups and is synchronized to changes in
|
|
|
ae23c9 |
that AddressSpace via a MemoryListener. For IOMMU backed devices,
|
|
|
ae23c9 |
maintaining the DMA context for a VFIOGroup generally involves
|
|
|
ae23c9 |
pinning a host virtual address in order to create a stable host
|
|
|
ae23c9 |
physical address and then mapping a translation from the associated
|
|
|
ae23c9 |
guest physical address to that host physical address into the IOMMU.
|
|
|
ae23c9 |
|
|
|
ae23c9 |
While the above maintains the VFIOContainer synchronized to the QEMU
|
|
|
ae23c9 |
memory API of the VM, memory ballooning occurs outside of that API.
|
|
|
ae23c9 |
Inflating the memory balloon (ie. cooperatively capturing pages from
|
|
|
ae23c9 |
the guest for use by the host) simply uses MADV_DONTNEED to "zap"
|
|
|
ae23c9 |
pages from QEMU's host virtual address space. The page pinning and
|
|
|
ae23c9 |
IOMMU mapping above remains in place, negating the host's ability to
|
|
|
ae23c9 |
reuse the page, but the host virtual to host physical mapping of the
|
|
|
ae23c9 |
page is invalidated outside of QEMU's memory API.
|
|
|
ae23c9 |
|
|
|
ae23c9 |
When the balloon is later deflated, attempting to cooperatively
|
|
|
ae23c9 |
return pages to the guest, the page is simply freed by the guest
|
|
|
ae23c9 |
balloon driver, allowing it to be used in the guest and incurring a
|
|
|
ae23c9 |
page fault when that occurs. The page fault maps a new host physical
|
|
|
ae23c9 |
page backing the existing host virtual address, meanwhile the
|
|
|
ae23c9 |
VFIOContainer still maintains the translation to the original host
|
|
|
ae23c9 |
physical address. At this point the guest vCPU and any assigned
|
|
|
ae23c9 |
devices will map different host physical addresses to the same guest
|
|
|
ae23c9 |
physical address. Badness.
|
|
|
ae23c9 |
|
|
|
ae23c9 |
The IOMMU typically does not have page level granularity with which
|
|
|
ae23c9 |
it can track this mapping without also incurring inefficiencies in
|
|
|
ae23c9 |
using page size mappings throughout. MMU notifiers in the host
|
|
|
ae23c9 |
kernel also provide indicators for invalidating the mapping on
|
|
|
ae23c9 |
balloon inflation, not for updating the mapping when the balloon is
|
|
|
ae23c9 |
deflated. For these reasons we assume a default behavior that the
|
|
|
ae23c9 |
mapping of each VFIOGroup into the VFIOContainer is incompatible
|
|
|
ae23c9 |
with memory ballooning and increment the balloon inhibitor to match
|
|
|
ae23c9 |
the attached VFIOGroups.
|
|
|
ae23c9 |
|
|
|
ae23c9 |
Reviewed-by: Peter Xu <peterx@redhat.com>
|
|
|
ae23c9 |
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
|
|
ae23c9 |
(cherry picked from commit c65ee433153b5925e183a00ebf568e160077c694)
|
|
|
ae23c9 |
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
|
|
|
ae23c9 |
---
|
|
|
ae23c9 |
hw/vfio/common.c | 30 ++++++++++++++++++++++++++++++
|
|
|
ae23c9 |
1 file changed, 30 insertions(+)
|
|
|
ae23c9 |
|
|
|
ae23c9 |
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
|
|
|
ae23c9 |
index 07ffa0b..7e8f289 100644
|
|
|
ae23c9 |
--- a/hw/vfio/common.c
|
|
|
ae23c9 |
+++ b/hw/vfio/common.c
|
|
|
ae23c9 |
@@ -32,6 +32,7 @@
|
|
|
ae23c9 |
#include "hw/hw.h"
|
|
|
ae23c9 |
#include "qemu/error-report.h"
|
|
|
ae23c9 |
#include "qemu/range.h"
|
|
|
ae23c9 |
+#include "sysemu/balloon.h"
|
|
|
ae23c9 |
#include "sysemu/kvm.h"
|
|
|
ae23c9 |
#include "trace.h"
|
|
|
ae23c9 |
#include "qapi/error.h"
|
|
|
ae23c9 |
@@ -1039,6 +1040,33 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
|
|
|
ae23c9 |
|
|
|
ae23c9 |
space = vfio_get_address_space(as);
|
|
|
ae23c9 |
|
|
|
ae23c9 |
+ /*
|
|
|
ae23c9 |
+ * VFIO is currently incompatible with memory ballooning insofar as the
|
|
|
ae23c9 |
+ * madvise to purge (zap) the page from QEMU's address space does not
|
|
|
ae23c9 |
+ * interact with the memory API and therefore leaves stale virtual to
|
|
|
ae23c9 |
+ * physical mappings in the IOMMU if the page was previously pinned. We
|
|
|
ae23c9 |
+ * therefore add a balloon inhibit for each group added to a container,
|
|
|
ae23c9 |
+ * whether the container is used individually or shared. This provides
|
|
|
ae23c9 |
+ * us with options to allow devices within a group to opt-in and allow
|
|
|
ae23c9 |
+ * ballooning, so long as it is done consistently for a group (for instance
|
|
|
ae23c9 |
+ * if the device is an mdev device where it is known that the host vendor
|
|
|
ae23c9 |
+ * driver will never pin pages outside of the working set of the guest
|
|
|
ae23c9 |
+ * driver, which would thus not be ballooning candidates).
|
|
|
ae23c9 |
+ *
|
|
|
ae23c9 |
+ * The first opportunity to induce pinning occurs here where we attempt to
|
|
|
ae23c9 |
+ * attach the group to existing containers within the AddressSpace. If any
|
|
|
ae23c9 |
+ * pages are already zapped from the virtual address space, such as from a
|
|
|
ae23c9 |
+ * previous ballooning opt-in, new pinning will cause valid mappings to be
|
|
|
ae23c9 |
+ * re-established. Likewise, when the overall MemoryListener for a new
|
|
|
ae23c9 |
+ * container is registered, a replay of mappings within the AddressSpace
|
|
|
ae23c9 |
+ * will occur, re-establishing any previously zapped pages as well.
|
|
|
ae23c9 |
+ *
|
|
|
ae23c9 |
+ * NB. Balloon inhibiting does not currently block operation of the
|
|
|
ae23c9 |
+ * balloon driver or revoke previously pinned pages, it only prevents
|
|
|
ae23c9 |
+ * calling madvise to modify the virtual mapping of ballooned pages.
|
|
|
ae23c9 |
+ */
|
|
|
ae23c9 |
+ qemu_balloon_inhibit(true);
|
|
|
ae23c9 |
+
|
|
|
ae23c9 |
QLIST_FOREACH(container, &space->containers, next) {
|
|
|
ae23c9 |
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
|
|
|
ae23c9 |
group->container = container;
|
|
|
ae23c9 |
@@ -1227,6 +1255,7 @@ close_fd_exit:
|
|
|
ae23c9 |
close(fd);
|
|
|
ae23c9 |
|
|
|
ae23c9 |
put_space_exit:
|
|
|
ae23c9 |
+ qemu_balloon_inhibit(false);
|
|
|
ae23c9 |
vfio_put_address_space(space);
|
|
|
ae23c9 |
|
|
|
ae23c9 |
return ret;
|
|
|
ae23c9 |
@@ -1347,6 +1376,7 @@ void vfio_put_group(VFIOGroup *group)
|
|
|
ae23c9 |
return;
|
|
|
ae23c9 |
}
|
|
|
ae23c9 |
|
|
|
ae23c9 |
+ qemu_balloon_inhibit(false);
|
|
|
ae23c9 |
vfio_kvm_device_del_group(group);
|
|
|
ae23c9 |
vfio_disconnect_container(group);
|
|
|
ae23c9 |
QLIST_REMOVE(group, next);
|
|
|
ae23c9 |
--
|
|
|
ae23c9 |
1.8.3.1
|
|
|
ae23c9 |
|