ed5979
From e13fdc97ff05cdee46c112c2dee70b6ef33e7fa7 Mon Sep 17 00:00:00 2001
ed5979
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
ed5979
Date: Mon, 16 Jan 2023 07:17:31 -0500
ed5979
Subject: [PATCH 31/31] kvm: Atomic memslot updates
ed5979
ed5979
RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
ed5979
RH-MergeRequest: 138: accel: introduce accelerator blocker API
ed5979
RH-Bugzilla: 1979276
ed5979
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
ed5979
RH-Acked-by: David Hildenbrand <david@redhat.com>
ed5979
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
ed5979
RH-Commit: [3/3] 9f03181ebcad2474fbe859acbce7b9891caa216b (eesposit/qemu-kvm)
ed5979
ed5979
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1979276
ed5979
ed5979
commit f39b7d2b96e3e73c01bb678cd096f7baf0b9ab39
ed5979
Author: David Hildenbrand <david@redhat.com>
ed5979
Date:   Fri Nov 11 10:47:58 2022 -0500
ed5979
ed5979
    kvm: Atomic memslot updates
ed5979
ed5979
    If we update an existing memslot (e.g., resize, split), we temporarily
ed5979
    remove the memslot to re-add it immediately afterwards. These updates
ed5979
    are not atomic, especially not for KVM VCPU threads, such that we can
ed5979
    get spurious faults.
ed5979
ed5979
    Let's inhibit most KVM ioctls while performing relevant updates, such
ed5979
    that we can perform the update just as if it would happen atomically
ed5979
    without additional kernel support.
ed5979
ed5979
    We capture the add/del changes and apply them in the notifier commit
ed5979
    stage instead. There, we can check for overlaps and perform the ioctl
ed5979
    inhibiting only if really required (-> overlap).
ed5979
ed5979
    To keep things simple we don't perform additional checks that wouldn't
ed5979
    actually result in an overlap -- such as !RAM memory regions in some
ed5979
    cases (see kvm_set_phys_mem()).
ed5979
ed5979
    To minimize cache-line bouncing, use a separate indicator
ed5979
    (in_ioctl_lock) per CPU.  Also, make sure to hold the kvm_slots_lock
ed5979
    while performing both actions (removing+re-adding).
ed5979
ed5979
    We have to wait until all IOCTLs were exited and block new ones from
ed5979
    getting executed.
ed5979
ed5979
    This approach cannot result in a deadlock as long as the inhibitor does
ed5979
    not hold any locks that might hinder an IOCTL from getting finished and
ed5979
    exited - something fairly unusual. The inhibitor will always hold the BQL.
ed5979
ed5979
    AFAIKs, one possible candidate would be userfaultfd. If a page cannot be
ed5979
    placed (e.g., during postcopy), because we're waiting for a lock, or if the
ed5979
    userfaultfd thread cannot process a fault, because it is waiting for a
ed5979
    lock, there could be a deadlock. However, the BQL is not applicable here,
ed5979
    because any other guest memory access while holding the BQL would already
ed5979
    result in a deadlock.
ed5979
ed5979
    Nothing else in the kernel should block forever and wait for userspace
ed5979
    intervention.
ed5979
ed5979
    Note: pause_all_vcpus()/resume_all_vcpus() or
ed5979
    start_exclusive()/end_exclusive() cannot be used, as they either drop
ed5979
    the BQL or require to be called without the BQL - something inhibitors
ed5979
    cannot handle. We need a low-level locking mechanism that is
ed5979
    deadlock-free even when not releasing the BQL.
ed5979
ed5979
    Signed-off-by: David Hildenbrand <david@redhat.com>
ed5979
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
ed5979
    Tested-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
ed5979
    Message-Id: <20221111154758.1372674-4-eesposit@redhat.com>
ed5979
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
ed5979
ed5979
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
ed5979
---
ed5979
 accel/kvm/kvm-all.c      | 101 ++++++++++++++++++++++++++++++++++-----
ed5979
 include/sysemu/kvm_int.h |   8 ++++
ed5979
 2 files changed, 98 insertions(+), 11 deletions(-)
ed5979
ed5979
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
ed5979
index ff660fd469..39ed30ab59 100644
ed5979
--- a/accel/kvm/kvm-all.c
ed5979
+++ b/accel/kvm/kvm-all.c
ed5979
@@ -31,6 +31,7 @@
ed5979
 #include "sysemu/kvm_int.h"
ed5979
 #include "sysemu/runstate.h"
ed5979
 #include "sysemu/cpus.h"
ed5979
+#include "sysemu/accel-blocker.h"
ed5979
 #include "qemu/bswap.h"
ed5979
 #include "exec/memory.h"
ed5979
 #include "exec/ram_addr.h"
ed5979
@@ -46,6 +47,7 @@
ed5979
 #include "sysemu/hw_accel.h"
ed5979
 #include "kvm-cpus.h"
ed5979
 #include "sysemu/dirtylimit.h"
ed5979
+#include "qemu/range.h"
ed5979
 
ed5979
 #include "hw/boards.h"
ed5979
 #include "monitor/stats.h"
ed5979
@@ -1292,6 +1294,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
ed5979
     kvm_max_slot_size = max_slot_size;
ed5979
 }
ed5979
 
ed5979
+/* Called with KVMMemoryListener.slots_lock held */
ed5979
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
ed5979
                              MemoryRegionSection *section, bool add)
ed5979
 {
ed5979
@@ -1326,14 +1329,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ed5979
     ram = memory_region_get_ram_ptr(mr) + mr_offset;
ed5979
     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
ed5979
 
ed5979
-    kvm_slots_lock();
ed5979
-
ed5979
     if (!add) {
ed5979
         do {
ed5979
             slot_size = MIN(kvm_max_slot_size, size);
ed5979
             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
ed5979
             if (!mem) {
ed5979
-                goto out;
ed5979
+                return;
ed5979
             }
ed5979
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ed5979
                 /*
ed5979
@@ -1371,7 +1372,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ed5979
             start_addr += slot_size;
ed5979
             size -= slot_size;
ed5979
         } while (size);
ed5979
-        goto out;
ed5979
+        return;
ed5979
     }
ed5979
 
ed5979
     /* register the new slot */
ed5979
@@ -1396,9 +1397,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ed5979
         ram += slot_size;
ed5979
         size -= slot_size;
ed5979
     } while (size);
ed5979
-
ed5979
-out:
ed5979
-    kvm_slots_unlock();
ed5979
 }
ed5979
 
ed5979
 static void *kvm_dirty_ring_reaper_thread(void *data)
ed5979
@@ -1455,18 +1453,95 @@ static void kvm_region_add(MemoryListener *listener,
ed5979
                            MemoryRegionSection *section)
ed5979
 {
ed5979
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
ed5979
+    KVMMemoryUpdate *update;
ed5979
+
ed5979
+    update = g_new0(KVMMemoryUpdate, 1);
ed5979
+    update->section = *section;
ed5979
 
ed5979
-    memory_region_ref(section->mr);
ed5979
-    kvm_set_phys_mem(kml, section, true);
ed5979
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
ed5979
 }
ed5979
 
ed5979
 static void kvm_region_del(MemoryListener *listener,
ed5979
                            MemoryRegionSection *section)
ed5979
 {
ed5979
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
ed5979
+    KVMMemoryUpdate *update;
ed5979
+
ed5979
+    update = g_new0(KVMMemoryUpdate, 1);
ed5979
+    update->section = *section;
ed5979
+
ed5979
+    QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
ed5979
+}
ed5979
+
ed5979
+static void kvm_region_commit(MemoryListener *listener)
ed5979
+{
ed5979
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
ed5979
+                                          listener);
ed5979
+    KVMMemoryUpdate *u1, *u2;
ed5979
+    bool need_inhibit = false;
ed5979
+
ed5979
+    if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
ed5979
+        QSIMPLEQ_EMPTY(&kml->transaction_del)) {
ed5979
+        return;
ed5979
+    }
ed5979
+
ed5979
+    /*
ed5979
+     * We have to be careful when regions to add overlap with ranges to remove.
ed5979
+     * We have to simulate atomic KVM memslot updates by making sure no ioctl()
ed5979
+     * is currently active.
ed5979
+     *
ed5979
+     * The lists are order by addresses, so it's easy to find overlaps.
ed5979
+     */
ed5979
+    u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
ed5979
+    u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
ed5979
+    while (u1 && u2) {
ed5979
+        Range r1, r2;
ed5979
+
ed5979
+        range_init_nofail(&r1, u1->section.offset_within_address_space,
ed5979
+                          int128_get64(u1->section.size));
ed5979
+        range_init_nofail(&r2, u2->section.offset_within_address_space,
ed5979
+                          int128_get64(u2->section.size));
ed5979
+
ed5979
+        if (range_overlaps_range(&r1, &r2)) {
ed5979
+            need_inhibit = true;
ed5979
+            break;
ed5979
+        }
ed5979
+        if (range_lob(&r1) < range_lob(&r2)) {
ed5979
+            u1 = QSIMPLEQ_NEXT(u1, next);
ed5979
+        } else {
ed5979
+            u2 = QSIMPLEQ_NEXT(u2, next);
ed5979
+        }
ed5979
+    }
ed5979
+
ed5979
+    kvm_slots_lock();
ed5979
+    if (need_inhibit) {
ed5979
+        accel_ioctl_inhibit_begin();
ed5979
+    }
ed5979
+
ed5979
+    /* Remove all memslots before adding the new ones. */
ed5979
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
ed5979
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
ed5979
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
ed5979
 
ed5979
-    kvm_set_phys_mem(kml, section, false);
ed5979
-    memory_region_unref(section->mr);
ed5979
+        kvm_set_phys_mem(kml, &u1->section, false);
ed5979
+        memory_region_unref(u1->section.mr);
ed5979
+
ed5979
+        g_free(u1);
ed5979
+    }
ed5979
+    while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
ed5979
+        u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
ed5979
+        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
ed5979
+
ed5979
+        memory_region_ref(u1->section.mr);
ed5979
+        kvm_set_phys_mem(kml, &u1->section, true);
ed5979
+
ed5979
+        g_free(u1);
ed5979
+    }
ed5979
+
ed5979
+    if (need_inhibit) {
ed5979
+        accel_ioctl_inhibit_end();
ed5979
+    }
ed5979
+    kvm_slots_unlock();
ed5979
 }
ed5979
 
ed5979
 static void kvm_log_sync(MemoryListener *listener,
ed5979
@@ -1610,8 +1685,12 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
ed5979
         kml->slots[i].slot = i;
ed5979
     }
ed5979
 
ed5979
+    QSIMPLEQ_INIT(&kml->transaction_add);
ed5979
+    QSIMPLEQ_INIT(&kml->transaction_del);
ed5979
+
ed5979
     kml->listener.region_add = kvm_region_add;
ed5979
     kml->listener.region_del = kvm_region_del;
ed5979
+    kml->listener.commit = kvm_region_commit;
ed5979
     kml->listener.log_start = kvm_log_start;
ed5979
     kml->listener.log_stop = kvm_log_stop;
ed5979
     kml->listener.priority = 10;
ed5979
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
ed5979
index 3b4adcdc10..60b520a13e 100644
ed5979
--- a/include/sysemu/kvm_int.h
ed5979
+++ b/include/sysemu/kvm_int.h
ed5979
@@ -12,6 +12,7 @@
ed5979
 #include "exec/memory.h"
ed5979
 #include "qapi/qapi-types-common.h"
ed5979
 #include "qemu/accel.h"
ed5979
+#include "qemu/queue.h"
ed5979
 #include "sysemu/kvm.h"
ed5979
 
ed5979
 typedef struct KVMSlot
ed5979
@@ -31,10 +32,17 @@ typedef struct KVMSlot
ed5979
     ram_addr_t ram_start_offset;
ed5979
 } KVMSlot;
ed5979
 
ed5979
+typedef struct KVMMemoryUpdate {
ed5979
+    QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
ed5979
+    MemoryRegionSection section;
ed5979
+} KVMMemoryUpdate;
ed5979
+
ed5979
 typedef struct KVMMemoryListener {
ed5979
     MemoryListener listener;
ed5979
     KVMSlot *slots;
ed5979
     int as_id;
ed5979
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
ed5979
+    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
ed5979
 } KVMMemoryListener;
ed5979
 
ed5979
 #define KVM_MSI_HASHTAB_SIZE    256
ed5979
-- 
ed5979
2.31.1
ed5979