yeahuh / rpms / qemu-kvm

Forked from rpms/qemu-kvm 2 years ago
Clone

Blame SOURCES/kvm-block-Allow-graph-changes-in-bdrv_drain_all_begin-en.patch

ae23c9
From 7946b28c7e68c564b0734869ff6f4e36bd5e2e2d Mon Sep 17 00:00:00 2001
ae23c9
From: Kevin Wolf <kwolf@redhat.com>
ae23c9
Date: Wed, 10 Oct 2018 20:21:48 +0100
ae23c9
Subject: [PATCH 22/49] block: Allow graph changes in bdrv_drain_all_begin/end
ae23c9
 sections
ae23c9
ae23c9
RH-Author: Kevin Wolf <kwolf@redhat.com>
ae23c9
Message-id: <20181010202213.7372-10-kwolf@redhat.com>
ae23c9
Patchwork-id: 82597
ae23c9
O-Subject: [RHEL-8 qemu-kvm PATCH 19/44] block: Allow graph changes in bdrv_drain_all_begin/end sections
ae23c9
Bugzilla: 1637976
ae23c9
RH-Acked-by: Max Reitz <mreitz@redhat.com>
ae23c9
RH-Acked-by: John Snow <jsnow@redhat.com>
ae23c9
RH-Acked-by: Thomas Huth <thuth@redhat.com>
ae23c9
ae23c9
bdrv_drain_all_*() used bdrv_next() to iterate over all root nodes and
ae23c9
did a subtree drain for each of them. This works fine as long as the
ae23c9
graph is static, but sadly, reality looks different.
ae23c9
ae23c9
If the graph changes so that root nodes are added or removed, we would
ae23c9
have to compensate for this. bdrv_next() returns each root node only
ae23c9
once even if it's the root node for multiple BlockBackends or for a
ae23c9
monitor-owned block driver tree, which would only complicate things.
ae23c9
ae23c9
The much easier and more obviously correct way is to fundamentally
ae23c9
change the way the functions work: Iterate over all BlockDriverStates,
ae23c9
no matter who owns them, and drain them individually. Compensation is
ae23c9
only necessary when a new BDS is created inside a drain_all section.
ae23c9
Removal of a BDS doesn't require any action because it's gone afterwards
ae23c9
anyway.
ae23c9
ae23c9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
ae23c9
(cherry picked from commit 0f12264e7a41458179ad10276a7c33c72024861a)
ae23c9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
ae23c9
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
ae23c9
---
ae23c9
 block.c                   | 34 ++++++++++++++++++++++++---
ae23c9
 block/io.c                | 60 ++++++++++++++++++++++++++++++++++++-----------
ae23c9
 include/block/block.h     |  1 +
ae23c9
 include/block/block_int.h |  1 +
ae23c9
 4 files changed, 79 insertions(+), 17 deletions(-)
ae23c9
ae23c9
diff --git a/block.c b/block.c
ae23c9
index 519be6e..eea9c6f 100644
ae23c9
--- a/block.c
ae23c9
+++ b/block.c
ae23c9
@@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
ae23c9
 
ae23c9
     qemu_co_queue_init(&bs->flush_queue);
ae23c9
 
ae23c9
+    for (i = 0; i < bdrv_drain_all_count; i++) {
ae23c9
+        bdrv_drained_begin(bs);
ae23c9
+    }
ae23c9
+
ae23c9
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
ae23c9
 
ae23c9
     return bs;
ae23c9
@@ -1170,7 +1174,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
ae23c9
                             int open_flags, Error **errp)
ae23c9
 {
ae23c9
     Error *local_err = NULL;
ae23c9
-    int ret;
ae23c9
+    int i, ret;
ae23c9
 
ae23c9
     bdrv_assign_node_name(bs, node_name, &local_err);
ae23c9
     if (local_err) {
ae23c9
@@ -1218,6 +1222,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
ae23c9
     assert(bdrv_min_mem_align(bs) != 0);
ae23c9
     assert(is_power_of_2(bs->bl.request_alignment));
ae23c9
 
ae23c9
+    for (i = 0; i < bs->quiesce_counter; i++) {
ae23c9
+        if (drv->bdrv_co_drain_begin) {
ae23c9
+            drv->bdrv_co_drain_begin(bs);
ae23c9
+        }
ae23c9
+    }
ae23c9
+
ae23c9
     return 0;
ae23c9
 open_failed:
ae23c9
     bs->drv = NULL;
ae23c9
@@ -2039,7 +2049,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
ae23c9
             child->role->detach(child);
ae23c9
         }
ae23c9
         if (old_bs->quiesce_counter && child->role->drained_end) {
ae23c9
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
ae23c9
+            int num = old_bs->quiesce_counter;
ae23c9
+            if (child->role->parent_is_bds) {
ae23c9
+                num -= bdrv_drain_all_count;
ae23c9
+            }
ae23c9
+            assert(num >= 0);
ae23c9
+            for (i = 0; i < num; i++) {
ae23c9
                 child->role->drained_end(child);
ae23c9
             }
ae23c9
         }
ae23c9
@@ -2051,7 +2066,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
ae23c9
     if (new_bs) {
ae23c9
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
ae23c9
         if (new_bs->quiesce_counter && child->role->drained_begin) {
ae23c9
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
ae23c9
+            int num = new_bs->quiesce_counter;
ae23c9
+            if (child->role->parent_is_bds) {
ae23c9
+                num -= bdrv_drain_all_count;
ae23c9
+            }
ae23c9
+            assert(num >= 0);
ae23c9
+            for (i = 0; i < num; i++) {
ae23c9
                 child->role->drained_begin(child);
ae23c9
             }
ae23c9
         }
ae23c9
@@ -3993,6 +4013,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
ae23c9
     return QTAILQ_NEXT(bs, node_list);
ae23c9
 }
ae23c9
 
ae23c9
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
ae23c9
+{
ae23c9
+    if (!bs) {
ae23c9
+        return QTAILQ_FIRST(&all_bdrv_states);
ae23c9
+    }
ae23c9
+    return QTAILQ_NEXT(bs, bs_list);
ae23c9
+}
ae23c9
+
ae23c9
 const char *bdrv_get_node_name(const BlockDriverState *bs)
ae23c9
 {
ae23c9
     return bs->node_name;
ae23c9
diff --git a/block/io.c b/block/io.c
ae23c9
index 0021fefd..38ae299 100644
ae23c9
--- a/block/io.c
ae23c9
+++ b/block/io.c
ae23c9
@@ -38,6 +38,8 @@
ae23c9
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
ae23c9
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
ae23c9
 
ae23c9
+static AioWait drain_all_aio_wait;
ae23c9
+
ae23c9
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
ae23c9
     int64_t offset, int bytes, BdrvRequestFlags flags);
ae23c9
 
ae23c9
@@ -471,6 +473,29 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
ae23c9
     }
ae23c9
 }
ae23c9
 
ae23c9
+unsigned int bdrv_drain_all_count = 0;
ae23c9
+
ae23c9
+static bool bdrv_drain_all_poll(void)
ae23c9
+{
ae23c9
+    BlockDriverState *bs = NULL;
ae23c9
+    bool result = false;
ae23c9
+
ae23c9
+    /* Execute pending BHs first (may modify the graph) and check everything
ae23c9
+     * else only after the BHs have executed. */
ae23c9
+    while (aio_poll(qemu_get_aio_context(), false));
ae23c9
+
ae23c9
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
ae23c9
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
ae23c9
+    while ((bs = bdrv_next_all_states(bs))) {
ae23c9
+        AioContext *aio_context = bdrv_get_aio_context(bs);
ae23c9
+        aio_context_acquire(aio_context);
ae23c9
+        result |= bdrv_drain_poll(bs, false, NULL, true);
ae23c9
+        aio_context_release(aio_context);
ae23c9
+    }
ae23c9
+
ae23c9
+    return result;
ae23c9
+}
ae23c9
+
ae23c9
 /*
ae23c9
  * Wait for pending requests to complete across all BlockDriverStates
ae23c9
  *
ae23c9
@@ -485,45 +510,51 @@ static void bdrv_drain_assert_idle(BlockDriverState *bs)
ae23c9
  */
ae23c9
 void bdrv_drain_all_begin(void)
ae23c9
 {
ae23c9
-    BlockDriverState *bs;
ae23c9
-    BdrvNextIterator it;
ae23c9
+    BlockDriverState *bs = NULL;
ae23c9
 
ae23c9
     if (qemu_in_coroutine()) {
ae23c9
-        bdrv_co_yield_to_drain(NULL, true, false, NULL, false, true);
ae23c9
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
ae23c9
         return;
ae23c9
     }
ae23c9
 
ae23c9
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
ae23c9
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
ae23c9
-     * nodes in several different AioContexts, so make sure we're in the main
ae23c9
-     * context. */
ae23c9
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
ae23c9
+     * loop AioContext, so make sure we're in the main context. */
ae23c9
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
ae23c9
+    assert(bdrv_drain_all_count < INT_MAX);
ae23c9
+    bdrv_drain_all_count++;
ae23c9
 
ae23c9
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
ae23c9
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
ae23c9
+     * cannot change during this loop. */
ae23c9
+    while ((bs = bdrv_next_all_states(bs))) {
ae23c9
         AioContext *aio_context = bdrv_get_aio_context(bs);
ae23c9
 
ae23c9
         aio_context_acquire(aio_context);
ae23c9
-        bdrv_do_drained_begin(bs, true, NULL, false, true);
ae23c9
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
ae23c9
         aio_context_release(aio_context);
ae23c9
     }
ae23c9
 
ae23c9
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
ae23c9
+    /* Now poll the in-flight requests */
ae23c9
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
ae23c9
+
ae23c9
+    while ((bs = bdrv_next_all_states(bs))) {
ae23c9
         bdrv_drain_assert_idle(bs);
ae23c9
     }
ae23c9
 }
ae23c9
 
ae23c9
 void bdrv_drain_all_end(void)
ae23c9
 {
ae23c9
-    BlockDriverState *bs;
ae23c9
-    BdrvNextIterator it;
ae23c9
+    BlockDriverState *bs = NULL;
ae23c9
 
ae23c9
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
ae23c9
+    while ((bs = bdrv_next_all_states(bs))) {
ae23c9
         AioContext *aio_context = bdrv_get_aio_context(bs);
ae23c9
 
ae23c9
         aio_context_acquire(aio_context);
ae23c9
-        bdrv_do_drained_end(bs, true, NULL, false);
ae23c9
+        bdrv_do_drained_end(bs, false, NULL, true);
ae23c9
         aio_context_release(aio_context);
ae23c9
     }
ae23c9
+
ae23c9
+    assert(bdrv_drain_all_count > 0);
ae23c9
+    bdrv_drain_all_count--;
ae23c9
 }
ae23c9
 
ae23c9
 void bdrv_drain_all(void)
ae23c9
@@ -658,6 +689,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
ae23c9
 void bdrv_wakeup(BlockDriverState *bs)
ae23c9
 {
ae23c9
     aio_wait_kick(bdrv_get_aio_wait(bs));
ae23c9
+    aio_wait_kick(&drain_all_aio_wait);
ae23c9
 }
ae23c9
 
ae23c9
 void bdrv_dec_in_flight(BlockDriverState *bs)
ae23c9
diff --git a/include/block/block.h b/include/block/block.h
ae23c9
index 6e91803..f9079ac 100644
ae23c9
--- a/include/block/block.h
ae23c9
+++ b/include/block/block.h
ae23c9
@@ -449,6 +449,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
ae23c9
                                  Error **errp);
ae23c9
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
ae23c9
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
ae23c9
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
ae23c9
 
ae23c9
 typedef struct BdrvNextIterator {
ae23c9
     enum {
ae23c9
diff --git a/include/block/block_int.h b/include/block/block_int.h
ae23c9
index 0ad8a76..9757d5e 100644
ae23c9
--- a/include/block/block_int.h
ae23c9
+++ b/include/block/block_int.h
ae23c9
@@ -845,6 +845,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
ae23c9
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
ae23c9
     BdrvRequestFlags flags);
ae23c9
 
ae23c9
+extern unsigned int bdrv_drain_all_count;
ae23c9
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
ae23c9
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
ae23c9
 
ae23c9
-- 
ae23c9
1.8.3.1
ae23c9