1072c8
From 82a02aec3a8b3c2ac925d0b71ea4c35aa5d6463b Mon Sep 17 00:00:00 2001
1072c8
From: Andrew Jones <drjones@redhat.com>
1072c8
Date: Wed, 4 Aug 2021 03:27:24 -0400
1072c8
Subject: [PATCH 2/2] async: use explicit memory barriers
1072c8
1072c8
RH-Author: Andrew Jones <drjones@redhat.com>
1072c8
Message-id: <20210729134448.4995-3-drjones@redhat.com>
1072c8
Patchwork-id: 101937
1072c8
O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 2/2] async: use explicit memory barriers
1072c8
Bugzilla: 1969848
1072c8
RH-Acked-by: Gavin Shan <gshan@redhat.com>
1072c8
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
1072c8
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
1072c8
1072c8
From: Paolo Bonzini <pbonzini@redhat.com>
1072c8
1072c8
When using C11 atomics, non-seqcst reads and writes do not participate
1072c8
in the total order of seqcst operations.  In util/async.c and util/aio-posix.c,
1072c8
in particular, the pattern that we use
1072c8
1072c8
          write ctx->notify_me                 write bh->scheduled
1072c8
          read bh->scheduled                   read ctx->notify_me
1072c8
          if !bh->scheduled, sleep             if ctx->notify_me, notify
1072c8
1072c8
needs to use seqcst operations for both the write and the read.  In
1072c8
general this is something that we do not want, because there can be
1072c8
many sources that are polled in addition to bottom halves.  The
1072c8
alternative is to place a seqcst memory barrier between the write
1072c8
and the read.  This also comes with a disadvantage, in that the
1072c8
memory barrier is implicit on strongly-ordered architectures and
1072c8
it wastes a few dozen clock cycles.
1072c8
1072c8
Fortunately, ctx->notify_me is never written concurrently by two
1072c8
threads, so we can assert that and relax the writes to ctx->notify_me.
1072c8
The resulting solution works and performs well on both aarch64 and x86.
1072c8
1072c8
Note that the atomic_set/atomic_read combination is not an atomic
1072c8
read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
1072c8
on x86, ATOMIC_RELAXED compiles to a locked operation.
1072c8
1072c8
Analyzed-by: Ying Fang <fangying1@huawei.com>
1072c8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1072c8
Tested-by: Ying Fang <fangying1@huawei.com>
1072c8
Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
1072c8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
1072c8
(cherry picked from commit 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c)
1072c8
Signed-off-by: Andrew Jones <drjones@redhat.com>
1072c8
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
1072c8
---
1072c8
 util/aio-posix.c | 16 ++++++++++++++--
1072c8
 util/aio-win32.c | 17 ++++++++++++++---
1072c8
 util/async.c     | 16 ++++++++++++----
1072c8
 3 files changed, 40 insertions(+), 9 deletions(-)
1072c8
1072c8
diff --git a/util/aio-posix.c b/util/aio-posix.c
1072c8
index abc396d030..8cfb25650d 100644
1072c8
--- a/util/aio-posix.c
1072c8
+++ b/util/aio-posix.c
1072c8
@@ -624,6 +624,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
     int64_t timeout;
1072c8
     int64_t start = 0;
1072c8
 
1072c8
+    /*
1072c8
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
1072c8
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
1072c8
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
1072c8
+     */
1072c8
     assert(in_aio_context_home_thread(ctx));
1072c8
 
1072c8
     /* aio_notify can avoid the expensive event_notifier_set if
1072c8
@@ -634,7 +639,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
      * so disable the optimization now.
1072c8
      */
1072c8
     if (blocking) {
1072c8
-        atomic_add(&ctx->notify_me, 2);
1072c8
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
1072c8
+        /*
1072c8
+         * Write ctx->notify_me before computing the timeout
1072c8
+         * (reading bottom half flags, etc.).  Pairs with
1072c8
+         * smp_mb in aio_notify().
1072c8
+         */
1072c8
+        smp_mb();
1072c8
     }
1072c8
 
1072c8
     qemu_lockcnt_inc(&ctx->list_lock);
1072c8
@@ -679,7 +690,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
     }
1072c8
 
1072c8
     if (blocking) {
1072c8
-        atomic_sub(&ctx->notify_me, 2);
1072c8
+        /* Finish the poll before clearing the flag.  */
1072c8
+        atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
1072c8
         aio_notify_accept(ctx);
1072c8
     }
1072c8
 
1072c8
diff --git a/util/aio-win32.c b/util/aio-win32.c
1072c8
index a23b9c364d..729d533faf 100644
1072c8
--- a/util/aio-win32.c
1072c8
+++ b/util/aio-win32.c
1072c8
@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
     int count;
1072c8
     int timeout;
1072c8
 
1072c8
+    /*
1072c8
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
1072c8
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
1072c8
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
1072c8
+     */
1072c8
+    assert(in_aio_context_home_thread(ctx));
1072c8
     progress = false;
1072c8
 
1072c8
     /* aio_notify can avoid the expensive event_notifier_set if
1072c8
@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
      * so disable the optimization now.
1072c8
      */
1072c8
     if (blocking) {
1072c8
-        atomic_add(&ctx->notify_me, 2);
1072c8
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
1072c8
+        /*
1072c8
+         * Write ctx->notify_me before computing the timeout
1072c8
+         * (reading bottom half flags, etc.).  Pairs with
1072c8
+         * smp_mb in aio_notify().
1072c8
+         */
1072c8
+        smp_mb();
1072c8
     }
1072c8
 
1072c8
     qemu_lockcnt_inc(&ctx->list_lock);
1072c8
@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
1072c8
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
1072c8
         if (blocking) {
1072c8
             assert(first);
1072c8
-            assert(in_aio_context_home_thread(ctx));
1072c8
-            atomic_sub(&ctx->notify_me, 2);
1072c8
+            atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
1072c8
             aio_notify_accept(ctx);
1072c8
         }
1072c8
 
1072c8
diff --git a/util/async.c b/util/async.c
1072c8
index b1fa5319e5..c65c58bbc9 100644
1072c8
--- a/util/async.c
1072c8
+++ b/util/async.c
1072c8
@@ -220,7 +220,14 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
1072c8
 {
1072c8
     AioContext *ctx = (AioContext *) source;
1072c8
 
1072c8
-    atomic_or(&ctx->notify_me, 1);
1072c8
+    atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
1072c8
+
1072c8
+    /*
1072c8
+     * Write ctx->notify_me before computing the timeout
1072c8
+     * (reading bottom half flags, etc.).  Pairs with
1072c8
+     * smp_mb in aio_notify().
1072c8
+     */
1072c8
+    smp_mb();
1072c8
 
1072c8
     /* We assume there is no timeout already supplied */
1072c8
     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
1072c8
@@ -238,7 +245,8 @@ aio_ctx_check(GSource *source)
1072c8
     AioContext *ctx = (AioContext *) source;
1072c8
     QEMUBH *bh;
1072c8
 
1072c8
-    atomic_and(&ctx->notify_me, ~1);
1072c8
+    /* Finish computing the timeout before clearing the flag.  */
1072c8
+    atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
1072c8
     aio_notify_accept(ctx);
1072c8
 
1072c8
     for (bh = ctx->first_bh; bh; bh = bh->next) {
1072c8
@@ -343,10 +351,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
1072c8
 void aio_notify(AioContext *ctx)
1072c8
 {
1072c8
     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
1072c8
-     * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
1072c8
+     * with smp_mb in aio_ctx_prepare or aio_poll.
1072c8
      */
1072c8
     smp_mb();
1072c8
-    if (ctx->notify_me) {
1072c8
+    if (atomic_read(&ctx->notify_me)) {
1072c8
         event_notifier_set(&ctx->notifier);
1072c8
         atomic_mb_set(&ctx->notified, true);
1072c8
     }
1072c8
-- 
1072c8
2.27.0
1072c8