yeahuh / rpms / qemu-kvm

Forked from rpms/qemu-kvm 2 years ago
Clone

Blame SOURCES/kvm-raw-posix-The-SEEK_HOLE-code-is-flawed-rewrite-it.patch

9ae3a8
From 03b3f6befef3ab33a422d4dad9c2b3892e49b686 Mon Sep 17 00:00:00 2001
9ae3a8
From: Max Reitz <mreitz@redhat.com>
9ae3a8
Date: Tue, 18 Nov 2014 15:30:20 +0100
9ae3a8
Subject: [PATCH 41/41] raw-posix: The SEEK_HOLE code is flawed, rewrite it
9ae3a8
9ae3a8
Message-id: <1416324620-16229-8-git-send-email-mreitz@redhat.com>
9ae3a8
Patchwork-id: 62442
9ae3a8
O-Subject: [RHEL-7.1/7.0.z qemu-kvm PATCH v3 7/7] raw-posix: The SEEK_HOLE code is flawed, rewrite it
9ae3a8
Bugzilla: 1160237
9ae3a8
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
9ae3a8
RH-Acked-by: Markus Armbruster <armbru@redhat.com>
9ae3a8
9ae3a8
From: Markus Armbruster <armbru@redhat.com>
9ae3a8
9ae3a8
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
9ae3a8
but not Linux), try_seek_hole() reports trailing data instead.
9ae3a8
9ae3a8
Additionally, unlikely lseek() failures are treated badly:
9ae3a8
9ae3a8
* When SEEK_HOLE fails, try_seek_hole() reports trailing data.  For
9ae3a8
  -ENXIO, there's in fact a trailing hole.  Can happen only when
9ae3a8
  something truncated the file since we opened it.
9ae3a8
9ae3a8
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
9ae3a8
  then try_seek_hole() reports a trailing hole.  This is okay only
9ae3a8
  when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
9ae3a8
  found by SEEK_HOLE has since become trailing somehow).  For other
9ae3a8
  failures (unlikely), it's wrong.
9ae3a8
9ae3a8
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
9ae3a8
  then try_seek_hole() reports bogus data [-1,start), which its caller
9ae3a8
  raw_co_get_block_status() turns into zero sectors of data.  Could
9ae3a8
  theoretically lead to infinite loops in code that attempts to scan
9ae3a8
  data vs. hole forward.
9ae3a8
9ae3a8
Rewrite from scratch, with very careful comments.
9ae3a8
9ae3a8
Signed-off-by: Markus Armbruster <armbru@redhat.com>
9ae3a8
Reviewed-by: Max Reitz <mreitz@redhat.com>
9ae3a8
Reviewed-by: Eric Blake <eblake@redhat.com>
9ae3a8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9ae3a8
(cherry picked from commit d1f06fe665acdd7aa7a46a5ef88172c3d7d3028e)
9ae3a8
9ae3a8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9ae3a8
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
9ae3a8
---
9ae3a8
 block/raw-posix.c | 111 +++++++++++++++++++++++++++++++++++++++++-------------
9ae3a8
 1 file changed, 85 insertions(+), 26 deletions(-)
9ae3a8
9ae3a8
diff --git a/block/raw-posix.c b/block/raw-posix.c
9ae3a8
index aeb8a97..6a50856 100644
9ae3a8
--- a/block/raw-posix.c
9ae3a8
+++ b/block/raw-posix.c
9ae3a8
@@ -1302,28 +1302,86 @@ out:
9ae3a8
     return result;
9ae3a8
 }
9ae3a8
 
9ae3a8
-static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
9ae3a8
-                         off_t *hole)
9ae3a8
+/*
9ae3a8
+ * Find allocation range in @bs around offset @start.
9ae3a8
+ * May change underlying file descriptor's file offset.
9ae3a8
+ * If @start is not in a hole, store @start in @data, and the
9ae3a8
+ * beginning of the next hole in @hole, and return 0.
9ae3a8
+ * If @start is in a non-trailing hole, store @start in @hole and the
9ae3a8
+ * beginning of the next non-hole in @data, and return 0.
9ae3a8
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
9ae3a8
+ * If we can't find out, return a negative errno other than -ENXIO.
9ae3a8
+ */
9ae3a8
+static int find_allocation(BlockDriverState *bs, off_t start,
9ae3a8
+                           off_t *data, off_t *hole)
9ae3a8
 {
9ae3a8
 #if defined SEEK_HOLE && defined SEEK_DATA
9ae3a8
     BDRVRawState *s = bs->opaque;
9ae3a8
+    off_t offs;
9ae3a8
 
9ae3a8
-    *hole = lseek(s->fd, start, SEEK_HOLE);
9ae3a8
-    if (*hole == -1) {
9ae3a8
-        return -errno;
9ae3a8
+    /*
9ae3a8
+     * SEEK_DATA cases:
9ae3a8
+     * D1. offs == start: start is in data
9ae3a8
+     * D2. offs > start: start is in a hole, next data at offs
9ae3a8
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
9ae3a8
+     *                              or start is beyond EOF
9ae3a8
+     *     If the latter happens, the file has been truncated behind
9ae3a8
+     *     our back since we opened it.  All bets are off then.
9ae3a8
+     *     Treating like a trailing hole is simplest.
9ae3a8
+     * D4. offs < 0, errno != ENXIO: we learned nothing
9ae3a8
+     */
9ae3a8
+    offs = lseek(s->fd, start, SEEK_DATA);
9ae3a8
+    if (offs < 0) {
9ae3a8
+        return -errno;          /* D3 or D4 */
9ae3a8
+    }
9ae3a8
+    assert(offs >= start);
9ae3a8
+
9ae3a8
+    if (offs > start) {
9ae3a8
+        /* D2: in hole, next data at offs */
9ae3a8
+        *hole = start;
9ae3a8
+        *data = offs;
9ae3a8
+        return 0;
9ae3a8
     }
9ae3a8
 
9ae3a8
-    if (*hole > start) {
9ae3a8
+    /* D1: in data, end not yet known */
9ae3a8
+
9ae3a8
+    /*
9ae3a8
+     * SEEK_HOLE cases:
9ae3a8
+     * H1. offs == start: start is in a hole
9ae3a8
+     *     If this happens here, a hole has been dug behind our back
9ae3a8
+     *     since the previous lseek().
9ae3a8
+     * H2. offs > start: either start is in data, next hole at offs,
9ae3a8
+     *                   or start is in trailing hole, EOF at offs
9ae3a8
+     *     Linux treats trailing holes like any other hole: offs ==
9ae3a8
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
9ae3a8
+     *     If that happens here, a hole has been dug behind our back
9ae3a8
+     *     since the previous lseek().
9ae3a8
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
9ae3a8
+     *     If this happens, the file has been truncated behind our
9ae3a8
+     *     back since we opened it.  Treat it like a trailing hole.
9ae3a8
+     * H4. offs < 0, errno != ENXIO: we learned nothing
9ae3a8
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
9ae3a8
+     */
9ae3a8
+    offs = lseek(s->fd, start, SEEK_HOLE);
9ae3a8
+    if (offs < 0) {
9ae3a8
+        return -errno;          /* D1 and (H3 or H4) */
9ae3a8
+    }
9ae3a8
+    assert(offs >= start);
9ae3a8
+
9ae3a8
+    if (offs > start) {
9ae3a8
+        /*
9ae3a8
+         * D1 and H2: either in data, next hole at offs, or it was in
9ae3a8
+         * data but is now in a trailing hole.  In the latter case,
9ae3a8
+         * all bets are off.  Treating it as if it there was data all
9ae3a8
+         * the way to EOF is safe, so simply do that.
9ae3a8
+         */
9ae3a8
         *data = start;
9ae3a8
-    } else {
9ae3a8
-        /* On a hole.  We need another syscall to find its end.  */
9ae3a8
-        *data = lseek(s->fd, start, SEEK_DATA);
9ae3a8
-        if (*data == -1) {
9ae3a8
-            *data = lseek(s->fd, 0, SEEK_END);
9ae3a8
-        }
9ae3a8
+        *hole = offs;
9ae3a8
+        return 0;
9ae3a8
     }
9ae3a8
 
9ae3a8
-    return 0;
9ae3a8
+    /* D1 and H1 */
9ae3a8
+    return -EBUSY;
9ae3a8
 #else
9ae3a8
     return -ENOTSUP;
9ae3a8
 #endif
9ae3a8
@@ -1368,25 +1426,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
9ae3a8
         nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
9ae3a8
     }
9ae3a8
 
9ae3a8
-    ret = try_seek_hole(bs, start, &data, &hole);
9ae3a8
-    if (ret < 0) {
9ae3a8
-        /* Assume everything is allocated. */
9ae3a8
-        data = 0;
9ae3a8
-        hole = start + nb_sectors * BDRV_SECTOR_SIZE;
9ae3a8
-        ret = 0;
9ae3a8
-    }
9ae3a8
-
9ae3a8
-    assert(ret >= 0);
9ae3a8
-
9ae3a8
-    if (data <= start) {
9ae3a8
+    ret = find_allocation(bs, start, &data, &hole);
9ae3a8
+    if (ret == -ENXIO) {
9ae3a8
+        /* Trailing hole */
9ae3a8
+        *pnum = nb_sectors;
9ae3a8
+        ret = BDRV_BLOCK_ZERO;
9ae3a8
+    } else if (ret < 0) {
9ae3a8
+        /* No info available, so pretend there are no holes */
9ae3a8
+        *pnum = nb_sectors;
9ae3a8
+        ret = BDRV_BLOCK_DATA;
9ae3a8
+    } else if (data == start) {
9ae3a8
         /* On a data extent, compute sectors to the end of the extent.  */
9ae3a8
         *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
9ae3a8
-        return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
9ae3a8
+        ret = BDRV_BLOCK_DATA;
9ae3a8
     } else {
9ae3a8
         /* On a hole, compute sectors to the beginning of the next extent.  */
9ae3a8
+        assert(hole == start);
9ae3a8
         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
9ae3a8
-        return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start;
9ae3a8
+        ret = BDRV_BLOCK_ZERO;
9ae3a8
     }
9ae3a8
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
9ae3a8
 }
9ae3a8
 
9ae3a8
 static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
9ae3a8
-- 
9ae3a8
1.8.3.1
9ae3a8