|
|
09b1b5 |
From f3d3993ffd2caeef594351702f42e86218f2a2fb Mon Sep 17 00:00:00 2001
|
|
|
09b1b5 |
From: Max Reitz <mreitz@redhat.com>
|
|
|
09b1b5 |
Date: Tue, 18 Nov 2014 15:30:20 +0100
|
|
|
09b1b5 |
Subject: [PATCH 7/7] raw-posix: The SEEK_HOLE code is flawed, rewrite it
|
|
|
09b1b5 |
|
|
|
09b1b5 |
Message-id: <1416324620-16229-8-git-send-email-mreitz@redhat.com>
|
|
|
09b1b5 |
Patchwork-id: 62442
|
|
|
09b1b5 |
O-Subject: [RHEL-7.1/7.0.z qemu-kvm PATCH v3 7/7] raw-posix: The SEEK_HOLE code is flawed, rewrite it
|
|
|
09b1b5 |
Bugzilla: 1166605
|
|
|
09b1b5 |
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
09b1b5 |
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
|
|
09b1b5 |
RH-Acked-by: Markus Armbruster <armbru@redhat.com>
|
|
|
09b1b5 |
|
|
|
09b1b5 |
From: Markus Armbruster <armbru@redhat.com>
|
|
|
09b1b5 |
|
|
|
09b1b5 |
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
|
|
|
09b1b5 |
but not Linux), try_seek_hole() reports trailing data instead.
|
|
|
09b1b5 |
|
|
|
09b1b5 |
Additionally, unlikely lseek() failures are treated badly:
|
|
|
09b1b5 |
|
|
|
09b1b5 |
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
|
|
|
09b1b5 |
-ENXIO, there's in fact a trailing hole. Can happen only when
|
|
|
09b1b5 |
something truncated the file since we opened it.
|
|
|
09b1b5 |
|
|
|
09b1b5 |
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
|
|
|
09b1b5 |
then try_seek_hole() reports a trailing hole. This is okay only
|
|
|
09b1b5 |
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
|
|
|
09b1b5 |
found by SEEK_HOLE has since become trailing somehow). For other
|
|
|
09b1b5 |
failures (unlikely), it's wrong.
|
|
|
09b1b5 |
|
|
|
09b1b5 |
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
|
|
|
09b1b5 |
then try_seek_hole() reports bogus data [-1,start), which its caller
|
|
|
09b1b5 |
raw_co_get_block_status() turns into zero sectors of data. Could
|
|
|
09b1b5 |
theoretically lead to infinite loops in code that attempts to scan
|
|
|
09b1b5 |
data vs. hole forward.
|
|
|
09b1b5 |
|
|
|
09b1b5 |
Rewrite from scratch, with very careful comments.
|
|
|
09b1b5 |
|
|
|
09b1b5 |
Signed-off-by: Markus Armbruster <armbru@redhat.com>
|
|
|
09b1b5 |
Reviewed-by: Max Reitz <mreitz@redhat.com>
|
|
|
09b1b5 |
Reviewed-by: Eric Blake <eblake@redhat.com>
|
|
|
09b1b5 |
Signed-off-by: Max Reitz <mreitz@redhat.com>
|
|
|
09b1b5 |
(cherry picked from commit d1f06fe665acdd7aa7a46a5ef88172c3d7d3028e)
|
|
|
09b1b5 |
|
|
|
09b1b5 |
Signed-off-by: Max Reitz <mreitz@redhat.com>
|
|
|
09b1b5 |
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
|
|
|
09b1b5 |
---
|
|
|
09b1b5 |
block/raw-posix.c | 111 +++++++++++++++++++++++++++++++++++++++++-------------
|
|
|
09b1b5 |
1 file changed, 85 insertions(+), 26 deletions(-)
|
|
|
09b1b5 |
|
|
|
09b1b5 |
diff --git a/block/raw-posix.c b/block/raw-posix.c
|
|
|
09b1b5 |
index 3ddcd2d..f2244a9 100644
|
|
|
09b1b5 |
--- a/block/raw-posix.c
|
|
|
09b1b5 |
+++ b/block/raw-posix.c
|
|
|
09b1b5 |
@@ -1254,28 +1254,86 @@ static int raw_create(const char *filename, QEMUOptionParameter *options,
|
|
|
09b1b5 |
return result;
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
|
|
|
09b1b5 |
-static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
|
|
|
09b1b5 |
- off_t *hole)
|
|
|
09b1b5 |
+/*
|
|
|
09b1b5 |
+ * Find allocation range in @bs around offset @start.
|
|
|
09b1b5 |
+ * May change underlying file descriptor's file offset.
|
|
|
09b1b5 |
+ * If @start is not in a hole, store @start in @data, and the
|
|
|
09b1b5 |
+ * beginning of the next hole in @hole, and return 0.
|
|
|
09b1b5 |
+ * If @start is in a non-trailing hole, store @start in @hole and the
|
|
|
09b1b5 |
+ * beginning of the next non-hole in @data, and return 0.
|
|
|
09b1b5 |
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
|
|
|
09b1b5 |
+ * If we can't find out, return a negative errno other than -ENXIO.
|
|
|
09b1b5 |
+ */
|
|
|
09b1b5 |
+static int find_allocation(BlockDriverState *bs, off_t start,
|
|
|
09b1b5 |
+ off_t *data, off_t *hole)
|
|
|
09b1b5 |
{
|
|
|
09b1b5 |
#if defined SEEK_HOLE && defined SEEK_DATA
|
|
|
09b1b5 |
BDRVRawState *s = bs->opaque;
|
|
|
09b1b5 |
+ off_t offs;
|
|
|
09b1b5 |
|
|
|
09b1b5 |
- *hole = lseek(s->fd, start, SEEK_HOLE);
|
|
|
09b1b5 |
- if (*hole == -1) {
|
|
|
09b1b5 |
- return -errno;
|
|
|
09b1b5 |
+ /*
|
|
|
09b1b5 |
+ * SEEK_DATA cases:
|
|
|
09b1b5 |
+ * D1. offs == start: start is in data
|
|
|
09b1b5 |
+ * D2. offs > start: start is in a hole, next data at offs
|
|
|
09b1b5 |
+ * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
|
|
|
09b1b5 |
+ * or start is beyond EOF
|
|
|
09b1b5 |
+ * If the latter happens, the file has been truncated behind
|
|
|
09b1b5 |
+ * our back since we opened it. All bets are off then.
|
|
|
09b1b5 |
+ * Treating like a trailing hole is simplest.
|
|
|
09b1b5 |
+ * D4. offs < 0, errno != ENXIO: we learned nothing
|
|
|
09b1b5 |
+ */
|
|
|
09b1b5 |
+ offs = lseek(s->fd, start, SEEK_DATA);
|
|
|
09b1b5 |
+ if (offs < 0) {
|
|
|
09b1b5 |
+ return -errno; /* D3 or D4 */
|
|
|
09b1b5 |
+ }
|
|
|
09b1b5 |
+ assert(offs >= start);
|
|
|
09b1b5 |
+
|
|
|
09b1b5 |
+ if (offs > start) {
|
|
|
09b1b5 |
+ /* D2: in hole, next data at offs */
|
|
|
09b1b5 |
+ *hole = start;
|
|
|
09b1b5 |
+ *data = offs;
|
|
|
09b1b5 |
+ return 0;
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
|
|
|
09b1b5 |
- if (*hole > start) {
|
|
|
09b1b5 |
+ /* D1: in data, end not yet known */
|
|
|
09b1b5 |
+
|
|
|
09b1b5 |
+ /*
|
|
|
09b1b5 |
+ * SEEK_HOLE cases:
|
|
|
09b1b5 |
+ * H1. offs == start: start is in a hole
|
|
|
09b1b5 |
+ * If this happens here, a hole has been dug behind our back
|
|
|
09b1b5 |
+ * since the previous lseek().
|
|
|
09b1b5 |
+ * H2. offs > start: either start is in data, next hole at offs,
|
|
|
09b1b5 |
+ * or start is in trailing hole, EOF at offs
|
|
|
09b1b5 |
+ * Linux treats trailing holes like any other hole: offs ==
|
|
|
09b1b5 |
+ * start. Solaris seeks to EOF instead: offs > start (blech).
|
|
|
09b1b5 |
+ * If that happens here, a hole has been dug behind our back
|
|
|
09b1b5 |
+ * since the previous lseek().
|
|
|
09b1b5 |
+ * H3. offs < 0, errno = ENXIO: start is beyond EOF
|
|
|
09b1b5 |
+ * If this happens, the file has been truncated behind our
|
|
|
09b1b5 |
+ * back since we opened it. Treat it like a trailing hole.
|
|
|
09b1b5 |
+ * H4. offs < 0, errno != ENXIO: we learned nothing
|
|
|
09b1b5 |
+ * Pretend we know nothing at all, i.e. "forget" about D1.
|
|
|
09b1b5 |
+ */
|
|
|
09b1b5 |
+ offs = lseek(s->fd, start, SEEK_HOLE);
|
|
|
09b1b5 |
+ if (offs < 0) {
|
|
|
09b1b5 |
+ return -errno; /* D1 and (H3 or H4) */
|
|
|
09b1b5 |
+ }
|
|
|
09b1b5 |
+ assert(offs >= start);
|
|
|
09b1b5 |
+
|
|
|
09b1b5 |
+ if (offs > start) {
|
|
|
09b1b5 |
+ /*
|
|
|
09b1b5 |
+ * D1 and H2: either in data, next hole at offs, or it was in
|
|
|
09b1b5 |
+ * data but is now in a trailing hole. In the latter case,
|
|
|
09b1b5 |
+ * all bets are off. Treating it as if it there was data all
|
|
|
09b1b5 |
+ * the way to EOF is safe, so simply do that.
|
|
|
09b1b5 |
+ */
|
|
|
09b1b5 |
*data = start;
|
|
|
09b1b5 |
- } else {
|
|
|
09b1b5 |
- /* On a hole. We need another syscall to find its end. */
|
|
|
09b1b5 |
- *data = lseek(s->fd, start, SEEK_DATA);
|
|
|
09b1b5 |
- if (*data == -1) {
|
|
|
09b1b5 |
- *data = lseek(s->fd, 0, SEEK_END);
|
|
|
09b1b5 |
- }
|
|
|
09b1b5 |
+ *hole = offs;
|
|
|
09b1b5 |
+ return 0;
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
|
|
|
09b1b5 |
- return 0;
|
|
|
09b1b5 |
+ /* D1 and H1 */
|
|
|
09b1b5 |
+ return -EBUSY;
|
|
|
09b1b5 |
#else
|
|
|
09b1b5 |
return -ENOTSUP;
|
|
|
09b1b5 |
#endif
|
|
|
09b1b5 |
@@ -1320,25 +1378,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
|
|
|
09b1b5 |
nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
|
|
|
09b1b5 |
- ret = try_seek_hole(bs, start, &data, &hole);
|
|
|
09b1b5 |
- if (ret < 0) {
|
|
|
09b1b5 |
- /* Assume everything is allocated. */
|
|
|
09b1b5 |
- data = 0;
|
|
|
09b1b5 |
- hole = start + nb_sectors * BDRV_SECTOR_SIZE;
|
|
|
09b1b5 |
- ret = 0;
|
|
|
09b1b5 |
- }
|
|
|
09b1b5 |
-
|
|
|
09b1b5 |
- assert(ret >= 0);
|
|
|
09b1b5 |
-
|
|
|
09b1b5 |
- if (data <= start) {
|
|
|
09b1b5 |
+ ret = find_allocation(bs, start, &data, &hole);
|
|
|
09b1b5 |
+ if (ret == -ENXIO) {
|
|
|
09b1b5 |
+ /* Trailing hole */
|
|
|
09b1b5 |
+ *pnum = nb_sectors;
|
|
|
09b1b5 |
+ ret = BDRV_BLOCK_ZERO;
|
|
|
09b1b5 |
+ } else if (ret < 0) {
|
|
|
09b1b5 |
+ /* No info available, so pretend there are no holes */
|
|
|
09b1b5 |
+ *pnum = nb_sectors;
|
|
|
09b1b5 |
+ ret = BDRV_BLOCK_DATA;
|
|
|
09b1b5 |
+ } else if (data == start) {
|
|
|
09b1b5 |
/* On a data extent, compute sectors to the end of the extent. */
|
|
|
09b1b5 |
*pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
|
|
|
09b1b5 |
- return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
|
|
|
09b1b5 |
+ ret = BDRV_BLOCK_DATA;
|
|
|
09b1b5 |
} else {
|
|
|
09b1b5 |
/* On a hole, compute sectors to the beginning of the next extent. */
|
|
|
09b1b5 |
+ assert(hole == start);
|
|
|
09b1b5 |
*pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
|
|
|
09b1b5 |
- return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start;
|
|
|
09b1b5 |
+ ret = BDRV_BLOCK_ZERO;
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
+ return ret | BDRV_BLOCK_OFFSET_VALID | start;
|
|
|
09b1b5 |
}
|
|
|
09b1b5 |
|
|
|
09b1b5 |
static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
|
|
|
09b1b5 |
--
|
|
|
09b1b5 |
1.8.3.1
|
|
|
09b1b5 |
|