|
|
9ae3a8 |
From adf32529b8b95ed360356f13bd9a7f2a4b707412 Mon Sep 17 00:00:00 2001
|
|
|
9ae3a8 |
Message-Id: <adf32529b8b95ed360356f13bd9a7f2a4b707412.1389014116.git.minovotn@redhat.com>
|
|
|
9ae3a8 |
In-Reply-To: <c8cc35838d42aa286242772d97e3a9be7bb786ba.1389014116.git.minovotn@redhat.com>
|
|
|
9ae3a8 |
References: <c8cc35838d42aa286242772d97e3a9be7bb786ba.1389014116.git.minovotn@redhat.com>
|
|
|
9ae3a8 |
From: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
9ae3a8 |
Date: Mon, 9 Dec 2013 14:09:29 +0100
|
|
|
9ae3a8 |
Subject: [PATCH 41/50] raw-posix: add support for write_zeroes on XFS and
|
|
|
9ae3a8 |
block devices
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
RH-Author: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
9ae3a8 |
Message-id: <1386598178-11845-44-git-send-email-pbonzini@redhat.com>
|
|
|
9ae3a8 |
Patchwork-id: 56080
|
|
|
9ae3a8 |
O-Subject: [RHEL 7.0 qemu-kvm PATCH 43/52] raw-posix: add support for write_zeroes on XFS and block devices
|
|
|
9ae3a8 |
Bugzilla: 1007815
|
|
|
9ae3a8 |
RH-Acked-by: Jeffrey Cody <jcody@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Fam Zheng <famz@redhat.com>
|
|
|
9ae3a8 |
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
The code is similar to the implementation of discard and write_zeroes
|
|
|
9ae3a8 |
with UNMAP. However, failure must be propagated up to block.c.
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
The stale page cache problem can be reproduced as follows:
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
# modprobe scsi-debug lbpws=1 lbprz=1
|
|
|
9ae3a8 |
# ./qemu-io /dev/sdXX
|
|
|
9ae3a8 |
qemu-io> write -P 0xcc 0 2M
|
|
|
9ae3a8 |
qemu-io> write -z 0 1M
|
|
|
9ae3a8 |
qemu-io> read -P 0x00 0 512
|
|
|
9ae3a8 |
Pattern verification failed at offset 0, 512 bytes
|
|
|
9ae3a8 |
qemu-io> read -v 0 512
|
|
|
9ae3a8 |
00000000: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................
|
|
|
9ae3a8 |
...
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
# ./qemu-io --cache=none /dev/sdXX
|
|
|
9ae3a8 |
qemu-io> write -P 0xcc 0 2M
|
|
|
9ae3a8 |
qemu-io> write -z 0 1M
|
|
|
9ae3a8 |
qemu-io> read -P 0x00 0 512
|
|
|
9ae3a8 |
qemu-io> read -v 0 512
|
|
|
9ae3a8 |
00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
|
|
|
9ae3a8 |
...
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
And similarly with discard instead of "write -z".
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
9ae3a8 |
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
|
9ae3a8 |
(cherry picked from commit 97a2ae34537882df34810d538ab1f51085499d2c)
|
|
|
9ae3a8 |
---
|
|
|
9ae3a8 |
block/raw-aio.h | 3 +-
|
|
|
9ae3a8 |
block/raw-posix.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++--------
|
|
|
9ae3a8 |
2 files changed, 74 insertions(+), 13 deletions(-)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
Signed-off-by: Michal Novotny <minovotn@redhat.com>
|
|
|
9ae3a8 |
---
|
|
|
9ae3a8 |
block/raw-aio.h | 3 +-
|
|
|
9ae3a8 |
block/raw-posix.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++--------
|
|
|
9ae3a8 |
2 files changed, 74 insertions(+), 13 deletions(-)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
diff --git a/block/raw-aio.h b/block/raw-aio.h
|
|
|
9ae3a8 |
index c61f159..7ad0a8a 100644
|
|
|
9ae3a8 |
--- a/block/raw-aio.h
|
|
|
9ae3a8 |
+++ b/block/raw-aio.h
|
|
|
9ae3a8 |
@@ -21,9 +21,10 @@
|
|
|
9ae3a8 |
#define QEMU_AIO_IOCTL 0x0004
|
|
|
9ae3a8 |
#define QEMU_AIO_FLUSH 0x0008
|
|
|
9ae3a8 |
#define QEMU_AIO_DISCARD 0x0010
|
|
|
9ae3a8 |
+#define QEMU_AIO_WRITE_ZEROES 0x0020
|
|
|
9ae3a8 |
#define QEMU_AIO_TYPE_MASK \
|
|
|
9ae3a8 |
(QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
|
|
|
9ae3a8 |
- QEMU_AIO_DISCARD)
|
|
|
9ae3a8 |
+ QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
/* AIO flags */
|
|
|
9ae3a8 |
#define QEMU_AIO_MISALIGNED 0x1000
|
|
|
9ae3a8 |
diff --git a/block/raw-posix.c b/block/raw-posix.c
|
|
|
9ae3a8 |
index 815a80b..f410668 100644
|
|
|
9ae3a8 |
--- a/block/raw-posix.c
|
|
|
9ae3a8 |
+++ b/block/raw-posix.c
|
|
|
9ae3a8 |
@@ -142,6 +142,7 @@ typedef struct BDRVRawState {
|
|
|
9ae3a8 |
bool is_xfs:1;
|
|
|
9ae3a8 |
#endif
|
|
|
9ae3a8 |
bool has_discard:1;
|
|
|
9ae3a8 |
+ bool has_write_zeroes:1;
|
|
|
9ae3a8 |
bool discard_zeroes:1;
|
|
|
9ae3a8 |
} BDRVRawState;
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
@@ -327,6 +328,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|
|
9ae3a8 |
#endif
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
s->has_discard = true;
|
|
|
9ae3a8 |
+ s->has_write_zeroes = true;
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
if (fstat(s->fd, &st) < 0) {
|
|
|
9ae3a8 |
error_setg_errno(errp, errno, "Could not stat file");
|
|
|
9ae3a8 |
@@ -345,9 +347,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|
|
9ae3a8 |
#ifdef __linux__
|
|
|
9ae3a8 |
/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
|
|
|
9ae3a8 |
* not rely on the contents of discarded blocks unless using O_DIRECT.
|
|
|
9ae3a8 |
+ * Same for BLKZEROOUT.
|
|
|
9ae3a8 |
*/
|
|
|
9ae3a8 |
if (!(bs->open_flags & BDRV_O_NOCACHE)) {
|
|
|
9ae3a8 |
s->discard_zeroes = false;
|
|
|
9ae3a8 |
+ s->has_write_zeroes = false;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
#endif
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
@@ -703,6 +707,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
#ifdef CONFIG_XFS
|
|
|
9ae3a8 |
+static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
|
|
|
9ae3a8 |
+{
|
|
|
9ae3a8 |
+ struct xfs_flock64 fl;
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ memset(&fl, 0, sizeof(fl));
|
|
|
9ae3a8 |
+ fl.l_whence = SEEK_SET;
|
|
|
9ae3a8 |
+ fl.l_start = offset;
|
|
|
9ae3a8 |
+ fl.l_len = bytes;
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
|
|
|
9ae3a8 |
+ DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
|
|
|
9ae3a8 |
+ return -errno;
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ return 0;
|
|
|
9ae3a8 |
+}
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
|
|
|
9ae3a8 |
{
|
|
|
9ae3a8 |
struct xfs_flock64 fl;
|
|
|
9ae3a8 |
@@ -721,6 +742,42 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
#endif
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
|
|
|
9ae3a8 |
+{
|
|
|
9ae3a8 |
+ int ret = -EOPNOTSUPP;
|
|
|
9ae3a8 |
+ BDRVRawState *s = aiocb->bs->opaque;
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ if (s->has_write_zeroes == 0) {
|
|
|
9ae3a8 |
+ return -ENOTSUP;
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
|
|
|
9ae3a8 |
+#ifdef BLKZEROOUT
|
|
|
9ae3a8 |
+ do {
|
|
|
9ae3a8 |
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
|
|
|
9ae3a8 |
+ if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
|
|
|
9ae3a8 |
+ return 0;
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+ } while (errno == EINTR);
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ ret = -errno;
|
|
|
9ae3a8 |
+#endif
|
|
|
9ae3a8 |
+ } else {
|
|
|
9ae3a8 |
+#ifdef CONFIG_XFS
|
|
|
9ae3a8 |
+ if (s->is_xfs) {
|
|
|
9ae3a8 |
+ return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+#endif
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
+ if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
|
|
|
9ae3a8 |
+ ret == -ENOTTY) {
|
|
|
9ae3a8 |
+ s->has_write_zeroes = false;
|
|
|
9ae3a8 |
+ ret = -ENOTSUP;
|
|
|
9ae3a8 |
+ }
|
|
|
9ae3a8 |
+ return ret;
|
|
|
9ae3a8 |
+}
|
|
|
9ae3a8 |
+
|
|
|
9ae3a8 |
static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
|
|
|
9ae3a8 |
{
|
|
|
9ae3a8 |
int ret = -EOPNOTSUPP;
|
|
|
9ae3a8 |
@@ -805,6 +862,9 @@ static int aio_worker(void *arg)
|
|
|
9ae3a8 |
case QEMU_AIO_DISCARD:
|
|
|
9ae3a8 |
ret = handle_aiocb_discard(aiocb);
|
|
|
9ae3a8 |
break;
|
|
|
9ae3a8 |
+ case QEMU_AIO_WRITE_ZEROES:
|
|
|
9ae3a8 |
+ ret = handle_aiocb_write_zeroes(aiocb);
|
|
|
9ae3a8 |
+ break;
|
|
|
9ae3a8 |
default:
|
|
|
9ae3a8 |
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
|
|
|
9ae3a8 |
ret = -EINVAL;
|
|
|
9ae3a8 |
@@ -1257,13 +1317,13 @@ static int coroutine_fn raw_co_write_zeroes(
|
|
|
9ae3a8 |
BDRVRawState *s = bs->opaque;
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
|
|
9ae3a8 |
- return -ENOTSUP;
|
|
|
9ae3a8 |
- }
|
|
|
9ae3a8 |
- if (!s->discard_zeroes) {
|
|
|
9ae3a8 |
- return -ENOTSUP;
|
|
|
9ae3a8 |
+ return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
+ QEMU_AIO_WRITE_ZEROES);
|
|
|
9ae3a8 |
+ } else if (s->discard_zeroes) {
|
|
|
9ae3a8 |
+ return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
+ QEMU_AIO_DISCARD);
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
- return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
- QEMU_AIO_DISCARD);
|
|
|
9ae3a8 |
+ return -ENOTSUP;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
|
|
|
9ae3a8 |
@@ -1613,13 +1673,13 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
|
|
|
9ae3a8 |
return rc;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
|
|
9ae3a8 |
- return -ENOTSUP;
|
|
|
9ae3a8 |
- }
|
|
|
9ae3a8 |
- if (!s->discard_zeroes) {
|
|
|
9ae3a8 |
- return -ENOTSUP;
|
|
|
9ae3a8 |
+ return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
+ QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
|
|
|
9ae3a8 |
+ } else if (s->discard_zeroes) {
|
|
|
9ae3a8 |
+ return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
+ QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
- return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
|
|
|
9ae3a8 |
- QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
|
|
|
9ae3a8 |
+ return -ENOTSUP;
|
|
|
9ae3a8 |
}
|
|
|
9ae3a8 |
|
|
|
9ae3a8 |
static int hdev_create(const char *filename, QEMUOptionParameter *options,
|
|
|
9ae3a8 |
--
|
|
|
9ae3a8 |
1.7.11.7
|
|
|
9ae3a8 |
|