|
|
016a62 |
From 4438710f7aa42f55d189d1b6adb09b1c0471495e Mon Sep 17 00:00:00 2001
|
|
|
016a62 |
From: "plai@redhat.com" <plai@redhat.com>
|
|
|
016a62 |
Date: Tue, 20 Aug 2019 16:12:51 +0100
|
|
|
016a62 |
Subject: [PATCH 04/11] util/mmap-alloc: support MAP_SYNC in qemu_ram_mmap()
|
|
|
016a62 |
|
|
|
016a62 |
RH-Author: plai@redhat.com
|
|
|
016a62 |
Message-id: <1566317571-5697-5-git-send-email-plai@redhat.com>
|
|
|
016a62 |
Patchwork-id: 90085
|
|
|
016a62 |
O-Subject: [RHEL8.2 qemu-kvm PATCH 4/4] util/mmap-alloc: support MAP_SYNC in qemu_ram_mmap()
|
|
|
016a62 |
Bugzilla: 1539282
|
|
|
016a62 |
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
|
016a62 |
RH-Acked-by: Pankaj Gupta <pagupta@redhat.com>
|
|
|
016a62 |
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
|
|
|
016a62 |
|
|
|
016a62 |
From: Zhang Yi <yi.z.zhang@linux.intel.com>
|
|
|
016a62 |
|
|
|
016a62 |
When a file supporting DAX is used as vNVDIMM backend, mmap it with
|
|
|
016a62 |
MAP_SYNC flag in addition which can ensure file system metadata
|
|
|
016a62 |
synced in each guest writes to the backend file, without other QEMU
|
|
|
016a62 |
actions (e.g., periodic fsync() by QEMU).
|
|
|
016a62 |
|
|
|
016a62 |
Current, We have below different possible use cases:
|
|
|
016a62 |
|
|
|
016a62 |
1. pmem=on is set, shared=on is set, MAP_SYNC supported:
|
|
|
016a62 |
a: backend is a dax supporting file.
|
|
|
016a62 |
- MAP_SYNC will active.
|
|
|
016a62 |
b: backend is not a dax supporting file.
|
|
|
016a62 |
- mmap will trigger a warning. then MAP_SYNC flag will be ignored
|
|
|
016a62 |
|
|
|
016a62 |
2. The rest of cases:
|
|
|
016a62 |
- we will never pass the MAP_SYNC to mmap2
|
|
|
016a62 |
|
|
|
016a62 |
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
|
|
|
016a62 |
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
|
|
|
016a62 |
[ehabkost: Rebased patch to latest code on master]
|
|
|
016a62 |
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
|
|
|
016a62 |
Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
|
|
|
016a62 |
Tested-by: Wei Yang <richardw.yang@linux.intel.com>
|
|
|
016a62 |
Message-Id: <20190422004849.26463-2-richardw.yang@linux.intel.com>
|
|
|
016a62 |
[ehabkost: squashed documentation patch]
|
|
|
016a62 |
Message-Id: <20190422004849.26463-3-richardw.yang@linux.intel.com>
|
|
|
016a62 |
[ehabkost: documentation fixup]
|
|
|
016a62 |
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
|
|
016a62 |
Reviewed-by: Pankaj Gupta <pagupta@redhat.com>
|
|
|
016a62 |
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
|
016a62 |
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
|
|
|
016a62 |
|
|
|
016a62 |
(cherry picked from commit 119906afa5ca610adb87c55ab0d8e53c9104bfc3)
|
|
|
016a62 |
Signed-off-by: Paul Lai <plai@redhat.com>
|
|
|
016a62 |
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
|
|
|
016a62 |
---
|
|
|
016a62 |
docs/nvdimm.txt | 22 +++++++++++++++++++---
|
|
|
016a62 |
qemu-options.hx | 5 +++++
|
|
|
016a62 |
util/mmap-alloc.c | 41 ++++++++++++++++++++++++++++++++++++++++-
|
|
|
016a62 |
3 files changed, 64 insertions(+), 4 deletions(-)
|
|
|
016a62 |
|
|
|
016a62 |
diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
|
|
|
016a62 |
index 5f158a6..33ce9aa 100644
|
|
|
016a62 |
--- a/docs/nvdimm.txt
|
|
|
016a62 |
+++ b/docs/nvdimm.txt
|
|
|
016a62 |
@@ -143,9 +143,25 @@ Guest Data Persistence
|
|
|
016a62 |
----------------------
|
|
|
016a62 |
|
|
|
016a62 |
Though QEMU supports multiple types of vNVDIMM backends on Linux,
|
|
|
016a62 |
-currently the only one that can guarantee the guest write persistence
|
|
|
016a62 |
-is the device DAX on the real NVDIMM device (e.g., /dev/dax0.0), to
|
|
|
016a62 |
-which all guest access do not involve any host-side kernel cache.
|
|
|
016a62 |
+the only backend that can guarantee the guest write persistence is:
|
|
|
016a62 |
+
|
|
|
016a62 |
+A. DAX device (e.g., /dev/dax0.0, ) or
|
|
|
016a62 |
+B. DAX file(mounted with dax option)
|
|
|
016a62 |
+
|
|
|
016a62 |
+When using B (A file supporting direct mapping of persistent memory)
|
|
|
016a62 |
+as a backend, write persistence is guaranteed if the host kernel has
|
|
|
016a62 |
+support for the MAP_SYNC flag in the mmap system call (available
|
|
|
016a62 |
+since Linux 4.15 and on certain distro kernels) and additionally
|
|
|
016a62 |
+both 'pmem' and 'share' flags are set to 'on' on the backend.
|
|
|
016a62 |
+
|
|
|
016a62 |
+If these conditions are not satisfied i.e. if either 'pmem' or 'share'
|
|
|
016a62 |
+are not set, if the backend file does not support DAX or if MAP_SYNC
|
|
|
016a62 |
+is not supported by the host kernel, write persistence is not
|
|
|
016a62 |
+guaranteed after a system crash. For compatibility reasons, these
|
|
|
016a62 |
+conditions are ignored if not satisfied. Currently, no way is
|
|
|
016a62 |
+provided to test for them.
|
|
|
016a62 |
+For more details, please reference mmap(2) man page:
|
|
|
016a62 |
+http://man7.org/linux/man-pages/man2/mmap.2.html.
|
|
|
016a62 |
|
|
|
016a62 |
When using other types of backends, it's suggested to set 'unarmed'
|
|
|
016a62 |
option of '-device nvdimm' to 'on', which sets the unarmed flag of the
|
|
|
016a62 |
diff --git a/qemu-options.hx b/qemu-options.hx
|
|
|
016a62 |
index 1b6786b..1243057 100644
|
|
|
016a62 |
--- a/qemu-options.hx
|
|
|
016a62 |
+++ b/qemu-options.hx
|
|
|
016a62 |
@@ -4057,6 +4057,11 @@ using the SNIA NVM programming model (e.g. Intel NVDIMM).
|
|
|
016a62 |
If @option{pmem} is set to 'on', QEMU will take necessary operations to
|
|
|
016a62 |
guarantee the persistence of its own writes to @option{mem-path}
|
|
|
016a62 |
(e.g. in vNVDIMM label emulation and live migration).
|
|
|
016a62 |
+Also, we will map the backend-file with MAP_SYNC flag, which ensures the
|
|
|
016a62 |
+file metadata is in sync for @option{mem-path} in case of host crash
|
|
|
016a62 |
+or a power failure. MAP_SYNC requires support from both the host kernel
|
|
|
016a62 |
+(since Linux kernel 4.15) and the filesystem of @option{mem-path} mounted
|
|
|
016a62 |
+with DAX option.
|
|
|
016a62 |
|
|
|
016a62 |
@item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},share=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave}
|
|
|
016a62 |
|
|
|
016a62 |
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
|
|
|
016a62 |
index bbd9077..4873984 100644
|
|
|
016a62 |
--- a/util/mmap-alloc.c
|
|
|
016a62 |
+++ b/util/mmap-alloc.c
|
|
|
016a62 |
@@ -10,6 +10,13 @@
|
|
|
016a62 |
* later. See the COPYING file in the top-level directory.
|
|
|
016a62 |
*/
|
|
|
016a62 |
|
|
|
016a62 |
+#ifdef CONFIG_LINUX
|
|
|
016a62 |
+#include <linux/mman.h>
|
|
|
016a62 |
+#else /* !CONFIG_LINUX */
|
|
|
016a62 |
+#define MAP_SYNC 0x0
|
|
|
016a62 |
+#define MAP_SHARED_VALIDATE 0x0
|
|
|
016a62 |
+#endif /* CONFIG_LINUX */
|
|
|
016a62 |
+
|
|
|
016a62 |
#include "qemu/osdep.h"
|
|
|
016a62 |
#include "qemu/mmap-alloc.h"
|
|
|
016a62 |
#include "qemu/host-utils.h"
|
|
|
016a62 |
@@ -80,6 +87,7 @@ void *qemu_ram_mmap(int fd,
|
|
|
016a62 |
bool is_pmem)
|
|
|
016a62 |
{
|
|
|
016a62 |
int flags;
|
|
|
016a62 |
+ int map_sync_flags = 0;
|
|
|
016a62 |
int guardfd;
|
|
|
016a62 |
size_t offset;
|
|
|
016a62 |
size_t pagesize;
|
|
|
016a62 |
@@ -130,9 +138,40 @@ void *qemu_ram_mmap(int fd,
|
|
|
016a62 |
flags = MAP_FIXED;
|
|
|
016a62 |
flags |= fd == -1 ? MAP_ANONYMOUS : 0;
|
|
|
016a62 |
flags |= shared ? MAP_SHARED : MAP_PRIVATE;
|
|
|
016a62 |
+ if (shared && is_pmem) {
|
|
|
016a62 |
+ map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
|
|
|
016a62 |
+ }
|
|
|
016a62 |
+
|
|
|
016a62 |
offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
|
|
|
016a62 |
|
|
|
016a62 |
- ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
|
|
|
016a62 |
+ ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
|
|
|
016a62 |
+ flags | map_sync_flags, fd, 0);
|
|
|
016a62 |
+
|
|
|
016a62 |
+ if (ptr == MAP_FAILED && map_sync_flags) {
|
|
|
016a62 |
+ if (errno == ENOTSUP) {
|
|
|
016a62 |
+ char *proc_link, *file_name;
|
|
|
016a62 |
+ int len;
|
|
|
016a62 |
+ proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
|
|
|
016a62 |
+ file_name = g_malloc0(PATH_MAX);
|
|
|
016a62 |
+ len = readlink(proc_link, file_name, PATH_MAX - 1);
|
|
|
016a62 |
+ if (len < 0) {
|
|
|
016a62 |
+ len = 0;
|
|
|
016a62 |
+ }
|
|
|
016a62 |
+ file_name[len] = '\0';
|
|
|
016a62 |
+ fprintf(stderr, "Warning: requesting persistence across crashes "
|
|
|
016a62 |
+ "for backend file %s failed. Proceeding without "
|
|
|
016a62 |
+ "persistence, data might become corrupted in case of host "
|
|
|
016a62 |
+ "crash.\n", file_name);
|
|
|
016a62 |
+ g_free(proc_link);
|
|
|
016a62 |
+ g_free(file_name);
|
|
|
016a62 |
+ }
|
|
|
016a62 |
+ /*
|
|
|
016a62 |
+ * if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
|
|
|
016a62 |
+ * we will remove these flags to handle compatibility.
|
|
|
016a62 |
+ */
|
|
|
016a62 |
+ ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
|
|
|
016a62 |
+ flags, fd, 0);
|
|
|
016a62 |
+ }
|
|
|
016a62 |
|
|
|
016a62 |
if (ptr == MAP_FAILED) {
|
|
|
016a62 |
munmap(guardptr, total);
|
|
|
016a62 |
--
|
|
|
016a62 |
1.8.3.1
|
|
|
016a62 |
|