From 1be5c7ff51f42ce741ecede4baea1218235d83f2 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Sep 20 2022 23:07:15 +0000 Subject: import qemu-kvm-6.2.0-20.module+el8.7.0+16634+93bdaf0e.1 --- diff --git a/SOURCES/kvm-Add-dirty-sync-missed-zero-copy-migration-stat.patch b/SOURCES/kvm-Add-dirty-sync-missed-zero-copy-migration-stat.patch new file mode 100644 index 0000000..ad2b261 --- /dev/null +++ b/SOURCES/kvm-Add-dirty-sync-missed-zero-copy-migration-stat.patch @@ -0,0 +1,87 @@ +From cd49a32e9c9e33efc51652b68180a07683814b4d Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 11 Jul 2022 18:11:12 -0300 +Subject: [PATCH 4/9] Add dirty-sync-missed-zero-copy migration stat +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [4/8] 56cce61cf95aafc8dafae7531b43c166084abfec +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Signed-off-by: Leonardo Bras +Acked-by: Markus Armbruster +Acked-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Message-Id: <20220711211112.18951-3-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit cf20c897338067ab4b70a4596fdccaf90c7e29a1) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 2 ++ + monitor/hmp-cmds.c | 5 +++++ + qapi/migration.json | 7 ++++++- + 3 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/migration/migration.c b/migration/migration.c +index e100b30f00..952a26c5c2 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1012,6 +1012,8 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) + info->ram->normal_bytes = ram_counters.normal * page_size; + info->ram->mbps = s->mbps; + info->ram->dirty_sync_count = ram_counters.dirty_sync_count; ++ info->ram->dirty_sync_missed_zero_copy = ++ ram_counters.dirty_sync_missed_zero_copy; + info->ram->postcopy_requests = ram_counters.postcopy_requests; + info->ram->page_size = page_size; + info->ram->multifd_bytes = ram_counters.multifd_bytes; +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index 8c384dc1b2..f7216ab5d0 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -305,6 +305,11 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) + monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n", + info->ram->postcopy_bytes >> 10); + } ++ if (info->ram->dirty_sync_missed_zero_copy) { ++ monitor_printf(mon, ++ "Zero-copy-send fallbacks happened: %" PRIu64 " times\n", ++ info->ram->dirty_sync_missed_zero_copy); ++ } + } + + if (info->has_disk) { +diff --git a/qapi/migration.json b/qapi/migration.json +index c8ec260ab0..94bc5c69db 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -55,6 +55,10 @@ + # @postcopy-bytes: The number of bytes sent during the post-copy phase + # (since 7.0). + # ++# @dirty-sync-missed-zero-copy: Number of times dirty RAM synchronization could ++# not avoid copying dirty pages. This is between ++# 0 and @dirty-sync-count * @multifd-channels. ++# (since 7.1) + # Since: 0.14 + ## + { 'struct': 'MigrationStats', +@@ -65,7 +69,8 @@ + 'postcopy-requests' : 'int', 'page-size' : 'int', + 'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64', + 'precopy-bytes' : 'uint64', 'downtime-bytes' : 'uint64', +- 'postcopy-bytes' : 'uint64' } } ++ 'postcopy-bytes' : 'uint64', ++ 'dirty-sync-missed-zero-copy' : 'uint64' } } + + ## + # @XBZRLECacheStats: +-- +2.31.1 + diff --git a/SOURCES/kvm-KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch b/SOURCES/kvm-KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch new file mode 100644 index 0000000..1a0beb2 --- /dev/null +++ b/SOURCES/kvm-KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch @@ -0,0 +1,109 @@ +From ea5299b5dde7d0b6b2f93cb646e6a24c9f105466 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 23 Mar 2022 12:33:25 +0100 +Subject: [PATCH 13/24] KVM: x86: workaround invalid CPUID[0xD,9] info on some + AMD processors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [13/13] 38f147c911258e84e01336271ebd23a1c24371fc +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Some AMD processors expose the PKRU extended save state even if they do not have +the related PKU feature in CPUID. Worse, when they do they report a size of +64, whereas the expected size of the PKRU extended save state is 8, therefore +the esa->size == eax assertion does not hold. + +The state is already ignored by KVM_GET_SUPPORTED_CPUID because it +was not enabled in the host XCR0. However, QEMU kvm_cpu_xsave_init() +runs before QEMU invokes arch_prctl() to enable dynamically-enabled +save states such as XTILEDATA, and KVM_GET_SUPPORTED_CPUID hides save +states that have yet to be enabled. Therefore, kvm_cpu_xsave_init() +needs to consult the host CPUID instead of KVM_GET_SUPPORTED_CPUID, +and dies with an assertion failure. + +When setting up the ExtSaveArea array to match the host, ignore features that +KVM does not report as supported. This will cause QEMU to skip the incorrect +CPUID leaf instead of tripping the assertion. + +Closes: https://gitlab.com/qemu-project/qemu/-/issues/916 +Reported-by: Daniel P. Berrangé +Analyzed-by: Yang Zhong +Reported-by: Peter Krempa +Tested-by: Daniel P. Berrangé +Signed-off-by: Paolo Bonzini +(cherry picked from commit 58f7db26f21c690cf9a669c314cfd7371506084a) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 4 ++-- + target/i386/cpu.h | 2 ++ + target/i386/kvm/kvm-cpu.c | 19 ++++++++++++------- + 3 files changed, 16 insertions(+), 9 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 09e08f7f38..0543b846ff 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -4980,8 +4980,8 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) + return cpu_list; + } + +-static uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, +- bool migratable_only) ++uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, ++ bool migratable_only) + { + FeatureWordInfo *wi = &feature_word_info[w]; + uint64_t r = 0; +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 8ab2a4042a..006b735fe4 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -604,6 +604,8 @@ typedef enum FeatureWord { + } FeatureWord; + + typedef uint64_t FeatureWordArray[FEATURE_WORDS]; ++uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, ++ bool migratable_only); + + /* cpuid_features bits */ + #define CPUID_FP87 (1U << 0) +diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c +index bdc967c484..74c1396a93 100644 +--- a/target/i386/kvm/kvm-cpu.c ++++ b/target/i386/kvm/kvm-cpu.c +@@ -99,13 +99,18 @@ static void kvm_cpu_xsave_init(void) + for (i = XSTATE_SSE_BIT + 1; i < XSAVE_STATE_AREA_COUNT; i++) { + ExtSaveArea *esa = &x86_ext_save_areas[i]; + +- if (esa->size) { +- host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx); +- if (eax != 0) { +- assert(esa->size == eax); +- esa->offset = ebx; +- esa->ecx = ecx; +- } ++ if (!esa->size) { ++ continue; ++ } ++ if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits) ++ != esa->bits) { ++ continue; ++ } ++ host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx); ++ if (eax != 0) { ++ assert(esa->size == eax); ++ esa->offset = ebx; ++ esa->ecx = ecx; + } + } + } +-- +2.35.3 + diff --git a/SOURCES/kvm-QIOChannel-Add-flags-on-io_writev-and-introduce-io_f.patch b/SOURCES/kvm-QIOChannel-Add-flags-on-io_writev-and-introduce-io_f.patch new file mode 100644 index 0000000..81ae532 --- /dev/null +++ b/SOURCES/kvm-QIOChannel-Add-flags-on-io_writev-and-introduce-io_f.patch @@ -0,0 +1,420 @@ +From 7eeec7c008e947bc3e1fed682791092b408852c6 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 17/37] QIOChannel: Add flags on io_writev and introduce + io_flush callback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [17/26] 7bde4e79fd3f76a6cc84d9cacf50420584ddd35c +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Add flags to io_writev and introduce io_flush as optional callback to +QIOChannelClass, allowing the implementation of zero copy writes by +subclasses. + +How to use them: +- Write data using qio_channel_writev*(...,QIO_CHANNEL_WRITE_FLAG_ZERO_COPY), +- Wait write completion with qio_channel_flush(). + +Notes: +As some zero copy write implementations work asynchronously, it's +recommended to keep the write buffer untouched until the return of +qio_channel_flush(), to avoid the risk of sending an updated buffer +instead of the buffer state during write. + +As io_flush callback is optional, if a subclass does not implement it, then: +- io_flush will return 0 without changing anything. + +Also, some functions like qio_channel_writev_full_all() were adapted to +receive a flag parameter. That allows shared code between zero copy and +non-zero copy writev, and also an easier implementation on new flags. + +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Message-Id: <20220513062836.965425-3-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b88651cb4d4fa416fdbb6afaf5b26ec8c035eaad) +Signed-off-by: Leonardo Bras +--- + chardev/char-io.c | 2 +- + hw/remote/mpqemu-link.c | 2 +- + include/io/channel.h | 38 +++++++++++++++++++++- + io/channel-buffer.c | 1 + + io/channel-command.c | 1 + + io/channel-file.c | 1 + + io/channel-socket.c | 2 ++ + io/channel-tls.c | 1 + + io/channel-websock.c | 1 + + io/channel.c | 49 +++++++++++++++++++++++------ + migration/rdma.c | 1 + + scsi/pr-manager-helper.c | 2 +- + tests/unit/test-io-channel-socket.c | 1 + + 13 files changed, 88 insertions(+), 14 deletions(-) + +diff --git a/chardev/char-io.c b/chardev/char-io.c +index 8ced184160..4451128cba 100644 +--- a/chardev/char-io.c ++++ b/chardev/char-io.c +@@ -122,7 +122,7 @@ int io_channel_send_full(QIOChannel *ioc, + + ret = qio_channel_writev_full( + ioc, &iov, 1, +- fds, nfds, NULL); ++ fds, nfds, 0, NULL); + if (ret == QIO_CHANNEL_ERR_BLOCK) { + if (offset) { + return offset; +diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c +index 7e841820e5..e8f556bd27 100644 +--- a/hw/remote/mpqemu-link.c ++++ b/hw/remote/mpqemu-link.c +@@ -69,7 +69,7 @@ bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) + } + + if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send), +- fds, nfds, errp)) { ++ fds, nfds, 0, errp)) { + ret = true; + } else { + trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds); +diff --git a/include/io/channel.h b/include/io/channel.h +index 88988979f8..c680ee7480 100644 +--- a/include/io/channel.h ++++ b/include/io/channel.h +@@ -32,12 +32,15 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass, + + #define QIO_CHANNEL_ERR_BLOCK -2 + ++#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1 ++ + typedef enum QIOChannelFeature QIOChannelFeature; + + enum QIOChannelFeature { + QIO_CHANNEL_FEATURE_FD_PASS, + QIO_CHANNEL_FEATURE_SHUTDOWN, + QIO_CHANNEL_FEATURE_LISTEN, ++ QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, + }; + + +@@ -104,6 +107,7 @@ struct QIOChannelClass { + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp); + ssize_t (*io_readv)(QIOChannel *ioc, + const struct iovec *iov, +@@ -136,6 +140,8 @@ struct QIOChannelClass { + IOHandler *io_read, + IOHandler *io_write, + void *opaque); ++ int (*io_flush)(QIOChannel *ioc, ++ Error **errp); + }; + + /* General I/O handling functions */ +@@ -228,6 +234,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc, + * @niov: the length of the @iov array + * @fds: an array of file handles to send + * @nfds: number of file handles in @fds ++ * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*) + * @errp: pointer to a NULL-initialized error object + * + * Write data to the IO channel, reading it from the +@@ -260,6 +267,7 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp); + + /** +@@ -837,6 +845,7 @@ int qio_channel_readv_full_all(QIOChannel *ioc, + * @niov: the length of the @iov array + * @fds: an array of file handles to send + * @nfds: number of file handles in @fds ++ * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*) + * @errp: pointer to a NULL-initialized error object + * + * +@@ -846,6 +855,14 @@ int qio_channel_readv_full_all(QIOChannel *ioc, + * to be written, yielding from the current coroutine + * if required. + * ++ * If QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is passed in flags, ++ * instead of waiting for all requested data to be written, ++ * this function will wait until it's all queued for writing. ++ * In this case, if the buffer gets changed between queueing and ++ * sending, the updated buffer will be sent. If this is not a ++ * desired behavior, it's suggested to call qio_channel_flush() ++ * before reusing the buffer. ++ * + * Returns: 0 if all bytes were written, or -1 on error + */ + +@@ -853,6 +870,25 @@ int qio_channel_writev_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int *fds, size_t nfds, +- Error **errp); ++ int flags, Error **errp); ++ ++/** ++ * qio_channel_flush: ++ * @ioc: the channel object ++ * @errp: pointer to a NULL-initialized error object ++ * ++ * Will block until every packet queued with ++ * qio_channel_writev_full() + QIO_CHANNEL_WRITE_FLAG_ZERO_COPY ++ * is sent, or return in case of any error. ++ * ++ * If not implemented, acts as a no-op, and returns 0. ++ * ++ * Returns -1 if any error is found, ++ * 1 if every send failed to use zero copy. ++ * 0 otherwise. ++ */ ++ ++int qio_channel_flush(QIOChannel *ioc, ++ Error **errp); + + #endif /* QIO_CHANNEL_H */ +diff --git a/io/channel-buffer.c b/io/channel-buffer.c +index baa4e2b089..bf52011be2 100644 +--- a/io/channel-buffer.c ++++ b/io/channel-buffer.c +@@ -81,6 +81,7 @@ static ssize_t qio_channel_buffer_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); +diff --git a/io/channel-command.c b/io/channel-command.c +index b2a9e27138..5ff1691bad 100644 +--- a/io/channel-command.c ++++ b/io/channel-command.c +@@ -258,6 +258,7 @@ static ssize_t qio_channel_command_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); +diff --git a/io/channel-file.c b/io/channel-file.c +index c4bf799a80..348a48545e 100644 +--- a/io/channel-file.c ++++ b/io/channel-file.c +@@ -114,6 +114,7 @@ static ssize_t qio_channel_file_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); +diff --git a/io/channel-socket.c b/io/channel-socket.c +index 606ec97cf7..bfbd64787e 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -525,6 +525,7 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +@@ -620,6 +621,7 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); +diff --git a/io/channel-tls.c b/io/channel-tls.c +index 2ae1b92fc0..4ce890a538 100644 +--- a/io/channel-tls.c ++++ b/io/channel-tls.c +@@ -301,6 +301,7 @@ static ssize_t qio_channel_tls_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc); +diff --git a/io/channel-websock.c b/io/channel-websock.c +index 70889bb54d..035dd6075b 100644 +--- a/io/channel-websock.c ++++ b/io/channel-websock.c +@@ -1127,6 +1127,7 @@ static ssize_t qio_channel_websock_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc); +diff --git a/io/channel.c b/io/channel.c +index e8b019dc36..0640941ac5 100644 +--- a/io/channel.c ++++ b/io/channel.c +@@ -72,18 +72,32 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + +- if ((fds || nfds) && +- !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ++ if (fds || nfds) { ++ if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ++ error_setg_errno(errp, EINVAL, ++ "Channel does not support file descriptor passing"); ++ return -1; ++ } ++ if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) { ++ error_setg_errno(errp, EINVAL, ++ "Zero Copy does not support file descriptor passing"); ++ return -1; ++ } ++ } ++ ++ if ((flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) && ++ !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY)) { + error_setg_errno(errp, EINVAL, +- "Channel does not support file descriptor passing"); ++ "Requested Zero Copy feature is not available"); + return -1; + } + +- return klass->io_writev(ioc, iov, niov, fds, nfds, errp); ++ return klass->io_writev(ioc, iov, niov, fds, nfds, flags, errp); + } + + +@@ -217,14 +231,14 @@ int qio_channel_writev_all(QIOChannel *ioc, + size_t niov, + Error **errp) + { +- return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp); ++ return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, 0, errp); + } + + int qio_channel_writev_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int *fds, size_t nfds, +- Error **errp) ++ int flags, Error **errp) + { + int ret = -1; + struct iovec *local_iov = g_new(struct iovec, niov); +@@ -237,8 +251,10 @@ int qio_channel_writev_full_all(QIOChannel *ioc, + + while (nlocal_iov > 0) { + ssize_t len; +- len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds, +- errp); ++ ++ len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, ++ nfds, flags, errp); ++ + if (len == QIO_CHANNEL_ERR_BLOCK) { + if (qemu_in_coroutine()) { + qio_channel_yield(ioc, G_IO_OUT); +@@ -277,7 +293,7 @@ ssize_t qio_channel_writev(QIOChannel *ioc, + size_t niov, + Error **errp) + { +- return qio_channel_writev_full(ioc, iov, niov, NULL, 0, errp); ++ return qio_channel_writev_full(ioc, iov, niov, NULL, 0, 0, errp); + } + + +@@ -297,7 +313,7 @@ ssize_t qio_channel_write(QIOChannel *ioc, + Error **errp) + { + struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen }; +- return qio_channel_writev_full(ioc, &iov, 1, NULL, 0, errp); ++ return qio_channel_writev_full(ioc, &iov, 1, NULL, 0, 0, errp); + } + + +@@ -473,6 +489,19 @@ off_t qio_channel_io_seek(QIOChannel *ioc, + return klass->io_seek(ioc, offset, whence, errp); + } + ++int qio_channel_flush(QIOChannel *ioc, ++ Error **errp) ++{ ++ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); ++ ++ if (!klass->io_flush || ++ !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY)) { ++ return 0; ++ } ++ ++ return klass->io_flush(ioc, errp); ++} ++ + + static void qio_channel_restart_read(void *opaque) + { +diff --git a/migration/rdma.c b/migration/rdma.c +index f5d3bbe7e9..54acd2000e 100644 +--- a/migration/rdma.c ++++ b/migration/rdma.c +@@ -2833,6 +2833,7 @@ static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, + size_t niov, + int *fds, + size_t nfds, ++ int flags, + Error **errp) + { + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); +diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c +index 451c7631b7..3be52a98d5 100644 +--- a/scsi/pr-manager-helper.c ++++ b/scsi/pr-manager-helper.c +@@ -77,7 +77,7 @@ static int pr_manager_helper_write(PRManagerHelper *pr_mgr, + iov.iov_base = (void *)buf; + iov.iov_len = sz; + n_written = qio_channel_writev_full(QIO_CHANNEL(pr_mgr->ioc), &iov, 1, +- nfds ? &fd : NULL, nfds, errp); ++ nfds ? &fd : NULL, nfds, 0, errp); + + if (n_written <= 0) { + assert(n_written != QIO_CHANNEL_ERR_BLOCK); +diff --git a/tests/unit/test-io-channel-socket.c b/tests/unit/test-io-channel-socket.c +index c49eec1f03..6713886d02 100644 +--- a/tests/unit/test-io-channel-socket.c ++++ b/tests/unit/test-io-channel-socket.c +@@ -444,6 +444,7 @@ static void test_io_channel_unix_fd_pass(void) + G_N_ELEMENTS(iosend), + fdsend, + G_N_ELEMENTS(fdsend), ++ 0, + &error_abort); + + qio_channel_readv_full(dst, +-- +2.35.3 + diff --git a/SOURCES/kvm-QIOChannelSocket-Add-support-for-MSG_ZEROCOPY-IPV6.patch b/SOURCES/kvm-QIOChannelSocket-Add-support-for-MSG_ZEROCOPY-IPV6.patch new file mode 100644 index 0000000..98f1ac4 --- /dev/null +++ b/SOURCES/kvm-QIOChannelSocket-Add-support-for-MSG_ZEROCOPY-IPV6.patch @@ -0,0 +1,56 @@ +From a6c4aed18a027ce8e107fdf9184e9ea43a86f843 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Thu, 4 Aug 2022 04:10:43 -0300 +Subject: [PATCH 8/9] QIOChannelSocket: Add support for MSG_ZEROCOPY + IPV6 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [8/8] 6e26ee7c9ebaedb07623313cb0678816867751dd +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +For using MSG_ZEROCOPY, there are two steps: +1 - io_writev() the packet, which enqueues the packet for sending, and +2 - io_flush(), which gets confirmation that all packets got correctly sent + +Currently, if MSG_ZEROCOPY is used to send packets over IPV6, no error will +be reported in (1), but it will fail in the first time (2) happens. + +This happens because (2) currently checks for cmsg_level & cmsg_type +associated with IPV4 only, before reporting any error. + +Add checks for cmsg_level & cmsg_type associated with IPV6, and thus enable +support for MSG_ZEROCOPY + IPV6 + +Fixes: 2bc58ffc29 ("QIOChannelSocket: Implement io_writev zero copy flag & io_flush for CONFIG_LINUX") +Signed-off-by: Leonardo Bras +Signed-off-by: Daniel P. Berrangé +(cherry picked from commit 5258a7e2c0677d16e9e1d06845f60171adf0b290) +Signed-off-by: Leonardo Bras +--- + io/channel-socket.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/io/channel-socket.c b/io/channel-socket.c +index cf0d67c51b..6010ad7017 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -747,8 +747,8 @@ static int qio_channel_socket_flush(QIOChannel *ioc, + } + + cm = CMSG_FIRSTHDR(&msg); +- if (cm->cmsg_level != SOL_IP && +- cm->cmsg_type != IP_RECVERR) { ++ if (cm->cmsg_level != SOL_IP && cm->cmsg_type != IP_RECVERR && ++ cm->cmsg_level != SOL_IPV6 && cm->cmsg_type != IPV6_RECVERR) { + error_setg_errno(errp, EPROTOTYPE, + "Wrong cmsg in errqueue"); + return -1; +-- +2.31.1 + diff --git a/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-flush-returning-code-.patch b/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-flush-returning-code-.patch new file mode 100644 index 0000000..5806062 --- /dev/null +++ b/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-flush-returning-code-.patch @@ -0,0 +1,65 @@ +From 905cc8032fc63619efb3f0a8c9754b7190bcc43a Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 11 Jul 2022 18:11:11 -0300 +Subject: [PATCH 3/9] QIOChannelSocket: Fix zero-copy flush returning code 1 + when nothing sent +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [3/8] 1ad707702fa26cd4d0fa1870c21f5f26ae93ff97 +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +If flush is called when no buffer was sent with MSG_ZEROCOPY, it currently +returns 1. This return code should be used only when Linux fails to use +MSG_ZEROCOPY on a lot of sendmsg(). + +Fix this by returning early from flush if no sendmsg(...,MSG_ZEROCOPY) +was attempted. + +Fixes: 2bc58ffc2926 ("QIOChannelSocket: Implement io_writev zero copy flag & io_flush for CONFIG_LINUX") +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Acked-by: Daniel P. Berrangé +Reviewed-by: Juan Quintela +Reviewed-by: Peter Xu +Message-Id: <20220711211112.18951-2-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 927f93e099c4f9184e60a1bc61624ac2d04d0223) +Signed-off-by: Leonardo Bras +--- + io/channel-socket.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/io/channel-socket.c b/io/channel-socket.c +index df858da924..cf0d67c51b 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -717,12 +717,18 @@ static int qio_channel_socket_flush(QIOChannel *ioc, + struct cmsghdr *cm; + char control[CMSG_SPACE(sizeof(*serr))]; + int received; +- int ret = 1; ++ int ret; ++ ++ if (sioc->zero_copy_queued == sioc->zero_copy_sent) { ++ return 0; ++ } + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + memset(control, 0, sizeof(control)); + ++ ret = 1; ++ + while (sioc->zero_copy_sent < sioc->zero_copy_queued) { + received = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE); + if (received < 0) { +-- +2.31.1 + diff --git a/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-send-so-socket-flush-.patch b/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-send-so-socket-flush-.patch new file mode 100644 index 0000000..685478f --- /dev/null +++ b/SOURCES/kvm-QIOChannelSocket-Fix-zero-copy-send-so-socket-flush-.patch @@ -0,0 +1,58 @@ +From c1fd32d93ae42fcf3c1a25f4d56e669f251087d8 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 20 Jun 2022 02:39:43 -0300 +Subject: [PATCH 25/37] QIOChannelSocket: Fix zero-copy send so socket flush + works +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [25/26] 3ede94f3269e21c3ace073ed1a6f24696315bcbb +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Somewhere between v6 and v7 the of the zero-copy-send patchset a crucial +part of the flushing mechanism got missing: incrementing zero_copy_queued. + +Without that, the flushing interface becomes a no-op, and there is no +guarantee the buffer is really sent. + +This can go as bad as causing a corruption in RAM during migration. + +Fixes: 2bc58ffc2926 ("QIOChannelSocket: Implement io_writev zero copy flag & io_flush for CONFIG_LINUX") +Reported-by: 徐闯 +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 4f5a09714c983a3471fd12e3c7f3196e95c650c1) +Signed-off-by: Leonardo Bras +--- + io/channel-socket.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/io/channel-socket.c b/io/channel-socket.c +index 7d37b39de7..df858da924 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -612,6 +612,11 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + "Unable to write to socket"); + return -1; + } ++ ++ if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) { ++ sioc->zero_copy_queued++; ++ } ++ + return ret; + } + #else /* WIN32 */ +-- +2.35.3 + diff --git a/SOURCES/kvm-QIOChannelSocket-Implement-io_writev-zero-copy-flag-.patch b/SOURCES/kvm-QIOChannelSocket-Implement-io_writev-zero-copy-flag-.patch new file mode 100644 index 0000000..4b272ee --- /dev/null +++ b/SOURCES/kvm-QIOChannelSocket-Implement-io_writev-zero-copy-flag-.patch @@ -0,0 +1,249 @@ +From 5fd7af93a06adaddbae719aabbaf912159f4fb28 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 18/37] QIOChannelSocket: Implement io_writev zero copy flag & + io_flush for CONFIG_LINUX +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [18/26] 6f65c8c879a5df57213b541d58285b65178f8547 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +For CONFIG_LINUX, implement the new zero copy flag and the optional callback +io_flush on QIOChannelSocket, but enables it only when MSG_ZEROCOPY +feature is available in the host kernel, which is checked on +qio_channel_socket_connect_sync() + +qio_channel_socket_flush() was implemented by counting how many times +sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the +socket's error queue, in order to find how many of them finished sending. +Flush will loop until those counters are the same, or until some error occurs. + +Notes on using writev() with QIO_CHANNEL_WRITE_FLAG_ZERO_COPY: +1: Buffer +- As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying, +some caution is necessary to avoid overwriting any buffer before it's sent. +If something like this happen, a newer version of the buffer may be sent instead. +- If this is a problem, it's recommended to call qio_channel_flush() before freeing +or re-using the buffer. + +2: Locked memory +- When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and +unlocked after it's sent. +- Depending on the size of each buffer, and how often it's sent, it may require +a larger amount of locked memory than usually available to non-root user. +- If the required amount of locked memory is not available, writev_zero_copy +will return an error, which can abort an operation like migration, +- Because of this, when an user code wants to add zero copy as a feature, it +requires a mechanism to disable it, so it can still be accessible to less +privileged users. + +Signed-off-by: Leonardo Bras +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Juan Quintela +Message-Id: <20220513062836.965425-4-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2bc58ffc2926a4efdd03edfb5909861fefc68c3d) +Signed-off-by: Leonardo Bras +--- + include/io/channel-socket.h | 2 + + io/channel-socket.c | 116 ++++++++++++++++++++++++++++++++++-- + 2 files changed, 114 insertions(+), 4 deletions(-) + +diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h +index e747e63514..513c428fe4 100644 +--- a/include/io/channel-socket.h ++++ b/include/io/channel-socket.h +@@ -47,6 +47,8 @@ struct QIOChannelSocket { + socklen_t localAddrLen; + struct sockaddr_storage remoteAddr; + socklen_t remoteAddrLen; ++ ssize_t zero_copy_queued; ++ ssize_t zero_copy_sent; + }; + + +diff --git a/io/channel-socket.c b/io/channel-socket.c +index bfbd64787e..38a46ba213 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -26,6 +26,14 @@ + #include "io/channel-watch.h" + #include "trace.h" + #include "qapi/clone-visitor.h" ++#ifdef CONFIG_LINUX ++#include ++#include ++ ++#if (defined(MSG_ZEROCOPY) && defined(SO_ZEROCOPY)) ++#define QEMU_MSG_ZEROCOPY ++#endif ++#endif + + #define SOCKET_MAX_FDS 16 + +@@ -55,6 +63,8 @@ qio_channel_socket_new(void) + + sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET)); + sioc->fd = -1; ++ sioc->zero_copy_queued = 0; ++ sioc->zero_copy_sent = 0; + + ioc = QIO_CHANNEL(sioc); + qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); +@@ -154,6 +164,16 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc, + return -1; + } + ++#ifdef QEMU_MSG_ZEROCOPY ++ int ret, v = 1; ++ ret = setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v)); ++ if (ret == 0) { ++ /* Zero copy available on host */ ++ qio_channel_set_feature(QIO_CHANNEL(ioc), ++ QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY); ++ } ++#endif ++ + return 0; + } + +@@ -534,6 +554,7 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + char control[CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS)]; + size_t fdsize = sizeof(int) * nfds; + struct cmsghdr *cmsg; ++ int sflags = 0; + + memset(control, 0, CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS)); + +@@ -558,15 +579,31 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } + ++#ifdef QEMU_MSG_ZEROCOPY ++ if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) { ++ sflags = MSG_ZEROCOPY; ++ } ++#endif ++ + retry: +- ret = sendmsg(sioc->fd, &msg, 0); ++ ret = sendmsg(sioc->fd, &msg, sflags); + if (ret <= 0) { +- if (errno == EAGAIN) { ++ switch (errno) { ++ case EAGAIN: + return QIO_CHANNEL_ERR_BLOCK; +- } +- if (errno == EINTR) { ++ case EINTR: + goto retry; ++#ifdef QEMU_MSG_ZEROCOPY ++ case ENOBUFS: ++ if (sflags & MSG_ZEROCOPY) { ++ error_setg_errno(errp, errno, ++ "Process can't lock enough memory for using MSG_ZEROCOPY"); ++ return -1; ++ } ++ break; ++#endif + } ++ + error_setg_errno(errp, errno, + "Unable to write to socket"); + return -1; +@@ -660,6 +697,74 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + } + #endif /* WIN32 */ + ++ ++#ifdef QEMU_MSG_ZEROCOPY ++static int qio_channel_socket_flush(QIOChannel *ioc, ++ Error **errp) ++{ ++ QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); ++ struct msghdr msg = {}; ++ struct sock_extended_err *serr; ++ struct cmsghdr *cm; ++ char control[CMSG_SPACE(sizeof(*serr))]; ++ int received; ++ int ret = 1; ++ ++ msg.msg_control = control; ++ msg.msg_controllen = sizeof(control); ++ memset(control, 0, sizeof(control)); ++ ++ while (sioc->zero_copy_sent < sioc->zero_copy_queued) { ++ received = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE); ++ if (received < 0) { ++ switch (errno) { ++ case EAGAIN: ++ /* Nothing on errqueue, wait until something is available */ ++ qio_channel_wait(ioc, G_IO_ERR); ++ continue; ++ case EINTR: ++ continue; ++ default: ++ error_setg_errno(errp, errno, ++ "Unable to read errqueue"); ++ return -1; ++ } ++ } ++ ++ cm = CMSG_FIRSTHDR(&msg); ++ if (cm->cmsg_level != SOL_IP && ++ cm->cmsg_type != IP_RECVERR) { ++ error_setg_errno(errp, EPROTOTYPE, ++ "Wrong cmsg in errqueue"); ++ return -1; ++ } ++ ++ serr = (void *) CMSG_DATA(cm); ++ if (serr->ee_errno != SO_EE_ORIGIN_NONE) { ++ error_setg_errno(errp, serr->ee_errno, ++ "Error on socket"); ++ return -1; ++ } ++ if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { ++ error_setg_errno(errp, serr->ee_origin, ++ "Error not from zero copy"); ++ return -1; ++ } ++ ++ /* No errors, count successfully finished sendmsg()*/ ++ sioc->zero_copy_sent += serr->ee_data - serr->ee_info + 1; ++ ++ /* If any sendmsg() succeeded using zero copy, return 0 at the end */ ++ if (serr->ee_code != SO_EE_CODE_ZEROCOPY_COPIED) { ++ ret = 0; ++ } ++ } ++ ++ return ret; ++} ++ ++#endif /* QEMU_MSG_ZEROCOPY */ ++ + static int + qio_channel_socket_set_blocking(QIOChannel *ioc, + bool enabled, +@@ -789,6 +894,9 @@ static void qio_channel_socket_class_init(ObjectClass *klass, + ioc_klass->io_set_delay = qio_channel_socket_set_delay; + ioc_klass->io_create_watch = qio_channel_socket_create_watch; + ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler; ++#ifdef QEMU_MSG_ZEROCOPY ++ ioc_klass->io_flush = qio_channel_socket_flush; ++#endif + } + + static const TypeInfo qio_channel_socket_info = { +-- +2.35.3 + diff --git a/SOURCES/kvm-QIOChannelSocket-Introduce-assert-and-reduce-ifdefs-.patch b/SOURCES/kvm-QIOChannelSocket-Introduce-assert-and-reduce-ifdefs-.patch new file mode 100644 index 0000000..2575f64 --- /dev/null +++ b/SOURCES/kvm-QIOChannelSocket-Introduce-assert-and-reduce-ifdefs-.patch @@ -0,0 +1,82 @@ +From cbfaf86331c2b2e01a2083303b7554672bf991b7 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 20 Jun 2022 02:39:42 -0300 +Subject: [PATCH 24/37] QIOChannelSocket: Introduce assert and reduce ifdefs to + improve readability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [24/26] b50e2e65307149f247155a7f7a032dc99e57718d +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +During implementation of MSG_ZEROCOPY feature, a lot of #ifdefs were +introduced, particularly at qio_channel_socket_writev(). + +Rewrite some of those changes so it's easier to read. + +Also, introduce an assert to help detect incorrect zero-copy usage is when +it's disabled on build. + +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Juan Quintela +Reviewed-by: Peter Xu +Signed-off-by: Juan Quintela +Signed-off-by: Dr. David Alan Gilbert + dgilbert: Fixed up thinko'd g_assert_unreachable->g_assert_not_reached +(cherry picked from commit 803ca43e4c7fcf32f9f68c118301ccd0c83ece3f) +Signed-off-by: Leonardo Bras +--- + io/channel-socket.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/io/channel-socket.c b/io/channel-socket.c +index 38a46ba213..7d37b39de7 100644 +--- a/io/channel-socket.c ++++ b/io/channel-socket.c +@@ -579,11 +579,17 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } + +-#ifdef QEMU_MSG_ZEROCOPY + if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) { ++#ifdef QEMU_MSG_ZEROCOPY + sflags = MSG_ZEROCOPY; +- } ++#else ++ /* ++ * We expect QIOChannel class entry point to have ++ * blocked this code path already ++ */ ++ g_assert_not_reached(); + #endif ++ } + + retry: + ret = sendmsg(sioc->fd, &msg, sflags); +@@ -593,15 +599,13 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, + return QIO_CHANNEL_ERR_BLOCK; + case EINTR: + goto retry; +-#ifdef QEMU_MSG_ZEROCOPY + case ENOBUFS: +- if (sflags & MSG_ZEROCOPY) { ++ if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) { + error_setg_errno(errp, errno, + "Process can't lock enough memory for using MSG_ZEROCOPY"); + return -1; + } + break; +-#endif + } + + error_setg_errno(errp, errno, +-- +2.35.3 + diff --git a/SOURCES/kvm-Revert-redhat-Add-some-devices-for-exporting-upstrea.patch b/SOURCES/kvm-Revert-redhat-Add-some-devices-for-exporting-upstrea.patch new file mode 100644 index 0000000..2aaef33 --- /dev/null +++ b/SOURCES/kvm-Revert-redhat-Add-some-devices-for-exporting-upstrea.patch @@ -0,0 +1,128 @@ +From 96edd15df257f1d1496397a6fac24b4316570d7e Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 14 Apr 2022 16:45:30 -0400 +Subject: [PATCH 1/3] Revert redhat: Add some devices for exporting upstream + machine types + +RH-Author: Jon Maloy +RH-MergeRequest: 156: Revert redhat: Add some devices for exporting upstream machine types +RH-Commit: [1/1] f25d0da3a181136917ead82f5a5c59efe3fa445a (jmaloy/qemu-kvm) +RH-Bugzilla: 2065043 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Thomas Huth +RH-Acked-by: Peter Xu + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2065043 +Upstream: no + +Manual revert of commit 70d3924521c9bfd912bcf1a1fc76f49eb377de46, since +the directory structure looks different from rhel-av-8.4.0.z where +this commit is taken from. Besides, x86_64-softmmu.mak looks totally +different and should not be affected by this reversal. + +Signed-off-by: Jon Maloy +--- + configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 - + .../devices/x86_64-softmmu/x86_64-upstream-devices.mak | 4 ---- + hw/char/parallel.c | 9 --------- + hw/i386/pc_piix.c | 2 +- + hw/i386/pc_q35.c | 2 +- + hw/timer/hpet.c | 8 -------- + 6 files changed, 2 insertions(+), 24 deletions(-) + delete mode 100644 configs/devices/x86_64-softmmu/x86_64-upstream-devices.mak + +diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak +index fdbbdf9742..31ce08edab 100644 +--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak ++++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak +@@ -1,5 +1,4 @@ + include ../rh-virtio.mak +-include x86_64-upstream-devices.mak + + CONFIG_AC97=y + CONFIG_ACPI=y +diff --git a/configs/devices/x86_64-softmmu/x86_64-upstream-devices.mak b/configs/devices/x86_64-softmmu/x86_64-upstream-devices.mak +deleted file mode 100644 +index 2cd20f54d2..0000000000 +--- a/configs/devices/x86_64-softmmu/x86_64-upstream-devices.mak ++++ /dev/null +@@ -1,4 +0,0 @@ +-# We need "isa-parallel" +-CONFIG_PARALLEL=y +-# We need "hpet" +-CONFIG_HPET=y +diff --git a/hw/char/parallel.c b/hw/char/parallel.c +index e5f108211b..b45e67bfbb 100644 +--- a/hw/char/parallel.c ++++ b/hw/char/parallel.c +@@ -29,7 +29,6 @@ + #include "chardev/char-parallel.h" + #include "chardev/char-fe.h" + #include "hw/acpi/aml-build.h" +-#include "hw/boards.h" + #include "hw/irq.h" + #include "hw/isa/isa.h" + #include "hw/qdev-properties.h" +@@ -535,14 +534,6 @@ static void parallel_isa_realizefn(DeviceState *dev, Error **errp) + int base; + uint8_t dummy; + +- /* Restricted for Red Hat Enterprise Linux */ +- MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); +- if (strstr(mc->name, "rhel")) { +- error_setg(errp, "Device %s is not supported with machine type %s", +- object_get_typename(OBJECT(dev)), mc->name); +- return; +- } +- + if (!qemu_chr_fe_backend_connected(&s->chr)) { + error_setg(errp, "Can't create parallel device, empty char device"); + return; +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index ab6d03e07a..5f101c8748 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -966,7 +966,7 @@ static void pc_machine_rhel7_options(MachineClass *m) + { + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + m->family = "pc_piix_Y"; +- m->default_machine_opts = "firmware=bios-256k.bin,hpet=off"; ++ m->default_machine_opts = "firmware=bios-256k.bin"; + pcmc->default_nic_model = "e1000"; + pcmc->pci_root_uid = 0; + m->default_display = "std"; +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 882fe7a68d..73b0d0d317 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -633,7 +633,7 @@ static void pc_q35_machine_rhel_options(MachineClass *m) + pcmc->pci_root_uid = 0; + m->family = "pc_q35_Z"; + m->units_per_default_bus = 1; +- m->default_machine_opts = "firmware=bios-256k.bin,hpet=off"; ++ m->default_machine_opts = "firmware=bios-256k.bin"; + m->default_display = "std"; + m->no_floppy = 1; + m->no_parallel = 1; +diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c +index 202e032524..9520471be2 100644 +--- a/hw/timer/hpet.c ++++ b/hw/timer/hpet.c +@@ -733,14 +733,6 @@ static void hpet_realize(DeviceState *dev, Error **errp) + int i; + HPETTimer *timer; + +- /* Restricted for Red Hat Enterprise Linux */ +- MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); +- if (strstr(mc->name, "rhel")) { +- error_setg(errp, "Device %s is not supported with machine type %s", +- object_get_typename(OBJECT(dev)), mc->name); +- return; +- } +- + if (!s->intcap) { + warn_report("Hpet's intcap not initialized"); + } +-- +2.35.1 + diff --git a/SOURCES/kvm-block-Make-bdrv_refresh_limits-non-recursive.patch b/SOURCES/kvm-block-Make-bdrv_refresh_limits-non-recursive.patch new file mode 100644 index 0000000..7ff8e7e --- /dev/null +++ b/SOURCES/kvm-block-Make-bdrv_refresh_limits-non-recursive.patch @@ -0,0 +1,78 @@ +From 6348063b91b2370cc27153fd58fd11a6681631f6 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Wed, 16 Feb 2022 11:53:53 +0100 +Subject: [PATCH 22/24] block: Make bdrv_refresh_limits() non-recursive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Hanna Reitz +RH-MergeRequest: 189: block: Make bdrv_refresh_limits() non-recursive +RH-Commit: [1/3] 1a1fe37f8d8f0344dd8639d6cc9d884d1aff9096 +RH-Bugzilla: 2072932 +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +bdrv_refresh_limits() recurses down to the node's children. That does +not seem necessary: When we refresh limits on some node, and then +recurse down and were to change one of its children's BlockLimits, then +that would mean we noticed the changed limits by pure chance. The fact +that we refresh the parent's limits has nothing to do with it, so the +reason for the change probably happened before this point in time, and +we should have refreshed the limits then. + +Consequently, we should actually propagate block limits changes upwards, +not downwards.  That is a separate and pre-existing issue, though, and +so will not be addressed in this patch. + +The problem with recursing is that bdrv_refresh_limits() is not atomic. +It begins with zeroing BDS.bl, and only then sets proper, valid limits. +If we do not drain all nodes whose limits are refreshed, then concurrent +I/O requests can encounter invalid request_alignment values and crash +qemu. Therefore, a recursing bdrv_refresh_limits() requires the whole +subtree to be drained, which is currently not ensured by most callers. + +A non-recursive bdrv_refresh_limits() only requires the node in question +to not receive I/O requests, and this is done by most callers in some +way or another: +- bdrv_open_driver() deals with a new node with no parents yet +- bdrv_set_file_or_backing_noperm() acts on a drained node +- bdrv_reopen_commit() acts only on drained nodes +- bdrv_append() should in theory require the node to be drained; in + practice most callers just lock the AioContext, which should at least + be enough to prevent concurrent I/O requests from accessing invalid + limits + +So we can resolve the bug by making bdrv_refresh_limits() non-recursive. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1879437 +Signed-off-by: Hanna Reitz +Reviewed-by: Eric Blake +Message-Id: <20220216105355.30729-2-hreitz@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Kevin Wolf +(cherry picked from commit 4d378bbd831bdd2f6e6adcd4ea5b77b6effaa627) +Signed-off-by: Hanna Reitz +--- + block/io.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/block/io.c b/block/io.c +index 4e4cb556c5..c3e7301613 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -189,10 +189,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) + { +- bdrv_refresh_limits(c->bs, tran, errp); +- if (*errp) { +- return; +- } + bdrv_merge_limits(&bs->bl, &c->bs->bl); + have_limits = true; + } +-- +2.35.3 + diff --git a/SOURCES/kvm-hw-block-fdc-Prevent-end-of-track-overrun-CVE-2021-3.patch b/SOURCES/kvm-hw-block-fdc-Prevent-end-of-track-overrun-CVE-2021-3.patch new file mode 100644 index 0000000..01e4097 --- /dev/null +++ b/SOURCES/kvm-hw-block-fdc-Prevent-end-of-track-overrun-CVE-2021-3.patch @@ -0,0 +1,97 @@ +From fe4abbda80eea7f65b6b5cc544a806fb6e064917 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 18 Nov 2021 12:57:32 +0100 +Subject: [PATCH 2/3] hw/block/fdc: Prevent end-of-track overrun + (CVE-2021-3507) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 194: hw/block/fdc: Prevent end-of-track overrun (CVE-2021-3507) +RH-Commit: [1/2] 31fa0351382b4ca5bd989b09e4d811ae73040673 (jmaloy/qemu-kvm) +RH-Bugzilla: 1951521 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth +RH-Acked-by: Hanna Reitz + +Per the 82078 datasheet, if the end-of-track (EOT byte in +the FIFO) is more than the number of sectors per side, the +command is terminated unsuccessfully: + +* 5.2.5 DATA TRANSFER TERMINATION + + The 82078 supports terminal count explicitly through + the TC pin and implicitly through the underrun/over- + run and end-of-track (EOT) functions. For full sector + transfers, the EOT parameter can define the last + sector to be transferred in a single or multisector + transfer. If the last sector to be transferred is a par- + tial sector, the host can stop transferring the data in + mid-sector, and the 82078 will continue to complete + the sector as if a hardware TC was received. The + only difference between these implicit functions and + TC is that they return "abnormal termination" result + status. Such status indications can be ignored if they + were expected. + +* 6.1.3 READ TRACK + + This command terminates when the EOT specified + number of sectors have been read. If the 82078 + does not find an I D Address Mark on the diskette + after the second· occurrence of a pulse on the + INDX# pin, then it sets the IC code in Status Regis- + ter 0 to "01" (Abnormal termination), sets the MA bit + in Status Register 1 to "1", and terminates the com- + mand. + +* 6.1.6 VERIFY + + Refer to Table 6-6 and Table 6-7 for information + concerning the values of MT and EC versus SC and + EOT value. + +* Table 6·6. Result Phase Table + +* Table 6-7. Verify Command Result Phase Table + +Fix by aborting the transfer when EOT > # Sectors Per Side. + +Cc: qemu-stable@nongnu.org +Cc: Hervé Poussineau +Fixes: baca51faff0 ("floppy driver: disk geometry auto detect") +Reported-by: Alexander Bulekov +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/339 +Signed-off-by: Philippe Mathieu-Daudé +Message-Id: <20211118115733.4038610-2-philmd@redhat.com> +Reviewed-by: Hanna Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit defac5e2fbddf8423a354ff0454283a2115e1367) +Signed-off-by: Jon Maloy +--- + hw/block/fdc.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/hw/block/fdc.c b/hw/block/fdc.c +index 97fa6de423..755a26c114 100644 +--- a/hw/block/fdc.c ++++ b/hw/block/fdc.c +@@ -1531,6 +1531,14 @@ static void fdctrl_start_transfer(FDCtrl *fdctrl, int direction) + int tmp; + fdctrl->data_len = 128 << (fdctrl->fifo[5] > 7 ? 7 : fdctrl->fifo[5]); + tmp = (fdctrl->fifo[6] - ks + 1); ++ if (tmp < 0) { ++ FLOPPY_DPRINTF("invalid EOT: %d\n", tmp); ++ fdctrl_stop_transfer(fdctrl, FD_SR0_ABNTERM, FD_SR1_MA, 0x00); ++ fdctrl->fifo[3] = kt; ++ fdctrl->fifo[4] = kh; ++ fdctrl->fifo[5] = ks; ++ return; ++ } + if (fdctrl->fifo[0] & 0x80) + tmp += fdctrl->fifo[6]; + fdctrl->data_len *= tmp; +-- +2.35.3 + diff --git a/SOURCES/kvm-i386-do-kvm_put_msr_feature_control-first-thing-when.patch b/SOURCES/kvm-i386-do-kvm_put_msr_feature_control-first-thing-when.patch new file mode 100644 index 0000000..1c2094e --- /dev/null +++ b/SOURCES/kvm-i386-do-kvm_put_msr_feature_control-first-thing-when.patch @@ -0,0 +1,68 @@ +From 1bd939d374ec2e994ff47c84e16fa3bc1323a0fd Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Thu, 18 Aug 2022 17:01:13 +0200 +Subject: [PATCH 2/2] i386: do kvm_put_msr_feature_control() first thing when + vCPU is reset + +RH-Author: Vitaly Kuznetsov +RH-MergeRequest: 216: i386: fix 'system_reset' when the VM is in VMX root operation +RH-Bugzilla: 2116743 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Peter Xu +RH-Commit: [2/2] f838a57f74487eb394794de00006d5d2b9e84344 + +kvm_put_sregs2() fails to reset 'locked' CR4/CR0 bits upon vCPU reset when +it is in VMX root operation. Do kvm_put_msr_feature_control() before +kvm_put_sregs2() to (possibly) kick vCPU out of VMX root operation. It also +seems logical to do kvm_put_msr_feature_control() before +kvm_put_nested_state() and not after it, especially when 'real' nested +state is set. + +Signed-off-by: Vitaly Kuznetsov +Message-Id: <20220818150113.479917-3-vkuznets@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 45ed68a1a3a19754ade954d75a3c9d13ff560e5c) +Signed-off-by: Vitaly Kuznetsov +--- + target/i386/kvm/kvm.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 81d729dc40..a06221d3e5 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -4255,6 +4255,18 @@ int kvm_arch_put_registers(CPUState *cpu, int level) + + assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); + ++ /* ++ * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX ++ * root operation upon vCPU reset. kvm_put_msr_feature_control() should also ++ * preceed kvm_put_nested_state() when 'real' nested state is set. ++ */ ++ if (level >= KVM_PUT_RESET_STATE) { ++ ret = kvm_put_msr_feature_control(x86_cpu); ++ if (ret < 0) { ++ return ret; ++ } ++ } ++ + /* must be before kvm_put_nested_state so that EFER.SVME is set */ + ret = kvm_put_sregs(x86_cpu); + if (ret < 0) { +@@ -4266,11 +4278,6 @@ int kvm_arch_put_registers(CPUState *cpu, int level) + if (ret < 0) { + return ret; + } +- +- ret = kvm_put_msr_feature_control(x86_cpu); +- if (ret < 0) { +- return ret; +- } + } + + if (level == KVM_PUT_FULL_STATE) { +-- +2.31.1 + diff --git a/SOURCES/kvm-i386-reset-KVM-nested-state-upon-CPU-reset.patch b/SOURCES/kvm-i386-reset-KVM-nested-state-upon-CPU-reset.patch new file mode 100644 index 0000000..ac5b579 --- /dev/null +++ b/SOURCES/kvm-i386-reset-KVM-nested-state-upon-CPU-reset.patch @@ -0,0 +1,95 @@ +From 4ad00e318f8afbee0e455cfbb6bc693c808d87f3 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Thu, 18 Aug 2022 17:01:12 +0200 +Subject: [PATCH 1/2] i386: reset KVM nested state upon CPU reset + +RH-Author: Vitaly Kuznetsov +RH-MergeRequest: 216: i386: fix 'system_reset' when the VM is in VMX root operation +RH-Bugzilla: 2116743 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Peter Xu +RH-Commit: [1/2] 20d2dabeda74b8cd5135228980a2414e66dc64f3 + +Make sure env->nested_state is cleaned up when a vCPU is reset, it may +be stale after an incoming migration, kvm_arch_put_registers() may +end up failing or putting vCPU in a weird state. + +Reviewed-by: Maxim Levitsky +Signed-off-by: Vitaly Kuznetsov +Message-Id: <20220818150113.479917-2-vkuznets@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 3cafdb67504a34a0305260f0c86a73d5a3fb000b) +Signed-off-by: Vitaly Kuznetsov +--- + target/i386/kvm/kvm.c | 37 +++++++++++++++++++++++++++---------- + 1 file changed, 27 insertions(+), 10 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index bd439e56ad..81d729dc40 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1615,6 +1615,30 @@ static void kvm_init_xsave(CPUX86State *env) + env->xsave_buf_len); + } + ++static void kvm_init_nested_state(CPUX86State *env) ++{ ++ struct kvm_vmx_nested_state_hdr *vmx_hdr; ++ uint32_t size; ++ ++ if (!env->nested_state) { ++ return; ++ } ++ ++ size = env->nested_state->size; ++ ++ memset(env->nested_state, 0, size); ++ env->nested_state->size = size; ++ ++ if (cpu_has_vmx(env)) { ++ env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; ++ vmx_hdr = &env->nested_state->hdr.vmx; ++ vmx_hdr->vmxon_pa = -1ull; ++ vmx_hdr->vmcs12_pa = -1ull; ++ } else if (cpu_has_svm(env)) { ++ env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; ++ } ++} ++ + int kvm_arch_init_vcpu(CPUState *cs) + { + struct { +@@ -2042,19 +2066,10 @@ int kvm_arch_init_vcpu(CPUState *cs) + assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data)); + + if (cpu_has_vmx(env) || cpu_has_svm(env)) { +- struct kvm_vmx_nested_state_hdr *vmx_hdr; +- + env->nested_state = g_malloc0(max_nested_state_len); + env->nested_state->size = max_nested_state_len; + +- if (cpu_has_vmx(env)) { +- env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; +- vmx_hdr = &env->nested_state->hdr.vmx; +- vmx_hdr->vmxon_pa = -1ull; +- vmx_hdr->vmcs12_pa = -1ull; +- } else { +- env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; +- } ++ kvm_init_nested_state(env); + } + } + +@@ -2117,6 +2132,8 @@ void kvm_arch_reset_vcpu(X86CPU *cpu) + /* enabled by default */ + env->poll_control_msr = 1; + ++ kvm_init_nested_state(env); ++ + sev_es_set_reset_vector(CPU(cpu)); + } + +-- +2.31.1 + diff --git a/SOURCES/kvm-ide-Increment-BB-in-flight-counter-for-TRIM-BH.patch b/SOURCES/kvm-ide-Increment-BB-in-flight-counter-for-TRIM-BH.patch new file mode 100644 index 0000000..6af2a9f --- /dev/null +++ b/SOURCES/kvm-ide-Increment-BB-in-flight-counter-for-TRIM-BH.patch @@ -0,0 +1,92 @@ +From eaade87072e903cf550dfdb8ed1480dddc6bb0e3 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Thu, 20 Jan 2022 15:22:59 +0100 +Subject: [PATCH 21/24] ide: Increment BB in-flight counter for TRIM BH +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Hanna Reitz +RH-MergeRequest: 188: ide: Increment BB in-flight counter for TRIM BH +RH-Commit: [1/1] 1e702e735ff63f2b8b69c20cac1b309dd085cd62 +RH-Bugzilla: 2029980 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Acked-by: Paolo Bonzini + +When we still have an AIOCB registered for DMA operations, we try to +settle the respective operation by draining the BlockBackend associated +with the IDE device. + +However, this assumes that every DMA operation is associated with an +increment of the BlockBackend’s in-flight counter (e.g. through some +ongoing I/O operation), so that draining the BB until its in-flight +counter reaches 0 will settle all DMA operations. That is not the case: +For TRIM, the guest can issue a zero-length operation that will not +result in any I/O operation forwarded to the BlockBackend, and also not +increment the in-flight counter in any other way. In such a case, +blk_drain() will be a no-op if no other operations are in flight. + +It is clear that if blk_drain() is a no-op, the value of +s->bus->dma->aiocb will not change between checking it in the `if` +condition and asserting that it is NULL after blk_drain(). + +The particular problem is that ide_issue_trim() creates a BH +(ide_trim_bh_cb()) to settle the TRIM request: iocb->common.cb() is +ide_dma_cb(), which will either create a new request, or find the +transfer to be done and call ide_set_inactive(), which clears +s->bus->dma->aiocb. Therefore, the blk_drain() must wait for +ide_trim_bh_cb() to run, which currently it will not always do. + +To fix this issue, we increment the BlockBackend's in-flight counter +when the TRIM operation begins (in ide_issue_trim(), when the +ide_trim_bh_cb() BH is created) and decrement it when ide_trim_bh_cb() +is done. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2029980 +Suggested-by: Paolo Bonzini +Signed-off-by: Hanna Reitz +Message-Id: <20220120142259.120189-1-hreitz@redhat.com> +Reviewed-by: Paolo Bonzini +Reviewed-by: John Snow +Tested-by: John Snow +(cherry picked from commit 7e5cdb345f77d76cb4877fe6230c4e17a7d0d0ca) +Signed-off-by: Hanna Reitz +--- + hw/ide/core.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/ide/core.c b/hw/ide/core.c +index e28f8aad61..15138225be 100644 +--- a/hw/ide/core.c ++++ b/hw/ide/core.c +@@ -433,12 +433,16 @@ static const AIOCBInfo trim_aiocb_info = { + static void ide_trim_bh_cb(void *opaque) + { + TrimAIOCB *iocb = opaque; ++ BlockBackend *blk = iocb->s->blk; + + iocb->common.cb(iocb->common.opaque, iocb->ret); + + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + qemu_aio_unref(iocb); ++ ++ /* Paired with an increment in ide_issue_trim() */ ++ blk_dec_in_flight(blk); + } + + static void ide_issue_trim_cb(void *opaque, int ret) +@@ -508,6 +512,9 @@ BlockAIOCB *ide_issue_trim( + IDEState *s = opaque; + TrimAIOCB *iocb; + ++ /* Paired with a decrement in ide_trim_bh_cb() */ ++ blk_inc_in_flight(s->blk); ++ + iocb = blk_aio_get(&trim_aiocb_info, s->blk, cb, cb_opaque); + iocb->s = s; + iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb); +-- +2.35.3 + diff --git a/SOURCES/kvm-iotests-108-Fix-when-missing-user_allow_other.patch b/SOURCES/kvm-iotests-108-Fix-when-missing-user_allow_other.patch new file mode 100644 index 0000000..0d652dd --- /dev/null +++ b/SOURCES/kvm-iotests-108-Fix-when-missing-user_allow_other.patch @@ -0,0 +1,52 @@ +From 676e19198916d7631ba1367646dd08dc72079f88 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Thu, 21 Apr 2022 16:24:35 +0200 +Subject: [PATCH 6/6] iotests/108: Fix when missing user_allow_other + +RH-Author: Hanna Reitz +RH-MergeRequest: 171: qcow2: Improve refcount structure rebuilding +RH-Commit: [4/4] 36b70b5378ae7c8084b9e847706f00003abe9c11 +RH-Bugzilla: 1519071 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Eric Blake + +FUSE exports' allow-other option defaults to "auto", which means that it +will try passing allow_other as a mount option, and fall back to not +using it when an error occurs. We make no effort to hide fusermount's +error message (because it would be difficult, and because users might +want to know about the fallback occurring), and so when allow_other does +not work (primarily when /etc/fuse.conf does not contain +user_allow_other), this error message will appear and break the +reference output. + +We do not need allow_other here, though, so we can just pass +allow-other=off to fix that. + +Reported-by: Markus Armbruster +Signed-off-by: Hanna Reitz +Message-Id: <20220421142435.569600-1-hreitz@redhat.com> +Tested-by: Markus Armbruster +Tested-by: Eric Blake +(cherry picked from commit 348a0740afc5b313599533eb69bbb2b95d2f1bba) +Signed-off-by: Hanna Reitz +--- + tests/qemu-iotests/108 | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 +index 23abbeaff0..775ff08eca 100755 +--- a/tests/qemu-iotests/108 ++++ b/tests/qemu-iotests/108 +@@ -326,7 +326,7 @@ else + + $QSD \ + --blockdev file,node-name=export-node,filename="$TEST_IMG" \ +- --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off \ ++ --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off,allow-other=off \ + --pidfile "$TEST_DIR/qsd.pid" \ + & + +-- +2.27.0 + diff --git a/SOURCES/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch b/SOURCES/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch new file mode 100644 index 0000000..cc67d7c --- /dev/null +++ b/SOURCES/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch @@ -0,0 +1,445 @@ +From d638552d76db0db9e2b6ae90a35f0b451b0cbaf8 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:51 +0200 +Subject: [PATCH 4/6] iotests/108: Test new refcount rebuild algorithm + +RH-Author: Hanna Reitz +RH-MergeRequest: 171: qcow2: Improve refcount structure rebuilding +RH-Commit: [2/4] 2aa8c383f0c88c414f10ade8bd2e8af07c35f35b +RH-Bugzilla: 1519071 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Eric Blake + +One clear problem with how qcow2's refcount structure rebuild algorithm +used to be before "qcow2: Improve refcount structure rebuilding" was +that it is prone to failure for qcow2 images on block devices: There is +generally unused space after the actual image, and if that exceeds what +one refblock covers, the old algorithm would invariably write the +reftable past the block device's end, which cannot work. The new +algorithm does not have this problem. + +Test it with three tests: +(1) Create an image with more empty space at the end than what one + refblock covers, see whether rebuilding the refcount structures + results in a change in the image file length. (It should not.) + +(2) Leave precisely enough space somewhere at the beginning of the image + for the new reftable (and the refblock for that place), see whether + the new algorithm puts the reftable there. (It should.) + +(3) Test the original problem: Create (something like) a block device + with a fixed size, then create a qcow2 image in there, write some + data, and then have qemu-img check rebuild the refcount structures. + Before HEAD^, the reftable would have been written past the image + file end, i.e. outside of what the block device provides, which + cannot work. HEAD^ should have fixed that. + ("Something like a block device" means a loop device if we can use + one ("sudo -n losetup" works), or a FUSE block export with + growable=false otherwise.) + +Reviewed-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-3-hreitz@redhat.com> +(cherry picked from commit 9ffd6d646d1d5ee9087a8cbf0b7d2f96c5656162) + +Conflicts: +- 108: The downstream qemu-storage-daemon does not support --daemonize, + so this switch has been replaced by a loop waiting for the PID file to + appear + +Signed-off-by: Hanna Reitz +--- + tests/qemu-iotests/108 | 263 ++++++++++++++++++++++++++++++++++++- + tests/qemu-iotests/108.out | 81 ++++++++++++ + 2 files changed, 343 insertions(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 +index 8eaef0b8bf..23abbeaff0 100755 +--- a/tests/qemu-iotests/108 ++++ b/tests/qemu-iotests/108 +@@ -30,13 +30,20 @@ status=1 # failure is the default! + + _cleanup() + { +- _cleanup_test_img ++ _cleanup_test_img ++ if [ -f "$TEST_DIR/qsd.pid" ]; then ++ qsd_pid=$(cat "$TEST_DIR/qsd.pid") ++ kill -KILL "$qsd_pid" ++ fusermount -u "$TEST_DIR/fuse-export" &>/dev/null ++ fi ++ rm -f "$TEST_DIR/fuse-export" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + + # get standard environment, filters and checks + . ./common.rc + . ./common.filter ++. ./common.qemu + + # This tests qcow2-specific low-level functionality + _supported_fmt qcow2 +@@ -47,6 +54,22 @@ _supported_os Linux + # files + _unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file + ++# This test either needs sudo -n losetup or FUSE exports to work ++if sudo -n losetup &>/dev/null; then ++ loopdev=true ++else ++ loopdev=false ++ ++ # QSD --export fuse will either yield "Parameter 'id' is missing" ++ # or "Invalid parameter 'fuse'", depending on whether there is ++ # FUSE support or not. ++ error=$($QSD --export fuse 2>&1) ++ if [[ $error = *"'fuse'"* ]]; then ++ _notrun 'Passwordless sudo for losetup or FUSE support required, but' \ ++ 'neither is available' ++ fi ++fi ++ + echo + echo '=== Repairing an image without any refcount table ===' + echo +@@ -138,6 +161,244 @@ _make_test_img 64M + poke_file "$TEST_IMG" $((0x10008)) "\xff\xff\xff\xff\xff\xff\x00\x00" + _check_test_img -r all + ++echo ++echo '=== Check rebuilt reftable location ===' ++ ++# In an earlier version of the refcount rebuild algorithm, the ++# reftable was generally placed at the image end (unless something was ++# allocated in the area covered by the refblock right before the image ++# file end, then we would try to place the reftable in that refblock). ++# This was later changed so the reftable would be placed in the ++# earliest possible location. Test this. ++ ++echo ++echo '--- Does the image size increase? ---' ++echo ++ ++# First test: Just create some image, write some data to it, and ++# resize it so there is free space at the end of the image (enough ++# that it spans at least one full refblock, which for cluster_size=512 ++# images, spans 128k). With the old algorithm, the reftable would ++# have then been placed at the end of the image file, but with the new ++# one, it will be put in that free space. ++# We want to check whether the size of the image file increases due to ++# rebuilding the refcount structures (it should not). ++ ++_make_test_img -o 'cluster_size=512' 1M ++# Write something ++$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io ++ ++# Add free space ++file_len=$(stat -c '%s' "$TEST_IMG") ++truncate -s $((file_len + 256 * 1024)) "$TEST_IMG" ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++rb_offset=$(peek_file_be "$TEST_IMG" $rt_offset 8) ++poke_file "$TEST_IMG" $rb_offset "\x00\x00" ++ ++# Check whether rebuilding the refcount structures increases the image ++# file size ++file_len=$(stat -c '%s' "$TEST_IMG") ++echo ++# The only leaks there can be are the old refcount structures that are ++# leaked during rebuilding, no need to clutter the output with them ++_check_test_img -r all | grep -v '^Repairing cluster.*refcount=1 reference=0' ++echo ++post_repair_file_len=$(stat -c '%s' "$TEST_IMG") ++ ++if [[ $file_len -eq $post_repair_file_len ]]; then ++ echo 'OK: Image size did not change' ++else ++ echo 'ERROR: Image size differs' \ ++ "($file_len before, $post_repair_file_len after)" ++fi ++ ++echo ++echo '--- Will the reftable occupy a hole specifically left for it? ---' ++echo ++ ++# Note: With cluster_size=512, every refblock covers 128k. ++# The reftable covers 8M per reftable cluster. ++ ++# Create an image that requires two reftable clusters (just because ++# this is more interesting than a single-clustered reftable). ++_make_test_img -o 'cluster_size=512' 9M ++$QEMU_IO -c 'write 0 8M' "$TEST_IMG" | _filter_qemu_io ++ ++# Writing 8M will have resized the reftable. Unfortunately, doing so ++# will leave holes in the file, so we need to fill them up so we can ++# be sure the whole file is allocated. Do that by writing ++# consecutively smaller chunks starting from 8 MB, until the file ++# length increases even with a chunk size of 512. Then we must have ++# filled all holes. ++ofs=$((8 * 1024 * 1024)) ++block_len=$((16 * 1024)) ++while [[ $block_len -ge 512 ]]; do ++ file_len=$(stat -c '%s' "$TEST_IMG") ++ while [[ $(stat -c '%s' "$TEST_IMG") -eq $file_len ]]; do ++ # Do not include this in the reference output, it does not ++ # really matter which qemu-io calls we do here exactly ++ $QEMU_IO -c "write $ofs $block_len" "$TEST_IMG" >/dev/null ++ ofs=$((ofs + block_len)) ++ done ++ block_len=$((block_len / 2)) ++done ++ ++# Fill up to 9M (do not include this in the reference output either, ++# $ofs is random for all we know) ++$QEMU_IO -c "write $ofs $((9 * 1024 * 1024 - ofs))" "$TEST_IMG" >/dev/null ++ ++# Make space as follows: ++# - For the first refblock: Right at the beginning of the image (this ++# refblock is placed in the first place possible), ++# - For the reftable somewhere soon afterwards, still near the ++# beginning of the image (i.e. covered by the first refblock); the ++# reftable too is placed in the first place possible, but only after ++# all refblocks have been placed) ++# No space is needed for the other refblocks, because no refblock is ++# put before the space it covers. In this test case, we do not mind ++# if they are placed at the image file's end. ++ ++# Before we make that space, we have to find out the host offset of ++# the area that belonged to the two data clusters at guest offset 4k, ++# because we expect the reftable to be placed there, and we will have ++# to verify that it is. ++ ++l1_offset=$(peek_file_be "$TEST_IMG" 40 8) ++l2_offset=$(peek_file_be "$TEST_IMG" $l1_offset 8) ++l2_offset=$((l2_offset & 0x00fffffffffffe00)) ++data_4k_offset=$(peek_file_be "$TEST_IMG" \ ++ $((l2_offset + 4096 / 512 * 8)) 8) ++data_4k_offset=$((data_4k_offset & 0x00fffffffffffe00)) ++ ++$QEMU_IO -c "discard 0 512" -c "discard 4k 1k" "$TEST_IMG" | _filter_qemu_io ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++rb_offset=$(peek_file_be "$TEST_IMG" $rt_offset 8) ++poke_file "$TEST_IMG" $rb_offset "\x00\x00" ++ ++echo ++# The only leaks there can be are the old refcount structures that are ++# leaked during rebuilding, no need to clutter the output with them ++_check_test_img -r all | grep -v '^Repairing cluster.*refcount=1 reference=0' ++echo ++ ++# Check whether the reftable was put where we expected ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++if [[ $rt_offset -eq $data_4k_offset ]]; then ++ echo 'OK: Reftable is where we expect it' ++else ++ echo "ERROR: Reftable is at $rt_offset, but was expected at $data_4k_offset" ++fi ++ ++echo ++echo '--- Rebuilding refcount structures on block devices ---' ++echo ++ ++# A block device cannot really grow, at least not during qemu-img ++# check. As mentioned in the above cases, rebuilding the refcount ++# structure may lead to new refcount structures being written after ++# the end of the image, and in the past that happened even if there ++# was more than sufficient space in the image. Such post-EOF writes ++# will not work on block devices, so test that the new algorithm ++# avoids it. ++ ++# If we have passwordless sudo and losetup, we can use those to create ++# a block device. Otherwise, we can resort to qemu's FUSE export to ++# create a file that isn't growable, which effectively tests the same ++# thing. ++ ++_cleanup_test_img ++truncate -s $((64 * 1024 * 1024)) "$TEST_IMG" ++ ++if $loopdev; then ++ export_mp=$(sudo -n losetup --show -f "$TEST_IMG") ++ export_mp_driver=host_device ++ sudo -n chmod go+rw "$export_mp" ++else ++ # Create non-growable FUSE export that is a bit like an empty ++ # block device ++ export_mp="$TEST_DIR/fuse-export" ++ export_mp_driver=file ++ touch "$export_mp" ++ ++ $QSD \ ++ --blockdev file,node-name=export-node,filename="$TEST_IMG" \ ++ --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off \ ++ --pidfile "$TEST_DIR/qsd.pid" \ ++ & ++ ++ while [ ! -f "$TEST_DIR/qsd.pid" ]; do ++ sleep 0.1 ++ done ++fi ++ ++# Now create a qcow2 image on the device -- unfortunately, qemu-img ++# create force-creates the file, so we have to resort to the ++# blockdev-create job. ++_launch_qemu \ ++ --blockdev $export_mp_driver,node-name=file,filename="$export_mp" ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "qmp_capabilities" }' \ ++ 'return' ++ ++# Small cluster size again, so the image needs multiple refblocks ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "blockdev-create", ++ "arguments": { ++ "job-id": "create", ++ "options": { ++ "driver": "qcow2", ++ "file": "file", ++ "size": '$((64 * 1024 * 1024))', ++ "cluster-size": 512 ++ } } }' \ ++ '"concluded"' ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "job-dismiss", "arguments": { "id": "create" } }' \ ++ 'return' ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "quit" }' \ ++ 'return' ++ ++wait=y _cleanup_qemu ++echo ++ ++# Write some data ++$QEMU_IO -c 'write 0 64k' "$export_mp" | _filter_qemu_io ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$export_mp" 48 8) ++rb_offset=$(peek_file_be "$export_mp" $rt_offset 8) ++poke_file "$export_mp" $rb_offset "\x00\x00" ++ ++# Repairing such a simple case should just work ++# (We used to put the reftable at the end of the image file, which can ++# never work for non-growable devices.) ++echo ++TEST_IMG="$export_mp" _check_test_img -r all \ ++ | grep -v '^Repairing cluster.*refcount=1 reference=0' ++ ++if $loopdev; then ++ sudo -n losetup -d "$export_mp" ++else ++ qsd_pid=$(cat "$TEST_DIR/qsd.pid") ++ kill -TERM "$qsd_pid" ++ # Wait for process to exit (cannot `wait` because the QSD is daemonized) ++ while [ -f "$TEST_DIR/qsd.pid" ]; do ++ true ++ done ++fi ++ + # success, all done + echo '*** done' + rm -f $seq.full +diff --git a/tests/qemu-iotests/108.out b/tests/qemu-iotests/108.out +index 75bab8dc84..b5401d788d 100644 +--- a/tests/qemu-iotests/108.out ++++ b/tests/qemu-iotests/108.out +@@ -105,6 +105,87 @@ The following inconsistencies were found and repaired: + 0 leaked clusters + 1 corruptions + ++Double checking the fixed image now... ++No errors were found on the image. ++ ++=== Check rebuilt reftable location === ++ ++--- Does the image size increase? --- ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 ++wrote 65536/65536 bytes at offset 0 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ ++Double checking the fixed image now... ++No errors were found on the image. ++ ++OK: Image size did not change ++ ++--- Will the reftable occupy a hole specifically left for it? --- ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=9437184 ++wrote 8388608/8388608 bytes at offset 0 ++8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++discard 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++discard 1024/1024 bytes at offset 4096 ++1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ ++Double checking the fixed image now... ++No errors were found on the image. ++ ++OK: Reftable is where we expect it ++ ++--- Rebuilding refcount structures on block devices --- ++ ++{ "execute": "qmp_capabilities" } ++{"return": {}} ++{ "execute": "blockdev-create", ++ "arguments": { ++ "job-id": "create", ++ "options": { ++ "driver": "IMGFMT", ++ "file": "file", ++ "size": 67108864, ++ "cluster-size": 512 ++ } } } ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "create"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "create"}} ++{ "execute": "job-dismiss", "arguments": { "id": "create" } } ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "create"}} ++{"return": {}} ++{ "execute": "quit" } ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} ++ ++wrote 65536/65536 bytes at offset 0 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ + Double checking the fixed image now... + No errors were found on the image. + *** done +-- +2.27.0 + diff --git a/SOURCES/kvm-iotests-Allow-using-QMP-with-the-QSD.patch b/SOURCES/kvm-iotests-Allow-using-QMP-with-the-QSD.patch new file mode 100644 index 0000000..5d45438 --- /dev/null +++ b/SOURCES/kvm-iotests-Allow-using-QMP-with-the-QSD.patch @@ -0,0 +1,99 @@ +From 12f596b66d577eb92f154fadf734d058dd0756d6 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Wed, 16 Feb 2022 11:53:54 +0100 +Subject: [PATCH 23/24] iotests: Allow using QMP with the QSD + +RH-Author: Hanna Reitz +RH-MergeRequest: 189: block: Make bdrv_refresh_limits() non-recursive +RH-Commit: [2/3] 55bee4690a2e02d3be9f2bd68f2d244d0a36743b +RH-Bugzilla: 2072932 +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Add a parameter to optionally open a QMP connection when creating a +QemuStorageDaemon instance. + +Signed-off-by: Hanna Reitz +Message-Id: <20220216105355.30729-3-hreitz@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Kevin Wolf +(cherry picked from commit ec88eed8d14088b36a3495710368b8d1a3c33420) +Signed-off-by: Hanna Reitz +--- + tests/qemu-iotests/iotests.py | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index a51b5ce8cd..2ef493755c 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -38,6 +38,7 @@ + + from qemu.machine import qtest + from qemu.qmp import QMPMessage ++from qemu.aqmp.legacy import QEMUMonitorProtocol + + # Use this logger for logging messages directly from the iotests module + logger = logging.getLogger('qemu.iotests') +@@ -315,14 +316,30 @@ def cmd(self, cmd): + + + class QemuStorageDaemon: +- def __init__(self, *args: str, instance_id: str = 'a'): ++ _qmp: Optional[QEMUMonitorProtocol] = None ++ _qmpsock: Optional[str] = None ++ # Python < 3.8 would complain if this type were not a string literal ++ # (importing `annotations` from `__future__` would work; but not on <= 3.6) ++ _p: 'Optional[subprocess.Popen[bytes]]' = None ++ ++ def __init__(self, *args: str, instance_id: str = 'a', qmp: bool = False): + assert '--pidfile' not in args + self.pidfile = os.path.join(test_dir, f'qsd-{instance_id}-pid') + all_args = [qsd_prog] + list(args) + ['--pidfile', self.pidfile] + ++ if qmp: ++ self._qmpsock = os.path.join(sock_dir, f'qsd-{instance_id}.sock') ++ all_args += ['--chardev', ++ f'socket,id=qmp-sock,path={self._qmpsock}', ++ '--monitor', 'qmp-sock'] ++ ++ self._qmp = QEMUMonitorProtocol(self._qmpsock, server=True) ++ + # Cannot use with here, we want the subprocess to stay around + # pylint: disable=consider-using-with + self._p = subprocess.Popen(all_args) ++ if self._qmp is not None: ++ self._qmp.accept() + while not os.path.exists(self.pidfile): + if self._p.poll() is not None: + cmd = ' '.join(all_args) +@@ -337,11 +354,24 @@ def __init__(self, *args: str, instance_id: str = 'a'): + + assert self._pid == self._p.pid + ++ def qmp(self, cmd: str, args: Optional[Dict[str, object]] = None) \ ++ -> QMPMessage: ++ assert self._qmp is not None ++ return self._qmp.cmd(cmd, args) ++ + def stop(self, kill_signal=15): + self._p.send_signal(kill_signal) + self._p.wait() + self._p = None + ++ if self._qmp: ++ self._qmp.close() ++ ++ if self._qmpsock is not None: ++ try: ++ os.remove(self._qmpsock) ++ except OSError: ++ pass + try: + os.remove(self.pidfile) + except OSError: +-- +2.35.3 + diff --git a/SOURCES/kvm-iotests-graph-changes-while-io-New-test.patch b/SOURCES/kvm-iotests-graph-changes-while-io-New-test.patch new file mode 100644 index 0000000..d40e25e --- /dev/null +++ b/SOURCES/kvm-iotests-graph-changes-while-io-New-test.patch @@ -0,0 +1,153 @@ +From 27042ff7aca4366c50e8ed66b47487d46774d16a Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Wed, 16 Feb 2022 11:53:55 +0100 +Subject: [PATCH 24/24] iotests/graph-changes-while-io: New test + +RH-Author: Hanna Reitz +RH-MergeRequest: 189: block: Make bdrv_refresh_limits() non-recursive +RH-Commit: [3/3] b9dffe09bef6cf9b2f0aad69b327ea1df92e847a +RH-Bugzilla: 2072932 +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Test the following scenario: +1. Some block node (null-co) attached to a user (here: NBD server) that + performs I/O and keeps the node in an I/O thread +2. Repeatedly run blockdev-add/blockdev-del to add/remove an overlay + to/from that node + +Each blockdev-add triggers bdrv_refresh_limits(), and because +blockdev-add runs in the main thread, it does not stop the I/O requests. +I/O can thus happen while the limits are refreshed, and when such a +request sees a temporarily invalid block limit (e.g. alignment is 0), +this may easily crash qemu (or the storage daemon in this case). + +The block layer needs to ensure that I/O requests to a node are paused +while that node's BlockLimits are refreshed. + +Signed-off-by: Hanna Reitz +Reviewed-by: Eric Blake +Message-Id: <20220216105355.30729-4-hreitz@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Kevin Wolf +(cherry picked from commit 971bea8089531af56b1bbd9ce62e756bdf006711) +Signed-off-by: Hanna Reitz +--- + .../qemu-iotests/tests/graph-changes-while-io | 91 +++++++++++++++++++ + .../tests/graph-changes-while-io.out | 5 + + 2 files changed, 96 insertions(+) + create mode 100755 tests/qemu-iotests/tests/graph-changes-while-io + create mode 100644 tests/qemu-iotests/tests/graph-changes-while-io.out + +diff --git a/tests/qemu-iotests/tests/graph-changes-while-io b/tests/qemu-iotests/tests/graph-changes-while-io +new file mode 100755 +index 0000000000..567e8cf21e +--- /dev/null ++++ b/tests/qemu-iotests/tests/graph-changes-while-io +@@ -0,0 +1,91 @@ ++#!/usr/bin/env python3 ++# group: rw ++# ++# Test graph changes while I/O is happening ++# ++# Copyright (C) 2022 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++import os ++from threading import Thread ++import iotests ++from iotests import imgfmt, qemu_img, qemu_img_create, QMPTestCase, \ ++ QemuStorageDaemon ++ ++ ++top = os.path.join(iotests.test_dir, 'top.img') ++nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock') ++ ++ ++def do_qemu_img_bench() -> None: ++ """ ++ Do some I/O requests on `nbd_sock`. ++ """ ++ assert qemu_img('bench', '-f', 'raw', '-c', '2000000', ++ f'nbd+unix:///node0?socket={nbd_sock}') == 0 ++ ++ ++class TestGraphChangesWhileIO(QMPTestCase): ++ def setUp(self) -> None: ++ # Create an overlay that can be added at runtime on top of the ++ # null-co block node that will receive I/O ++ assert qemu_img_create('-f', imgfmt, '-F', 'raw', '-b', 'null-co://', ++ top) == 0 ++ ++ # QSD instance with a null-co block node in an I/O thread, ++ # exported over NBD (on `nbd_sock`, export name "node0") ++ self.qsd = QemuStorageDaemon( ++ '--object', 'iothread,id=iothread0', ++ '--blockdev', 'null-co,node-name=node0,read-zeroes=true', ++ '--nbd-server', f'addr.type=unix,addr.path={nbd_sock}', ++ '--export', 'nbd,id=exp0,node-name=node0,iothread=iothread0,' + ++ 'fixed-iothread=true,writable=true', ++ qmp=True ++ ) ++ ++ def tearDown(self) -> None: ++ self.qsd.stop() ++ ++ def test_blockdev_add_while_io(self) -> None: ++ # Run qemu-img bench in the background ++ bench_thr = Thread(target=do_qemu_img_bench) ++ bench_thr.start() ++ ++ # While qemu-img bench is running, repeatedly add and remove an ++ # overlay to/from node0 ++ while bench_thr.is_alive(): ++ result = self.qsd.qmp('blockdev-add', { ++ 'driver': imgfmt, ++ 'node-name': 'overlay', ++ 'backing': 'node0', ++ 'file': { ++ 'driver': 'file', ++ 'filename': top ++ } ++ }) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.qsd.qmp('blockdev-del', { ++ 'node-name': 'overlay' ++ }) ++ self.assert_qmp(result, 'return', {}) ++ ++ bench_thr.join() ++ ++if __name__ == '__main__': ++ # Format must support raw backing files ++ iotests.main(supported_fmts=['qcow', 'qcow2', 'qed'], ++ supported_protocols=['file']) +diff --git a/tests/qemu-iotests/tests/graph-changes-while-io.out b/tests/qemu-iotests/tests/graph-changes-while-io.out +new file mode 100644 +index 0000000000..ae1213e6f8 +--- /dev/null ++++ b/tests/qemu-iotests/tests/graph-changes-while-io.out +@@ -0,0 +1,5 @@ ++. ++---------------------------------------------------------------------- ++Ran 1 tests ++ ++OK +-- +2.35.3 + diff --git a/SOURCES/kvm-linux-aio-explain-why-max-batch-is-checked-in-laio_i.patch b/SOURCES/kvm-linux-aio-explain-why-max-batch-is-checked-in-laio_i.patch new file mode 100644 index 0000000..7c1fcc4 --- /dev/null +++ b/SOURCES/kvm-linux-aio-explain-why-max-batch-is-checked-in-laio_i.patch @@ -0,0 +1,49 @@ +From 99d33621440fd30e0da2974dafb0cd372334305a Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 9 Jun 2022 17:47:12 +0100 +Subject: [PATCH 2/2] linux-aio: explain why max batch is checked in + laio_io_unplug() + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 199: linux-aio: fix unbalanced plugged counter in laio_io_unplug() +RH-Commit: [2/2] 8617870ed70e3a57269f06eeb242d0fab79a66fb +RH-Bugzilla: 2105410 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Hanna Reitz +RH-Acked-by: Stefano Garzarella + +It may not be obvious why laio_io_unplug() checks max batch. I discussed +this with Stefano and have added a comment summarizing the reason. + +Cc: Stefano Garzarella +Cc: Kevin Wolf +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Stefano Garzarella +Message-id: 20220609164712.1539045-3-stefanha@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 99b969fbe105117f5af6060d3afef40ca39cc9c1) +Signed-off-by: Stefan Hajnoczi +--- + block/linux-aio.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/block/linux-aio.c b/block/linux-aio.c +index 77f17ad596..85650c4222 100644 +--- a/block/linux-aio.c ++++ b/block/linux-aio.c +@@ -362,6 +362,12 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, + assert(s->io_q.plugged); + s->io_q.plugged--; + ++ /* ++ * Why max batch checking is performed here: ++ * Another BDS may have queued requests with a higher dev_max_batch and ++ * therefore in_queue could now exceed our dev_max_batch. Re-check the max ++ * batch so we can honor our device's dev_max_batch. ++ */ + if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || + (!s->io_q.plugged && + !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { +-- +2.35.3 + diff --git a/SOURCES/kvm-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch b/SOURCES/kvm-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch new file mode 100644 index 0000000..c89fc72 --- /dev/null +++ b/SOURCES/kvm-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch @@ -0,0 +1,56 @@ +From 0fbb0c87628bef2cb4d1b7748d67020dde50cdef Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 9 Jun 2022 17:47:11 +0100 +Subject: [PATCH 1/2] linux-aio: fix unbalanced plugged counter in + laio_io_unplug() + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 199: linux-aio: fix unbalanced plugged counter in laio_io_unplug() +RH-Commit: [1/2] f518df755090289905898a36922992288688e338 +RH-Bugzilla: 2105410 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Hanna Reitz +RH-Acked-by: Stefano Garzarella + +Every laio_io_plug() call has a matching laio_io_unplug() call. There is +a plugged counter that tracks the number of levels of plugging and +allows for nesting. + +The plugged counter must reflect the balance between laio_io_plug() and +laio_io_unplug() calls accurately. Otherwise I/O stalls occur since +io_submit(2) calls are skipped while plugged. + +Reported-by: Nikolay Tenev +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Stefano Garzarella +Message-id: 20220609164712.1539045-2-stefanha@redhat.com +Cc: Stefano Garzarella +Fixes: 68d7946648 ("linux-aio: add `dev_max_batch` parameter to laio_io_unplug()") +[Stefano Garzarella suggested adding a Fixes tag. +--Stefan] +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit f387cac5af030a58ac5a0dacf64cab5e5a4fe5c7) +Signed-off-by: Stefan Hajnoczi +--- + block/linux-aio.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/block/linux-aio.c b/block/linux-aio.c +index f53ae72e21..77f17ad596 100644 +--- a/block/linux-aio.c ++++ b/block/linux-aio.c +@@ -360,8 +360,10 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, + uint64_t dev_max_batch) + { + assert(s->io_q.plugged); ++ s->io_q.plugged--; ++ + if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || +- (--s->io_q.plugged == 0 && ++ (!s->io_q.plugged && + !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { + ioq_submit(s); + } +-- +2.35.3 + diff --git a/SOURCES/kvm-linux-headers-Update-headers-to-v5.17-rc1.patch b/SOURCES/kvm-linux-headers-Update-headers-to-v5.17-rc1.patch new file mode 100644 index 0000000..90adb5c --- /dev/null +++ b/SOURCES/kvm-linux-headers-Update-headers-to-v5.17-rc1.patch @@ -0,0 +1,1227 @@ +From e9ecd7543fa8d3e9fe80f4144e4c0461f783fc37 Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Tue, 8 Feb 2022 15:48:05 -0500 +Subject: [PATCH 03/24] linux-headers: Update headers to v5.17-rc1 + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [3/13] 63593c2431eabf02222f37467736b580022b94c8 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Update headers to 5.17-rc1. I need latest fuse changes. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal +Message-Id: <20220208204813.682906-3-vgoyal@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ef17dd6a8e6b6e3aeb29233996d44dfcb736d515) +Signed-off-by: Paul Lai +--- + include/standard-headers/asm-x86/kvm_para.h | 1 + + include/standard-headers/drm/drm_fourcc.h | 11 ++ + include/standard-headers/linux/ethtool.h | 1 + + include/standard-headers/linux/fuse.h | 60 +++++++- + include/standard-headers/linux/pci_regs.h | 142 +++++++++--------- + include/standard-headers/linux/virtio_gpio.h | 72 +++++++++ + include/standard-headers/linux/virtio_i2c.h | 47 ++++++ + include/standard-headers/linux/virtio_iommu.h | 8 +- + .../standard-headers/linux/virtio_pcidev.h | 65 ++++++++ + include/standard-headers/linux/virtio_scmi.h | 24 +++ + linux-headers/asm-generic/unistd.h | 5 +- + linux-headers/asm-mips/unistd_n32.h | 2 + + linux-headers/asm-mips/unistd_n64.h | 2 + + linux-headers/asm-mips/unistd_o32.h | 2 + + linux-headers/asm-powerpc/unistd_32.h | 2 + + linux-headers/asm-powerpc/unistd_64.h | 2 + + linux-headers/asm-riscv/bitsperlong.h | 14 ++ + linux-headers/asm-riscv/mman.h | 1 + + linux-headers/asm-riscv/unistd.h | 44 ++++++ + linux-headers/asm-s390/unistd_32.h | 2 + + linux-headers/asm-s390/unistd_64.h | 2 + + linux-headers/asm-x86/kvm.h | 16 +- + linux-headers/asm-x86/unistd_32.h | 1 + + linux-headers/asm-x86/unistd_64.h | 1 + + linux-headers/asm-x86/unistd_x32.h | 1 + + linux-headers/linux/kvm.h | 17 +++ + 26 files changed, 469 insertions(+), 76 deletions(-) + create mode 100644 include/standard-headers/linux/virtio_gpio.h + create mode 100644 include/standard-headers/linux/virtio_i2c.h + create mode 100644 include/standard-headers/linux/virtio_pcidev.h + create mode 100644 include/standard-headers/linux/virtio_scmi.h + create mode 100644 linux-headers/asm-riscv/bitsperlong.h + create mode 100644 linux-headers/asm-riscv/mman.h + create mode 100644 linux-headers/asm-riscv/unistd.h + +diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h +index 204cfb8640..f0235e58a1 100644 +--- a/include/standard-headers/asm-x86/kvm_para.h ++++ b/include/standard-headers/asm-x86/kvm_para.h +@@ -8,6 +8,7 @@ + * should be used to determine that a VM is running under KVM. + */ + #define KVM_CPUID_SIGNATURE 0x40000000 ++#define KVM_SIGNATURE "KVMKVMKVM\0\0\0" + + /* This CPUID returns two feature bitmaps in eax, edx. Before enabling + * a particular paravirtualization, the appropriate feature bit should +diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h +index 2c025cb4fe..4888f85f69 100644 +--- a/include/standard-headers/drm/drm_fourcc.h ++++ b/include/standard-headers/drm/drm_fourcc.h +@@ -313,6 +313,13 @@ extern "C" { + */ + #define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cr:Cb plane 16 bits per channel */ + ++/* 2 plane YCbCr420. ++ * 3 10 bit components and 2 padding bits packed into 4 bytes. ++ * index 0 = Y plane, [31:0] x:Y2:Y1:Y0 2:10:10:10 little endian ++ * index 1 = Cr:Cb plane, [63:0] x:Cr2:Cb2:Cr1:x:Cb1:Cr0:Cb0 [2:10:10:10:2:10:10:10] little endian ++ */ ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ ++ + /* 3 plane non-subsampled (444) YCbCr + * 16 bits per component, but only 10 bits are used and 6 bits are padded + * index 0: Y plane, [15:0] Y:x [10:6] little endian +@@ -853,6 +860,10 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) + * and UV. Some SAND-using hardware stores UV in a separate tiled + * image from Y to reduce the column height, which is not supported + * with these modifiers. ++ * ++ * The DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT modifier is also ++ * supported for DRM_FORMAT_P030 where the columns remain as 128 bytes ++ * wide, but as this is a 10 bpp format that translates to 96 pixels. + */ + + #define DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(v) \ +diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h +index 688eb8dc39..38d5a4cd6e 100644 +--- a/include/standard-headers/linux/ethtool.h ++++ b/include/standard-headers/linux/ethtool.h +@@ -231,6 +231,7 @@ enum tunable_id { + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ ++ ETHTOOL_TX_COPYBREAK_BUF_SIZE, + /* + * Add your fresh new tunable attribute above and remember to update + * tunable_strings[] in net/ethtool/common.c +diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h +index 23ea31708b..bda06258be 100644 +--- a/include/standard-headers/linux/fuse.h ++++ b/include/standard-headers/linux/fuse.h +@@ -184,6 +184,16 @@ + * + * 7.34 + * - add FUSE_SYNCFS ++ * ++ * 7.35 ++ * - add FOPEN_NOFLUSH ++ * ++ * 7.36 ++ * - extend fuse_init_in with reserved fields, add FUSE_INIT_EXT init flag ++ * - add flags2 to fuse_init_in and fuse_init_out ++ * - add FUSE_SECURITY_CTX init flag ++ * - add security context to create, mkdir, symlink, and mknod requests ++ * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX + */ + + #ifndef _LINUX_FUSE_H +@@ -215,7 +225,7 @@ + #define FUSE_KERNEL_VERSION 7 + + /** Minor version number of this interface */ +-#define FUSE_KERNEL_MINOR_VERSION 34 ++#define FUSE_KERNEL_MINOR_VERSION 36 + + /** The node ID of the root inode */ + #define FUSE_ROOT_ID 1 +@@ -286,12 +296,14 @@ struct fuse_file_lock { + * FOPEN_NONSEEKABLE: the file is not seekable + * FOPEN_CACHE_DIR: allow caching this directory + * FOPEN_STREAM: the file is stream-like (no file position at all) ++ * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) + */ + #define FOPEN_DIRECT_IO (1 << 0) + #define FOPEN_KEEP_CACHE (1 << 1) + #define FOPEN_NONSEEKABLE (1 << 2) + #define FOPEN_CACHE_DIR (1 << 3) + #define FOPEN_STREAM (1 << 4) ++#define FOPEN_NOFLUSH (1 << 5) + + /** + * INIT request/reply flags +@@ -332,6 +344,11 @@ struct fuse_file_lock { + * write/truncate sgid is killed only if file has group + * execute permission. (Same as Linux VFS behavior). + * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in ++ * FUSE_INIT_EXT: extended fuse_init_in request ++ * FUSE_INIT_RESERVED: reserved, do not use ++ * FUSE_SECURITY_CTX: add security context to create, mkdir, symlink, and ++ * mknod ++ * FUSE_HAS_INODE_DAX: use per inode DAX + */ + #define FUSE_ASYNC_READ (1 << 0) + #define FUSE_POSIX_LOCKS (1 << 1) +@@ -363,6 +380,11 @@ struct fuse_file_lock { + #define FUSE_SUBMOUNTS (1 << 27) + #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) + #define FUSE_SETXATTR_EXT (1 << 29) ++#define FUSE_INIT_EXT (1 << 30) ++#define FUSE_INIT_RESERVED (1 << 31) ++/* bits 32..63 get shifted down 32 bits into the flags2 field */ ++#define FUSE_SECURITY_CTX (1ULL << 32) ++#define FUSE_HAS_INODE_DAX (1ULL << 33) + + /** + * CUSE INIT request/reply flags +@@ -445,8 +467,10 @@ struct fuse_file_lock { + * fuse_attr flags + * + * FUSE_ATTR_SUBMOUNT: Object is a submount root ++ * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode + */ + #define FUSE_ATTR_SUBMOUNT (1 << 0) ++#define FUSE_ATTR_DAX (1 << 1) + + /** + * Open flags +@@ -732,6 +756,8 @@ struct fuse_init_in { + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; ++ uint32_t flags2; ++ uint32_t unused[11]; + }; + + #define FUSE_COMPAT_INIT_OUT_SIZE 8 +@@ -748,7 +774,8 @@ struct fuse_init_out { + uint32_t time_gran; + uint16_t max_pages; + uint16_t map_alignment; +- uint32_t unused[8]; ++ uint32_t flags2; ++ uint32_t unused[7]; + }; + + #define CUSE_INIT_INFO_MAX 4096 +@@ -856,9 +883,12 @@ struct fuse_dirent { + char name[]; + }; + +-#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) +-#define FUSE_DIRENT_ALIGN(x) \ ++/* Align variable length records to 64bit boundary */ ++#define FUSE_REC_ALIGN(x) \ + (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) ++ ++#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) ++#define FUSE_DIRENT_ALIGN(x) FUSE_REC_ALIGN(x) + #define FUSE_DIRENT_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +@@ -975,4 +1005,26 @@ struct fuse_syncfs_in { + uint64_t padding; + }; + ++/* ++ * For each security context, send fuse_secctx with size of security context ++ * fuse_secctx will be followed by security context name and this in turn ++ * will be followed by actual context label. ++ * fuse_secctx, name, context ++ */ ++struct fuse_secctx { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++/* ++ * Contains the information about how many fuse_secctx structures are being ++ * sent and what's the total size of all security contexts (including ++ * size of fuse_secctx_header). ++ * ++ */ ++struct fuse_secctx_header { ++ uint32_t size; ++ uint32_t nr_secctx; ++}; ++ + #endif /* _LINUX_FUSE_H */ +diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h +index ff6ccbc6ef..bee1a9ed6e 100644 +--- a/include/standard-headers/linux/pci_regs.h ++++ b/include/standard-headers/linux/pci_regs.h +@@ -301,23 +301,23 @@ + #define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ + #define PCI_SID_CHASSIS_NR 3 /* Chassis Number */ + +-/* Message Signalled Interrupt registers */ ++/* Message Signaled Interrupt registers */ + +-#define PCI_MSI_FLAGS 2 /* Message Control */ ++#define PCI_MSI_FLAGS 0x02 /* Message Control */ + #define PCI_MSI_FLAGS_ENABLE 0x0001 /* MSI feature enabled */ + #define PCI_MSI_FLAGS_QMASK 0x000e /* Maximum queue size available */ + #define PCI_MSI_FLAGS_QSIZE 0x0070 /* Message queue size configured */ + #define PCI_MSI_FLAGS_64BIT 0x0080 /* 64-bit addresses allowed */ + #define PCI_MSI_FLAGS_MASKBIT 0x0100 /* Per-vector masking capable */ + #define PCI_MSI_RFU 3 /* Rest of capability flags */ +-#define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */ +-#define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */ +-#define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */ +-#define PCI_MSI_MASK_32 12 /* Mask bits register for 32-bit devices */ +-#define PCI_MSI_PENDING_32 16 /* Pending intrs for 32-bit devices */ +-#define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */ +-#define PCI_MSI_MASK_64 16 /* Mask bits register for 64-bit devices */ +-#define PCI_MSI_PENDING_64 20 /* Pending intrs for 64-bit devices */ ++#define PCI_MSI_ADDRESS_LO 0x04 /* Lower 32 bits */ ++#define PCI_MSI_ADDRESS_HI 0x08 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */ ++#define PCI_MSI_DATA_32 0x08 /* 16 bits of data for 32-bit devices */ ++#define PCI_MSI_MASK_32 0x0c /* Mask bits register for 32-bit devices */ ++#define PCI_MSI_PENDING_32 0x10 /* Pending intrs for 32-bit devices */ ++#define PCI_MSI_DATA_64 0x0c /* 16 bits of data for 64-bit devices */ ++#define PCI_MSI_MASK_64 0x10 /* Mask bits register for 64-bit devices */ ++#define PCI_MSI_PENDING_64 0x14 /* Pending intrs for 64-bit devices */ + + /* MSI-X registers (in MSI-X capability) */ + #define PCI_MSIX_FLAGS 2 /* Message Control */ +@@ -335,10 +335,10 @@ + + /* MSI-X Table entry format (in memory mapped by a BAR) */ + #define PCI_MSIX_ENTRY_SIZE 16 +-#define PCI_MSIX_ENTRY_LOWER_ADDR 0 /* Message Address */ +-#define PCI_MSIX_ENTRY_UPPER_ADDR 4 /* Message Upper Address */ +-#define PCI_MSIX_ENTRY_DATA 8 /* Message Data */ +-#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 /* Vector Control */ ++#define PCI_MSIX_ENTRY_LOWER_ADDR 0x0 /* Message Address */ ++#define PCI_MSIX_ENTRY_UPPER_ADDR 0x4 /* Message Upper Address */ ++#define PCI_MSIX_ENTRY_DATA 0x8 /* Message Data */ ++#define PCI_MSIX_ENTRY_VECTOR_CTRL 0xc /* Vector Control */ + #define PCI_MSIX_ENTRY_CTRL_MASKBIT 0x00000001 + + /* CompactPCI Hotswap Register */ +@@ -470,7 +470,7 @@ + + /* PCI Express capability registers */ + +-#define PCI_EXP_FLAGS 2 /* Capabilities register */ ++#define PCI_EXP_FLAGS 0x02 /* Capabilities register */ + #define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */ + #define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */ + #define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */ +@@ -484,7 +484,7 @@ + #define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ + #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ + #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ +-#define PCI_EXP_DEVCAP 4 /* Device capabilities */ ++#define PCI_EXP_DEVCAP 0x04 /* Device capabilities */ + #define PCI_EXP_DEVCAP_PAYLOAD 0x00000007 /* Max_Payload_Size */ + #define PCI_EXP_DEVCAP_PHANTOM 0x00000018 /* Phantom functions */ + #define PCI_EXP_DEVCAP_EXT_TAG 0x00000020 /* Extended tags */ +@@ -497,7 +497,7 @@ + #define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */ + #define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */ + #define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */ +-#define PCI_EXP_DEVCTL 8 /* Device Control */ ++#define PCI_EXP_DEVCTL 0x08 /* Device Control */ + #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ + #define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ + #define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */ +@@ -522,7 +522,7 @@ + #define PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */ + #define PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */ + #define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */ +-#define PCI_EXP_DEVSTA 10 /* Device Status */ ++#define PCI_EXP_DEVSTA 0x0a /* Device Status */ + #define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */ + #define PCI_EXP_DEVSTA_NFED 0x0002 /* Non-Fatal Error Detected */ + #define PCI_EXP_DEVSTA_FED 0x0004 /* Fatal Error Detected */ +@@ -530,7 +530,7 @@ + #define PCI_EXP_DEVSTA_AUXPD 0x0010 /* AUX Power Detected */ + #define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */ + #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 /* v1 endpoints without link end here */ +-#define PCI_EXP_LNKCAP 12 /* Link Capabilities */ ++#define PCI_EXP_LNKCAP 0x0c /* Link Capabilities */ + #define PCI_EXP_LNKCAP_SLS 0x0000000f /* Supported Link Speeds */ + #define PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */ + #define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */ +@@ -549,7 +549,7 @@ + #define PCI_EXP_LNKCAP_DLLLARC 0x00100000 /* Data Link Layer Link Active Reporting Capable */ + #define PCI_EXP_LNKCAP_LBNC 0x00200000 /* Link Bandwidth Notification Capability */ + #define PCI_EXP_LNKCAP_PN 0xff000000 /* Port Number */ +-#define PCI_EXP_LNKCTL 16 /* Link Control */ ++#define PCI_EXP_LNKCTL 0x10 /* Link Control */ + #define PCI_EXP_LNKCTL_ASPMC 0x0003 /* ASPM Control */ + #define PCI_EXP_LNKCTL_ASPM_L0S 0x0001 /* L0s Enable */ + #define PCI_EXP_LNKCTL_ASPM_L1 0x0002 /* L1 Enable */ +@@ -562,7 +562,7 @@ + #define PCI_EXP_LNKCTL_HAWD 0x0200 /* Hardware Autonomous Width Disable */ + #define PCI_EXP_LNKCTL_LBMIE 0x0400 /* Link Bandwidth Management Interrupt Enable */ + #define PCI_EXP_LNKCTL_LABIE 0x0800 /* Link Autonomous Bandwidth Interrupt Enable */ +-#define PCI_EXP_LNKSTA 18 /* Link Status */ ++#define PCI_EXP_LNKSTA 0x12 /* Link Status */ + #define PCI_EXP_LNKSTA_CLS 0x000f /* Current Link Speed */ + #define PCI_EXP_LNKSTA_CLS_2_5GB 0x0001 /* Current Link Speed 2.5GT/s */ + #define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */ +@@ -582,7 +582,7 @@ + #define PCI_EXP_LNKSTA_LBMS 0x4000 /* Link Bandwidth Management Status */ + #define PCI_EXP_LNKSTA_LABS 0x8000 /* Link Autonomous Bandwidth Status */ + #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 20 /* v1 endpoints with link end here */ +-#define PCI_EXP_SLTCAP 20 /* Slot Capabilities */ ++#define PCI_EXP_SLTCAP 0x14 /* Slot Capabilities */ + #define PCI_EXP_SLTCAP_ABP 0x00000001 /* Attention Button Present */ + #define PCI_EXP_SLTCAP_PCP 0x00000002 /* Power Controller Present */ + #define PCI_EXP_SLTCAP_MRLSP 0x00000004 /* MRL Sensor Present */ +@@ -595,7 +595,7 @@ + #define PCI_EXP_SLTCAP_EIP 0x00020000 /* Electromechanical Interlock Present */ + #define PCI_EXP_SLTCAP_NCCS 0x00040000 /* No Command Completed Support */ + #define PCI_EXP_SLTCAP_PSN 0xfff80000 /* Physical Slot Number */ +-#define PCI_EXP_SLTCTL 24 /* Slot Control */ ++#define PCI_EXP_SLTCTL 0x18 /* Slot Control */ + #define PCI_EXP_SLTCTL_ABPE 0x0001 /* Attention Button Pressed Enable */ + #define PCI_EXP_SLTCTL_PFDE 0x0002 /* Power Fault Detected Enable */ + #define PCI_EXP_SLTCTL_MRLSCE 0x0004 /* MRL Sensor Changed Enable */ +@@ -617,7 +617,7 @@ + #define PCI_EXP_SLTCTL_EIC 0x0800 /* Electromechanical Interlock Control */ + #define PCI_EXP_SLTCTL_DLLSCE 0x1000 /* Data Link Layer State Changed Enable */ + #define PCI_EXP_SLTCTL_IBPD_DISABLE 0x4000 /* In-band PD disable */ +-#define PCI_EXP_SLTSTA 26 /* Slot Status */ ++#define PCI_EXP_SLTSTA 0x1a /* Slot Status */ + #define PCI_EXP_SLTSTA_ABP 0x0001 /* Attention Button Pressed */ + #define PCI_EXP_SLTSTA_PFD 0x0002 /* Power Fault Detected */ + #define PCI_EXP_SLTSTA_MRLSC 0x0004 /* MRL Sensor Changed */ +@@ -627,15 +627,15 @@ + #define PCI_EXP_SLTSTA_PDS 0x0040 /* Presence Detect State */ + #define PCI_EXP_SLTSTA_EIS 0x0080 /* Electromechanical Interlock Status */ + #define PCI_EXP_SLTSTA_DLLSC 0x0100 /* Data Link Layer State Changed */ +-#define PCI_EXP_RTCTL 28 /* Root Control */ ++#define PCI_EXP_RTCTL 0x1c /* Root Control */ + #define PCI_EXP_RTCTL_SECEE 0x0001 /* System Error on Correctable Error */ + #define PCI_EXP_RTCTL_SENFEE 0x0002 /* System Error on Non-Fatal Error */ + #define PCI_EXP_RTCTL_SEFEE 0x0004 /* System Error on Fatal Error */ + #define PCI_EXP_RTCTL_PMEIE 0x0008 /* PME Interrupt Enable */ + #define PCI_EXP_RTCTL_CRSSVE 0x0010 /* CRS Software Visibility Enable */ +-#define PCI_EXP_RTCAP 30 /* Root Capabilities */ ++#define PCI_EXP_RTCAP 0x1e /* Root Capabilities */ + #define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ +-#define PCI_EXP_RTSTA 32 /* Root Status */ ++#define PCI_EXP_RTSTA 0x20 /* Root Status */ + #define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ + #define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ + /* +@@ -646,7 +646,7 @@ + * Use pcie_capability_read_word() and similar interfaces to use them + * safely. + */ +-#define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ ++#define PCI_EXP_DEVCAP2 0x24 /* Device Capabilities 2 */ + #define PCI_EXP_DEVCAP2_COMP_TMOUT_DIS 0x00000010 /* Completion Timeout Disable supported */ + #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ + #define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ +@@ -658,7 +658,7 @@ + #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ + #define PCI_EXP_DEVCAP2_OBFF_WAKE 0x00080000 /* Re-use WAKE# for OBFF */ + #define PCI_EXP_DEVCAP2_EE_PREFIX 0x00200000 /* End-End TLP Prefix */ +-#define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ ++#define PCI_EXP_DEVCTL2 0x28 /* Device Control 2 */ + #define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */ + #define PCI_EXP_DEVCTL2_COMP_TMOUT_DIS 0x0010 /* Completion Timeout Disable */ + #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ +@@ -670,9 +670,9 @@ + #define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* Enable OBFF Message type A */ + #define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* Enable OBFF Message type B */ + #define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ +-#define PCI_EXP_DEVSTA2 42 /* Device Status 2 */ +-#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints without link end here */ +-#define PCI_EXP_LNKCAP2 44 /* Link Capabilities 2 */ ++#define PCI_EXP_DEVSTA2 0x2a /* Device Status 2 */ ++#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c /* end of v2 EPs w/o link */ ++#define PCI_EXP_LNKCAP2 0x2c /* Link Capabilities 2 */ + #define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */ + #define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */ + #define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */ +@@ -680,7 +680,7 @@ + #define PCI_EXP_LNKCAP2_SLS_32_0GB 0x00000020 /* Supported Speed 32GT/s */ + #define PCI_EXP_LNKCAP2_SLS_64_0GB 0x00000040 /* Supported Speed 64GT/s */ + #define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ +-#define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ ++#define PCI_EXP_LNKCTL2 0x30 /* Link Control 2 */ + #define PCI_EXP_LNKCTL2_TLS 0x000f + #define PCI_EXP_LNKCTL2_TLS_2_5GT 0x0001 /* Supported Speed 2.5GT/s */ + #define PCI_EXP_LNKCTL2_TLS_5_0GT 0x0002 /* Supported Speed 5GT/s */ +@@ -691,12 +691,12 @@ + #define PCI_EXP_LNKCTL2_ENTER_COMP 0x0010 /* Enter Compliance */ + #define PCI_EXP_LNKCTL2_TX_MARGIN 0x0380 /* Transmit Margin */ + #define PCI_EXP_LNKCTL2_HASD 0x0020 /* HW Autonomous Speed Disable */ +-#define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ +-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ +-#define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ ++#define PCI_EXP_LNKSTA2 0x32 /* Link Status 2 */ ++#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 0x32 /* end of v2 EPs w/ link */ ++#define PCI_EXP_SLTCAP2 0x34 /* Slot Capabilities 2 */ + #define PCI_EXP_SLTCAP2_IBPD 0x00000001 /* In-band PD Disable Supported */ +-#define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ +-#define PCI_EXP_SLTSTA2 58 /* Slot Status 2 */ ++#define PCI_EXP_SLTCTL2 0x38 /* Slot Control 2 */ ++#define PCI_EXP_SLTSTA2 0x3a /* Slot Status 2 */ + + /* Extended Capabilities (PCI-X 2.0 and Express) */ + #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) +@@ -742,7 +742,7 @@ + #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 + + /* Advanced Error Reporting */ +-#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ ++#define PCI_ERR_UNCOR_STATUS 0x04 /* Uncorrectable Error Status */ + #define PCI_ERR_UNC_UND 0x00000001 /* Undefined */ + #define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */ + #define PCI_ERR_UNC_SURPDN 0x00000020 /* Surprise Down */ +@@ -760,11 +760,11 @@ + #define PCI_ERR_UNC_MCBTLP 0x00800000 /* MC blocked TLP */ + #define PCI_ERR_UNC_ATOMEG 0x01000000 /* Atomic egress blocked */ + #define PCI_ERR_UNC_TLPPRE 0x02000000 /* TLP prefix blocked */ +-#define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */ ++#define PCI_ERR_UNCOR_MASK 0x08 /* Uncorrectable Error Mask */ + /* Same bits as above */ +-#define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */ ++#define PCI_ERR_UNCOR_SEVER 0x0c /* Uncorrectable Error Severity */ + /* Same bits as above */ +-#define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */ ++#define PCI_ERR_COR_STATUS 0x10 /* Correctable Error Status */ + #define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */ + #define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */ + #define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */ +@@ -773,20 +773,20 @@ + #define PCI_ERR_COR_ADV_NFAT 0x00002000 /* Advisory Non-Fatal */ + #define PCI_ERR_COR_INTERNAL 0x00004000 /* Corrected Internal */ + #define PCI_ERR_COR_LOG_OVER 0x00008000 /* Header Log Overflow */ +-#define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */ ++#define PCI_ERR_COR_MASK 0x14 /* Correctable Error Mask */ + /* Same bits as above */ +-#define PCI_ERR_CAP 24 /* Advanced Error Capabilities */ +-#define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */ ++#define PCI_ERR_CAP 0x18 /* Advanced Error Capabilities & Ctrl*/ ++#define PCI_ERR_CAP_FEP(x) ((x) & 0x1f) /* First Error Pointer */ + #define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ + #define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ + #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ + #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +-#define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */ +-#define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */ ++#define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ ++#define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */ + #define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */ + #define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 /* Non-Fatal Err Reporting Enable */ + #define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 /* Fatal Err Reporting Enable */ +-#define PCI_ERR_ROOT_STATUS 48 ++#define PCI_ERR_ROOT_STATUS 0x30 + #define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ + #define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 /* Multiple ERR_COR */ + #define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 /* ERR_FATAL/NONFATAL */ +@@ -795,52 +795,52 @@ + #define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ + #define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ + #define PCI_ERR_ROOT_AER_IRQ 0xf8000000 /* Advanced Error Interrupt Message Number */ +-#define PCI_ERR_ROOT_ERR_SRC 52 /* Error Source Identification */ ++#define PCI_ERR_ROOT_ERR_SRC 0x34 /* Error Source Identification */ + + /* Virtual Channel */ +-#define PCI_VC_PORT_CAP1 4 ++#define PCI_VC_PORT_CAP1 0x04 + #define PCI_VC_CAP1_EVCC 0x00000007 /* extended VC count */ + #define PCI_VC_CAP1_LPEVCC 0x00000070 /* low prio extended VC count */ + #define PCI_VC_CAP1_ARB_SIZE 0x00000c00 +-#define PCI_VC_PORT_CAP2 8 ++#define PCI_VC_PORT_CAP2 0x08 + #define PCI_VC_CAP2_32_PHASE 0x00000002 + #define PCI_VC_CAP2_64_PHASE 0x00000004 + #define PCI_VC_CAP2_128_PHASE 0x00000008 + #define PCI_VC_CAP2_ARB_OFF 0xff000000 +-#define PCI_VC_PORT_CTRL 12 ++#define PCI_VC_PORT_CTRL 0x0c + #define PCI_VC_PORT_CTRL_LOAD_TABLE 0x00000001 +-#define PCI_VC_PORT_STATUS 14 ++#define PCI_VC_PORT_STATUS 0x0e + #define PCI_VC_PORT_STATUS_TABLE 0x00000001 +-#define PCI_VC_RES_CAP 16 ++#define PCI_VC_RES_CAP 0x10 + #define PCI_VC_RES_CAP_32_PHASE 0x00000002 + #define PCI_VC_RES_CAP_64_PHASE 0x00000004 + #define PCI_VC_RES_CAP_128_PHASE 0x00000008 + #define PCI_VC_RES_CAP_128_PHASE_TB 0x00000010 + #define PCI_VC_RES_CAP_256_PHASE 0x00000020 + #define PCI_VC_RES_CAP_ARB_OFF 0xff000000 +-#define PCI_VC_RES_CTRL 20 ++#define PCI_VC_RES_CTRL 0x14 + #define PCI_VC_RES_CTRL_LOAD_TABLE 0x00010000 + #define PCI_VC_RES_CTRL_ARB_SELECT 0x000e0000 + #define PCI_VC_RES_CTRL_ID 0x07000000 + #define PCI_VC_RES_CTRL_ENABLE 0x80000000 +-#define PCI_VC_RES_STATUS 26 ++#define PCI_VC_RES_STATUS 0x1a + #define PCI_VC_RES_STATUS_TABLE 0x00000001 + #define PCI_VC_RES_STATUS_NEGO 0x00000002 + #define PCI_CAP_VC_BASE_SIZEOF 0x10 +-#define PCI_CAP_VC_PER_VC_SIZEOF 0x0C ++#define PCI_CAP_VC_PER_VC_SIZEOF 0x0c + + /* Power Budgeting */ +-#define PCI_PWR_DSR 4 /* Data Select Register */ +-#define PCI_PWR_DATA 8 /* Data Register */ ++#define PCI_PWR_DSR 0x04 /* Data Select Register */ ++#define PCI_PWR_DATA 0x08 /* Data Register */ + #define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ + #define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ + #define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ + #define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ + #define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ + #define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ +-#define PCI_PWR_CAP 12 /* Capability */ ++#define PCI_PWR_CAP 0x0c /* Capability */ + #define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ +-#define PCI_EXT_CAP_PWR_SIZEOF 16 ++#define PCI_EXT_CAP_PWR_SIZEOF 0x10 + + /* Root Complex Event Collector Endpoint Association */ + #define PCI_RCEC_RCIEP_BITMAP 4 /* Associated Bitmap for RCiEPs */ +@@ -964,7 +964,7 @@ + #define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */ + #define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */ + #define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */ +-#define PCI_EXT_CAP_SRIOV_SIZEOF 64 ++#define PCI_EXT_CAP_SRIOV_SIZEOF 0x40 + + #define PCI_LTR_MAX_SNOOP_LAT 0x4 + #define PCI_LTR_MAX_NOSNOOP_LAT 0x6 +@@ -1017,12 +1017,12 @@ + #define PCI_TPH_LOC_NONE 0x000 /* no location */ + #define PCI_TPH_LOC_CAP 0x200 /* in capability */ + #define PCI_TPH_LOC_MSIX 0x400 /* in MSI-X */ +-#define PCI_TPH_CAP_ST_MASK 0x07FF0000 /* st table mask */ +-#define PCI_TPH_CAP_ST_SHIFT 16 /* st table shift */ +-#define PCI_TPH_BASE_SIZEOF 12 /* size with no st table */ ++#define PCI_TPH_CAP_ST_MASK 0x07FF0000 /* ST table mask */ ++#define PCI_TPH_CAP_ST_SHIFT 16 /* ST table shift */ ++#define PCI_TPH_BASE_SIZEOF 0xc /* size with no ST table */ + + /* Downstream Port Containment */ +-#define PCI_EXP_DPC_CAP 4 /* DPC Capability */ ++#define PCI_EXP_DPC_CAP 0x04 /* DPC Capability */ + #define PCI_EXP_DPC_IRQ 0x001F /* Interrupt Message Number */ + #define PCI_EXP_DPC_CAP_RP_EXT 0x0020 /* Root Port Extensions */ + #define PCI_EXP_DPC_CAP_POISONED_TLP 0x0040 /* Poisoned TLP Egress Blocking Supported */ +@@ -1030,19 +1030,19 @@ + #define PCI_EXP_DPC_RP_PIO_LOG_SIZE 0x0F00 /* RP PIO Log Size */ + #define PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000 /* ERR_COR signal on DL_Active supported */ + +-#define PCI_EXP_DPC_CTL 6 /* DPC control */ ++#define PCI_EXP_DPC_CTL 0x06 /* DPC control */ + #define PCI_EXP_DPC_CTL_EN_FATAL 0x0001 /* Enable trigger on ERR_FATAL message */ + #define PCI_EXP_DPC_CTL_EN_NONFATAL 0x0002 /* Enable trigger on ERR_NONFATAL message */ + #define PCI_EXP_DPC_CTL_INT_EN 0x0008 /* DPC Interrupt Enable */ + +-#define PCI_EXP_DPC_STATUS 8 /* DPC Status */ ++#define PCI_EXP_DPC_STATUS 0x08 /* DPC Status */ + #define PCI_EXP_DPC_STATUS_TRIGGER 0x0001 /* Trigger Status */ + #define PCI_EXP_DPC_STATUS_TRIGGER_RSN 0x0006 /* Trigger Reason */ + #define PCI_EXP_DPC_STATUS_INTERRUPT 0x0008 /* Interrupt Status */ + #define PCI_EXP_DPC_RP_BUSY 0x0010 /* Root Port Busy */ + #define PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */ + +-#define PCI_EXP_DPC_SOURCE_ID 10 /* DPC Source Identifier */ ++#define PCI_EXP_DPC_SOURCE_ID 0x0A /* DPC Source Identifier */ + + #define PCI_EXP_DPC_RP_PIO_STATUS 0x0C /* RP PIO Status */ + #define PCI_EXP_DPC_RP_PIO_MASK 0x10 /* RP PIO Mask */ +@@ -1086,7 +1086,11 @@ + + /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ + #define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ ++#define PCI_DVSEC_HEADER1_VID(x) ((x) & 0xffff) ++#define PCI_DVSEC_HEADER1_REV(x) (((x) >> 16) & 0xf) ++#define PCI_DVSEC_HEADER1_LEN(x) (((x) >> 20) & 0xfff) + #define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ ++#define PCI_DVSEC_HEADER2_ID(x) ((x) & 0xffff) + + /* Data Link Feature */ + #define PCI_DLF_CAP 0x04 /* Capabilities Register */ +diff --git a/include/standard-headers/linux/virtio_gpio.h b/include/standard-headers/linux/virtio_gpio.h +new file mode 100644 +index 0000000000..2b5cf06349 +--- /dev/null ++++ b/include/standard-headers/linux/virtio_gpio.h +@@ -0,0 +1,72 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ ++#ifndef _LINUX_VIRTIO_GPIO_H ++#define _LINUX_VIRTIO_GPIO_H ++ ++#include "standard-headers/linux/types.h" ++ ++/* Virtio GPIO Feature bits */ ++#define VIRTIO_GPIO_F_IRQ 0 ++ ++/* Virtio GPIO request types */ ++#define VIRTIO_GPIO_MSG_GET_NAMES 0x0001 ++#define VIRTIO_GPIO_MSG_GET_DIRECTION 0x0002 ++#define VIRTIO_GPIO_MSG_SET_DIRECTION 0x0003 ++#define VIRTIO_GPIO_MSG_GET_VALUE 0x0004 ++#define VIRTIO_GPIO_MSG_SET_VALUE 0x0005 ++#define VIRTIO_GPIO_MSG_IRQ_TYPE 0x0006 ++ ++/* Possible values of the status field */ ++#define VIRTIO_GPIO_STATUS_OK 0x0 ++#define VIRTIO_GPIO_STATUS_ERR 0x1 ++ ++/* Direction types */ ++#define VIRTIO_GPIO_DIRECTION_NONE 0x00 ++#define VIRTIO_GPIO_DIRECTION_OUT 0x01 ++#define VIRTIO_GPIO_DIRECTION_IN 0x02 ++ ++/* Virtio GPIO IRQ types */ ++#define VIRTIO_GPIO_IRQ_TYPE_NONE 0x00 ++#define VIRTIO_GPIO_IRQ_TYPE_EDGE_RISING 0x01 ++#define VIRTIO_GPIO_IRQ_TYPE_EDGE_FALLING 0x02 ++#define VIRTIO_GPIO_IRQ_TYPE_EDGE_BOTH 0x03 ++#define VIRTIO_GPIO_IRQ_TYPE_LEVEL_HIGH 0x04 ++#define VIRTIO_GPIO_IRQ_TYPE_LEVEL_LOW 0x08 ++ ++struct virtio_gpio_config { ++ uint16_t ngpio; ++ uint8_t padding[2]; ++ uint32_t gpio_names_size; ++}; ++ ++/* Virtio GPIO Request / Response */ ++struct virtio_gpio_request { ++ uint16_t type; ++ uint16_t gpio; ++ uint32_t value; ++}; ++ ++struct virtio_gpio_response { ++ uint8_t status; ++ uint8_t value; ++}; ++ ++struct virtio_gpio_response_get_names { ++ uint8_t status; ++ uint8_t value[]; ++}; ++ ++/* Virtio GPIO IRQ Request / Response */ ++struct virtio_gpio_irq_request { ++ uint16_t gpio; ++}; ++ ++struct virtio_gpio_irq_response { ++ uint8_t status; ++}; ++ ++/* Possible values of the interrupt status field */ ++#define VIRTIO_GPIO_IRQ_STATUS_INVALID 0x0 ++#define VIRTIO_GPIO_IRQ_STATUS_VALID 0x1 ++ ++#endif /* _LINUX_VIRTIO_GPIO_H */ +diff --git a/include/standard-headers/linux/virtio_i2c.h b/include/standard-headers/linux/virtio_i2c.h +new file mode 100644 +index 0000000000..09fa907793 +--- /dev/null ++++ b/include/standard-headers/linux/virtio_i2c.h +@@ -0,0 +1,47 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */ ++/* ++ * Definitions for virtio I2C Adpter ++ * ++ * Copyright (c) 2021 Intel Corporation. All rights reserved. ++ */ ++ ++#ifndef _LINUX_VIRTIO_I2C_H ++#define _LINUX_VIRTIO_I2C_H ++ ++#include "standard-headers/linux/const.h" ++#include "standard-headers/linux/types.h" ++ ++/* Virtio I2C Feature bits */ ++#define VIRTIO_I2C_F_ZERO_LENGTH_REQUEST 0 ++ ++/* The bit 0 of the @virtio_i2c_out_hdr.@flags, used to group the requests */ ++#define VIRTIO_I2C_FLAGS_FAIL_NEXT _BITUL(0) ++ ++/* The bit 1 of the @virtio_i2c_out_hdr.@flags, used to mark a buffer as read */ ++#define VIRTIO_I2C_FLAGS_M_RD _BITUL(1) ++ ++/** ++ * struct virtio_i2c_out_hdr - the virtio I2C message OUT header ++ * @addr: the controlled device address ++ * @padding: used to pad to full dword ++ * @flags: used for feature extensibility ++ */ ++struct virtio_i2c_out_hdr { ++ uint16_t addr; ++ uint16_t padding; ++ uint32_t flags; ++}; ++ ++/** ++ * struct virtio_i2c_in_hdr - the virtio I2C message IN header ++ * @status: the processing result from the backend ++ */ ++struct virtio_i2c_in_hdr { ++ uint8_t status; ++}; ++ ++/* The final status written by the device */ ++#define VIRTIO_I2C_MSG_OK 0 ++#define VIRTIO_I2C_MSG_ERR 1 ++ ++#endif /* _LINUX_VIRTIO_I2C_H */ +diff --git a/include/standard-headers/linux/virtio_iommu.h b/include/standard-headers/linux/virtio_iommu.h +index b9443b83a1..366379c2f0 100644 +--- a/include/standard-headers/linux/virtio_iommu.h ++++ b/include/standard-headers/linux/virtio_iommu.h +@@ -16,6 +16,7 @@ + #define VIRTIO_IOMMU_F_BYPASS 3 + #define VIRTIO_IOMMU_F_PROBE 4 + #define VIRTIO_IOMMU_F_MMIO 5 ++#define VIRTIO_IOMMU_F_BYPASS_CONFIG 6 + + struct virtio_iommu_range_64 { + uint64_t start; +@@ -36,6 +37,8 @@ struct virtio_iommu_config { + struct virtio_iommu_range_32 domain_range; + /* Probe buffer size */ + uint32_t probe_size; ++ uint8_t bypass; ++ uint8_t reserved[3]; + }; + + /* Request types */ +@@ -66,11 +69,14 @@ struct virtio_iommu_req_tail { + uint8_t reserved[3]; + }; + ++#define VIRTIO_IOMMU_ATTACH_F_BYPASS (1 << 0) ++ + struct virtio_iommu_req_attach { + struct virtio_iommu_req_head head; + uint32_t domain; + uint32_t endpoint; +- uint8_t reserved[8]; ++ uint32_t flags; ++ uint8_t reserved[4]; + struct virtio_iommu_req_tail tail; + }; + +diff --git a/include/standard-headers/linux/virtio_pcidev.h b/include/standard-headers/linux/virtio_pcidev.h +new file mode 100644 +index 0000000000..bdf1d062da +--- /dev/null ++++ b/include/standard-headers/linux/virtio_pcidev.h +@@ -0,0 +1,65 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* ++ * Copyright (C) 2021 Intel Corporation ++ * Author: Johannes Berg ++ */ ++#ifndef _LINUX_VIRTIO_PCIDEV_H ++#define _LINUX_VIRTIO_PCIDEV_H ++#include "standard-headers/linux/types.h" ++ ++/** ++ * enum virtio_pcidev_ops - virtual PCI device operations ++ * @VIRTIO_PCIDEV_OP_RESERVED: reserved to catch errors ++ * @VIRTIO_PCIDEV_OP_CFG_READ: read config space, size is 1, 2, 4 or 8; ++ * the @data field should be filled in by the device (in little endian). ++ * @VIRTIO_PCIDEV_OP_CFG_WRITE: write config space, size is 1, 2, 4 or 8; ++ * the @data field contains the data to write (in little endian). ++ * @VIRTIO_PCIDEV_OP_MMIO_READ: read BAR mem/pio, size can be variable; ++ * the @data field should be filled in by the device (in little endian). ++ * @VIRTIO_PCIDEV_OP_MMIO_WRITE: write BAR mem/pio, size can be variable; ++ * the @data field contains the data to write (in little endian). ++ * @VIRTIO_PCIDEV_OP_MMIO_MEMSET: memset MMIO, size is variable but ++ * the @data field only has one byte (unlike @VIRTIO_PCIDEV_OP_MMIO_WRITE) ++ * @VIRTIO_PCIDEV_OP_INT: legacy INTx# pin interrupt, the addr field is 1-4 for ++ * the number ++ * @VIRTIO_PCIDEV_OP_MSI: MSI(-X) interrupt, this message basically transports ++ * the 16- or 32-bit write that would otherwise be done into memory, ++ * analogous to the write messages (@VIRTIO_PCIDEV_OP_MMIO_WRITE) above ++ * @VIRTIO_PCIDEV_OP_PME: Dummy message whose content is ignored (and should be ++ * all zeroes) to signal the PME# pin. ++ */ ++enum virtio_pcidev_ops { ++ VIRTIO_PCIDEV_OP_RESERVED = 0, ++ VIRTIO_PCIDEV_OP_CFG_READ, ++ VIRTIO_PCIDEV_OP_CFG_WRITE, ++ VIRTIO_PCIDEV_OP_MMIO_READ, ++ VIRTIO_PCIDEV_OP_MMIO_WRITE, ++ VIRTIO_PCIDEV_OP_MMIO_MEMSET, ++ VIRTIO_PCIDEV_OP_INT, ++ VIRTIO_PCIDEV_OP_MSI, ++ VIRTIO_PCIDEV_OP_PME, ++}; ++ ++/** ++ * struct virtio_pcidev_msg - virtio PCI device operation ++ * @op: the operation to do ++ * @bar: the bar (only with BAR read/write messages) ++ * @reserved: reserved ++ * @size: the size of the read/write (in bytes) ++ * @addr: the address to read/write ++ * @data: the data, normally @size long, but just one byte for ++ * %VIRTIO_PCIDEV_OP_MMIO_MEMSET ++ * ++ * Note: the fields are all in native (CPU) endian, however, the ++ * @data values will often be in little endian (see the ops above.) ++ */ ++struct virtio_pcidev_msg { ++ uint8_t op; ++ uint8_t bar; ++ uint16_t reserved; ++ uint32_t size; ++ uint64_t addr; ++ uint8_t data[]; ++}; ++ ++#endif /* _LINUX_VIRTIO_PCIDEV_H */ +diff --git a/include/standard-headers/linux/virtio_scmi.h b/include/standard-headers/linux/virtio_scmi.h +new file mode 100644 +index 0000000000..8f2c305aea +--- /dev/null ++++ b/include/standard-headers/linux/virtio_scmi.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* ++ * Copyright (C) 2020-2021 OpenSynergy GmbH ++ * Copyright (C) 2021 ARM Ltd. ++ */ ++ ++#ifndef _LINUX_VIRTIO_SCMI_H ++#define _LINUX_VIRTIO_SCMI_H ++ ++#include "standard-headers/linux/virtio_types.h" ++ ++/* Device implements some SCMI notifications, or delayed responses. */ ++#define VIRTIO_SCMI_F_P2A_CHANNELS 0 ++ ++/* Device implements any SCMI statistics shared memory region */ ++#define VIRTIO_SCMI_F_SHARED_MEMORY 1 ++ ++/* Virtqueues */ ++ ++#define VIRTIO_SCMI_VQ_TX 0 /* cmdq */ ++#define VIRTIO_SCMI_VQ_RX 1 /* eventq */ ++#define VIRTIO_SCMI_VQ_MAX_CNT 2 ++ ++#endif /* _LINUX_VIRTIO_SCMI_H */ +diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h +index 4557a8b608..1c48b0ae3b 100644 +--- a/linux-headers/asm-generic/unistd.h ++++ b/linux-headers/asm-generic/unistd.h +@@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) + #define __NR_futex_waitv 449 + __SYSCALL(__NR_futex_waitv, sys_futex_waitv) + ++#define __NR_set_mempolicy_home_node 450 ++__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) ++ + #undef __NR_syscalls +-#define __NR_syscalls 450 ++#define __NR_syscalls 451 + + /* + * 32 bit systems traditionally used different +diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h +index 4b3e7ad1ec..1f14a6fad3 100644 +--- a/linux-headers/asm-mips/unistd_n32.h ++++ b/linux-headers/asm-mips/unistd_n32.h +@@ -377,5 +377,7 @@ + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) + #define __NR_process_mrelease (__NR_Linux + 448) ++#define __NR_futex_waitv (__NR_Linux + 449) ++#define __NR_set_mempolicy_home_node (__NR_Linux + 450) + + #endif /* _ASM_UNISTD_N32_H */ +diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h +index 488d9298d9..e5a8ebec78 100644 +--- a/linux-headers/asm-mips/unistd_n64.h ++++ b/linux-headers/asm-mips/unistd_n64.h +@@ -353,5 +353,7 @@ + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) + #define __NR_process_mrelease (__NR_Linux + 448) ++#define __NR_futex_waitv (__NR_Linux + 449) ++#define __NR_set_mempolicy_home_node (__NR_Linux + 450) + + #endif /* _ASM_UNISTD_N64_H */ +diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h +index f47399870a..871d57168f 100644 +--- a/linux-headers/asm-mips/unistd_o32.h ++++ b/linux-headers/asm-mips/unistd_o32.h +@@ -423,5 +423,7 @@ + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) + #define __NR_process_mrelease (__NR_Linux + 448) ++#define __NR_futex_waitv (__NR_Linux + 449) ++#define __NR_set_mempolicy_home_node (__NR_Linux + 450) + + #endif /* _ASM_UNISTD_O32_H */ +diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h +index 11d54696dc..585c7fefbc 100644 +--- a/linux-headers/asm-powerpc/unistd_32.h ++++ b/linux-headers/asm-powerpc/unistd_32.h +@@ -430,6 +430,8 @@ + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 + #define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + + #endif /* _ASM_UNISTD_32_H */ +diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h +index cf740bab13..350f7ec0ac 100644 +--- a/linux-headers/asm-powerpc/unistd_64.h ++++ b/linux-headers/asm-powerpc/unistd_64.h +@@ -402,6 +402,8 @@ + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 + #define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + + #endif /* _ASM_UNISTD_64_H */ +diff --git a/linux-headers/asm-riscv/bitsperlong.h b/linux-headers/asm-riscv/bitsperlong.h +new file mode 100644 +index 0000000000..cc5c45a9ce +--- /dev/null ++++ b/linux-headers/asm-riscv/bitsperlong.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ ++/* ++ * Copyright (C) 2012 ARM Ltd. ++ * Copyright (C) 2015 Regents of the University of California ++ */ ++ ++#ifndef _ASM_RISCV_BITSPERLONG_H ++#define _ASM_RISCV_BITSPERLONG_H ++ ++#define __BITS_PER_LONG (__SIZEOF_POINTER__ * 8) ++ ++#include ++ ++#endif /* _ASM_RISCV_BITSPERLONG_H */ +diff --git a/linux-headers/asm-riscv/mman.h b/linux-headers/asm-riscv/mman.h +new file mode 100644 +index 0000000000..8eebf89f5a +--- /dev/null ++++ b/linux-headers/asm-riscv/mman.h +@@ -0,0 +1 @@ ++#include +diff --git a/linux-headers/asm-riscv/unistd.h b/linux-headers/asm-riscv/unistd.h +new file mode 100644 +index 0000000000..8062996c2d +--- /dev/null ++++ b/linux-headers/asm-riscv/unistd.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * Copyright (C) 2018 David Abdurachmanov ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++#ifdef __LP64__ ++#define __ARCH_WANT_NEW_STAT ++#define __ARCH_WANT_SET_GET_RLIMIT ++#endif /* __LP64__ */ ++ ++#define __ARCH_WANT_SYS_CLONE3 ++ ++#include ++ ++/* ++ * Allows the instruction cache to be flushed from userspace. Despite RISC-V ++ * having a direct 'fence.i' instruction available to userspace (which we ++ * can't trap!), that's not actually viable when running on Linux because the ++ * kernel might schedule a process on another hart. There is no way for ++ * userspace to handle this without invoking the kernel (as it doesn't know the ++ * thread->hart mappings), so we've defined a RISC-V specific system call to ++ * flush the instruction cache. ++ * ++ * __NR_riscv_flush_icache is defined to flush the instruction cache over an ++ * address range, with the flush applying to either all threads or just the ++ * caller. We don't currently do anything with the address range, that's just ++ * in there for forwards compatibility. ++ */ ++#ifndef __NR_riscv_flush_icache ++#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) ++#endif ++__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) +diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h +index 8f97d98128..8e644d65f5 100644 +--- a/linux-headers/asm-s390/unistd_32.h ++++ b/linux-headers/asm-s390/unistd_32.h +@@ -420,5 +420,7 @@ + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 + #define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + #endif /* _ASM_S390_UNISTD_32_H */ +diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h +index 021ffc30e6..51da542fec 100644 +--- a/linux-headers/asm-s390/unistd_64.h ++++ b/linux-headers/asm-s390/unistd_64.h +@@ -368,5 +368,7 @@ + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 + #define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + #endif /* _ASM_S390_UNISTD_64_H */ +diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h +index 5a776a08f7..2da3316bb5 100644 +--- a/linux-headers/asm-x86/kvm.h ++++ b/linux-headers/asm-x86/kvm.h +@@ -373,9 +373,23 @@ struct kvm_debugregs { + __u64 reserved[9]; + }; + +-/* for KVM_CAP_XSAVE */ ++/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */ + struct kvm_xsave { ++ /* ++ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes ++ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) ++ * respectively, when invoked on the vm file descriptor. ++ * ++ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) ++ * will always be at least 4096. Currently, it is only greater ++ * than 4096 if a dynamic feature has been enabled with ++ * ``arch_prctl()``, but this may change in the future. ++ * ++ * The offsets of the state save areas in struct kvm_xsave follow ++ * the contents of CPUID leaf 0xD on the host. ++ */ + __u32 region[1024]; ++ __u32 extra[0]; + }; + + #define KVM_MAX_XCRS 16 +diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h +index 9c9ffe312b..87e1e977af 100644 +--- a/linux-headers/asm-x86/unistd_32.h ++++ b/linux-headers/asm-x86/unistd_32.h +@@ -440,6 +440,7 @@ + #define __NR_memfd_secret 447 + #define __NR_process_mrelease 448 + #define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + + #endif /* _ASM_UNISTD_32_H */ +diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h +index 084f1eef9c..147a78d623 100644 +--- a/linux-headers/asm-x86/unistd_64.h ++++ b/linux-headers/asm-x86/unistd_64.h +@@ -362,6 +362,7 @@ + #define __NR_memfd_secret 447 + #define __NR_process_mrelease 448 + #define __NR_futex_waitv 449 ++#define __NR_set_mempolicy_home_node 450 + + + #endif /* _ASM_UNISTD_64_H */ +diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h +index a2441affc2..27098db7fb 100644 +--- a/linux-headers/asm-x86/unistd_x32.h ++++ b/linux-headers/asm-x86/unistd_x32.h +@@ -315,6 +315,7 @@ + #define __NR_memfd_secret (__X32_SYSCALL_BIT + 447) + #define __NR_process_mrelease (__X32_SYSCALL_BIT + 448) + #define __NR_futex_waitv (__X32_SYSCALL_BIT + 449) ++#define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450) + #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) + #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) + #define __NR_ioctl (__X32_SYSCALL_BIT + 514) +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 02c5e7b7bb..00af3bc333 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -1130,6 +1130,9 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_BINARY_STATS_FD 203 + #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 + #define KVM_CAP_ARM_MTE 205 ++#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 ++#define KVM_CAP_VM_GPA_BITS 207 ++#define KVM_CAP_XSAVE2 208 + + #ifdef KVM_CAP_IRQ_ROUTING + +@@ -1161,11 +1164,20 @@ struct kvm_irq_routing_hv_sint { + __u32 sint; + }; + ++struct kvm_irq_routing_xen_evtchn { ++ __u32 port; ++ __u32 vcpu; ++ __u32 priority; ++}; ++ ++#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) ++ + /* gsi routing entry types */ + #define KVM_IRQ_ROUTING_IRQCHIP 1 + #define KVM_IRQ_ROUTING_MSI 2 + #define KVM_IRQ_ROUTING_S390_ADAPTER 3 + #define KVM_IRQ_ROUTING_HV_SINT 4 ++#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 + + struct kvm_irq_routing_entry { + __u32 gsi; +@@ -1177,6 +1189,7 @@ struct kvm_irq_routing_entry { + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; ++ struct kvm_irq_routing_xen_evtchn xen_evtchn; + __u32 pad[8]; + } u; + }; +@@ -1207,6 +1220,7 @@ struct kvm_x86_mce { + #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) + #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) ++#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) + + struct kvm_xen_hvm_config { + __u32 flags; +@@ -1609,6 +1623,9 @@ struct kvm_enc_region { + #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) + #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) + ++/* Available with KVM_CAP_XSAVE2 */ ++#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) ++ + struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +-- +2.35.3 + diff --git a/SOURCES/kvm-linux-headers-include-missing-changes-from-5.17.patch b/SOURCES/kvm-linux-headers-include-missing-changes-from-5.17.patch new file mode 100644 index 0000000..1319926 --- /dev/null +++ b/SOURCES/kvm-linux-headers-include-missing-changes-from-5.17.patch @@ -0,0 +1,58 @@ +From aa6181d87e2b4ef1a70be002881908d2df5548a9 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 22 Feb 2022 17:58:11 +0100 +Subject: [PATCH 04/24] linux-headers: include missing changes from 5.17 + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [4/13] 2ed7cbc07e63d85cda916ef44d1e82b1fba7fdf4 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Signed-off-by: Paolo Bonzini +(cherry picked from commit 1ea5208febcc068449b63282d72bb719ab67a466) +Signed-off-by: Paul Lai +--- + linux-headers/asm-x86/kvm.h | 3 +++ + linux-headers/linux/kvm.h | 4 ++++ + 2 files changed, 7 insertions(+) + +diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h +index 2da3316bb5..bf6e96011d 100644 +--- a/linux-headers/asm-x86/kvm.h ++++ b/linux-headers/asm-x86/kvm.h +@@ -452,6 +452,9 @@ struct kvm_sync_regs { + + #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 + ++/* attributes for system fd (group 0) */ ++#define KVM_X86_XCOMP_GUEST_SUPP 0 ++ + struct kvm_vmx_nested_state_data { + __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 00af3bc333..d232feaae9 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 + #define KVM_CAP_VM_GPA_BITS 207 + #define KVM_CAP_XSAVE2 208 ++#define KVM_CAP_SYS_ATTRIBUTES 209 + + #ifdef KVM_CAP_IRQ_ROUTING + +@@ -2047,4 +2048,7 @@ struct kvm_stats_desc { + + #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) + ++/* Available with KVM_CAP_XSAVE2 */ ++#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) ++ + #endif /* __LINUX_KVM_H */ +-- +2.35.3 + diff --git a/SOURCES/kvm-linux-headers-update-to-5.16-rc1.patch b/SOURCES/kvm-linux-headers-update-to-5.16-rc1.patch new file mode 100644 index 0000000..1ad047b --- /dev/null +++ b/SOURCES/kvm-linux-headers-update-to-5.16-rc1.patch @@ -0,0 +1,725 @@ +From 64808db4a14867ad774b5e7535972a886e20a156 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 11 Nov 2021 12:06:01 +0100 +Subject: [PATCH 02/24] linux-headers: update to 5.16-rc1 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [2/13] 4af2f4942db029b81890e3862793fb54b62791cc +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Signed-off-by: Paolo Bonzini +Acked-by: Cornelia Huck +Reviewed-by: Alex Bennée +Message-Id: <20211111110604.207376-3-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 43709a0ca3b09e952bde3f38112f1d7fbf7c65b1) +Signed-off-by: Paul Lai +--- + include/standard-headers/drm/drm_fourcc.h | 121 +++++++++++++++++- + include/standard-headers/linux/ethtool.h | 31 +++++ + include/standard-headers/linux/fuse.h | 10 +- + include/standard-headers/linux/pci_regs.h | 6 + + include/standard-headers/linux/virtio_gpu.h | 18 ++- + include/standard-headers/linux/virtio_ids.h | 24 ++++ + include/standard-headers/linux/virtio_vsock.h | 3 +- + linux-headers/asm-arm64/unistd.h | 1 + + linux-headers/asm-generic/unistd.h | 22 +++- + linux-headers/asm-mips/unistd_n32.h | 1 + + linux-headers/asm-mips/unistd_n64.h | 1 + + linux-headers/asm-mips/unistd_o32.h | 1 + + linux-headers/asm-powerpc/unistd_32.h | 1 + + linux-headers/asm-powerpc/unistd_64.h | 1 + + linux-headers/asm-s390/unistd_32.h | 1 + + linux-headers/asm-s390/unistd_64.h | 1 + + linux-headers/asm-x86/kvm.h | 5 + + linux-headers/asm-x86/unistd_32.h | 3 + + linux-headers/asm-x86/unistd_64.h | 3 + + linux-headers/asm-x86/unistd_x32.h | 3 + + linux-headers/linux/kvm.h | 40 +++++- + 21 files changed, 276 insertions(+), 21 deletions(-) + +diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h +index 352b51fd0a..2c025cb4fe 100644 +--- a/include/standard-headers/drm/drm_fourcc.h ++++ b/include/standard-headers/drm/drm_fourcc.h +@@ -103,6 +103,12 @@ extern "C" { + /* 8 bpp Red */ + #define DRM_FORMAT_R8 fourcc_code('R', '8', ' ', ' ') /* [7:0] R */ + ++/* 10 bpp Red */ ++#define DRM_FORMAT_R10 fourcc_code('R', '1', '0', ' ') /* [15:0] x:R 6:10 little endian */ ++ ++/* 12 bpp Red */ ++#define DRM_FORMAT_R12 fourcc_code('R', '1', '2', ' ') /* [15:0] x:R 4:12 little endian */ ++ + /* 16 bpp Red */ + #define DRM_FORMAT_R16 fourcc_code('R', '1', '6', ' ') /* [15:0] R little endian */ + +@@ -372,6 +378,12 @@ extern "C" { + + #define DRM_FORMAT_RESERVED ((1ULL << 56) - 1) + ++#define fourcc_mod_get_vendor(modifier) \ ++ (((modifier) >> 56) & 0xff) ++ ++#define fourcc_mod_is_vendor(modifier, vendor) \ ++ (fourcc_mod_get_vendor(modifier) == DRM_FORMAT_MOD_VENDOR_## vendor) ++ + #define fourcc_mod_code(vendor, val) \ + ((((uint64_t)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | ((val) & 0x00ffffffffffffffULL)) + +@@ -899,9 +911,9 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) + + /* + * The top 4 bits (out of the 56 bits alloted for specifying vendor specific +- * modifiers) denote the category for modifiers. Currently we have only two +- * categories of modifiers ie AFBC and MISC. We can have a maximum of sixteen +- * different categories. ++ * modifiers) denote the category for modifiers. Currently we have three ++ * categories of modifiers ie AFBC, MISC and AFRC. We can have a maximum of ++ * sixteen different categories. + */ + #define DRM_FORMAT_MOD_ARM_CODE(__type, __val) \ + fourcc_mod_code(ARM, ((uint64_t)(__type) << 52) | ((__val) & 0x000fffffffffffffULL)) +@@ -1016,6 +1028,109 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) + */ + #define AFBC_FORMAT_MOD_USM (1ULL << 12) + ++/* ++ * Arm Fixed-Rate Compression (AFRC) modifiers ++ * ++ * AFRC is a proprietary fixed rate image compression protocol and format, ++ * designed to provide guaranteed bandwidth and memory footprint ++ * reductions in graphics and media use-cases. ++ * ++ * AFRC buffers consist of one or more planes, with the same components ++ * and meaning as an uncompressed buffer using the same pixel format. ++ * ++ * Within each plane, the pixel/luma/chroma values are grouped into ++ * "coding unit" blocks which are individually compressed to a ++ * fixed size (in bytes). All coding units within a given plane of a buffer ++ * store the same number of values, and have the same compressed size. ++ * ++ * The coding unit size is configurable, allowing different rates of compression. ++ * ++ * The start of each AFRC buffer plane must be aligned to an alignment granule which ++ * depends on the coding unit size. ++ * ++ * Coding Unit Size Plane Alignment ++ * ---------------- --------------- ++ * 16 bytes 1024 bytes ++ * 24 bytes 512 bytes ++ * 32 bytes 2048 bytes ++ * ++ * Coding units are grouped into paging tiles. AFRC buffer dimensions must be aligned ++ * to a multiple of the paging tile dimensions. ++ * The dimensions of each paging tile depend on whether the buffer is optimised for ++ * scanline (SCAN layout) or rotated (ROT layout) access. ++ * ++ * Layout Paging Tile Width Paging Tile Height ++ * ------ ----------------- ------------------ ++ * SCAN 16 coding units 4 coding units ++ * ROT 8 coding units 8 coding units ++ * ++ * The dimensions of each coding unit depend on the number of components ++ * in the compressed plane and whether the buffer is optimised for ++ * scanline (SCAN layout) or rotated (ROT layout) access. ++ * ++ * Number of Components in Plane Layout Coding Unit Width Coding Unit Height ++ * ----------------------------- --------- ----------------- ------------------ ++ * 1 SCAN 16 samples 4 samples ++ * Example: 16x4 luma samples in a 'Y' plane ++ * 16x4 chroma 'V' values, in the 'V' plane of a fully-planar YUV buffer ++ * ----------------------------- --------- ----------------- ------------------ ++ * 1 ROT 8 samples 8 samples ++ * Example: 8x8 luma samples in a 'Y' plane ++ * 8x8 chroma 'V' values, in the 'V' plane of a fully-planar YUV buffer ++ * ----------------------------- --------- ----------------- ------------------ ++ * 2 DONT CARE 8 samples 4 samples ++ * Example: 8x4 chroma pairs in the 'UV' plane of a semi-planar YUV buffer ++ * ----------------------------- --------- ----------------- ------------------ ++ * 3 DONT CARE 4 samples 4 samples ++ * Example: 4x4 pixels in an RGB buffer without alpha ++ * ----------------------------- --------- ----------------- ------------------ ++ * 4 DONT CARE 4 samples 4 samples ++ * Example: 4x4 pixels in an RGB buffer with alpha ++ */ ++ ++#define DRM_FORMAT_MOD_ARM_TYPE_AFRC 0x02 ++ ++#define DRM_FORMAT_MOD_ARM_AFRC(__afrc_mode) \ ++ DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_AFRC, __afrc_mode) ++ ++/* ++ * AFRC coding unit size modifier. ++ * ++ * Indicates the number of bytes used to store each compressed coding unit for ++ * one or more planes in an AFRC encoded buffer. The coding unit size for chrominance ++ * is the same for both Cb and Cr, which may be stored in separate planes. ++ * ++ * AFRC_FORMAT_MOD_CU_SIZE_P0 indicates the number of bytes used to store ++ * each compressed coding unit in the first plane of the buffer. For RGBA buffers ++ * this is the only plane, while for semi-planar and fully-planar YUV buffers, ++ * this corresponds to the luma plane. ++ * ++ * AFRC_FORMAT_MOD_CU_SIZE_P12 indicates the number of bytes used to store ++ * each compressed coding unit in the second and third planes in the buffer. ++ * For semi-planar and fully-planar YUV buffers, this corresponds to the chroma plane(s). ++ * ++ * For single-plane buffers, AFRC_FORMAT_MOD_CU_SIZE_P0 must be specified ++ * and AFRC_FORMAT_MOD_CU_SIZE_P12 must be zero. ++ * For semi-planar and fully-planar buffers, both AFRC_FORMAT_MOD_CU_SIZE_P0 and ++ * AFRC_FORMAT_MOD_CU_SIZE_P12 must be specified. ++ */ ++#define AFRC_FORMAT_MOD_CU_SIZE_MASK 0xf ++#define AFRC_FORMAT_MOD_CU_SIZE_16 (1ULL) ++#define AFRC_FORMAT_MOD_CU_SIZE_24 (2ULL) ++#define AFRC_FORMAT_MOD_CU_SIZE_32 (3ULL) ++ ++#define AFRC_FORMAT_MOD_CU_SIZE_P0(__afrc_cu_size) (__afrc_cu_size) ++#define AFRC_FORMAT_MOD_CU_SIZE_P12(__afrc_cu_size) ((__afrc_cu_size) << 4) ++ ++/* ++ * AFRC scanline memory layout. ++ * ++ * Indicates if the buffer uses the scanline-optimised layout ++ * for an AFRC encoded buffer, otherwise, it uses the rotation-optimised layout. ++ * The memory layout is the same for all planes. ++ */ ++#define AFRC_FORMAT_MOD_LAYOUT_SCAN (1ULL << 8) ++ + /* + * Arm 16x16 Block U-Interleaved modifier + * +diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h +index 053d3fafdf..688eb8dc39 100644 +--- a/include/standard-headers/linux/ethtool.h ++++ b/include/standard-headers/linux/ethtool.h +@@ -603,6 +603,7 @@ enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, ++ ETHTOOL_LINK_EXT_STATE_MODULE, + }; + + /* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ +@@ -639,6 +640,8 @@ enum ethtool_link_ext_substate_link_logical_mismatch { + enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, ++ ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, ++ ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, + }; + + /* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ +@@ -647,6 +650,11 @@ enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, + }; + ++/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ ++enum ethtool_link_ext_substate_module { ++ ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, ++}; ++ + #define ETH_GSTRING_LEN 32 + + /** +@@ -704,6 +712,29 @@ enum ethtool_stringset { + ETH_SS_COUNT + }; + ++/** ++ * enum ethtool_module_power_mode_policy - plug-in module power mode policy ++ * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. ++ * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host ++ * to high power mode when the first port using it is put administratively ++ * up and to low power mode when the last port using it is put ++ * administratively down. ++ */ ++enum ethtool_module_power_mode_policy { ++ ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, ++ ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, ++}; ++ ++/** ++ * enum ethtool_module_power_mode - plug-in module power mode ++ * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. ++ * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. ++ */ ++enum ethtool_module_power_mode { ++ ETHTOOL_MODULE_POWER_MODE_LOW = 1, ++ ETHTOOL_MODULE_POWER_MODE_HIGH, ++}; ++ + /** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS +diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h +index cce105bfba..23ea31708b 100644 +--- a/include/standard-headers/linux/fuse.h ++++ b/include/standard-headers/linux/fuse.h +@@ -181,6 +181,9 @@ + * - add FUSE_OPEN_KILL_SUIDGID + * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT + * - add FUSE_SETXATTR_ACL_KILL_SGID ++ * ++ * 7.34 ++ * - add FUSE_SYNCFS + */ + + #ifndef _LINUX_FUSE_H +@@ -212,7 +215,7 @@ + #define FUSE_KERNEL_VERSION 7 + + /** Minor version number of this interface */ +-#define FUSE_KERNEL_MINOR_VERSION 33 ++#define FUSE_KERNEL_MINOR_VERSION 34 + + /** The node ID of the root inode */ + #define FUSE_ROOT_ID 1 +@@ -505,6 +508,7 @@ enum fuse_opcode { + FUSE_COPY_FILE_RANGE = 47, + FUSE_SETUPMAPPING = 48, + FUSE_REMOVEMAPPING = 49, ++ FUSE_SYNCFS = 50, + + /* CUSE specific operations */ + CUSE_INIT = 4096, +@@ -967,4 +971,8 @@ struct fuse_removemapping_one { + #define FUSE_REMOVEMAPPING_MAX_ENTRY \ + (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) + ++struct fuse_syncfs_in { ++ uint64_t padding; ++}; ++ + #endif /* _LINUX_FUSE_H */ +diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h +index e709ae8235..ff6ccbc6ef 100644 +--- a/include/standard-headers/linux/pci_regs.h ++++ b/include/standard-headers/linux/pci_regs.h +@@ -504,6 +504,12 @@ + #define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ + #define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ + #define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ ++#define PCI_EXP_DEVCTL_PAYLOAD_128B 0x0000 /* 128 Bytes */ ++#define PCI_EXP_DEVCTL_PAYLOAD_256B 0x0020 /* 256 Bytes */ ++#define PCI_EXP_DEVCTL_PAYLOAD_512B 0x0040 /* 512 Bytes */ ++#define PCI_EXP_DEVCTL_PAYLOAD_1024B 0x0060 /* 1024 Bytes */ ++#define PCI_EXP_DEVCTL_PAYLOAD_2048B 0x0080 /* 2048 Bytes */ ++#define PCI_EXP_DEVCTL_PAYLOAD_4096B 0x00a0 /* 4096 Bytes */ + #define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ + #define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ + #define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ +diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h +index 1357e4774e..2da48d3d4c 100644 +--- a/include/standard-headers/linux/virtio_gpu.h ++++ b/include/standard-headers/linux/virtio_gpu.h +@@ -59,6 +59,11 @@ + * VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB + */ + #define VIRTIO_GPU_F_RESOURCE_BLOB 3 ++/* ++ * VIRTIO_GPU_CMD_CREATE_CONTEXT with ++ * context_init and multiple timelines ++ */ ++#define VIRTIO_GPU_F_CONTEXT_INIT 4 + + enum virtio_gpu_ctrl_type { + VIRTIO_GPU_UNDEFINED = 0, +@@ -122,14 +127,20 @@ enum virtio_gpu_shm_id { + VIRTIO_GPU_SHM_ID_HOST_VISIBLE = 1 + }; + +-#define VIRTIO_GPU_FLAG_FENCE (1 << 0) ++#define VIRTIO_GPU_FLAG_FENCE (1 << 0) ++/* ++ * If the following flag is set, then ring_idx contains the index ++ * of the command ring that needs to used when creating the fence ++ */ ++#define VIRTIO_GPU_FLAG_INFO_RING_IDX (1 << 1) + + struct virtio_gpu_ctrl_hdr { + uint32_t type; + uint32_t flags; + uint64_t fence_id; + uint32_t ctx_id; +- uint32_t padding; ++ uint8_t ring_idx; ++ uint8_t padding[3]; + }; + + /* data passed in the cursor vq */ +@@ -269,10 +280,11 @@ struct virtio_gpu_resource_create_3d { + }; + + /* VIRTIO_GPU_CMD_CTX_CREATE */ ++#define VIRTIO_GPU_CONTEXT_INIT_CAPSET_ID_MASK 0x000000ff + struct virtio_gpu_ctx_create { + struct virtio_gpu_ctrl_hdr hdr; + uint32_t nlen; +- uint32_t padding; ++ uint32_t context_init; + char debug_name[64]; + }; + +diff --git a/include/standard-headers/linux/virtio_ids.h b/include/standard-headers/linux/virtio_ids.h +index 4fe842c3a3..80d76b75bc 100644 +--- a/include/standard-headers/linux/virtio_ids.h ++++ b/include/standard-headers/linux/virtio_ids.h +@@ -54,7 +54,31 @@ + #define VIRTIO_ID_SOUND 25 /* virtio sound */ + #define VIRTIO_ID_FS 26 /* virtio filesystem */ + #define VIRTIO_ID_PMEM 27 /* virtio pmem */ ++#define VIRTIO_ID_RPMB 28 /* virtio rpmb */ + #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ ++#define VIRTIO_ID_VIDEO_ENCODER 30 /* virtio video encoder */ ++#define VIRTIO_ID_VIDEO_DECODER 31 /* virtio video decoder */ ++#define VIRTIO_ID_SCMI 32 /* virtio SCMI */ ++#define VIRTIO_ID_NITRO_SEC_MOD 33 /* virtio nitro secure module*/ ++#define VIRTIO_ID_I2C_ADAPTER 34 /* virtio i2c adapter */ ++#define VIRTIO_ID_WATCHDOG 35 /* virtio watchdog */ ++#define VIRTIO_ID_CAN 36 /* virtio can */ ++#define VIRTIO_ID_DMABUF 37 /* virtio dmabuf */ ++#define VIRTIO_ID_PARAM_SERV 38 /* virtio parameter server */ ++#define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ + #define VIRTIO_ID_BT 40 /* virtio bluetooth */ ++#define VIRTIO_ID_GPIO 41 /* virtio gpio */ ++ ++/* ++ * Virtio Transitional IDs ++ */ ++ ++#define VIRTIO_TRANS_ID_NET 1000 /* transitional virtio net */ ++#define VIRTIO_TRANS_ID_BLOCK 1001 /* transitional virtio block */ ++#define VIRTIO_TRANS_ID_BALLOON 1002 /* transitional virtio balloon */ ++#define VIRTIO_TRANS_ID_CONSOLE 1003 /* transitional virtio console */ ++#define VIRTIO_TRANS_ID_SCSI 1004 /* transitional virtio SCSI */ ++#define VIRTIO_TRANS_ID_RNG 1005 /* transitional virtio rng */ ++#define VIRTIO_TRANS_ID_9P 1009 /* transitional virtio 9p console */ + + #endif /* _LINUX_VIRTIO_IDS_H */ +diff --git a/include/standard-headers/linux/virtio_vsock.h b/include/standard-headers/linux/virtio_vsock.h +index 3a23488e42..467e751b17 100644 +--- a/include/standard-headers/linux/virtio_vsock.h ++++ b/include/standard-headers/linux/virtio_vsock.h +@@ -97,7 +97,8 @@ enum virtio_vsock_shutdown { + + /* VIRTIO_VSOCK_OP_RW flags values */ + enum virtio_vsock_rw { +- VIRTIO_VSOCK_SEQ_EOR = 1, ++ VIRTIO_VSOCK_SEQ_EOM = 1, ++ VIRTIO_VSOCK_SEQ_EOR = 2, + }; + + #endif /* _LINUX_VIRTIO_VSOCK_H */ +diff --git a/linux-headers/asm-arm64/unistd.h b/linux-headers/asm-arm64/unistd.h +index f83a70e07d..ce2ee8f1e3 100644 +--- a/linux-headers/asm-arm64/unistd.h ++++ b/linux-headers/asm-arm64/unistd.h +@@ -20,5 +20,6 @@ + #define __ARCH_WANT_SET_GET_RLIMIT + #define __ARCH_WANT_TIME32_SYSCALLS + #define __ARCH_WANT_SYS_CLONE3 ++#define __ARCH_WANT_MEMFD_SECRET + + #include +diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h +index f211961ce1..4557a8b608 100644 +--- a/linux-headers/asm-generic/unistd.h ++++ b/linux-headers/asm-generic/unistd.h +@@ -673,15 +673,15 @@ __SYSCALL(__NR_madvise, sys_madvise) + #define __NR_remap_file_pages 234 + __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) + #define __NR_mbind 235 +-__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) ++__SYSCALL(__NR_mbind, sys_mbind) + #define __NR_get_mempolicy 236 +-__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) ++__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) + #define __NR_set_mempolicy 237 +-__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) ++__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) + #define __NR_migrate_pages 238 +-__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) ++__SYSCALL(__NR_migrate_pages, sys_migrate_pages) + #define __NR_move_pages 239 +-__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) ++__SYSCALL(__NR_move_pages, sys_move_pages) + #endif + + #define __NR_rt_tgsigqueueinfo 240 +@@ -873,8 +873,18 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) + #define __NR_landlock_restrict_self 446 + __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) + ++#ifdef __ARCH_WANT_MEMFD_SECRET ++#define __NR_memfd_secret 447 ++__SYSCALL(__NR_memfd_secret, sys_memfd_secret) ++#endif ++#define __NR_process_mrelease 448 ++__SYSCALL(__NR_process_mrelease, sys_process_mrelease) ++ ++#define __NR_futex_waitv 449 ++__SYSCALL(__NR_futex_waitv, sys_futex_waitv) ++ + #undef __NR_syscalls +-#define __NR_syscalls 447 ++#define __NR_syscalls 450 + + /* + * 32 bit systems traditionally used different +diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h +index 09cd297698..4b3e7ad1ec 100644 +--- a/linux-headers/asm-mips/unistd_n32.h ++++ b/linux-headers/asm-mips/unistd_n32.h +@@ -376,5 +376,6 @@ + #define __NR_landlock_create_ruleset (__NR_Linux + 444) + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) ++#define __NR_process_mrelease (__NR_Linux + 448) + + #endif /* _ASM_UNISTD_N32_H */ +diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h +index 780e0cead6..488d9298d9 100644 +--- a/linux-headers/asm-mips/unistd_n64.h ++++ b/linux-headers/asm-mips/unistd_n64.h +@@ -352,5 +352,6 @@ + #define __NR_landlock_create_ruleset (__NR_Linux + 444) + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) ++#define __NR_process_mrelease (__NR_Linux + 448) + + #endif /* _ASM_UNISTD_N64_H */ +diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h +index 06a2b3b55e..f47399870a 100644 +--- a/linux-headers/asm-mips/unistd_o32.h ++++ b/linux-headers/asm-mips/unistd_o32.h +@@ -422,5 +422,6 @@ + #define __NR_landlock_create_ruleset (__NR_Linux + 444) + #define __NR_landlock_add_rule (__NR_Linux + 445) + #define __NR_landlock_restrict_self (__NR_Linux + 446) ++#define __NR_process_mrelease (__NR_Linux + 448) + + #endif /* _ASM_UNISTD_O32_H */ +diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h +index cd5a8a41b2..11d54696dc 100644 +--- a/linux-headers/asm-powerpc/unistd_32.h ++++ b/linux-headers/asm-powerpc/unistd_32.h +@@ -429,6 +429,7 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_process_mrelease 448 + + + #endif /* _ASM_UNISTD_32_H */ +diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h +index 8458effa8d..cf740bab13 100644 +--- a/linux-headers/asm-powerpc/unistd_64.h ++++ b/linux-headers/asm-powerpc/unistd_64.h +@@ -401,6 +401,7 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_process_mrelease 448 + + + #endif /* _ASM_UNISTD_64_H */ +diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h +index 0c3cd299e4..8f97d98128 100644 +--- a/linux-headers/asm-s390/unistd_32.h ++++ b/linux-headers/asm-s390/unistd_32.h +@@ -419,5 +419,6 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_process_mrelease 448 + + #endif /* _ASM_S390_UNISTD_32_H */ +diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h +index 8dfc08b5e6..021ffc30e6 100644 +--- a/linux-headers/asm-s390/unistd_64.h ++++ b/linux-headers/asm-s390/unistd_64.h +@@ -367,5 +367,6 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_process_mrelease 448 + + #endif /* _ASM_S390_UNISTD_64_H */ +diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h +index a6c327f8ad..5a776a08f7 100644 +--- a/linux-headers/asm-x86/kvm.h ++++ b/linux-headers/asm-x86/kvm.h +@@ -295,6 +295,7 @@ struct kvm_debug_exit_arch { + #define KVM_GUESTDBG_USE_HW_BP 0x00020000 + #define KVM_GUESTDBG_INJECT_DB 0x00040000 + #define KVM_GUESTDBG_INJECT_BP 0x00080000 ++#define KVM_GUESTDBG_BLOCKIRQ 0x00100000 + + /* for KVM_SET_GUEST_DEBUG */ + struct kvm_guest_debug_arch { +@@ -503,4 +504,8 @@ struct kvm_pmu_event_filter { + #define KVM_PMU_EVENT_ALLOW 0 + #define KVM_PMU_EVENT_DENY 1 + ++/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ ++#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ ++#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ ++ + #endif /* _ASM_X86_KVM_H */ +diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h +index 66e96c0c68..9c9ffe312b 100644 +--- a/linux-headers/asm-x86/unistd_32.h ++++ b/linux-headers/asm-x86/unistd_32.h +@@ -437,6 +437,9 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_memfd_secret 447 ++#define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 + + + #endif /* _ASM_UNISTD_32_H */ +diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h +index b8ff6f14ee..084f1eef9c 100644 +--- a/linux-headers/asm-x86/unistd_64.h ++++ b/linux-headers/asm-x86/unistd_64.h +@@ -359,6 +359,9 @@ + #define __NR_landlock_create_ruleset 444 + #define __NR_landlock_add_rule 445 + #define __NR_landlock_restrict_self 446 ++#define __NR_memfd_secret 447 ++#define __NR_process_mrelease 448 ++#define __NR_futex_waitv 449 + + + #endif /* _ASM_UNISTD_64_H */ +diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h +index 06a1097c15..a2441affc2 100644 +--- a/linux-headers/asm-x86/unistd_x32.h ++++ b/linux-headers/asm-x86/unistd_x32.h +@@ -312,6 +312,9 @@ + #define __NR_landlock_create_ruleset (__X32_SYSCALL_BIT + 444) + #define __NR_landlock_add_rule (__X32_SYSCALL_BIT + 445) + #define __NR_landlock_restrict_self (__X32_SYSCALL_BIT + 446) ++#define __NR_memfd_secret (__X32_SYSCALL_BIT + 447) ++#define __NR_process_mrelease (__X32_SYSCALL_BIT + 448) ++#define __NR_futex_waitv (__X32_SYSCALL_BIT + 449) + #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) + #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) + #define __NR_ioctl (__X32_SYSCALL_BIT + 514) +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index bcaf66cc4d..02c5e7b7bb 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -269,6 +269,7 @@ struct kvm_xen_exit { + #define KVM_EXIT_AP_RESET_HOLD 32 + #define KVM_EXIT_X86_BUS_LOCK 33 + #define KVM_EXIT_XEN 34 ++#define KVM_EXIT_RISCV_SBI 35 + + /* For KVM_EXIT_INTERNAL_ERROR */ + /* Emulate instruction failed. */ +@@ -397,13 +398,23 @@ struct kvm_run { + * "ndata" is correct, that new fields are enumerated in "flags", + * and that each flag enumerates fields that are 64-bit aligned + * and sized (so that ndata+internal.data[] is valid/accurate). ++ * ++ * Space beyond the defined fields may be used to store arbitrary ++ * debug information relating to the emulation failure. It is ++ * accounted for in "ndata" but the format is unspecified and is ++ * not represented in "flags". Any such information is *not* ABI! + */ + struct { + __u32 suberror; + __u32 ndata; + __u64 flags; +- __u8 insn_size; +- __u8 insn_bytes[15]; ++ union { ++ struct { ++ __u8 insn_size; ++ __u8 insn_bytes[15]; ++ }; ++ }; ++ /* Arbitrary debug data may follow. */ + } emulation_failure; + /* KVM_EXIT_OSI */ + struct { +@@ -469,6 +480,13 @@ struct kvm_run { + } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; ++ /* KVM_EXIT_RISCV_SBI */ ++ struct { ++ unsigned long extension_id; ++ unsigned long function_id; ++ unsigned long args[6]; ++ unsigned long ret[2]; ++ } riscv_sbi; + /* Fix the size of the union. */ + char padding[256]; + }; +@@ -1223,11 +1241,16 @@ struct kvm_irqfd { + + /* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */ + #define KVM_CLOCK_TSC_STABLE 2 ++#define KVM_CLOCK_REALTIME (1 << 2) ++#define KVM_CLOCK_HOST_TSC (1 << 3) + + struct kvm_clock_data { + __u64 clock; + __u32 flags; +- __u32 pad[9]; ++ __u32 pad0; ++ __u64 realtime; ++ __u64 host_tsc; ++ __u32 pad[4]; + }; + + /* For KVM_CAP_SW_TLB */ +@@ -1965,7 +1988,9 @@ struct kvm_stats_header { + #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) +-#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK ++#define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) ++#define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) ++#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST + + #define KVM_STATS_UNIT_SHIFT 4 + #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) +@@ -1988,8 +2013,9 @@ struct kvm_stats_header { + * @size: The number of data items for this stats. + * Every data item is of type __u64. + * @offset: The offset of the stats to the start of stat structure in +- * struture kvm or kvm_vcpu. +- * @unused: Unused field for future usage. Always 0 for now. ++ * structure kvm or kvm_vcpu. ++ * @bucket_size: A parameter value used for histogram stats. It is only used ++ * for linear histogram stats, specifying the size of the bucket; + * @name: The name string for the stats. Its size is indicated by the + * &kvm_stats_header->name_size. + */ +@@ -1998,7 +2024,7 @@ struct kvm_stats_desc { + __s16 exponent; + __u16 size; + __u32 offset; +- __u32 unused; ++ __u32 bucket_size; + char name[]; + }; + +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Add-migrate_use_tls-helper.patch b/SOURCES/kvm-migration-Add-migrate_use_tls-helper.patch new file mode 100644 index 0000000..8fdfe68 --- /dev/null +++ b/SOURCES/kvm-migration-Add-migrate_use_tls-helper.patch @@ -0,0 +1,106 @@ +From a7c6bc008fe006f005d5c15d3f883572ad5defc5 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 20/37] migration: Add migrate_use_tls() helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [20/26] 02afc2e60f1abbf6db45d83e54a18b66dad52426 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +A lot of places check parameters.tls_creds in order to evaluate if TLS is +in use, and sometimes call migrate_get_current() just for that test. + +Add new helper function migrate_use_tls() in order to simplify testing +for TLS usage. + +Signed-off-by: Leonardo Bras +Reviewed-by: Juan Quintela +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Message-Id: <20220513062836.965425-6-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d2fafb6a6814a8998607d0baf691265032996a0f) +Signed-off-by: Leonardo Bras +--- + migration/channel.c | 3 +-- + migration/migration.c | 9 +++++++++ + migration/migration.h | 1 + + migration/multifd.c | 5 +---- + 4 files changed, 12 insertions(+), 6 deletions(-) + +diff --git a/migration/channel.c b/migration/channel.c +index c4fc000a1a..086b5c0d8b 100644 +--- a/migration/channel.c ++++ b/migration/channel.c +@@ -38,8 +38,7 @@ void migration_channel_process_incoming(QIOChannel *ioc) + trace_migration_set_incoming_channel( + ioc, object_get_typename(OBJECT(ioc))); + +- if (s->parameters.tls_creds && +- *s->parameters.tls_creds && ++ if (migrate_use_tls() && + !object_dynamic_cast(OBJECT(ioc), + TYPE_QIO_CHANNEL_TLS)) { + migration_tls_channel_process_incoming(s, ioc, &local_err); +diff --git a/migration/migration.c b/migration/migration.c +index b0fc3f68bd..8e28f2ee41 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2568,6 +2568,15 @@ bool migrate_use_zero_copy_send(void) + } + #endif + ++int migrate_use_tls(void) ++{ ++ MigrationState *s; ++ ++ s = migrate_get_current(); ++ ++ return s->parameters.tls_creds && *s->parameters.tls_creds; ++} ++ + int migrate_use_xbzrle(void) + { + MigrationState *s; +diff --git a/migration/migration.h b/migration/migration.h +index 908098939f..9396b7e90a 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -344,6 +344,7 @@ bool migrate_use_zero_copy_send(void); + #else + #define migrate_use_zero_copy_send() (false) + #endif ++int migrate_use_tls(void); + int migrate_use_xbzrle(void); + uint64_t migrate_xbzrle_cache_size(void); + bool migrate_colo_enabled(void); +diff --git a/migration/multifd.c b/migration/multifd.c +index 3725226400..e53811f04a 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -789,14 +789,11 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error *error) + { +- MigrationState *s = migrate_get_current(); +- + trace_multifd_set_outgoing_channel( + ioc, object_get_typename(OBJECT(ioc)), p->tls_hostname, error); + + if (!error) { +- if (s->parameters.tls_creds && +- *s->parameters.tls_creds && ++ if (migrate_use_tls() && + !object_dynamic_cast(OBJECT(ioc), + TYPE_QIO_CHANNEL_TLS)) { + multifd_tls_channel_connect(p, ioc, &error); +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Add-migration_incoming_transport_cleanup.patch b/SOURCES/kvm-migration-Add-migration_incoming_transport_cleanup.patch new file mode 100644 index 0000000..985bbe2 --- /dev/null +++ b/SOURCES/kvm-migration-Add-migration_incoming_transport_cleanup.patch @@ -0,0 +1,102 @@ +From 02eab793d82cd3c82d31f1e1f34d16fcc30caf0e Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 1 Mar 2022 16:39:14 +0800 +Subject: [PATCH 27/37] migration: Add migration_incoming_transport_cleanup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Peter Xu +RH-MergeRequest: 195: migration: Allow migrate-recover to run multiple times +RH-Commit: [1/2] 57b2a9a165ee7cb2d01519bd54eb8dc4185815e0 +RH-Bugzilla: 2097652 +RH-Acked-by: Leonardo Brás +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Hanna Reitz + +Add a helper to cleanup the transport listener. + +When do it, we should also null-ify the cleanup hook and the data, then it's +even safe to call it multiple times. + +Move the socket_address_list cleanup altogether, because that's a mirror of the +listener channels and only for the purpose of query-migrate. Hence when +someone wants to cleanup the listener transport, it should also want to cleanup +the socket list too, always. + +No functional change intended. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Peter Xu +Message-Id: <20220301083925.33483-15-peterx@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e031149c78489413038e934eec9f54ac699cf322) +Signed-off-by: Peter Xu +--- + migration/migration.c | 22 ++++++++++++++-------- + migration/migration.h | 1 + + 2 files changed, 15 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index c8aa55d2fe..b787a36789 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -263,6 +263,19 @@ MigrationIncomingState *migration_incoming_get_current(void) + return current_incoming; + } + ++void migration_incoming_transport_cleanup(MigrationIncomingState *mis) ++{ ++ if (mis->socket_address_list) { ++ qapi_free_SocketAddressList(mis->socket_address_list); ++ mis->socket_address_list = NULL; ++ } ++ ++ if (mis->transport_cleanup) { ++ mis->transport_cleanup(mis->transport_data); ++ mis->transport_data = mis->transport_cleanup = NULL; ++ } ++} ++ + void migration_incoming_state_destroy(void) + { + struct MigrationIncomingState *mis = migration_incoming_get_current(); +@@ -283,10 +296,8 @@ void migration_incoming_state_destroy(void) + g_array_free(mis->postcopy_remote_fds, TRUE); + mis->postcopy_remote_fds = NULL; + } +- if (mis->transport_cleanup) { +- mis->transport_cleanup(mis->transport_data); +- } + ++ migration_incoming_transport_cleanup(mis); + qemu_event_reset(&mis->main_thread_load_event); + + if (mis->page_requested) { +@@ -294,11 +305,6 @@ void migration_incoming_state_destroy(void) + mis->page_requested = NULL; + } + +- if (mis->socket_address_list) { +- qapi_free_SocketAddressList(mis->socket_address_list); +- mis->socket_address_list = NULL; +- } +- + yank_unregister_instance(MIGRATION_YANK_INSTANCE); + } + +diff --git a/migration/migration.h b/migration/migration.h +index 9396b7e90a..243898e3be 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -130,6 +130,7 @@ struct MigrationIncomingState { + + MigrationIncomingState *migration_incoming_get_current(void); + void migration_incoming_state_destroy(void); ++void migration_incoming_transport_cleanup(MigrationIncomingState *mis); + /* + * Functions to work with blocktime context + */ +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Add-zero-copy-send-parameter-for-QMP-HMP-f.patch b/SOURCES/kvm-migration-Add-zero-copy-send-parameter-for-QMP-HMP-f.patch new file mode 100644 index 0000000..63e67c6 --- /dev/null +++ b/SOURCES/kvm-migration-Add-zero-copy-send-parameter-for-QMP-HMP-f.patch @@ -0,0 +1,250 @@ +From 2a84bf822cae38f67458043cd379a22e0fd22485 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 19/37] migration: Add zero-copy-send parameter for QMP/HMP for + Linux +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [19/26] 44ec703088cad75fd6e504958527e81d3261c9df +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Add property that allows zero-copy migration of memory pages +on the sending side, and also includes a helper function +migrate_use_zero_copy_send() to check if it's enabled. + +No code is introduced to actually do the migration, but it allow +future implementations to enable/disable this feature. + +On non-Linux builds this parameter is compiled-out. + +Signed-off-by: Leonardo Bras +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Juan Quintela +Acked-by: Markus Armbruster +Message-Id: <20220513062836.965425-5-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit abb6295b3ace5d17c3a65936913fc346616dbf14) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 32 ++++++++++++++++++++++++++++++++ + migration/migration.h | 5 +++++ + migration/socket.c | 11 +++++++++-- + monitor/hmp-cmds.c | 6 ++++++ + qapi/migration.json | 24 ++++++++++++++++++++++++ + 5 files changed, 76 insertions(+), 2 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 8a13294da6..b0fc3f68bd 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -888,6 +888,10 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) + params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_zstd_level = true; + params->multifd_zstd_level = s->parameters.multifd_zstd_level; ++#ifdef CONFIG_LINUX ++ params->has_zero_copy_send = true; ++ params->zero_copy_send = s->parameters.zero_copy_send; ++#endif + params->has_xbzrle_cache_size = true; + params->xbzrle_cache_size = s->parameters.xbzrle_cache_size; + params->has_max_postcopy_bandwidth = true; +@@ -1541,6 +1545,11 @@ static void migrate_params_test_apply(MigrateSetParameters *params, + if (params->has_multifd_compression) { + dest->multifd_compression = params->multifd_compression; + } ++#ifdef CONFIG_LINUX ++ if (params->has_zero_copy_send) { ++ dest->zero_copy_send = params->zero_copy_send; ++ } ++#endif + if (params->has_xbzrle_cache_size) { + dest->xbzrle_cache_size = params->xbzrle_cache_size; + } +@@ -1653,6 +1662,11 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) + if (params->has_multifd_compression) { + s->parameters.multifd_compression = params->multifd_compression; + } ++#ifdef CONFIG_LINUX ++ if (params->has_zero_copy_send) { ++ s->parameters.zero_copy_send = params->zero_copy_send; ++ } ++#endif + if (params->has_xbzrle_cache_size) { + s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; + xbzrle_cache_resize(params->xbzrle_cache_size, errp); +@@ -2543,6 +2557,17 @@ int migrate_multifd_zstd_level(void) + return s->parameters.multifd_zstd_level; + } + ++#ifdef CONFIG_LINUX ++bool migrate_use_zero_copy_send(void) ++{ ++ MigrationState *s; ++ ++ s = migrate_get_current(); ++ ++ return s->parameters.zero_copy_send; ++} ++#endif ++ + int migrate_use_xbzrle(void) + { + MigrationState *s; +@@ -4193,6 +4218,10 @@ static Property migration_properties[] = { + DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, + parameters.multifd_zstd_level, + DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), ++#ifdef CONFIG_LINUX ++ DEFINE_PROP_BOOL("zero_copy_send", MigrationState, ++ parameters.zero_copy_send, false), ++#endif + DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, + parameters.xbzrle_cache_size, + DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE), +@@ -4290,6 +4319,9 @@ static void migration_instance_init(Object *obj) + params->has_multifd_compression = true; + params->has_multifd_zlib_level = true; + params->has_multifd_zstd_level = true; ++#ifdef CONFIG_LINUX ++ params->has_zero_copy_send = true; ++#endif + params->has_xbzrle_cache_size = true; + params->has_max_postcopy_bandwidth = true; + params->has_max_cpu_throttle = true; +diff --git a/migration/migration.h b/migration/migration.h +index d016cedd9d..908098939f 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -339,6 +339,11 @@ MultiFDCompression migrate_multifd_compression(void); + int migrate_multifd_zlib_level(void); + int migrate_multifd_zstd_level(void); + ++#ifdef CONFIG_LINUX ++bool migrate_use_zero_copy_send(void); ++#else ++#define migrate_use_zero_copy_send() (false) ++#endif + int migrate_use_xbzrle(void); + uint64_t migrate_xbzrle_cache_size(void); + bool migrate_colo_enabled(void); +diff --git a/migration/socket.c b/migration/socket.c +index 05705a32d8..3754d8f72c 100644 +--- a/migration/socket.c ++++ b/migration/socket.c +@@ -74,9 +74,16 @@ static void socket_outgoing_migration(QIOTask *task, + + if (qio_task_propagate_error(task, &err)) { + trace_migration_socket_outgoing_error(error_get_pretty(err)); +- } else { +- trace_migration_socket_outgoing_connected(data->hostname); ++ goto out; + } ++ ++ trace_migration_socket_outgoing_connected(data->hostname); ++ ++ if (migrate_use_zero_copy_send()) { ++ error_setg(&err, "Zero copy send not available in migration"); ++ } ++ ++out: + migration_channel_connect(data->s, sioc, data->hostname, err); + object_unref(OBJECT(sioc)); + } +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index 2669156b28..e02da5008b 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -1297,6 +1297,12 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) + p->has_multifd_zstd_level = true; + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); + break; ++#ifdef CONFIG_LINUX ++ case MIGRATION_PARAMETER_ZERO_COPY_SEND: ++ p->has_zero_copy_send = true; ++ visit_type_bool(v, param, &p->zero_copy_send, &err); ++ break; ++#endif + case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: + p->has_xbzrle_cache_size = true; + if (!visit_type_size(v, param, &cache_size, &err)) { +diff --git a/qapi/migration.json b/qapi/migration.json +index bbfd48cf0b..59b5c5780b 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -730,6 +730,13 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # ++# @zero-copy-send: Controls behavior on sending memory pages on migration. ++# When true, enables a zero-copy mechanism for sending ++# memory pages, if host supports it. ++# Requires that QEMU be permitted to use locked memory ++# for guest RAM pages. ++# Defaults to false. (Since 7.1) ++# + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such + # aliases may for example be the corresponding names on the +@@ -769,6 +776,7 @@ + 'xbzrle-cache-size', 'max-postcopy-bandwidth', + 'max-cpu-throttle', 'multifd-compression', + 'multifd-zlib-level' ,'multifd-zstd-level', ++ { 'name': 'zero-copy-send', 'if' : 'CONFIG_LINUX'}, + 'block-bitmap-mapping' ] } + + ## +@@ -895,6 +903,13 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # ++# @zero-copy-send: Controls behavior on sending memory pages on migration. ++# When true, enables a zero-copy mechanism for sending ++# memory pages, if host supports it. ++# Requires that QEMU be permitted to use locked memory ++# for guest RAM pages. ++# Defaults to false. (Since 7.1) ++# + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such + # aliases may for example be the corresponding names on the +@@ -949,6 +964,7 @@ + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', ++ '*zero-copy-send': { 'type': 'bool', 'if': 'CONFIG_LINUX' }, + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } + + ## +@@ -1095,6 +1111,13 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # ++# @zero-copy-send: Controls behavior on sending memory pages on migration. ++# When true, enables a zero-copy mechanism for sending ++# memory pages, if host supports it. ++# Requires that QEMU be permitted to use locked memory ++# for guest RAM pages. ++# Defaults to false. (Since 7.1) ++# + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such + # aliases may for example be the corresponding names on the +@@ -1147,6 +1170,7 @@ + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', ++ '*zero-copy-send': { 'type': 'bool', 'if': 'CONFIG_LINUX' }, + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } + + ## +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-All-this-fields-are-unsigned.patch b/SOURCES/kvm-migration-All-this-fields-are-unsigned.patch new file mode 100644 index 0000000..245e2b4 --- /dev/null +++ b/SOURCES/kvm-migration-All-this-fields-are-unsigned.patch @@ -0,0 +1,329 @@ +From b21f18afceba8231c78d29e66f58516e12c28d22 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 10/37] migration: All this fields are unsigned +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [10/26] 2c3ee27aae334db3b283ab7ef580f58e396e569d +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +So printing it as %d is wrong. Notice that for the channel id, that +is an uint8_t, but I changed it anyways for consistency. + +Signed-off-by: Juan Quintela +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Peter Xu +(cherry picked from commit 04e114049406dbb69fc9043c795ddd28fdba31a6) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 20 ++++++++++---------- + migration/multifd-zstd.c | 24 ++++++++++++------------ + migration/multifd.c | 16 ++++++++-------- + migration/trace-events | 26 +++++++++++++------------- + 4 files changed, 43 insertions(+), 43 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index a1950a4588..a987e4a26c 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -52,7 +52,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) + zs->opaque = Z_NULL; + if (deflateInit(zs, migrate_multifd_zlib_level()) != Z_OK) { + g_free(z); +- error_setg(errp, "multifd %d: deflate init failed", p->id); ++ error_setg(errp, "multifd %u: deflate init failed", p->id); + return -1; + } + /* We will never have more than page_count pages */ +@@ -62,7 +62,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) + if (!z->zbuff) { + deflateEnd(&z->zs); + g_free(z); +- error_setg(errp, "multifd %d: out of memory for zbuff", p->id); ++ error_setg(errp, "multifd %u: out of memory for zbuff", p->id); + return -1; + } + p->data = z; +@@ -134,12 +134,12 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + ret = deflate(zs, flush); + } while (ret == Z_OK && zs->avail_in && zs->avail_out); + if (ret == Z_OK && zs->avail_in) { +- error_setg(errp, "multifd %d: deflate failed to compress all input", ++ error_setg(errp, "multifd %u: deflate failed to compress all input", + p->id); + return -1; + } + if (ret != Z_OK) { +- error_setg(errp, "multifd %d: deflate returned %d instead of Z_OK", ++ error_setg(errp, "multifd %u: deflate returned %d instead of Z_OK", + p->id, ret); + return -1; + } +@@ -193,7 +193,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) + zs->avail_in = 0; + zs->next_in = Z_NULL; + if (inflateInit(zs) != Z_OK) { +- error_setg(errp, "multifd %d: inflate init failed", p->id); ++ error_setg(errp, "multifd %u: inflate init failed", p->id); + return -1; + } + /* We will never have more than page_count pages */ +@@ -203,7 +203,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) + z->zbuff = g_try_malloc(z->zbuff_len); + if (!z->zbuff) { + inflateEnd(zs); +- error_setg(errp, "multifd %d: out of memory for zbuff", p->id); ++ error_setg(errp, "multifd %u: out of memory for zbuff", p->id); + return -1; + } + return 0; +@@ -252,7 +252,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + int i; + + if (flags != MULTIFD_FLAG_ZLIB) { +- error_setg(errp, "multifd %d: flags received %x flags expected %x", ++ error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_ZLIB); + return -1; + } +@@ -289,19 +289,19 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + } while (ret == Z_OK && zs->avail_in + && (zs->total_out - start) < page_size); + if (ret == Z_OK && (zs->total_out - start) < page_size) { +- error_setg(errp, "multifd %d: inflate generated too few output", ++ error_setg(errp, "multifd %u: inflate generated too few output", + p->id); + return -1; + } + if (ret != Z_OK) { +- error_setg(errp, "multifd %d: inflate returned %d instead of Z_OK", ++ error_setg(errp, "multifd %u: inflate returned %d instead of Z_OK", + p->id, ret); + return -1; + } + } + out_size = zs->total_out - out_size; + if (out_size != expected_size) { +- error_setg(errp, "multifd %d: packet size received %d size expected %d", ++ error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_size, expected_size); + return -1; + } +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index d9ed42622b..2185a83eac 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -56,7 +56,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + z->zcs = ZSTD_createCStream(); + if (!z->zcs) { + g_free(z); +- error_setg(errp, "multifd %d: zstd createCStream failed", p->id); ++ error_setg(errp, "multifd %u: zstd createCStream failed", p->id); + return -1; + } + +@@ -64,7 +64,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + if (ZSTD_isError(res)) { + ZSTD_freeCStream(z->zcs); + g_free(z); +- error_setg(errp, "multifd %d: initCStream failed with error %s", ++ error_setg(errp, "multifd %u: initCStream failed with error %s", + p->id, ZSTD_getErrorName(res)); + return -1; + } +@@ -75,7 +75,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + if (!z->zbuff) { + ZSTD_freeCStream(z->zcs); + g_free(z); +- error_setg(errp, "multifd %d: out of memory for zbuff", p->id); ++ error_setg(errp, "multifd %u: out of memory for zbuff", p->id); + return -1; + } + return 0; +@@ -146,12 +146,12 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + } while (ret > 0 && (z->in.size - z->in.pos > 0) + && (z->out.size - z->out.pos > 0)); + if (ret > 0 && (z->in.size - z->in.pos > 0)) { +- error_setg(errp, "multifd %d: compressStream buffer too small", ++ error_setg(errp, "multifd %u: compressStream buffer too small", + p->id); + return -1; + } + if (ZSTD_isError(ret)) { +- error_setg(errp, "multifd %d: compressStream error %s", ++ error_setg(errp, "multifd %u: compressStream error %s", + p->id, ZSTD_getErrorName(ret)); + return -1; + } +@@ -201,7 +201,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) + z->zds = ZSTD_createDStream(); + if (!z->zds) { + g_free(z); +- error_setg(errp, "multifd %d: zstd createDStream failed", p->id); ++ error_setg(errp, "multifd %u: zstd createDStream failed", p->id); + return -1; + } + +@@ -209,7 +209,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) + if (ZSTD_isError(ret)) { + ZSTD_freeDStream(z->zds); + g_free(z); +- error_setg(errp, "multifd %d: initDStream failed with error %s", ++ error_setg(errp, "multifd %u: initDStream failed with error %s", + p->id, ZSTD_getErrorName(ret)); + return -1; + } +@@ -222,7 +222,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) + if (!z->zbuff) { + ZSTD_freeDStream(z->zds); + g_free(z); +- error_setg(errp, "multifd %d: out of memory for zbuff", p->id); ++ error_setg(errp, "multifd %u: out of memory for zbuff", p->id); + return -1; + } + return 0; +@@ -270,7 +270,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + int i; + + if (flags != MULTIFD_FLAG_ZSTD) { +- error_setg(errp, "multifd %d: flags received %x flags expected %x", ++ error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_ZSTD); + return -1; + } +@@ -302,19 +302,19 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + } while (ret > 0 && (z->in.size - z->in.pos > 0) + && (z->out.pos < page_size)); + if (ret > 0 && (z->out.pos < page_size)) { +- error_setg(errp, "multifd %d: decompressStream buffer too small", ++ error_setg(errp, "multifd %u: decompressStream buffer too small", + p->id); + return -1; + } + if (ZSTD_isError(ret)) { +- error_setg(errp, "multifd %d: decompressStream returned %s", ++ error_setg(errp, "multifd %u: decompressStream returned %s", + p->id, ZSTD_getErrorName(ret)); + return ret; + } + out_size += z->out.pos; + } + if (out_size != expected_size) { +- error_setg(errp, "multifd %d: packet size received %d size expected %d", ++ error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_size, expected_size); + return -1; + } +diff --git a/migration/multifd.c b/migration/multifd.c +index 0533da154a..d0d19470f9 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -148,7 +148,7 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (flags != MULTIFD_FLAG_NOCOMP) { +- error_setg(errp, "multifd %d: flags received %x flags expected %x", ++ error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } +@@ -212,8 +212,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) + } + + if (msg.version != MULTIFD_VERSION) { +- error_setg(errp, "multifd: received packet version %d " +- "expected %d", msg.version, MULTIFD_VERSION); ++ error_setg(errp, "multifd: received packet version %u " ++ "expected %u", msg.version, MULTIFD_VERSION); + return -1; + } + +@@ -229,8 +229,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) + } + + if (msg.id > migrate_multifd_channels()) { +- error_setg(errp, "multifd: received channel version %d " +- "expected %d", msg.version, MULTIFD_VERSION); ++ error_setg(errp, "multifd: received channel version %u " ++ "expected %u", msg.version, MULTIFD_VERSION); + return -1; + } + +@@ -303,7 +303,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + packet->version = be32_to_cpu(packet->version); + if (packet->version != MULTIFD_VERSION) { + error_setg(errp, "multifd: received packet " +- "version %d and expected version %d", ++ "version %u and expected version %u", + packet->version, MULTIFD_VERSION); + return -1; + } +@@ -317,7 +317,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + */ + if (packet->pages_alloc > pages_max * 100) { + error_setg(errp, "multifd: received packet " +- "with size %d and expected a maximum size of %d", ++ "with size %u and expected a maximum size of %u", + packet->pages_alloc, pages_max * 100) ; + return -1; + } +@@ -333,7 +333,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->pages->num = be32_to_cpu(packet->pages_used); + if (p->pages->num > packet->pages_alloc) { + error_setg(errp, "multifd: received packet " +- "with %d pages and expected maximum pages are %d", ++ "with %u pages and expected maximum pages are %u", + p->pages->num, packet->pages_alloc) ; + return -1; + } +diff --git a/migration/trace-events b/migration/trace-events +index b48d873b8a..5172cb3b3d 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -115,23 +115,23 @@ ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void * + ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" + + # multifd.c +-multifd_new_send_channel_async(uint8_t id) "channel %d" +-multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %d packet_num %" PRIu64 " pages %d flags 0x%x next packet size %d" +-multifd_recv_new_channel(uint8_t id) "channel %d" ++multifd_new_send_channel_async(uint8_t id) "channel %u" ++multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" ++multifd_recv_new_channel(uint8_t id) "channel %u" + multifd_recv_sync_main(long packet_num) "packet num %ld" +-multifd_recv_sync_main_signal(uint8_t id) "channel %d" +-multifd_recv_sync_main_wait(uint8_t id) "channel %d" ++multifd_recv_sync_main_signal(uint8_t id) "channel %u" ++multifd_recv_sync_main_wait(uint8_t id) "channel %u" + multifd_recv_terminate_threads(bool error) "error %d" +-multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %d packets %" PRIu64 " pages %" PRIu64 +-multifd_recv_thread_start(uint8_t id) "%d" +-multifd_send(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %d packet_num %" PRIu64 " pages %d flags 0x%x next packet size %d" +-multifd_send_error(uint8_t id) "channel %d" ++multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 ++multifd_recv_thread_start(uint8_t id) "%u" ++multifd_send(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" ++multifd_send_error(uint8_t id) "channel %u" + multifd_send_sync_main(long packet_num) "packet num %ld" +-multifd_send_sync_main_signal(uint8_t id) "channel %d" +-multifd_send_sync_main_wait(uint8_t id) "channel %d" ++multifd_send_sync_main_signal(uint8_t id) "channel %u" ++multifd_send_sync_main_wait(uint8_t id) "channel %u" + multifd_send_terminate_threads(bool error) "error %d" +-multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %d packets %" PRIu64 " pages %" PRIu64 +-multifd_send_thread_start(uint8_t id) "%d" ++multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 ++multifd_send_thread_start(uint8_t id) "%u" + multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" + multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" + multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p" +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Allow-migrate-recover-to-run-multiple-time.patch b/SOURCES/kvm-migration-Allow-migrate-recover-to-run-multiple-time.patch new file mode 100644 index 0000000..b4f1e68 --- /dev/null +++ b/SOURCES/kvm-migration-Allow-migrate-recover-to-run-multiple-time.patch @@ -0,0 +1,98 @@ +From f5be3d8a5944679c1239b974e0f910f1afe4f532 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 31 Mar 2022 11:08:45 -0400 +Subject: [PATCH 28/37] migration: Allow migrate-recover to run multiple times +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Peter Xu +RH-MergeRequest: 195: migration: Allow migrate-recover to run multiple times +RH-Commit: [2/2] a2e6b02007a06c9c7f5237289095811c7d7ca1f1 +RH-Bugzilla: 2097652 +RH-Acked-by: Leonardo Brás +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Hanna Reitz + +Previously migration didn't have an easy way to cleanup the listening +transport, migrate recovery only allows to execute once. That's done with a +trick flag in postcopy_recover_triggered. + +Now the facility is already there. + +Drop postcopy_recover_triggered and instead allows a new migrate-recover to +release the previous listener transport. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Peter Xu +Message-Id: <20220331150857.74406-8-peterx@redhat.com> +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 08401c0426bc1a5ce4609afd1cda5dd39abbf9fa) +Signed-off-by: Peter Xu +--- + migration/migration.c | 13 ++----------- + migration/migration.h | 1 - + migration/savevm.c | 3 --- + 3 files changed, 2 insertions(+), 15 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index b787a36789..616c3ff32e 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2158,11 +2158,8 @@ void qmp_migrate_recover(const char *uri, Error **errp) + return; + } + +- if (qatomic_cmpxchg(&mis->postcopy_recover_triggered, +- false, true) == true) { +- error_setg(errp, "Migrate recovery is triggered already"); +- return; +- } ++ /* If there's an existing transport, release it */ ++ migration_incoming_transport_cleanup(mis); + + /* + * Note that this call will never start a real migration; it will +@@ -2170,12 +2167,6 @@ void qmp_migrate_recover(const char *uri, Error **errp) + * to continue using that newly established channel. + */ + qemu_start_incoming_migration(uri, errp); +- +- /* Safe to dereference with the assert above */ +- if (*errp) { +- /* Reset the flag so user could still retry */ +- qatomic_set(&mis->postcopy_recover_triggered, false); +- } + } + + void qmp_migrate_pause(Error **errp) +diff --git a/migration/migration.h b/migration/migration.h +index 243898e3be..0ae2133326 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -103,7 +103,6 @@ struct MigrationIncomingState { + struct PostcopyBlocktimeContext *blocktime_ctx; + + /* notify PAUSED postcopy incoming migrations to try to continue */ +- bool postcopy_recover_triggered; + QemuSemaphore postcopy_pause_sem_dst; + QemuSemaphore postcopy_pause_sem_fault; + +diff --git a/migration/savevm.c b/migration/savevm.c +index 0bef031acb..b8382aaa64 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -2568,9 +2568,6 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) + + assert(migrate_postcopy_ram()); + +- /* Clear the triggered bit to allow one recovery */ +- mis->postcopy_recover_triggered = false; +- + /* + * Unregister yank with either from/to src would work, since ioc behind it + * is the same +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Avoid-false-positive-on-non-supported-scen.patch b/SOURCES/kvm-migration-Avoid-false-positive-on-non-supported-scen.patch new file mode 100644 index 0000000..f1a7d49 --- /dev/null +++ b/SOURCES/kvm-migration-Avoid-false-positive-on-non-supported-scen.patch @@ -0,0 +1,93 @@ +From 097f72427f4f5da4fdcdbeee52aea0c1f67d54dc Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Tue, 19 Jul 2022 09:23:45 -0300 +Subject: [PATCH 6/9] migration: Avoid false-positive on non-supported + scenarios for zero-copy-send +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [6/8] f23195f3ab4f6eba0463f38e5971ccaccdac2cfd +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Migration with zero-copy-send currently has it's limitations, as it can't +be used with TLS nor any kind of compression. In such scenarios, it should +output errors during parameter / capability setting. + +But currently there are some ways of setting this not-supported scenarios +without printing the error message: + +!) For 'compression' capability, it works by enabling it together with +zero-copy-send. This happens because the validity test for zero-copy uses +the helper unction migrate_use_compression(), which check for compression +presence in s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS]. + +The point here is: the validity test happens before the capability gets +enabled. If all of them get enabled together, this test will not return +error. + +In order to fix that, replace migrate_use_compression() by directly testing +the cap_list parameter migrate_caps_check(). + +2) For features enabled by parameters such as TLS & 'multifd_compression', +there was also a possibility of setting non-supported scenarios: setting +zero-copy-send first, then setting the unsupported parameter. + +In order to fix that, also add a check for parameters conflicting with +zero-copy-send on migrate_params_check(). + +3) XBZRLE is also a compression capability, so it makes sense to also add +it to the list of capabilities which are not supported with zero-copy-send. + +Fixes: 1abaec9a1b2c ("migration: Change zero_copy_send from migration parameter to migration capability") +Signed-off-by: Leonardo Bras +Message-Id: <20220719122345.253713-1-leobras@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 90eb69e4f1a16b388d0483543bf6bfc69a9966e4) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 952a26c5c2..35b3197eff 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1260,7 +1260,9 @@ static bool migrate_caps_check(bool *cap_list, + #ifdef CONFIG_LINUX + if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] && + (!cap_list[MIGRATION_CAPABILITY_MULTIFD] || +- migrate_use_compression() || ++ cap_list[MIGRATION_CAPABILITY_COMPRESS] || ++ cap_list[MIGRATION_CAPABILITY_XBZRLE] || ++ migrate_multifd_compression() || + migrate_use_tls())) { + error_setg(errp, + "Zero copy only available for non-compressed non-TLS multifd migration"); +@@ -1497,6 +1499,17 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) + error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: "); + return false; + } ++ ++#ifdef CONFIG_LINUX ++ if (migrate_use_zero_copy_send() && ++ ((params->has_multifd_compression && params->multifd_compression) || ++ (params->has_tls_creds && params->tls_creds && *params->tls_creds))) { ++ error_setg(errp, ++ "Zero copy only available for non-compressed non-TLS multifd migration"); ++ return false; ++ } ++#endif ++ + return true; + } + +-- +2.31.1 + diff --git a/SOURCES/kvm-migration-Change-zero_copy_send-from-migration-param.patch b/SOURCES/kvm-migration-Change-zero_copy_send-from-migration-param.patch new file mode 100644 index 0000000..b1f576d --- /dev/null +++ b/SOURCES/kvm-migration-Change-zero_copy_send-from-migration-param.patch @@ -0,0 +1,289 @@ +From 70108ff9ffe77062116e47670c0e0c2396529f88 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 20 Jun 2022 02:39:45 -0300 +Subject: [PATCH 26/37] migration: Change zero_copy_send from migration + parameter to migration capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [26/26] ea61e6cbdbe47611bd22d18988e1c4c4e8357cc3 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +When originally implemented, zero_copy_send was designed as a Migration +paramenter. + +But taking into account how is that supposed to work, and how +the difference between a capability and a parameter, it only makes sense +that zero-copy-send would work better as a capability. + +Taking into account how recently the change got merged, it was decided +that it's still time to make it right, and convert zero_copy_send into +a Migration capability. + +Signed-off-by: Leonardo Bras +Reviewed-by: Juan Quintela +Acked-by: Markus Armbruster +Acked-by: Peter Xu +Signed-off-by: Juan Quintela +Signed-off-by: Dr. David Alan Gilbert + dgilbert: always define the capability, even on non-Linux but error if +set; avoids build problems with the capability +(cherry picked from commit 1abaec9a1b2c23f7aa94709a422128d9e42c3e0b) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 58 +++++++++++++++++++------------------------ + monitor/hmp-cmds.c | 6 ----- + qapi/migration.json | 33 +++++++----------------- + 3 files changed, 34 insertions(+), 63 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 5357efd348..c8aa55d2fe 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -162,7 +162,8 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, + MIGRATION_CAPABILITY_COMPRESS, + MIGRATION_CAPABILITY_XBZRLE, + MIGRATION_CAPABILITY_X_COLO, +- MIGRATION_CAPABILITY_VALIDATE_UUID); ++ MIGRATION_CAPABILITY_VALIDATE_UUID, ++ MIGRATION_CAPABILITY_ZERO_COPY_SEND); + + bool migrate_pre_2_2; + +@@ -888,10 +889,6 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) + params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_zstd_level = true; + params->multifd_zstd_level = s->parameters.multifd_zstd_level; +-#ifdef CONFIG_LINUX +- params->has_zero_copy_send = true; +- params->zero_copy_send = s->parameters.zero_copy_send; +-#endif + params->has_xbzrle_cache_size = true; + params->xbzrle_cache_size = s->parameters.xbzrle_cache_size; + params->has_max_postcopy_bandwidth = true; +@@ -1249,6 +1246,24 @@ static bool migrate_caps_check(bool *cap_list, + } + } + ++#ifdef CONFIG_LINUX ++ if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] && ++ (!cap_list[MIGRATION_CAPABILITY_MULTIFD] || ++ migrate_use_compression() || ++ migrate_use_tls())) { ++ error_setg(errp, ++ "Zero copy only available for non-compressed non-TLS multifd migration"); ++ return false; ++ } ++#else ++ if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) { ++ error_setg(errp, ++ "Zero copy currently only available on Linux"); ++ return false; ++ } ++#endif ++ ++ + /* incoming side only */ + if (runstate_check(RUN_STATE_INMIGRATE) && + !migrate_multifd_is_allowed() && +@@ -1471,16 +1486,6 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) + error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: "); + return false; + } +-#ifdef CONFIG_LINUX +- if (params->zero_copy_send && +- (!migrate_use_multifd() || +- params->multifd_compression != MULTIFD_COMPRESSION_NONE || +- (params->tls_creds && *params->tls_creds))) { +- error_setg(errp, +- "Zero copy only available for non-compressed non-TLS multifd migration"); +- return false; +- } +-#endif + return true; + } + +@@ -1554,11 +1559,6 @@ static void migrate_params_test_apply(MigrateSetParameters *params, + if (params->has_multifd_compression) { + dest->multifd_compression = params->multifd_compression; + } +-#ifdef CONFIG_LINUX +- if (params->has_zero_copy_send) { +- dest->zero_copy_send = params->zero_copy_send; +- } +-#endif + if (params->has_xbzrle_cache_size) { + dest->xbzrle_cache_size = params->xbzrle_cache_size; + } +@@ -1671,11 +1671,6 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) + if (params->has_multifd_compression) { + s->parameters.multifd_compression = params->multifd_compression; + } +-#ifdef CONFIG_LINUX +- if (params->has_zero_copy_send) { +- s->parameters.zero_copy_send = params->zero_copy_send; +- } +-#endif + if (params->has_xbzrle_cache_size) { + s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; + xbzrle_cache_resize(params->xbzrle_cache_size, errp); +@@ -2573,7 +2568,7 @@ bool migrate_use_zero_copy_send(void) + + s = migrate_get_current(); + +- return s->parameters.zero_copy_send; ++ return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND]; + } + #endif + +@@ -4236,10 +4231,6 @@ static Property migration_properties[] = { + DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, + parameters.multifd_zstd_level, + DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), +-#ifdef CONFIG_LINUX +- DEFINE_PROP_BOOL("zero_copy_send", MigrationState, +- parameters.zero_copy_send, false), +-#endif + DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, + parameters.xbzrle_cache_size, + DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE), +@@ -4277,6 +4268,10 @@ static Property migration_properties[] = { + DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), + DEFINE_PROP_MIG_CAP("x-background-snapshot", + MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), ++#ifdef CONFIG_LINUX ++ DEFINE_PROP_MIG_CAP("x-zero-copy-send", ++ MIGRATION_CAPABILITY_ZERO_COPY_SEND), ++#endif + + DEFINE_PROP_END_OF_LIST(), + }; +@@ -4337,9 +4332,6 @@ static void migration_instance_init(Object *obj) + params->has_multifd_compression = true; + params->has_multifd_zlib_level = true; + params->has_multifd_zstd_level = true; +-#ifdef CONFIG_LINUX +- params->has_zero_copy_send = true; +-#endif + params->has_xbzrle_cache_size = true; + params->has_max_postcopy_bandwidth = true; + params->has_max_cpu_throttle = true; +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index e02da5008b..2669156b28 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -1297,12 +1297,6 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) + p->has_multifd_zstd_level = true; + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); + break; +-#ifdef CONFIG_LINUX +- case MIGRATION_PARAMETER_ZERO_COPY_SEND: +- p->has_zero_copy_send = true; +- visit_type_bool(v, param, &p->zero_copy_send, &err); +- break; +-#endif + case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: + p->has_xbzrle_cache_size = true; + if (!visit_type_size(v, param, &cache_size, &err)) { +diff --git a/qapi/migration.json b/qapi/migration.json +index 59b5c5780b..fe70a0c4b2 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -452,6 +452,13 @@ + # procedure starts. The VM RAM is saved with running VM. + # (since 6.0) + # ++# @zero-copy-send: Controls behavior on sending memory pages on migration. ++# When true, enables a zero-copy mechanism for sending ++# memory pages, if host supports it. ++# Requires that QEMU be permitted to use locked memory ++# for guest RAM pages. ++# (since 7.1) ++# + # Features: + # @unstable: Members @x-colo and @x-ignore-shared are experimental. + # +@@ -465,7 +472,8 @@ + 'block', 'return-path', 'pause-before-switchover', 'multifd', + 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', + { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, +- 'validate-uuid', 'background-snapshot'] } ++ 'validate-uuid', 'background-snapshot', ++ 'zero-copy-send'] } + + ## + # @MigrationCapabilityStatus: +@@ -730,12 +738,6 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # +-# @zero-copy-send: Controls behavior on sending memory pages on migration. +-# When true, enables a zero-copy mechanism for sending +-# memory pages, if host supports it. +-# Requires that QEMU be permitted to use locked memory +-# for guest RAM pages. +-# Defaults to false. (Since 7.1) + # + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such +@@ -776,7 +778,6 @@ + 'xbzrle-cache-size', 'max-postcopy-bandwidth', + 'max-cpu-throttle', 'multifd-compression', + 'multifd-zlib-level' ,'multifd-zstd-level', +- { 'name': 'zero-copy-send', 'if' : 'CONFIG_LINUX'}, + 'block-bitmap-mapping' ] } + + ## +@@ -903,13 +904,6 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # +-# @zero-copy-send: Controls behavior on sending memory pages on migration. +-# When true, enables a zero-copy mechanism for sending +-# memory pages, if host supports it. +-# Requires that QEMU be permitted to use locked memory +-# for guest RAM pages. +-# Defaults to false. (Since 7.1) +-# + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such + # aliases may for example be the corresponding names on the +@@ -964,7 +958,6 @@ + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', +- '*zero-copy-send': { 'type': 'bool', 'if': 'CONFIG_LINUX' }, + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } + + ## +@@ -1111,13 +1104,6 @@ + # will consume more CPU. + # Defaults to 1. (Since 5.0) + # +-# @zero-copy-send: Controls behavior on sending memory pages on migration. +-# When true, enables a zero-copy mechanism for sending +-# memory pages, if host supports it. +-# Requires that QEMU be permitted to use locked memory +-# for guest RAM pages. +-# Defaults to false. (Since 7.1) +-# + # @block-bitmap-mapping: Maps block nodes and bitmaps on them to + # aliases for the purpose of dirty bitmap migration. Such + # aliases may for example be the corresponding names on the +@@ -1170,7 +1156,6 @@ + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', +- '*zero-copy-send': { 'type': 'bool', 'if': 'CONFIG_LINUX' }, + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } + + ## +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Introduce-ram_transferred_add.patch b/SOURCES/kvm-migration-Introduce-ram_transferred_add.patch new file mode 100644 index 0000000..561e231 --- /dev/null +++ b/SOURCES/kvm-migration-Introduce-ram_transferred_add.patch @@ -0,0 +1,122 @@ +From 030b54f5a2b2c8976370c962e9847af4746ac2c2 Mon Sep 17 00:00:00 2001 +From: David Edmondson +Date: Tue, 21 Dec 2021 09:34:40 +0000 +Subject: [PATCH 1/9] migration: Introduce ram_transferred_add() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [1/8] a6545760b0de13d533f6164be0545a6720bb42c7 +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Replace direct manipulation of ram_counters.transferred with a +function. + +Signed-off-by: David Edmondson +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 4c2d0f6dca24f3396ab0718ad3f9f53cc53004df) +Signed-off-by: Leonardo Bras +--- + migration/ram.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 3e208efca7..3e82c4ff46 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -391,6 +391,11 @@ uint64_t ram_bytes_remaining(void) + + MigrationStats ram_counters; + ++static void ram_transferred_add(uint64_t bytes) ++{ ++ ram_counters.transferred += bytes; ++} ++ + /* used by the search for pages to send */ + struct PageSearchStatus { + /* Current block being searched */ +@@ -772,7 +777,7 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, + * RAM_SAVE_FLAG_CONTINUE. + */ + xbzrle_counters.bytes += bytes_xbzrle - 8; +- ram_counters.transferred += bytes_xbzrle; ++ ram_transferred_add(bytes_xbzrle); + + return 1; + } +@@ -1203,7 +1208,7 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) + + if (len) { + ram_counters.duplicate++; +- ram_counters.transferred += len; ++ ram_transferred_add(len); + return 1; + } + return -1; +@@ -1239,7 +1244,7 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + } + + if (bytes_xmit) { +- ram_counters.transferred += bytes_xmit; ++ ram_transferred_add(bytes_xmit); + *pages = 1; + } + +@@ -1270,8 +1275,8 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + uint8_t *buf, bool async) + { +- ram_counters.transferred += save_page_header(rs, rs->f, block, +- offset | RAM_SAVE_FLAG_PAGE); ++ ram_transferred_add(save_page_header(rs, rs->f, block, ++ offset | RAM_SAVE_FLAG_PAGE)); + if (async) { + qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, + migrate_release_ram() & +@@ -1279,7 +1284,7 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, + } else { + qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); + } +- ram_counters.transferred += TARGET_PAGE_SIZE; ++ ram_transferred_add(TARGET_PAGE_SIZE); + ram_counters.normal++; + return 1; + } +@@ -1378,7 +1383,7 @@ exit: + static void + update_compress_thread_counts(const CompressParam *param, int bytes_xmit) + { +- ram_counters.transferred += bytes_xmit; ++ ram_transferred_add(bytes_xmit); + + if (param->zero_page) { + ram_counters.duplicate++; +@@ -2303,7 +2308,7 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) + ram_counters.duplicate += pages; + } else { + ram_counters.normal += pages; +- ram_counters.transferred += size; ++ ram_transferred_add(size); + qemu_update_position(f, size); + } + } +@@ -3147,7 +3152,7 @@ out: + + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); +- ram_counters.transferred += 8; ++ ram_transferred_add(8); + + ret = qemu_file_get_error(f); + } +-- +2.31.1 + diff --git a/SOURCES/kvm-migration-Never-call-twice-qemu_target_page_size.patch b/SOURCES/kvm-migration-Never-call-twice-qemu_target_page_size.patch new file mode 100644 index 0000000..d956712 --- /dev/null +++ b/SOURCES/kvm-migration-Never-call-twice-qemu_target_page_size.patch @@ -0,0 +1,116 @@ +From 6a9a5a2809cbbe2982df156722b88efeec998e3d Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:22 -0300 +Subject: [PATCH 01/37] migration: Never call twice qemu_target_page_size() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [1/26] 809ca84dec80bafc1959df8c9e57f482ee752a97 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 144fa06b3431e806057ce1438338395b35a3e544) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 7 ++++--- + migration/multifd.c | 7 ++++--- + migration/savevm.c | 5 +++-- + 3 files changed, 11 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index a87ff01b81..8a13294da6 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -992,6 +992,8 @@ static void populate_time_info(MigrationInfo *info, MigrationState *s) + + static void populate_ram_info(MigrationInfo *info, MigrationState *s) + { ++ size_t page_size = qemu_target_page_size(); ++ + info->has_ram = true; + info->ram = g_malloc0(sizeof(*info->ram)); + info->ram->transferred = ram_counters.transferred; +@@ -1000,12 +1002,11 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) + /* legacy value. It is not used anymore */ + info->ram->skipped = 0; + info->ram->normal = ram_counters.normal; +- info->ram->normal_bytes = ram_counters.normal * +- qemu_target_page_size(); ++ info->ram->normal_bytes = ram_counters.normal * page_size; + info->ram->mbps = s->mbps; + info->ram->dirty_sync_count = ram_counters.dirty_sync_count; + info->ram->postcopy_requests = ram_counters.postcopy_requests; +- info->ram->page_size = qemu_target_page_size(); ++ info->ram->page_size = page_size; + info->ram->multifd_bytes = ram_counters.multifd_bytes; + info->ram->pages_per_second = s->pages_per_second; + +diff --git a/migration/multifd.c b/migration/multifd.c +index 7c9deb1921..8125d0015c 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -289,7 +289,8 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + { + MultiFDPacket_t *packet = p->packet; +- uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size(); ++ size_t page_size = qemu_target_page_size(); ++ uint32_t pages_max = MULTIFD_PACKET_SIZE / page_size; + RAMBlock *block; + int i; + +@@ -358,14 +359,14 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + for (i = 0; i < p->pages->used; i++) { + uint64_t offset = be64_to_cpu(packet->offset[i]); + +- if (offset > (block->used_length - qemu_target_page_size())) { ++ if (offset > (block->used_length - page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, block->used_length); + return -1; + } + p->pages->iov[i].iov_base = block->host + offset; +- p->pages->iov[i].iov_len = qemu_target_page_size(); ++ p->pages->iov[i].iov_len = page_size; + } + + return 0; +diff --git a/migration/savevm.c b/migration/savevm.c +index d59e976d50..0bef031acb 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -1685,6 +1685,7 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, + { + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); + uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps; ++ size_t page_size = qemu_target_page_size(); + Error *local_err = NULL; + + trace_loadvm_postcopy_handle_advise(); +@@ -1741,13 +1742,13 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, + } + + remote_tps = qemu_get_be64(mis->from_src_file); +- if (remote_tps != qemu_target_page_size()) { ++ if (remote_tps != page_size) { + /* + * Again, some differences could be dealt with, but for now keep it + * simple. + */ + error_report("Postcopy needs matching target page sizes (s=%d d=%zd)", +- (int)remote_tps, qemu_target_page_size()); ++ (int)remote_tps, page_size); + return -1; + } + +-- +2.35.3 + diff --git a/SOURCES/kvm-migration-Tally-pre-copy-downtime-and-post-copy-byte.patch b/SOURCES/kvm-migration-Tally-pre-copy-downtime-and-post-copy-byte.patch new file mode 100644 index 0000000..1cf4724 --- /dev/null +++ b/SOURCES/kvm-migration-Tally-pre-copy-downtime-and-post-copy-byte.patch @@ -0,0 +1,122 @@ +From 82637509cc9197ad9d1e1b286a608bf0da04b7b3 Mon Sep 17 00:00:00 2001 +From: David Edmondson +Date: Tue, 21 Dec 2021 09:34:41 +0000 +Subject: [PATCH 2/9] migration: Tally pre-copy, downtime and post-copy bytes + independently +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [2/8] 7d1bf37a3d93da88da6525d70fc1fce1abb92b83 +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Provide information on the number of bytes copied in the pre-copy, +downtime and post-copy phases of migration. + +Signed-off-by: David Edmondson +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit ae6806688016711bb9ec7541266d76ab511c5e3b) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 3 +++ + migration/ram.c | 7 +++++++ + monitor/hmp-cmds.c | 12 ++++++++++++ + qapi/migration.json | 13 ++++++++++++- + 4 files changed, 34 insertions(+), 1 deletion(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 616c3ff32e..e100b30f00 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1016,6 +1016,9 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s) + info->ram->page_size = page_size; + info->ram->multifd_bytes = ram_counters.multifd_bytes; + info->ram->pages_per_second = s->pages_per_second; ++ info->ram->precopy_bytes = ram_counters.precopy_bytes; ++ info->ram->downtime_bytes = ram_counters.downtime_bytes; ++ info->ram->postcopy_bytes = ram_counters.postcopy_bytes; + + if (migrate_use_xbzrle()) { + info->has_xbzrle_cache = true; +diff --git a/migration/ram.c b/migration/ram.c +index 3e82c4ff46..e7173da217 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -393,6 +393,13 @@ MigrationStats ram_counters; + + static void ram_transferred_add(uint64_t bytes) + { ++ if (runstate_is_running()) { ++ ram_counters.precopy_bytes += bytes; ++ } else if (migration_in_postcopy()) { ++ ram_counters.postcopy_bytes += bytes; ++ } else { ++ ram_counters.downtime_bytes += bytes; ++ } + ram_counters.transferred += bytes; + } + +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index 2669156b28..8c384dc1b2 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -293,6 +293,18 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) + monitor_printf(mon, "postcopy request count: %" PRIu64 "\n", + info->ram->postcopy_requests); + } ++ if (info->ram->precopy_bytes) { ++ monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n", ++ info->ram->precopy_bytes >> 10); ++ } ++ if (info->ram->downtime_bytes) { ++ monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n", ++ info->ram->downtime_bytes >> 10); ++ } ++ if (info->ram->postcopy_bytes) { ++ monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n", ++ info->ram->postcopy_bytes >> 10); ++ } + } + + if (info->has_disk) { +diff --git a/qapi/migration.json b/qapi/migration.json +index fe70a0c4b2..c8ec260ab0 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -46,6 +46,15 @@ + # @pages-per-second: the number of memory pages transferred per second + # (Since 4.0) + # ++# @precopy-bytes: The number of bytes sent in the pre-copy phase ++# (since 7.0). ++# ++# @downtime-bytes: The number of bytes sent while the guest is paused ++# (since 7.0). ++# ++# @postcopy-bytes: The number of bytes sent during the post-copy phase ++# (since 7.0). ++# + # Since: 0.14 + ## + { 'struct': 'MigrationStats', +@@ -54,7 +63,9 @@ + 'normal-bytes': 'int', 'dirty-pages-rate' : 'int', + 'mbps' : 'number', 'dirty-sync-count' : 'int', + 'postcopy-requests' : 'int', 'page-size' : 'int', +- 'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64' } } ++ 'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64', ++ 'precopy-bytes' : 'uint64', 'downtime-bytes' : 'uint64', ++ 'postcopy-bytes' : 'uint64' } } + + ## + # @XBZRLECacheStats: +-- +2.31.1 + diff --git a/SOURCES/kvm-migration-add-remaining-params-has_-true-in-migratio.patch b/SOURCES/kvm-migration-add-remaining-params-has_-true-in-migratio.patch new file mode 100644 index 0000000..73011b3 --- /dev/null +++ b/SOURCES/kvm-migration-add-remaining-params-has_-true-in-migratio.patch @@ -0,0 +1,62 @@ +From 8aecb49fdd771c5819fccc9e750b2e9cd4e94b58 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 25 Jul 2022 22:02:35 -0300 +Subject: [PATCH 7/9] migration: add remaining params->has_* = true in + migration_instance_init() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [7/8] fb622e5b88e14eb859d4903d9c088ba6ca63fc81 +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Some of params->has_* = true are missing in migration_instance_init, this +causes migrate_params_check() to skip some tests, allowing some +unsupported scenarios. + +Fix this by adding all missing params->has_* = true in +migration_instance_init(). + +Fixes: 69ef1f36b0 ("migration: define 'tls-creds' and 'tls-hostname' migration parameters") +Fixes: 1d58872a91 ("migration: do not wait for free thread") +Fixes: d2f1d29b95 ("migration: add support for a "tls-authz" migration parameter") +Signed-off-by: Leonardo Bras +Message-Id: <20220726010235.342927-1-leobras@redhat.com> +Reviewed-by: Peter Xu +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit df67aa3e61e2c83459da7d815962d9706f1528fc) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/migration/migration.c b/migration/migration.c +index 35b3197eff..51e6726dac 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -4334,6 +4334,7 @@ static void migration_instance_init(Object *obj) + /* Set has_* up only for parameter checks */ + params->has_compress_level = true; + params->has_compress_threads = true; ++ params->has_compress_wait_thread = true; + params->has_decompress_threads = true; + params->has_throttle_trigger_threshold = true; + params->has_cpu_throttle_initial = true; +@@ -4354,6 +4355,9 @@ static void migration_instance_init(Object *obj) + params->has_announce_max = true; + params->has_announce_rounds = true; + params->has_announce_step = true; ++ params->has_tls_creds = true; ++ params->has_tls_hostname = true; ++ params->has_tls_authz = true; + + qemu_sem_init(&ms->postcopy_pause_sem, 0); + qemu_sem_init(&ms->postcopy_pause_rp_sem, 0); +-- +2.31.1 + diff --git a/SOURCES/kvm-migration-multifd-Report-to-user-when-zerocopy-not-w.patch b/SOURCES/kvm-migration-multifd-Report-to-user-when-zerocopy-not-w.patch new file mode 100644 index 0000000..5008e15 --- /dev/null +++ b/SOURCES/kvm-migration-multifd-Report-to-user-when-zerocopy-not-w.patch @@ -0,0 +1,83 @@ +From 2516a21205e67078cb735e9fd47ba50156c166b7 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Mon, 11 Jul 2022 18:11:13 -0300 +Subject: [PATCH 5/9] migration/multifd: Report to user when zerocopy not + working +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 201: Zero-copy-send fixes + improvements +RH-Commit: [5/8] 0b2e23b7f8ae72936e11369cd44ba474ef3b9e8c +RH-Bugzilla: 2110203 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina + +Some errors, like the lack of Scatter-Gather support by the network +interface(NETIF_F_SG) may cause sendmsg(...,MSG_ZEROCOPY) to fail on using +zero-copy, which causes it to fall back to the default copying mechanism. + +After each full dirty-bitmap scan there should be a zero-copy flush +happening, which checks for errors each of the previous calls to +sendmsg(...,MSG_ZEROCOPY). If all of them failed to use zero-copy, then +increment dirty_sync_missed_zero_copy migration stat to let the user know +about it. + +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Acked-by: Peter Xu +Message-Id: <20220711211112.18951-4-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d59c40cc483729f2e67c80e58df769ad19976fe9) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 2 ++ + migration/ram.c | 5 +++++ + migration/ram.h | 2 ++ + 3 files changed, 9 insertions(+) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 90ab4c4346..7c16523e6b 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -631,6 +631,8 @@ int multifd_send_sync_main(QEMUFile *f) + if (ret < 0) { + error_report_err(err); + return -1; ++ } else if (ret == 1) { ++ dirty_sync_missed_zero_copy(); + } + } + } +diff --git a/migration/ram.c b/migration/ram.c +index e7173da217..93cdb456ac 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -403,6 +403,11 @@ static void ram_transferred_add(uint64_t bytes) + ram_counters.transferred += bytes; + } + ++void dirty_sync_missed_zero_copy(void) ++{ ++ ram_counters.dirty_sync_missed_zero_copy++; ++} ++ + /* used by the search for pages to send */ + struct PageSearchStatus { + /* Current block being searched */ +diff --git a/migration/ram.h b/migration/ram.h +index c515396a9a..69c3ccb26a 100644 +--- a/migration/ram.h ++++ b/migration/ram.h +@@ -88,4 +88,6 @@ void ram_write_tracking_prepare(void); + int ram_write_tracking_start(void); + void ram_write_tracking_stop(void); + ++void dirty_sync_missed_zero_copy(void); ++ + #endif +-- +2.31.1 + diff --git a/SOURCES/kvm-multifd-Add-missing-documentation.patch b/SOURCES/kvm-multifd-Add-missing-documentation.patch new file mode 100644 index 0000000..361f0c1 --- /dev/null +++ b/SOURCES/kvm-multifd-Add-missing-documentation.patch @@ -0,0 +1,82 @@ +From 3b567f762cbd8d4ffaf717b0baba9cf9fe9614c2 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 03/37] multifd: Add missing documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [3/26] 924fca4305ebd8669955d456fc1c515f509e6026 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 18ede636bc29fd8bda628fe3e5c593f8c1b734f4) +(fixed typo in commit message) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 2 ++ + migration/multifd-zstd.c | 2 ++ + migration/multifd.c | 1 + + 3 files changed, 5 insertions(+) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index ab4ba75d75..f403d2f031 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -74,6 +74,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) + * Close the channel and return memory. + * + * @p: Params for the channel that we are using ++ * @errp: pointer to an error + */ + static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + { +@@ -96,6 +97,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + * + * @p: Params for the channel that we are using + * @used: number of pages used ++ * @errp: pointer to an error + */ + static int zlib_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) + { +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 693bddf8c9..8d657f8860 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -86,6 +86,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + * Close the channel and return memory. + * + * @p: Params for the channel that we are using ++ * @errp: pointer to an error + */ + static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + { +@@ -109,6 +110,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + * + * @p: Params for the channel that we are using + * @used: number of pages used ++ * @errp: pointer to an error + */ + static int zstd_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) + { +diff --git a/migration/multifd.c b/migration/multifd.c +index 8ea86d81dc..cdeffdc4c5 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -66,6 +66,7 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) + * For no compression this function does nothing. + * + * @p: Params for the channel that we are using ++ * @errp: pointer to an error + */ + static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + { +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Fill-offset-and-block-for-reception.patch b/SOURCES/kvm-multifd-Fill-offset-and-block-for-reception.patch new file mode 100644 index 0000000..7996f87 --- /dev/null +++ b/SOURCES/kvm-multifd-Fill-offset-and-block-for-reception.patch @@ -0,0 +1,50 @@ +From 8c1edb1889ff44506f35fa185d6569b0dd9d7260 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 07/37] multifd: Fill offset and block for reception +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [7/26] 51a9e6b76af956d63fc735172211d9bf6f0f6f80 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +We were using the iov directly, but we will need this info on the +following patch. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 01102a2ef6c97acc5cc8a2c3bb62b7665a20f51f) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 55d99a8232..0533da154a 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -354,6 +354,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + return -1; + } + ++ p->pages->block = block; + for (i = 0; i < p->pages->num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[i]); + +@@ -363,6 +364,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + offset, block->used_length); + return -1; + } ++ p->pages->offset[i] = offset; + p->pages->iov[i].iov_base = block->host + offset; + p->pages->iov[i].iov_len = page_size; + } +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Implement-zero-copy-write-in-multifd-migrati.patch b/SOURCES/kvm-multifd-Implement-zero-copy-write-in-multifd-migrati.patch new file mode 100644 index 0000000..dccdf1f --- /dev/null +++ b/SOURCES/kvm-multifd-Implement-zero-copy-write-in-multifd-migrati.patch @@ -0,0 +1,182 @@ +From 7a7e2191f1ac4114380248cbd3c6ab7425250747 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 23/37] multifd: Implement zero copy write in multifd migration + (multifd-zero-copy) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [23/26] 904ce3909cfef62dd84cc7d3c6a3482e7e6f28e9 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Implement zero copy send on nocomp_send_write(), by making use of QIOChannel +writev + flags & flush interface. + +Change multifd_send_sync_main() so flush_zero_copy() can be called +after each iteration in order to make sure all dirty pages are sent before +a new iteration is started. It will also flush at the beginning and at the +end of migration. + +Also make it return -1 if flush_zero_copy() fails, in order to cancel +the migration process, and avoid resuming the guest in the target host +without receiving all current RAM. + +This will work fine on RAM migration because the RAM pages are not usually freed, +and there is no problem on changing the pages content between writev_zero_copy() and +the actual sending of the buffer, because this change will dirty the page and +cause it to be re-sent on a next iteration anyway. + +A lot of locked memory may be needed in order to use multifd migration +with zero-copy enabled, so disabling the feature should be necessary for +low-privileged users trying to perform multifd migrations. + +Signed-off-by: Leonardo Bras +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Message-Id: <20220513062836.965425-9-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5b1d9bab2da4fca3a3caee97c430e5709cb32b7b) +Signed-off-by: Leonardo Bras +--- + migration/migration.c | 11 ++++++++++- + migration/multifd.c | 37 +++++++++++++++++++++++++++++++++++-- + migration/multifd.h | 2 ++ + migration/socket.c | 5 +++-- + 4 files changed, 50 insertions(+), 5 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 8e28f2ee41..5357efd348 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1471,7 +1471,16 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) + error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: "); + return false; + } +- ++#ifdef CONFIG_LINUX ++ if (params->zero_copy_send && ++ (!migrate_use_multifd() || ++ params->multifd_compression != MULTIFD_COMPRESSION_NONE || ++ (params->tls_creds && *params->tls_creds))) { ++ error_setg(errp, ++ "Zero copy only available for non-compressed non-TLS multifd migration"); ++ return false; ++ } ++#endif + return true; + } + +diff --git a/migration/multifd.c b/migration/multifd.c +index 193f70cdba..90ab4c4346 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -576,6 +576,7 @@ void multifd_save_cleanup(void) + int multifd_send_sync_main(QEMUFile *f) + { + int i; ++ bool flush_zero_copy; + + if (!migrate_use_multifd()) { + return 0; +@@ -586,6 +587,20 @@ int multifd_send_sync_main(QEMUFile *f) + return -1; + } + } ++ ++ /* ++ * When using zero-copy, it's necessary to flush the pages before any of ++ * the pages can be sent again, so we'll make sure the new version of the ++ * pages will always arrive _later_ than the old pages. ++ * ++ * Currently we achieve this by flushing the zero-page requested writes ++ * per ram iteration, but in the future we could potentially optimize it ++ * to be less frequent, e.g. only after we finished one whole scanning of ++ * all the dirty bitmaps. ++ */ ++ ++ flush_zero_copy = migrate_use_zero_copy_send(); ++ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -607,6 +622,17 @@ int multifd_send_sync_main(QEMUFile *f) + ram_counters.transferred += p->packet_len; + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); ++ ++ if (flush_zero_copy && p->c) { ++ int ret; ++ Error *err = NULL; ++ ++ ret = qio_channel_flush(p->c, &err); ++ if (ret < 0) { ++ error_report_err(err); ++ return -1; ++ } ++ } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +@@ -691,8 +717,8 @@ static void *multifd_send_thread(void *opaque) + p->iov[0].iov_base = p->packet; + } + +- ret = qio_channel_writev_all(p->c, p->iov, p->iovs_num, +- &local_err); ++ ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, ++ 0, p->write_flags, &local_err); + if (ret != 0) { + break; + } +@@ -933,6 +959,13 @@ int multifd_save_setup(Error **errp) + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + p->normal = g_new0(ram_addr_t, page_count); ++ ++ if (migrate_use_zero_copy_send()) { ++ p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; ++ } else { ++ p->write_flags = 0; ++ } ++ + socket_send_channel_create(multifd_new_send_channel_async, p); + } + +diff --git a/migration/multifd.h b/migration/multifd.h +index 92de878155..11d5e273e6 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -95,6 +95,8 @@ typedef struct { + uint32_t packet_len; + /* pointer to the packet */ + MultiFDPacket_t *packet; ++ /* multifd flags for sending ram */ ++ int write_flags; + /* multifd flags for each packet */ + uint32_t flags; + /* size of the next packet that contains pages */ +diff --git a/migration/socket.c b/migration/socket.c +index 3754d8f72c..4fd5e85f50 100644 +--- a/migration/socket.c ++++ b/migration/socket.c +@@ -79,8 +79,9 @@ static void socket_outgoing_migration(QIOTask *task, + + trace_migration_socket_outgoing_connected(data->hostname); + +- if (migrate_use_zero_copy_send()) { +- error_setg(&err, "Zero copy send not available in migration"); ++ if (migrate_use_zero_copy_send() && ++ !qio_channel_has_feature(sioc, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY)) { ++ error_setg(&err, "Zero copy send feature not detected in host kernel"); + } + + out: +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Make-zlib-compression-method-not-use-iovs.patch b/SOURCES/kvm-multifd-Make-zlib-compression-method-not-use-iovs.patch new file mode 100644 index 0000000..e23d35d --- /dev/null +++ b/SOURCES/kvm-multifd-Make-zlib-compression-method-not-use-iovs.patch @@ -0,0 +1,98 @@ +From 75cd92cb7cff055f46163e64d66ba3f685f9ac04 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 09/37] multifd: Make zlib compression method not use iovs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [9/26] d33dd62b833d50fee989a195aebcc8d5e7d43181 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit a5ed22948873b50fcf1415d1ce15c71d61a9388d) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 330fc021c5..a1950a4588 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -13,6 +13,7 @@ + #include "qemu/osdep.h" + #include + #include "qemu/rcu.h" ++#include "exec/ramblock.h" + #include "exec/target_page.h" + #include "qapi/error.h" + #include "migration.h" +@@ -100,8 +101,8 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + { +- struct iovec *iov = p->pages->iov; + struct zlib_data *z = p->data; ++ size_t page_size = qemu_target_page_size(); + z_stream *zs = &z->zs; + uint32_t out_size = 0; + int ret; +@@ -115,8 +116,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + flush = Z_SYNC_FLUSH; + } + +- zs->avail_in = iov[i].iov_len; +- zs->next_in = iov[i].iov_base; ++ zs->avail_in = page_size; ++ zs->next_in = p->pages->block->host + p->pages->offset[i]; + + zs->avail_out = available; + zs->next_out = z->zbuff + out_size; +@@ -240,6 +241,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) + static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + { + struct zlib_data *z = p->data; ++ size_t page_size = qemu_target_page_size(); + z_stream *zs = &z->zs; + uint32_t in_size = p->next_packet_size; + /* we measure the change of total_out */ +@@ -264,7 +266,6 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + zs->next_in = z->zbuff; + + for (i = 0; i < p->pages->num; i++) { +- struct iovec *iov = &p->pages->iov[i]; + int flush = Z_NO_FLUSH; + unsigned long start = zs->total_out; + +@@ -272,8 +273,8 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + flush = Z_SYNC_FLUSH; + } + +- zs->avail_out = iov->iov_len; +- zs->next_out = iov->iov_base; ++ zs->avail_out = page_size; ++ zs->next_out = p->pages->block->host + p->pages->offset[i]; + + /* + * Welcome to inflate semantics +@@ -286,8 +287,8 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + do { + ret = inflate(zs, flush); + } while (ret == Z_OK && zs->avail_in +- && (zs->total_out - start) < iov->iov_len); +- if (ret == Z_OK && (zs->total_out - start) < iov->iov_len) { ++ && (zs->total_out - start) < page_size); ++ if (ret == Z_OK && (zs->total_out - start) < page_size) { + error_setg(errp, "multifd %d: inflate generated too few output", + p->id); + return -1; +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Make-zlib-use-iov-s.patch b/SOURCES/kvm-multifd-Make-zlib-use-iov-s.patch new file mode 100644 index 0000000..6310738 --- /dev/null +++ b/SOURCES/kvm-multifd-Make-zlib-use-iov-s.patch @@ -0,0 +1,53 @@ +From 1cdab9cadef1ed84ec34651a1edbffa36c1e67d0 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 12/37] multifd: Make zlib use iov's +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [12/26] 58630452e14802e71a9eadb17cfe4964ebf8e091 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 48a4a44c1cde382c6b8e7792d01fe7d9b0a59c69) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index a987e4a26c..96475e096e 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -145,6 +145,9 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + } + out_size += available - zs->avail_out; + } ++ p->iov[p->iovs_num].iov_base = z->zbuff; ++ p->iov[p->iovs_num].iov_len = out_size; ++ p->iovs_num++; + p->next_packet_size = out_size; + p->flags |= MULTIFD_FLAG_ZLIB; + +@@ -164,10 +167,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + */ + static int zlib_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) + { +- struct zlib_data *z = p->data; +- +- return qio_channel_write_all(p->c, (void *)z->zbuff, p->next_packet_size, +- errp); ++ return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); + } + + /** +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Make-zstd-compression-method-not-use-iovs.patch b/SOURCES/kvm-multifd-Make-zstd-compression-method-not-use-iovs.patch new file mode 100644 index 0000000..3a10280 --- /dev/null +++ b/SOURCES/kvm-multifd-Make-zstd-compression-method-not-use-iovs.patch @@ -0,0 +1,94 @@ +From ab6262bd4829e3bd6437fe32737209df2af2d141 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 08/37] multifd: Make zstd compression method not use iovs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [8/26] 010579fa73b5a4c6fd631dc9fbaf6f974974bc99 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit f5ff548774c22b34a0c0e2fef85f1be11160d774) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zstd.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index f0d1105792..d9ed42622b 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -13,6 +13,7 @@ + #include "qemu/osdep.h" + #include + #include "qemu/rcu.h" ++#include "exec/ramblock.h" + #include "exec/target_page.h" + #include "qapi/error.h" + #include "migration.h" +@@ -113,8 +114,8 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + { +- struct iovec *iov = p->pages->iov; + struct zstd_data *z = p->data; ++ size_t page_size = qemu_target_page_size(); + int ret; + uint32_t i; + +@@ -128,8 +129,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + if (i == p->pages->num - 1) { + flush = ZSTD_e_flush; + } +- z->in.src = iov[i].iov_base; +- z->in.size = iov[i].iov_len; ++ z->in.src = p->pages->block->host + p->pages->offset[i]; ++ z->in.size = page_size; + z->in.pos = 0; + + /* +@@ -261,7 +262,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + { + uint32_t in_size = p->next_packet_size; + uint32_t out_size = 0; +- uint32_t expected_size = p->pages->num * qemu_target_page_size(); ++ size_t page_size = qemu_target_page_size(); ++ uint32_t expected_size = p->pages->num * page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + struct zstd_data *z = p->data; + int ret; +@@ -283,10 +285,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + z->in.pos = 0; + + for (i = 0; i < p->pages->num; i++) { +- struct iovec *iov = &p->pages->iov[i]; +- +- z->out.dst = iov->iov_base; +- z->out.size = iov->iov_len; ++ z->out.dst = p->pages->block->host + p->pages->offset[i]; ++ z->out.size = page_size; + z->out.pos = 0; + + /* +@@ -300,8 +300,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + do { + ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); + } while (ret > 0 && (z->in.size - z->in.pos > 0) +- && (z->out.pos < iov->iov_len)); +- if (ret > 0 && (z->out.pos < iov->iov_len)) { ++ && (z->out.pos < page_size)); ++ if (ret > 0 && (z->out.pos < page_size)) { + error_setg(errp, "multifd %d: decompressStream buffer too small", + p->id); + return -1; +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Make-zstd-use-iov-s.patch b/SOURCES/kvm-multifd-Make-zstd-use-iov-s.patch new file mode 100644 index 0000000..af3e7fb --- /dev/null +++ b/SOURCES/kvm-multifd-Make-zstd-use-iov-s.patch @@ -0,0 +1,53 @@ +From bac5ce0b4d3552d6056045f201b4e50dd6204b31 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 13/37] multifd: Make zstd use iov's +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [13/26] 4d7036fb32efdf088d23737b9710e6ad1a4654aa +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 0a818b89eb8eaf79ae651405907d8110a0935cfd) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zstd.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 2185a83eac..4e60cdbc54 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -156,6 +156,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + return -1; + } + } ++ p->iov[p->iovs_num].iov_base = z->zbuff; ++ p->iov[p->iovs_num].iov_len = z->out.pos; ++ p->iovs_num++; + p->next_packet_size = z->out.pos; + p->flags |= MULTIFD_FLAG_ZSTD; + +@@ -175,10 +178,7 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + */ + static int zstd_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) + { +- struct zstd_data *z = p->data; +- +- return qio_channel_write_all(p->c, (void *)z->zbuff, p->next_packet_size, +- errp); ++ return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); + } + + /** +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Move-iov-from-pages-to-params.patch b/SOURCES/kvm-multifd-Move-iov-from-pages-to-params.patch new file mode 100644 index 0000000..6a59707 --- /dev/null +++ b/SOURCES/kvm-multifd-Move-iov-from-pages-to-params.patch @@ -0,0 +1,190 @@ +From 1181a9cbcaf37a82aa7bf117ef209f554b8c4a71 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 11/37] multifd: Move iov from pages to params +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [11/26] 24dff3ef68cf3327811242193502319ed3e3940a +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +This will allow us to reduce the number of system calls on the next patch. + +Signed-off-by: Juan Quintela +(cherry picked from commit 226468ba3dea950ab4bb0b729878dde25812da1c) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 34 ++++++++++++++++++++++++---------- + migration/multifd.h | 8 ++++++-- + 2 files changed, 30 insertions(+), 12 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index d0d19470f9..5004f394aa 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -86,7 +86,16 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + { +- p->next_packet_size = p->pages->num * qemu_target_page_size(); ++ MultiFDPages_t *pages = p->pages; ++ size_t page_size = qemu_target_page_size(); ++ ++ for (int i = 0; i < p->pages->num; i++) { ++ p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; ++ p->iov[p->iovs_num].iov_len = page_size; ++ p->iovs_num++; ++ } ++ ++ p->next_packet_size = p->pages->num * page_size; + p->flags |= MULTIFD_FLAG_NOCOMP; + return 0; + } +@@ -104,7 +113,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + */ + static int nocomp_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) + { +- return qio_channel_writev_all(p->c, p->pages->iov, used, errp); ++ return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); + } + + /** +@@ -146,13 +155,18 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) + static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) + { + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; ++ size_t page_size = qemu_target_page_size(); + + if (flags != MULTIFD_FLAG_NOCOMP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } +- return qio_channel_readv_all(p->c, p->pages->iov, p->pages->num, errp); ++ for (int i = 0; i < p->pages->num; i++) { ++ p->iov[i].iov_base = p->pages->block->host + p->pages->offset[i]; ++ p->iov[i].iov_len = page_size; ++ } ++ return qio_channel_readv_all(p->c, p->iov, p->pages->num, errp); + } + + static MultiFDMethods multifd_nocomp_ops = { +@@ -242,7 +256,6 @@ static MultiFDPages_t *multifd_pages_init(size_t size) + MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); + + pages->allocated = size; +- pages->iov = g_new0(struct iovec, size); + pages->offset = g_new0(ram_addr_t, size); + + return pages; +@@ -254,8 +267,6 @@ static void multifd_pages_clear(MultiFDPages_t *pages) + pages->allocated = 0; + pages->packet_num = 0; + pages->block = NULL; +- g_free(pages->iov); +- pages->iov = NULL; + g_free(pages->offset); + pages->offset = NULL; + g_free(pages); +@@ -365,8 +376,6 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + return -1; + } + p->pages->offset[i] = offset; +- p->pages->iov[i].iov_base = block->host + offset; +- p->pages->iov[i].iov_len = page_size; + } + + return 0; +@@ -470,8 +479,6 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) + + if (pages->block == block) { + pages->offset[pages->num] = offset; +- pages->iov[pages->num].iov_base = block->host + offset; +- pages->iov[pages->num].iov_len = qemu_target_page_size(); + pages->num++; + + if (pages->num < pages->allocated) { +@@ -564,6 +571,8 @@ void multifd_save_cleanup(void) + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; ++ g_free(p->iov); ++ p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, &local_err); + if (local_err) { + migrate_set_error(migrate_get_current(), local_err); +@@ -651,6 +660,7 @@ static void *multifd_send_thread(void *opaque) + uint32_t used = p->pages->num; + uint64_t packet_num = p->packet_num; + uint32_t flags = p->flags; ++ p->iovs_num = 0; + + if (used) { + ret = multifd_send_state->ops->send_prepare(p, &local_err); +@@ -919,6 +929,7 @@ int multifd_save_setup(Error **errp) + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + p->name = g_strdup_printf("multifdsend_%d", i); + p->tls_hostname = g_strdup(s->hostname); ++ p->iov = g_new0(struct iovec, page_count); + socket_send_channel_create(multifd_new_send_channel_async, p); + } + +@@ -1018,6 +1029,8 @@ int multifd_load_cleanup(Error **errp) + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; ++ g_free(p->iov); ++ p->iov = NULL; + multifd_recv_state->ops->recv_cleanup(p); + } + qemu_sem_destroy(&multifd_recv_state->sem_sync); +@@ -1158,6 +1171,7 @@ int multifd_load_setup(Error **errp) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->name = g_strdup_printf("multifdrecv_%d", i); ++ p->iov = g_new0(struct iovec, page_count); + } + + for (i = 0; i < thread_count; i++) { +diff --git a/migration/multifd.h b/migration/multifd.h +index e57adc783b..c3f18af364 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -62,8 +62,6 @@ typedef struct { + uint64_t packet_num; + /* offset of each page */ + ram_addr_t *offset; +- /* pointer to each page */ +- struct iovec *iov; + RAMBlock *block; + } MultiFDPages_t; + +@@ -110,6 +108,10 @@ typedef struct { + uint64_t num_pages; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; ++ /* buffers to send */ ++ struct iovec *iov; ++ /* number of iovs used */ ++ uint32_t iovs_num; + /* used for compression methods */ + void *data; + } MultiFDSendParams; +@@ -149,6 +151,8 @@ typedef struct { + uint64_t num_pages; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; ++ /* buffers to recv */ ++ struct iovec *iov; + /* used for de-compression methods */ + void *data; + } MultiFDRecvParams; +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Remove-send_write-method.patch b/SOURCES/kvm-multifd-Remove-send_write-method.patch new file mode 100644 index 0000000..79fc649 --- /dev/null +++ b/SOURCES/kvm-multifd-Remove-send_write-method.patch @@ -0,0 +1,160 @@ +From 2952487c7e5ed14796fbffae0b964a35790d6850 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 14/37] multifd: Remove send_write() method +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [14/26] 5fa59ffa09099fbc6da84e9a192ca71af52cc98f +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Everything use now iov's. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 468fcb5dd0c965e1af0da9efab09b1462631da18) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 17 ----------------- + migration/multifd-zstd.c | 17 ----------------- + migration/multifd.c | 20 ++------------------ + migration/multifd.h | 2 -- + 4 files changed, 2 insertions(+), 54 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 96475e096e..8ed29b9633 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -154,22 +154,6 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + return 0; + } + +-/** +- * zlib_send_write: do the actual write of the data +- * +- * Do the actual write of the comprresed buffer. +- * +- * Returns 0 for success or -1 for error +- * +- * @p: Params for the channel that we are using +- * @used: number of pages used +- * @errp: pointer to an error +- */ +-static int zlib_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) +-{ +- return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); +-} +- + /** + * zlib_recv_setup: setup receive side + * +@@ -312,7 +296,6 @@ static MultiFDMethods multifd_zlib_ops = { + .send_setup = zlib_send_setup, + .send_cleanup = zlib_send_cleanup, + .send_prepare = zlib_send_prepare, +- .send_write = zlib_send_write, + .recv_setup = zlib_recv_setup, + .recv_cleanup = zlib_recv_cleanup, + .recv_pages = zlib_recv_pages +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 4e60cdbc54..25e1f517b5 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -165,22 +165,6 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + return 0; + } + +-/** +- * zstd_send_write: do the actual write of the data +- * +- * Do the actual write of the comprresed buffer. +- * +- * Returns 0 for success or -1 for error +- * +- * @p: Params for the channel that we are using +- * @used: number of pages used +- * @errp: pointer to an error +- */ +-static int zstd_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) +-{ +- return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); +-} +- + /** + * zstd_recv_setup: setup receive side + * +@@ -325,7 +309,6 @@ static MultiFDMethods multifd_zstd_ops = { + .send_setup = zstd_send_setup, + .send_cleanup = zstd_send_cleanup, + .send_prepare = zstd_send_prepare, +- .send_write = zstd_send_write, + .recv_setup = zstd_recv_setup, + .recv_cleanup = zstd_recv_cleanup, + .recv_pages = zstd_recv_pages +diff --git a/migration/multifd.c b/migration/multifd.c +index 5004f394aa..1e1551d78b 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -100,22 +100,6 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + return 0; + } + +-/** +- * nocomp_send_write: do the actual write of the data +- * +- * For no compression we just have to write the data. +- * +- * Returns 0 for success or -1 for error +- * +- * @p: Params for the channel that we are using +- * @used: number of pages used +- * @errp: pointer to an error +- */ +-static int nocomp_send_write(MultiFDSendParams *p, uint32_t used, Error **errp) +-{ +- return qio_channel_writev_all(p->c, p->iov, p->iovs_num, errp); +-} +- + /** + * nocomp_recv_setup: setup receive side + * +@@ -173,7 +157,6 @@ static MultiFDMethods multifd_nocomp_ops = { + .send_setup = nocomp_send_setup, + .send_cleanup = nocomp_send_cleanup, + .send_prepare = nocomp_send_prepare, +- .send_write = nocomp_send_write, + .recv_setup = nocomp_recv_setup, + .recv_cleanup = nocomp_recv_cleanup, + .recv_pages = nocomp_recv_pages +@@ -687,7 +670,8 @@ static void *multifd_send_thread(void *opaque) + } + + if (used) { +- ret = multifd_send_state->ops->send_write(p, used, &local_err); ++ ret = qio_channel_writev_all(p->c, p->iov, p->iovs_num, ++ &local_err); + if (ret != 0) { + break; + } +diff --git a/migration/multifd.h b/migration/multifd.h +index c3f18af364..7496f951a7 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -164,8 +164,6 @@ typedef struct { + void (*send_cleanup)(MultiFDSendParams *p, Error **errp); + /* Prepare the send packet */ + int (*send_prepare)(MultiFDSendParams *p, Error **errp); +- /* Write the send packet */ +- int (*send_write)(MultiFDSendParams *p, uint32_t used, Error **errp); + /* Setup for receiving side */ + int (*recv_setup)(MultiFDRecvParams *p, Error **errp); + /* Cleanup for receiving side */ +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Rename-used-field-to-num.patch b/SOURCES/kvm-multifd-Rename-used-field-to-num.patch new file mode 100644 index 0000000..24bdd8c --- /dev/null +++ b/SOURCES/kvm-multifd-Rename-used-field-to-num.patch @@ -0,0 +1,177 @@ +From 003ef20d11b33a7139fae6fbcf170188a07afc43 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:22 -0300 +Subject: [PATCH 02/37] multifd: Rename used field to num +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [2/26] 952283197ef89be4d61c7690bb6c3194e5c67217 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +We will need to split it later in zero_num (number of zero pages) and +normal_num (number of normal pages). This name is better. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 90a3d2f9d5f729147b2827c177932603ae6e2d55) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 38 +++++++++++++++++++------------------- + migration/multifd.h | 2 +- + 2 files changed, 20 insertions(+), 20 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 8125d0015c..8ea86d81dc 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -252,7 +252,7 @@ static MultiFDPages_t *multifd_pages_init(size_t size) + + static void multifd_pages_clear(MultiFDPages_t *pages) + { +- pages->used = 0; ++ pages->num = 0; + pages->allocated = 0; + pages->packet_num = 0; + pages->block = NULL; +@@ -270,7 +270,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); +- packet->pages_used = cpu_to_be32(p->pages->used); ++ packet->pages_used = cpu_to_be32(p->pages->num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); + packet->packet_num = cpu_to_be64(p->packet_num); + +@@ -278,7 +278,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + strncpy(packet->ramblock, p->pages->block->idstr, 256); + } + +- for (i = 0; i < p->pages->used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + /* there are architectures where ram_addr_t is 32 bit */ + uint64_t temp = p->pages->offset[i]; + +@@ -332,18 +332,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->pages = multifd_pages_init(packet->pages_alloc); + } + +- p->pages->used = be32_to_cpu(packet->pages_used); +- if (p->pages->used > packet->pages_alloc) { ++ p->pages->num = be32_to_cpu(packet->pages_used); ++ if (p->pages->num > packet->pages_alloc) { + error_setg(errp, "multifd: received packet " + "with %d pages and expected maximum pages are %d", +- p->pages->used, packet->pages_alloc) ; ++ p->pages->num, packet->pages_alloc) ; + return -1; + } + + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); + +- if (p->pages->used == 0) { ++ if (p->pages->num == 0) { + return 0; + } + +@@ -356,7 +356,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + return -1; + } + +- for (i = 0; i < p->pages->used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[i]); + + if (offset > (block->used_length - page_size)) { +@@ -443,13 +443,13 @@ static int multifd_send_pages(QEMUFile *f) + } + qemu_mutex_unlock(&p->mutex); + } +- assert(!p->pages->used); ++ assert(!p->pages->num); + assert(!p->pages->block); + + p->packet_num = multifd_send_state->packet_num++; + multifd_send_state->pages = p->pages; + p->pages = pages; +- transferred = ((uint64_t) pages->used) * qemu_target_page_size() ++ transferred = ((uint64_t) pages->num) * qemu_target_page_size() + + p->packet_len; + qemu_file_update_transfer(f, transferred); + ram_counters.multifd_bytes += transferred; +@@ -469,12 +469,12 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) + } + + if (pages->block == block) { +- pages->offset[pages->used] = offset; +- pages->iov[pages->used].iov_base = block->host + offset; +- pages->iov[pages->used].iov_len = qemu_target_page_size(); +- pages->used++; ++ pages->offset[pages->num] = offset; ++ pages->iov[pages->num].iov_base = block->host + offset; ++ pages->iov[pages->num].iov_len = qemu_target_page_size(); ++ pages->num++; + +- if (pages->used < pages->allocated) { ++ if (pages->num < pages->allocated) { + return 1; + } + } +@@ -586,7 +586,7 @@ void multifd_send_sync_main(QEMUFile *f) + if (!migrate_use_multifd()) { + return; + } +- if (multifd_send_state->pages->used) { ++ if (multifd_send_state->pages->num) { + if (multifd_send_pages(f) < 0) { + error_report("%s: multifd_send_pages fail", __func__); + return; +@@ -649,7 +649,7 @@ static void *multifd_send_thread(void *opaque) + qemu_mutex_lock(&p->mutex); + + if (p->pending_job) { +- uint32_t used = p->pages->used; ++ uint32_t used = p->pages->num; + uint64_t packet_num = p->packet_num; + flags = p->flags; + +@@ -665,7 +665,7 @@ static void *multifd_send_thread(void *opaque) + p->flags = 0; + p->num_packets++; + p->num_pages += used; +- p->pages->used = 0; ++ p->pages->num = 0; + p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + +@@ -1091,7 +1091,7 @@ static void *multifd_recv_thread(void *opaque) + break; + } + +- used = p->pages->used; ++ used = p->pages->num; + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; +diff --git a/migration/multifd.h b/migration/multifd.h +index 15c50ca0b2..86820dd028 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -55,7 +55,7 @@ typedef struct { + + typedef struct { + /* number of used pages */ +- uint32_t used; ++ uint32_t num; + /* number of allocated pages */ + uint32_t allocated; + /* global number of generated multifd packets */ +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Send-header-packet-without-flags-if-zero-cop.patch b/SOURCES/kvm-multifd-Send-header-packet-without-flags-if-zero-cop.patch new file mode 100644 index 0000000..d54cce8 --- /dev/null +++ b/SOURCES/kvm-multifd-Send-header-packet-without-flags-if-zero-cop.patch @@ -0,0 +1,102 @@ +From 33a38fef5e889b45571228bde519746fd90d8877 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 22/37] multifd: Send header packet without flags if + zero-copy-send is enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [22/26] 9abfee42b72f11911cf128519826d09cbd2f5bc3 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Since d48c3a0445 ("multifd: Use a single writev on the send side"), +sending the header packet and the memory pages happens in the same +writev, which can potentially make the migration faster. + +Using channel-socket as example, this works well with the default copying +mechanism of sendmsg(), but with zero-copy-send=true, it will cause +the migration to often break. + +This happens because the header packet buffer gets reused quite often, +and there is a high chance that by the time the MSG_ZEROCOPY mechanism get +to send the buffer, it has already changed, sending the wrong data and +causing the migration to abort. + +It means that, as it is, the buffer for the header packet is not suitable +for sending with MSG_ZEROCOPY. + +In order to enable zero copy for multifd, send the header packet on an +individual write(), without any flags, and the remanining pages with a +writev(), as it was happening before. This only changes how a migration +with zero-copy-send=true works, not changing any current behavior for +migrations with zero-copy-send=false. + +Signed-off-by: Leonardo Bras +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Message-Id: <20220513062836.965425-8-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b7dbdd8e76cd03453c234dbb9578d20969859d74) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 1e34e01ebc..193f70cdba 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -624,6 +624,7 @@ static void *multifd_send_thread(void *opaque) + MultiFDSendParams *p = opaque; + Error *local_err = NULL; + int ret = 0; ++ bool use_zero_copy_send = migrate_use_zero_copy_send(); + + trace_multifd_send_thread_start(p->id); + rcu_register_thread(); +@@ -646,9 +647,14 @@ static void *multifd_send_thread(void *opaque) + if (p->pending_job) { + uint64_t packet_num = p->packet_num; + uint32_t flags = p->flags; +- p->iovs_num = 1; + p->normal_num = 0; + ++ if (use_zero_copy_send) { ++ p->iovs_num = 0; ++ } else { ++ p->iovs_num = 1; ++ } ++ + for (int i = 0; i < p->pages->num; i++) { + p->normal[p->normal_num] = p->pages->offset[i]; + p->normal_num++; +@@ -672,8 +678,18 @@ static void *multifd_send_thread(void *opaque) + trace_multifd_send(p->id, packet_num, p->normal_num, flags, + p->next_packet_size); + +- p->iov[0].iov_len = p->packet_len; +- p->iov[0].iov_base = p->packet; ++ if (use_zero_copy_send) { ++ /* Send header first, without zerocopy */ ++ ret = qio_channel_write_all(p->c, (void *)p->packet, ++ p->packet_len, &local_err); ++ if (ret != 0) { ++ break; ++ } ++ } else { ++ /* Send header using the same writev call */ ++ p->iov[0].iov_len = p->packet_len; ++ p->iov[0].iov_base = p->packet; ++ } + + ret = qio_channel_writev_all(p->c, p->iov, p->iovs_num, + &local_err); +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-The-variable-is-only-used-inside-the-loop.patch b/SOURCES/kvm-multifd-The-variable-is-only-used-inside-the-loop.patch new file mode 100644 index 0000000..ef5e6d2 --- /dev/null +++ b/SOURCES/kvm-multifd-The-variable-is-only-used-inside-the-loop.patch @@ -0,0 +1,48 @@ +From 56cd14fc23c58707b9184da11f36d777bba6ce78 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 04/37] multifd: The variable is only used inside the loop +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [4/26] 45d8bbde75ebbef6329c41ddb56db4526739f94f +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 1943c11a62bd0741e5d9fbba78404fe47ebea820) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index cdeffdc4c5..ce7101cf9d 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -629,7 +629,6 @@ static void *multifd_send_thread(void *opaque) + MultiFDSendParams *p = opaque; + Error *local_err = NULL; + int ret = 0; +- uint32_t flags = 0; + + trace_multifd_send_thread_start(p->id); + rcu_register_thread(); +@@ -652,7 +651,7 @@ static void *multifd_send_thread(void *opaque) + if (p->pending_job) { + uint32_t used = p->pages->num; + uint64_t packet_num = p->packet_num; +- flags = p->flags; ++ uint32_t flags = p->flags; + + if (used) { + ret = multifd_send_state->ops->send_prepare(p, used, +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Use-a-single-writev-on-the-send-side.patch b/SOURCES/kvm-multifd-Use-a-single-writev-on-the-send-side.patch new file mode 100644 index 0000000..b4f3036 --- /dev/null +++ b/SOURCES/kvm-multifd-Use-a-single-writev-on-the-send-side.patch @@ -0,0 +1,80 @@ +From 4051de396e02ea2c1911c842426318bcd97f93c7 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 15/37] multifd: Use a single writev on the send side +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [15/26] c37063c813fc0ba695072117f272360e5c413803 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Until now, we wrote the packet header with write(), and the rest of the +pages with writev(). Just increase the size of the iovec and do a +single writev(). + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit d48c3a044537689866fe44e65d24c7d39a68868a) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 1e1551d78b..d0f86542b1 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -643,7 +643,7 @@ static void *multifd_send_thread(void *opaque) + uint32_t used = p->pages->num; + uint64_t packet_num = p->packet_num; + uint32_t flags = p->flags; +- p->iovs_num = 0; ++ p->iovs_num = 1; + + if (used) { + ret = multifd_send_state->ops->send_prepare(p, &local_err); +@@ -663,20 +663,15 @@ static void *multifd_send_thread(void *opaque) + trace_multifd_send(p->id, packet_num, used, flags, + p->next_packet_size); + +- ret = qio_channel_write_all(p->c, (void *)p->packet, +- p->packet_len, &local_err); ++ p->iov[0].iov_len = p->packet_len; ++ p->iov[0].iov_base = p->packet; ++ ++ ret = qio_channel_writev_all(p->c, p->iov, p->iovs_num, ++ &local_err); + if (ret != 0) { + break; + } + +- if (used) { +- ret = qio_channel_writev_all(p->c, p->iov, p->iovs_num, +- &local_err); +- if (ret != 0) { +- break; +- } +- } +- + qemu_mutex_lock(&p->mutex); + p->pending_job--; + qemu_mutex_unlock(&p->mutex); +@@ -913,7 +908,8 @@ int multifd_save_setup(Error **errp) + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + p->name = g_strdup_printf("multifdsend_%d", i); + p->tls_hostname = g_strdup(s->hostname); +- p->iov = g_new0(struct iovec, page_count); ++ /* We need one extra place for the packet header */ ++ p->iov = g_new0(struct iovec, page_count + 1); + socket_send_channel_create(multifd_new_send_channel_async, p); + } + +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-Use-normal-pages-array-on-the-send-side.patch b/SOURCES/kvm-multifd-Use-normal-pages-array-on-the-send-side.patch new file mode 100644 index 0000000..032dac2 --- /dev/null +++ b/SOURCES/kvm-multifd-Use-normal-pages-array-on-the-send-side.patch @@ -0,0 +1,261 @@ +From 3b57c876e1eaca34fb5bd9067553de945013d4be Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:24 -0300 +Subject: [PATCH 16/37] multifd: Use normal pages array on the send side +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [16/26] 1c48806474daf48fe93920ac361311af95c6a6f3 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +We are only sending normal pages through multifd channels. +Later on this series, we are going to also send zero pages. +We are going to detect if a page is zero or non zero in the multifd +channel thread, not on the main thread. + +So we receive an array of pages page->offset[N] + +And we will end with: + +p->normal[N - zero_pages] +p->zero[zero_pages]. + +In this patch, we just copy all the pages in offset to normal. + +for (i = 0; i < pages->num; i++) { + p->narmal[p->normal_num] = pages->offset[i]; + p->normal_num++: +} + +Later in the series this becomes: + +for (i = 0; i < pages->num; i++) { + if (buffer_is_zero(page->offset[i])) { + p->zerol[p->zero_num] = pages->offset[i]; + p->zero_num++: + } else { + p->narmal[p->normal_num] = pages->offset[i]; + p->normal_num++: + } +} + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert + +--- + +Improving comment (dave) +Renaming num_normal_pages to total_normal_pages (peter) + +(cherry picked from commit 815956f03902980c771da64b17f7f791c1cb57b0) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 6 +++--- + migration/multifd-zstd.c | 6 +++--- + migration/multifd.c | 30 +++++++++++++++++++----------- + migration/multifd.h | 8 ++++++-- + migration/trace-events | 4 ++-- + 5 files changed, 33 insertions(+), 21 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 8ed29b9633..8508f26adf 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -108,16 +108,16 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + int ret; + uint32_t i; + +- for (i = 0; i < p->pages->num; i++) { ++ for (i = 0; i < p->normal_num; i++) { + uint32_t available = z->zbuff_len - out_size; + int flush = Z_NO_FLUSH; + +- if (i == p->pages->num - 1) { ++ if (i == p->normal_num - 1) { + flush = Z_SYNC_FLUSH; + } + + zs->avail_in = page_size; +- zs->next_in = p->pages->block->host + p->pages->offset[i]; ++ zs->next_in = p->pages->block->host + p->normal[i]; + + zs->avail_out = available; + zs->next_out = z->zbuff + out_size; +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 25e1f517b5..693af3a140 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -123,13 +123,13 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + z->out.size = z->zbuff_len; + z->out.pos = 0; + +- for (i = 0; i < p->pages->num; i++) { ++ for (i = 0; i < p->normal_num; i++) { + ZSTD_EndDirective flush = ZSTD_e_continue; + +- if (i == p->pages->num - 1) { ++ if (i == p->normal_num - 1) { + flush = ZSTD_e_flush; + } +- z->in.src = p->pages->block->host + p->pages->offset[i]; ++ z->in.src = p->pages->block->host + p->normal[i]; + z->in.size = page_size; + z->in.pos = 0; + +diff --git a/migration/multifd.c b/migration/multifd.c +index d0f86542b1..3725226400 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -89,13 +89,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + MultiFDPages_t *pages = p->pages; + size_t page_size = qemu_target_page_size(); + +- for (int i = 0; i < p->pages->num; i++) { +- p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; ++ for (int i = 0; i < p->normal_num; i++) { ++ p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + p->iov[p->iovs_num].iov_len = page_size; + p->iovs_num++; + } + +- p->next_packet_size = p->pages->num * page_size; ++ p->next_packet_size = p->normal_num * page_size; + p->flags |= MULTIFD_FLAG_NOCOMP; + return 0; + } +@@ -262,7 +262,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); +- packet->pages_used = cpu_to_be32(p->pages->num); ++ packet->pages_used = cpu_to_be32(p->normal_num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); + packet->packet_num = cpu_to_be64(p->packet_num); + +@@ -270,9 +270,9 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + strncpy(packet->ramblock, p->pages->block->idstr, 256); + } + +- for (i = 0; i < p->pages->num; i++) { ++ for (i = 0; i < p->normal_num; i++) { + /* there are architectures where ram_addr_t is 32 bit */ +- uint64_t temp = p->pages->offset[i]; ++ uint64_t temp = p->normal[i]; + + packet->offset[i] = cpu_to_be64(temp); + } +@@ -556,6 +556,8 @@ void multifd_save_cleanup(void) + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; ++ g_free(p->normal); ++ p->normal = NULL; + multifd_send_state->ops->send_cleanup(p, &local_err); + if (local_err) { + migrate_set_error(migrate_get_current(), local_err); +@@ -640,12 +642,17 @@ static void *multifd_send_thread(void *opaque) + qemu_mutex_lock(&p->mutex); + + if (p->pending_job) { +- uint32_t used = p->pages->num; + uint64_t packet_num = p->packet_num; + uint32_t flags = p->flags; + p->iovs_num = 1; ++ p->normal_num = 0; ++ ++ for (int i = 0; i < p->pages->num; i++) { ++ p->normal[p->normal_num] = p->pages->offset[i]; ++ p->normal_num++; ++ } + +- if (used) { ++ if (p->normal_num) { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); +@@ -655,12 +662,12 @@ static void *multifd_send_thread(void *opaque) + multifd_send_fill_packet(p); + p->flags = 0; + p->num_packets++; +- p->num_pages += used; ++ p->total_normal_pages += p->normal_num; + p->pages->num = 0; + p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + +- trace_multifd_send(p->id, packet_num, used, flags, ++ trace_multifd_send(p->id, packet_num, p->normal_num, flags, + p->next_packet_size); + + p->iov[0].iov_len = p->packet_len; +@@ -710,7 +717,7 @@ out: + qemu_mutex_unlock(&p->mutex); + + rcu_unregister_thread(); +- trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages); ++ trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + + return NULL; + } +@@ -910,6 +917,7 @@ int multifd_save_setup(Error **errp) + p->tls_hostname = g_strdup(s->hostname); + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); ++ p->normal = g_new0(ram_addr_t, page_count); + socket_send_channel_create(multifd_new_send_channel_async, p); + } + +diff --git a/migration/multifd.h b/migration/multifd.h +index 7496f951a7..7823199dbe 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -104,14 +104,18 @@ typedef struct { + /* thread local variables */ + /* packets sent through this channel */ + uint64_t num_packets; +- /* pages sent through this channel */ +- uint64_t num_pages; ++ /* non zero pages sent through this channel */ ++ uint64_t total_normal_pages; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* buffers to send */ + struct iovec *iov; + /* number of iovs used */ + uint32_t iovs_num; ++ /* Pages that are not zero */ ++ ram_addr_t *normal; ++ /* num of non zero pages */ ++ uint32_t normal_num; + /* used for compression methods */ + void *data; + } MultiFDSendParams; +diff --git a/migration/trace-events b/migration/trace-events +index 5172cb3b3d..171a83a55d 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -124,13 +124,13 @@ multifd_recv_sync_main_wait(uint8_t id) "channel %u" + multifd_recv_terminate_threads(bool error) "error %d" + multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 + multifd_recv_thread_start(uint8_t id) "%u" +-multifd_send(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" ++multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" + multifd_send_error(uint8_t id) "channel %u" + multifd_send_sync_main(long packet_num) "packet num %ld" + multifd_send_sync_main_signal(uint8_t id) "channel %u" + multifd_send_sync_main_wait(uint8_t id) "channel %u" + multifd_send_terminate_threads(bool error) "error %d" +-multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 ++multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 + multifd_send_thread_start(uint8_t id) "%u" + multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" + multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-multifd_send_sync_main-now-returns-negative-.patch b/SOURCES/kvm-multifd-multifd_send_sync_main-now-returns-negative-.patch new file mode 100644 index 0000000..7912266 --- /dev/null +++ b/SOURCES/kvm-multifd-multifd_send_sync_main-now-returns-negative-.patch @@ -0,0 +1,163 @@ +From fce933410a5068220a5f29011a6d1a647e357a62 Mon Sep 17 00:00:00 2001 +From: Leonardo Bras +Date: Wed, 18 May 2022 02:52:25 -0300 +Subject: [PATCH 21/37] multifd: multifd_send_sync_main now returns negative on + error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [21/26] b4e4f3663576aa87f3b2f66f1d38bad4f50bd4ac +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +Even though multifd_send_sync_main() currently emits error_reports, it's +callers don't really check it before continuing. + +Change multifd_send_sync_main() to return -1 on error and 0 on success. +Also change all it's callers to make use of this change and possibly fail +earlier. + +(This change is important to next patch on multifd zero copy +implementation, to make it sure an error in zero-copy flush does not go +unnoticed. + +Signed-off-by: Leonardo Bras +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Peter Xu +Message-Id: <20220513062836.965425-7-leobras@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 33d70973a3a6e8c6b62bcbc64d9e488961981007) +Signed-off-by: Leonardo Bras +--- + migration/multifd.c | 10 ++++++---- + migration/multifd.h | 2 +- + migration/ram.c | 29 ++++++++++++++++++++++------- + 3 files changed, 29 insertions(+), 12 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index e53811f04a..1e34e01ebc 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -573,17 +573,17 @@ void multifd_save_cleanup(void) + multifd_send_state = NULL; + } + +-void multifd_send_sync_main(QEMUFile *f) ++int multifd_send_sync_main(QEMUFile *f) + { + int i; + + if (!migrate_use_multifd()) { +- return; ++ return 0; + } + if (multifd_send_state->pages->num) { + if (multifd_send_pages(f) < 0) { + error_report("%s: multifd_send_pages fail", __func__); +- return; ++ return -1; + } + } + for (i = 0; i < migrate_multifd_channels(); i++) { +@@ -596,7 +596,7 @@ void multifd_send_sync_main(QEMUFile *f) + if (p->quit) { + error_report("%s: channel %d has already quit", __func__, i); + qemu_mutex_unlock(&p->mutex); +- return; ++ return -1; + } + + p->packet_num = multifd_send_state->packet_num++; +@@ -615,6 +615,8 @@ void multifd_send_sync_main(QEMUFile *f) + qemu_sem_wait(&p->sem_sync); + } + trace_multifd_send_sync_main(multifd_send_state->packet_num); ++ ++ return 0; + } + + static void *multifd_send_thread(void *opaque) +diff --git a/migration/multifd.h b/migration/multifd.h +index 7823199dbe..92de878155 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -22,7 +22,7 @@ int multifd_load_cleanup(Error **errp); + bool multifd_recv_all_channels_created(void); + bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp); + void multifd_recv_sync_main(void); +-void multifd_send_sync_main(QEMUFile *f); ++int multifd_send_sync_main(QEMUFile *f); + int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); + + /* Multifd Compression flags */ +diff --git a/migration/ram.c b/migration/ram.c +index 863035d235..3e208efca7 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -2992,6 +2992,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) + { + RAMState **rsp = opaque; + RAMBlock *block; ++ int ret; + + if (compress_threads_save_setup()) { + return -1; +@@ -3026,7 +3027,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque) + ram_control_before_iterate(f, RAM_CONTROL_SETUP); + ram_control_after_iterate(f, RAM_CONTROL_SETUP); + +- multifd_send_sync_main(f); ++ ret = multifd_send_sync_main(f); ++ if (ret < 0) { ++ return ret; ++ } ++ + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); + +@@ -3135,7 +3140,11 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + out: + if (ret >= 0 + && migration_is_setup_or_active(migrate_get_current()->state)) { +- multifd_send_sync_main(rs->f); ++ ret = multifd_send_sync_main(rs->f); ++ if (ret < 0) { ++ return ret; ++ } ++ + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); + ram_counters.transferred += 8; +@@ -3193,13 +3202,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_FINISH); + } + +- if (ret >= 0) { +- multifd_send_sync_main(rs->f); +- qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +- qemu_fflush(f); ++ if (ret < 0) { ++ return ret; + } + +- return ret; ++ ret = multifd_send_sync_main(rs->f); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ qemu_fflush(f); ++ ++ return 0; + } + + static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-remove-used-parameter-from-send_prepare-meth.patch b/SOURCES/kvm-multifd-remove-used-parameter-from-send_prepare-meth.patch new file mode 100644 index 0000000..3f3b923 --- /dev/null +++ b/SOURCES/kvm-multifd-remove-used-parameter-from-send_prepare-meth.patch @@ -0,0 +1,135 @@ +From 5f53448092c944857a2b89138f22c5ab335d8250 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 05/37] multifd: remove used parameter from send_prepare() + method +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [5/26] ad6360d19d65e8c332dcdc3d3234478639e03db8 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +It is already there as p->pages->num. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 02fb81043ecee338e4aeb8f5be09a46325dc5e43) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 7 +++---- + migration/multifd-zstd.c | 7 +++---- + migration/multifd.c | 9 +++------ + migration/multifd.h | 2 +- + 4 files changed, 10 insertions(+), 15 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index f403d2f031..0c70a2dc78 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -96,10 +96,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int zlib_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) ++static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + { + struct iovec *iov = p->pages->iov; + struct zlib_data *z = p->data; +@@ -108,11 +107,11 @@ static int zlib_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) + int ret; + uint32_t i; + +- for (i = 0; i < used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + uint32_t available = z->zbuff_len - out_size; + int flush = Z_NO_FLUSH; + +- if (i == used - 1) { ++ if (i == p->pages->num - 1) { + flush = Z_SYNC_FLUSH; + } + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 8d657f8860..466b370cad 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -109,10 +109,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int zstd_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) ++static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + { + struct iovec *iov = p->pages->iov; + struct zstd_data *z = p->data; +@@ -123,10 +122,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, uint32_t used, Error **errp) + z->out.size = z->zbuff_len; + z->out.pos = 0; + +- for (i = 0; i < used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + ZSTD_EndDirective flush = ZSTD_e_continue; + +- if (i == used - 1) { ++ if (i == p->pages->num - 1) { + flush = ZSTD_e_flush; + } + z->in.src = iov[i].iov_base; +diff --git a/migration/multifd.c b/migration/multifd.c +index ce7101cf9d..098ef8842c 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -82,13 +82,11 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int nocomp_send_prepare(MultiFDSendParams *p, uint32_t used, +- Error **errp) ++static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + { +- p->next_packet_size = used * qemu_target_page_size(); ++ p->next_packet_size = p->pages->num * qemu_target_page_size(); + p->flags |= MULTIFD_FLAG_NOCOMP; + return 0; + } +@@ -654,8 +652,7 @@ static void *multifd_send_thread(void *opaque) + uint32_t flags = p->flags; + + if (used) { +- ret = multifd_send_state->ops->send_prepare(p, used, +- &local_err); ++ ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; +diff --git a/migration/multifd.h b/migration/multifd.h +index 86820dd028..7968cc5c20 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -159,7 +159,7 @@ typedef struct { + /* Cleanup for sending side */ + void (*send_cleanup)(MultiFDSendParams *p, Error **errp); + /* Prepare the send packet */ +- int (*send_prepare)(MultiFDSendParams *p, uint32_t used, Error **errp); ++ int (*send_prepare)(MultiFDSendParams *p, Error **errp); + /* Write the send packet */ + int (*send_write)(MultiFDSendParams *p, uint32_t used, Error **errp); + /* Setup for receiving side */ +-- +2.35.3 + diff --git a/SOURCES/kvm-multifd-remove-used-parameter-from-send_recv_pages-m.patch b/SOURCES/kvm-multifd-remove-used-parameter-from-send_recv_pages-m.patch new file mode 100644 index 0000000..02c5918 --- /dev/null +++ b/SOURCES/kvm-multifd-remove-used-parameter-from-send_recv_pages-m.patch @@ -0,0 +1,149 @@ +From 8cdedf86dc193673ea24516e7b44f8b4da5dd713 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Wed, 18 May 2022 02:52:23 -0300 +Subject: [PATCH 06/37] multifd: remove used parameter from send_recv_pages() + method +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Leonardo Brás +RH-MergeRequest: 191: MSG_ZEROCOPY + Multifd @ rhel8.7 +RH-Commit: [6/26] 5c1a506e4178501a0894ea4e7ac919e1d4d4cc32 +RH-Bugzilla: 2072049 +RH-Acked-by: Peter Xu +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Dr. David Alan Gilbert + +It is already there as p->pages->num. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 40a4bfe9d3f8ad35a9c3ffb4cbf7367e2777054b) +Signed-off-by: Leonardo Bras +--- + migration/multifd-zlib.c | 9 ++++----- + migration/multifd-zstd.c | 7 +++---- + migration/multifd.c | 7 +++---- + migration/multifd.h | 2 +- + 4 files changed, 11 insertions(+), 14 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 0c70a2dc78..330fc021c5 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -235,17 +235,16 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int zlib_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) ++static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + { + struct zlib_data *z = p->data; + z_stream *zs = &z->zs; + uint32_t in_size = p->next_packet_size; + /* we measure the change of total_out */ + uint32_t out_size = zs->total_out; +- uint32_t expected_size = used * qemu_target_page_size(); ++ uint32_t expected_size = p->pages->num * qemu_target_page_size(); + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + int ret; + int i; +@@ -264,12 +263,12 @@ static int zlib_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) + zs->avail_in = in_size; + zs->next_in = z->zbuff; + +- for (i = 0; i < used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + struct iovec *iov = &p->pages->iov[i]; + int flush = Z_NO_FLUSH; + unsigned long start = zs->total_out; + +- if (i == used - 1) { ++ if (i == p->pages->num - 1) { + flush = Z_SYNC_FLUSH; + } + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 466b370cad..f0d1105792 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -255,14 +255,13 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int zstd_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) ++static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + { + uint32_t in_size = p->next_packet_size; + uint32_t out_size = 0; +- uint32_t expected_size = used * qemu_target_page_size(); ++ uint32_t expected_size = p->pages->num * qemu_target_page_size(); + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + struct zstd_data *z = p->data; + int ret; +@@ -283,7 +282,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) + z->in.size = in_size; + z->in.pos = 0; + +- for (i = 0; i < used; i++) { ++ for (i = 0; i < p->pages->num; i++) { + struct iovec *iov = &p->pages->iov[i]; + + z->out.dst = iov->iov_base; +diff --git a/migration/multifd.c b/migration/multifd.c +index 098ef8842c..55d99a8232 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -141,10 +141,9 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using +- * @used: number of pages used + * @errp: pointer to an error + */ +-static int nocomp_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) ++static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) + { + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + +@@ -153,7 +152,7 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, uint32_t used, Error **errp) + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } +- return qio_channel_readv_all(p->c, p->pages->iov, used, errp); ++ return qio_channel_readv_all(p->c, p->pages->iov, p->pages->num, errp); + } + + static MultiFDMethods multifd_nocomp_ops = { +@@ -1099,7 +1098,7 @@ static void *multifd_recv_thread(void *opaque) + qemu_mutex_unlock(&p->mutex); + + if (used) { +- ret = multifd_recv_state->ops->recv_pages(p, used, &local_err); ++ ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (ret != 0) { + break; + } +diff --git a/migration/multifd.h b/migration/multifd.h +index 7968cc5c20..e57adc783b 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -167,7 +167,7 @@ typedef struct { + /* Cleanup for receiving side */ + void (*recv_cleanup)(MultiFDRecvParams *p); + /* Read all pages */ +- int (*recv_pages)(MultiFDRecvParams *p, uint32_t used, Error **errp); ++ int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + } MultiFDMethods; + + void multifd_register_ops(int method, MultiFDMethods *ops); +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-Fix-booting-with-logical-block-size.patch b/SOURCES/kvm-pc-bios-s390-ccw-Fix-booting-with-logical-block-size.patch new file mode 100644 index 0000000..83fe9af --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-Fix-booting-with-logical-block-size.patch @@ -0,0 +1,63 @@ +From 115507e5e8b97993b50ea7b39d6d4bb493973e46 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Aug 2022 11:42:14 +0200 +Subject: [PATCH 9/9] pc-bios/s390-ccw: Fix booting with logical block size < + physical block size + +RH-Author: Thomas Huth +RH-MergeRequest: 207: pc-bios/s390-ccw: Fix booting with logical block size < physical block size +RH-Commit: [1/1] ab22832592e0a48277bf7aca1b941a1be79aeab6 +RH-Bugzilla: 2112296 +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand +RH-Acked-by: Claudio Imbrenda + +For accessing single blocks during boot, it's the logical block size that +matters. (Physical block sizes are rather interesting e.g. for creating +file systems with the correct alignment for speed reasons etc.). +So the s390-ccw bios has to use the logical block size for calculating +sector numbers during the boot phase, the "physical_block_exp" shift +value must not be taken into account. This change fixes the boot process +when the guest hast been installed on a disk where the logical block size +differs from the physical one, e.g. if the guest has been installed +like this: + + qemu-system-s390x -nographic -accel kvm -m 2G \ + -drive if=none,id=d1,file=fedora.iso,format=raw,media=cdrom \ + -device virtio-scsi -device scsi-cd,drive=d1 \ + -drive if=none,id=d2,file=test.qcow2,format=qcow2 + -device virtio-blk,drive=d2,physical_block_size=4096,logical_block_size=512 + +Linux correctly uses the logical block size of 512 for the installation, +but the s390-ccw bios tries to boot from a disk with 4096 block size so +far, as long as this patch has not been applied yet (well, it used to work +by accident in the past due to the virtio_assume_scsi() hack that used to +enforce 512 byte sectors on all virtio-block disks, but that hack has been +well removed in commit 5447de2619050a0a4d to fix other scenarios). + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2112296 +Message-Id: <20220805094214.285223-1-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Reviewed-by: Eric Farman +Signed-off-by: Thomas Huth +(cherry picked from commit 393296de19650e1400ca265914cfdeb313725363) +--- + pc-bios/s390-ccw/virtio-blkdev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 8271c47296..794f99b42c 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -173,7 +173,7 @@ int virtio_get_block_size(void) + + switch (vdev->senseid.cu_model) { + case VIRTIO_ID_BLOCK: +- return vdev->config.blk.blk_size << vdev->config.blk.physical_block_exp; ++ return vdev->config.blk.blk_size; + case VIRTIO_ID_SCSI: + return vdev->scsi_block_size; + } +-- +2.31.1 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-Split-virtio-scsi-code-from-virtio_.patch b/SOURCES/kvm-pc-bios-s390-ccw-Split-virtio-scsi-code-from-virtio_.patch new file mode 100644 index 0000000..89d8a91 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-Split-virtio-scsi-code-from-virtio_.patch @@ -0,0 +1,180 @@ +From 0e7b71a3f0b3a2e1dba54f02efc15b02f337e031 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 36/37] pc-bios/s390-ccw: Split virtio-scsi code from + virtio_blk_setup_device() + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [8/9] 8e24806a91c91b2e3603da88e5a22d96a91e8686 +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit cf30b7c4a9b2c64518be8037c2e6670aacdb00b9 +Author: Thomas Huth +Date: Mon Jul 4 13:19:00 2022 +0200 + + pc-bios/s390-ccw: Split virtio-scsi code from virtio_blk_setup_device() + + The next patch is going to add more virtio-block specific code to + virtio_blk_setup_device(), and if the virtio-scsi code is also in + there, this is more cumbersome. And the calling function virtio_setup() + in main.c looks at the device type already anyway, so it's more + logical to separate the virtio-scsi stuff into a new function in + virtio-scsi.c instead. + + Message-Id: <20220704111903.62400-10-thuth@redhat.com> + Reviewed-by: Eric Farman + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/main.c | 24 +++++++++++++++++------- + pc-bios/s390-ccw/virtio-blkdev.c | 20 ++------------------ + pc-bios/s390-ccw/virtio-scsi.c | 19 ++++++++++++++++++- + pc-bios/s390-ccw/virtio-scsi.h | 2 +- + 4 files changed, 38 insertions(+), 27 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 5d2b7ba94d..13e1d8fdf7 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -14,6 +14,7 @@ + #include "s390-ccw.h" + #include "cio.h" + #include "virtio.h" ++#include "virtio-scsi.h" + #include "dasd-ipl.h" + + char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE))); +@@ -218,6 +219,7 @@ static int virtio_setup(void) + { + VDev *vdev = virtio_get_device(); + QemuIplParameters *early_qipl = (QemuIplParameters *)QIPL_ADDRESS; ++ int ret; + + memcpy(&qipl, early_qipl, sizeof(QemuIplParameters)); + +@@ -225,18 +227,26 @@ static int virtio_setup(void) + menu_setup(); + } + +- if (virtio_get_device_type() == VIRTIO_ID_NET) { ++ switch (vdev->senseid.cu_model) { ++ case VIRTIO_ID_NET: + sclp_print("Network boot device detected\n"); + vdev->netboot_start_addr = qipl.netboot_start_addr; +- } else { +- int ret = virtio_blk_setup_device(blk_schid); +- if (ret) { +- return ret; +- } ++ return 0; ++ case VIRTIO_ID_BLOCK: ++ ret = virtio_blk_setup_device(blk_schid); ++ break; ++ case VIRTIO_ID_SCSI: ++ ret = virtio_scsi_setup_device(blk_schid); ++ break; ++ default: ++ panic("\n! No IPL device available !\n"); ++ } ++ ++ if (!ret) { + IPL_assert(virtio_ipl_disk_is_valid(), "No valid IPL device detected"); + } + +- return 0; ++ return ret; + } + + static void ipl_boot_device(void) +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index db1f7f44aa..c175b66a47 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -222,27 +222,11 @@ uint64_t virtio_get_blocks(void) + int virtio_blk_setup_device(SubChannelId schid) + { + VDev *vdev = virtio_get_device(); +- int ret = 0; + + vdev->schid = schid; + virtio_setup_ccw(vdev); + +- switch (vdev->senseid.cu_model) { +- case VIRTIO_ID_BLOCK: +- sclp_print("Using virtio-blk.\n"); +- break; +- case VIRTIO_ID_SCSI: +- IPL_assert(vdev->config.scsi.sense_size == VIRTIO_SCSI_SENSE_SIZE, +- "Config: sense size mismatch"); +- IPL_assert(vdev->config.scsi.cdb_size == VIRTIO_SCSI_CDB_SIZE, +- "Config: CDB size mismatch"); ++ sclp_print("Using virtio-blk.\n"); + +- sclp_print("Using virtio-scsi.\n"); +- ret = virtio_scsi_setup(vdev); +- break; +- default: +- panic("\n! No IPL device available !\n"); +- } +- +- return ret; ++ return 0; + } +diff --git a/pc-bios/s390-ccw/virtio-scsi.c b/pc-bios/s390-ccw/virtio-scsi.c +index 2c8d0f3097..3b7069270c 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.c ++++ b/pc-bios/s390-ccw/virtio-scsi.c +@@ -329,7 +329,7 @@ static void scsi_parse_capacity_report(void *data, + } + } + +-int virtio_scsi_setup(VDev *vdev) ++static int virtio_scsi_setup(VDev *vdev) + { + int retry_test_unit_ready = 3; + uint8_t data[256]; +@@ -430,3 +430,20 @@ int virtio_scsi_setup(VDev *vdev) + + return 0; + } ++ ++int virtio_scsi_setup_device(SubChannelId schid) ++{ ++ VDev *vdev = virtio_get_device(); ++ ++ vdev->schid = schid; ++ virtio_setup_ccw(vdev); ++ ++ IPL_assert(vdev->config.scsi.sense_size == VIRTIO_SCSI_SENSE_SIZE, ++ "Config: sense size mismatch"); ++ IPL_assert(vdev->config.scsi.cdb_size == VIRTIO_SCSI_CDB_SIZE, ++ "Config: CDB size mismatch"); ++ ++ sclp_print("Using virtio-scsi.\n"); ++ ++ return virtio_scsi_setup(vdev); ++} +diff --git a/pc-bios/s390-ccw/virtio-scsi.h b/pc-bios/s390-ccw/virtio-scsi.h +index 4b14c2c2f9..e6b6cd4815 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.h ++++ b/pc-bios/s390-ccw/virtio-scsi.h +@@ -67,8 +67,8 @@ static inline bool virtio_scsi_response_ok(const VirtioScsiCmdResp *r) + return r->response == VIRTIO_SCSI_S_OK && r->status == CDB_STATUS_GOOD; + } + +-int virtio_scsi_setup(VDev *vdev); + int virtio_scsi_read_many(VDev *vdev, + ulong sector, void *load_addr, int sec_num); ++int virtio_scsi_setup_device(SubChannelId schid); + + #endif /* VIRTIO_SCSI_H */ +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-bootmap-Improve-the-guessing-logic-.patch b/SOURCES/kvm-pc-bios-s390-ccw-bootmap-Improve-the-guessing-logic-.patch new file mode 100644 index 0000000..fd34b3d --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-bootmap-Improve-the-guessing-logic-.patch @@ -0,0 +1,102 @@ +From 8433b2ba40d0618c7086da87685e1c51b6da3b11 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 30/37] pc-bios/s390-ccw/bootmap: Improve the guessing logic in + zipl_load_vblk() + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [2/9] db1d2e7929352bec0e1a5d4cf3fb385bbe02304b +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 422865f6672ee1482b98d18321b55c1ecfb06c82 +Author: Thomas Huth +Date: Mon Jul 4 13:18:54 2022 +0200 + + pc-bios/s390-ccw/bootmap: Improve the guessing logic in zipl_load_vblk() + + The logic of trying an final ISO or ECKD boot on virtio-block devices is + very weird: Since the geometry hardly ever matches in virtio_disk_is_scsi(), + virtio_blk_setup_device() always sets a "guessed" disk geometry via + virtio_assume_scsi() (which is certainly also wrong in a lot of cases). + + zipl_load_vblk() then sees that there's been a "virtio_guessed_disk_nature" + and tries to fix up the geometry again via virtio_assume_iso9660() before + always trying to do ipl_iso_el_torito(). That's a very brain-twisting + way of attempting to boot from ISO images, which won't work anymore after + the following patches that will clean up the virtio_assume_scsi() mess + (and thus get rid of the "virtio_guessed_disk_nature" here). + + Let's try a better approach instead: ISO files always have a magic + string "CD001" at offset 0x8001 (see e.g. the ECMA-119 specification) + which we can use to decide whether we should try to boot in ISO 9660 + mode (which we should also try if we see a sector size of 2048). + + And if we were not able to boot in ISO mode here, the final boot attempt + before panicking is to boot in ECKD mode. Since this is our last boot + attempt anyway, simply always assume the ECKD geometry here (if the sector + size was not 4096 yet), so that we also do not depend on the guessed disk + geometry from virtio_blk_setup_device() here anymore. + + Message-Id: <20220704111903.62400-4-thuth@redhat.com> + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/bootmap.c | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index 56411ab3b6..994e59c0b0 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -780,18 +780,37 @@ static void ipl_iso_el_torito(void) + } + } + ++/** ++ * Detect whether we're trying to boot from an .ISO image. ++ * These always have a signature string "CD001" at offset 0x8001. ++ */ ++static bool has_iso_signature(void) ++{ ++ int blksize = virtio_get_block_size(); ++ ++ if (!blksize || virtio_read(0x8000 / blksize, sec)) { ++ return false; ++ } ++ ++ return !memcmp("CD001", &sec[1], 5); ++} ++ + /*********************************************************************** + * Bus specific IPL sequences + */ + + static void zipl_load_vblk(void) + { +- if (virtio_guessed_disk_nature()) { +- virtio_assume_iso9660(); ++ int blksize = virtio_get_block_size(); ++ ++ if (blksize == VIRTIO_ISO_BLOCK_SIZE || has_iso_signature()) { ++ if (blksize != VIRTIO_ISO_BLOCK_SIZE) { ++ virtio_assume_iso9660(); ++ } ++ ipl_iso_el_torito(); + } +- ipl_iso_el_torito(); + +- if (virtio_guessed_disk_nature()) { ++ if (blksize != VIRTIO_DASD_DEFAULT_BLOCK_SIZE) { + sclp_print("Using guessed DASD geometry.\n"); + virtio_assume_eckd(); + } +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-Beautify-the-code-for-readin.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Beautify-the-code-for-readin.patch new file mode 100644 index 0000000..84bf0ce --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Beautify-the-code-for-readin.patch @@ -0,0 +1,56 @@ +From 8b05a4aa32e5ae6cdbc16a5350f6df35d2d79efc Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 35/37] pc-bios/s390-ccw/virtio: Beautify the code for reading + virtqueue configuration + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [7/9] 52fb7fee7d7c46397f32e35bd5f92f82616dfb5c +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 070824885741f5d2a66626d3c4ecb2773c8e0552 +Author: Thomas Huth +Date: Mon Jul 4 13:18:59 2022 +0200 + + pc-bios/s390-ccw/virtio: Beautify the code for reading virtqueue configuration + + It looks nicer if we separate the run_ccw() from the IPL_assert() + statement, and the error message should talk about "virtio device" + instead of "block device", since this code is nowadays used for + non-block (i.e. network) devices, too. + + Message-Id: <20220704111903.62400-9-thuth@redhat.com> + Reviewed-by: Cornelia Huck + Reviewed-by: Eric Farman + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c +index d8c2b52710..f37510f312 100644 +--- a/pc-bios/s390-ccw/virtio.c ++++ b/pc-bios/s390-ccw/virtio.c +@@ -289,9 +289,8 @@ void virtio_setup_ccw(VDev *vdev) + .num = 0, + }; + +- IPL_assert( +- run_ccw(vdev, CCW_CMD_READ_VQ_CONF, &config, sizeof(config), false) == 0, +- "Could not get block device VQ configuration"); ++ rc = run_ccw(vdev, CCW_CMD_READ_VQ_CONF, &config, sizeof(config), false); ++ IPL_assert(rc == 0, "Could not get virtio device VQ configuration"); + info.num = config.num; + vring_init(&vdev->vrings[i], &info); + vdev->vrings[i].schid = vdev->schid; +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-Introduce-a-macro-for-the-DA.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Introduce-a-macro-for-the-DA.patch new file mode 100644 index 0000000..9e9d8e6 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Introduce-a-macro-for-the-DA.patch @@ -0,0 +1,63 @@ +From 511d05f31824b375057ba8dea3f0343ce6e1c1e8 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 29/37] pc-bios/s390-ccw/virtio: Introduce a macro for the DASD + block size + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [1/9] 1053101fd5fb591131c567ff98c7d92b63a9dfa9 +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 1f2c2ee48e87ea743f8e23cc7569dd26c4cf9623 +Author: Thomas Huth +Date: Mon Jul 4 13:18:53 2022 +0200 + + pc-bios/s390-ccw/virtio: Introduce a macro for the DASD block size + + Use VIRTIO_DASD_DEFAULT_BLOCK_SIZE instead of the magic value 4096. + + Message-Id: <20220704111903.62400-3-thuth@redhat.com> + Reviewed-by: Eric Farman + Reviewed-by: Cornelia Huck + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio-blkdev.c | 2 +- + pc-bios/s390-ccw/virtio.h | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 7d35050292..6483307630 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -155,7 +155,7 @@ void virtio_assume_eckd(void) + vdev->config.blk.physical_block_exp = 0; + switch (vdev->senseid.cu_model) { + case VIRTIO_ID_BLOCK: +- vdev->config.blk.blk_size = 4096; ++ vdev->config.blk.blk_size = VIRTIO_DASD_DEFAULT_BLOCK_SIZE; + break; + case VIRTIO_ID_SCSI: + vdev->config.blk.blk_size = vdev->scsi_block_size; +diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h +index 19fceb6495..9e410bde6f 100644 +--- a/pc-bios/s390-ccw/virtio.h ++++ b/pc-bios/s390-ccw/virtio.h +@@ -198,6 +198,7 @@ extern int virtio_read_many(ulong sector, void *load_addr, int sec_num); + #define VIRTIO_SECTOR_SIZE 512 + #define VIRTIO_ISO_BLOCK_SIZE 2048 + #define VIRTIO_SCSI_BLOCK_SIZE 512 ++#define VIRTIO_DASD_DEFAULT_BLOCK_SIZE 4096 + + static inline ulong virtio_sector_adjust(ulong sector) + { +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-Read-device-config-after-fea.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Read-device-config-after-fea.patch new file mode 100644 index 0000000..53f125a --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Read-device-config-after-fea.patch @@ -0,0 +1,67 @@ +From a60940fb7ef026f3aa968e77389efa51ea648ddf Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 34/37] pc-bios/s390-ccw/virtio: Read device config after + feature negotiation + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [6/9] 99ed8765d614207db19ded75d62c65171674d982 +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit aa5c69ce99411c4886bcd051f288afc02b6d968d +Author: Thomas Huth +Date: Mon Jul 4 13:18:58 2022 +0200 + + pc-bios/s390-ccw/virtio: Read device config after feature negotiation + + Feature negotiation should be done first, since some fields in the + config area can depend on the negotiated features and thus should + rather be read afterwards. + + While we're at it, also adjust the error message here a little bit + (the code is nowadays used for non-block virtio devices, too). + + Message-Id: <20220704111903.62400-8-thuth@redhat.com> + Reviewed-by: Eric Farman + Reviewed-by: Cornelia Huck + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c +index 4e85a2eb82..d8c2b52710 100644 +--- a/pc-bios/s390-ccw/virtio.c ++++ b/pc-bios/s390-ccw/virtio.c +@@ -262,10 +262,6 @@ void virtio_setup_ccw(VDev *vdev) + rc = run_ccw(vdev, CCW_CMD_WRITE_STATUS, &status, sizeof(status), false); + IPL_assert(rc == 0, "Could not write DRIVER status to host"); + +- IPL_assert( +- run_ccw(vdev, CCW_CMD_READ_CONF, &vdev->config, cfg_size, false) == 0, +- "Could not get block device configuration"); +- + /* Feature negotiation */ + for (i = 0; i < ARRAY_SIZE(vdev->guest_features); i++) { + feats.features = 0; +@@ -278,6 +274,9 @@ void virtio_setup_ccw(VDev *vdev) + IPL_assert(rc == 0, "Could not set features bits"); + } + ++ rc = run_ccw(vdev, CCW_CMD_READ_CONF, &vdev->config, cfg_size, false); ++ IPL_assert(rc == 0, "Could not get virtio device configuration"); ++ + for (i = 0; i < vdev->nr_vqs; i++) { + VqInfo info = { + .queue = (unsigned long long) ring_area + (i * VIRTIO_RING_SIZE), +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-Set-missing-status-bits-whil.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Set-missing-status-bits-whil.patch new file mode 100644 index 0000000..b25a352 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-Set-missing-status-bits-whil.patch @@ -0,0 +1,93 @@ +From 5cf01cccb7501c801fa9f21a021bc9e7d1fc56e3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 33/37] pc-bios/s390-ccw/virtio: Set missing status bits while + initializing + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [5/9] 6072245f49c229518246b4a0d1be360331305bfa +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 175aa06a152ef6b58ba9b2e47a1296b024dea70c +Author: Thomas Huth +Date: Mon Jul 4 13:18:57 2022 +0200 + + pc-bios/s390-ccw/virtio: Set missing status bits while initializing + + According chapter "3.1.1 Driver Requirements: Device Initialization" + of the Virtio specification (v1.1), a driver for a device has to set + the ACKNOWLEDGE and DRIVER bits in the status field after resetting + the device. The s390-ccw bios skipped these steps so far and seems + like QEMU never cared. Anyway, it's better to follow the spec, so + let's set these bits now in the right spots, too. + + Message-Id: <20220704111903.62400-7-thuth@redhat.com> + Acked-by: Christian Borntraeger + Reviewed-by: Cornelia Huck + Reviewed-by: Eric Farman + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c +index 5d2c6e3381..4e85a2eb82 100644 +--- a/pc-bios/s390-ccw/virtio.c ++++ b/pc-bios/s390-ccw/virtio.c +@@ -220,7 +220,7 @@ int virtio_run(VDev *vdev, int vqid, VirtioCmd *cmd) + void virtio_setup_ccw(VDev *vdev) + { + int i, rc, cfg_size = 0; +- unsigned char status = VIRTIO_CONFIG_S_DRIVER_OK; ++ uint8_t status; + struct VirtioFeatureDesc { + uint32_t features; + uint8_t index; +@@ -234,6 +234,10 @@ void virtio_setup_ccw(VDev *vdev) + + run_ccw(vdev, CCW_CMD_VDEV_RESET, NULL, 0, false); + ++ status = VIRTIO_CONFIG_S_ACKNOWLEDGE; ++ rc = run_ccw(vdev, CCW_CMD_WRITE_STATUS, &status, sizeof(status), false); ++ IPL_assert(rc == 0, "Could not write ACKNOWLEDGE status to host"); ++ + switch (vdev->senseid.cu_model) { + case VIRTIO_ID_NET: + vdev->nr_vqs = 2; +@@ -253,6 +257,11 @@ void virtio_setup_ccw(VDev *vdev) + default: + panic("Unsupported virtio device\n"); + } ++ ++ status |= VIRTIO_CONFIG_S_DRIVER; ++ rc = run_ccw(vdev, CCW_CMD_WRITE_STATUS, &status, sizeof(status), false); ++ IPL_assert(rc == 0, "Could not write DRIVER status to host"); ++ + IPL_assert( + run_ccw(vdev, CCW_CMD_READ_CONF, &vdev->config, cfg_size, false) == 0, + "Could not get block device configuration"); +@@ -291,9 +300,10 @@ void virtio_setup_ccw(VDev *vdev) + run_ccw(vdev, CCW_CMD_SET_VQ, &info, sizeof(info), false) == 0, + "Cannot set VQ info"); + } +- IPL_assert( +- run_ccw(vdev, CCW_CMD_WRITE_STATUS, &status, sizeof(status), false) == 0, +- "Could not write status to host"); ++ ++ status |= VIRTIO_CONFIG_S_DRIVER_OK; ++ rc = run_ccw(vdev, CCW_CMD_WRITE_STATUS, &status, sizeof(status), false); ++ IPL_assert(rc == 0, "Could not write DRIVER_OK status to host"); + } + + bool virtio_is_supported(SubChannelId schid) +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Remove-virtio_assume_.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Remove-virtio_assume_.patch new file mode 100644 index 0000000..ff8aab3 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Remove-virtio_assume_.patch @@ -0,0 +1,101 @@ +From 5b3548c50e35729d724403b83e26579d31621367 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 32/37] pc-bios/s390-ccw/virtio-blkdev: Remove + virtio_assume_scsi() + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [4/9] 5256c4e6f4d5c5aedf1bad3fee30dd3ad230a3dd +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 5447de2619050a0a4dd480b97f88a9b58da360d1 +Author: Thomas Huth +Date: Mon Jul 4 13:18:56 2022 +0200 + + pc-bios/s390-ccw/virtio-blkdev: Remove virtio_assume_scsi() + + The virtio_assume_scsi() function is very questionable: First, it + is only called for virtio-blk, and not for virtio-scsi, so the naming + is already quite confusing. Second, it is called if we detected a + "invalid" IPL disk, trying to fix it by blindly setting a sector + size of 512. This of course won't work in most cases since disks + might have a different sector size for a reason. + + Thus let's remove this strange function now. The calling code can + also be removed completely, since there is another spot in main.c + that does "IPL_assert(virtio_ipl_disk_is_valid(), ...)" to make + sure that we do not try to IPL from an invalid device. + + Message-Id: <20220704111903.62400-6-thuth@redhat.com> + Reviewed-by: Eric Farman + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio-blkdev.c | 24 ------------------------ + pc-bios/s390-ccw/virtio.h | 1 - + 2 files changed, 25 deletions(-) + +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 7e13155589..db1f7f44aa 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -112,23 +112,6 @@ VirtioGDN virtio_guessed_disk_nature(void) + return virtio_get_device()->guessed_disk_nature; + } + +-void virtio_assume_scsi(void) +-{ +- VDev *vdev = virtio_get_device(); +- +- switch (vdev->senseid.cu_model) { +- case VIRTIO_ID_BLOCK: +- vdev->guessed_disk_nature = VIRTIO_GDN_SCSI; +- vdev->config.blk.blk_size = VIRTIO_SCSI_BLOCK_SIZE; +- vdev->config.blk.physical_block_exp = 0; +- vdev->blk_factor = 1; +- break; +- case VIRTIO_ID_SCSI: +- vdev->scsi_block_size = VIRTIO_SCSI_BLOCK_SIZE; +- break; +- } +-} +- + void virtio_assume_iso9660(void) + { + VDev *vdev = virtio_get_device(); +@@ -247,13 +230,6 @@ int virtio_blk_setup_device(SubChannelId schid) + switch (vdev->senseid.cu_model) { + case VIRTIO_ID_BLOCK: + sclp_print("Using virtio-blk.\n"); +- if (!virtio_ipl_disk_is_valid()) { +- /* make sure all getters but blocksize return 0 for +- * invalid IPL disk +- */ +- memset(&vdev->config.blk, 0, sizeof(vdev->config.blk)); +- virtio_assume_scsi(); +- } + break; + case VIRTIO_ID_SCSI: + IPL_assert(vdev->config.scsi.sense_size == VIRTIO_SCSI_SENSE_SIZE, +diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h +index 241730effe..600ba5052b 100644 +--- a/pc-bios/s390-ccw/virtio.h ++++ b/pc-bios/s390-ccw/virtio.h +@@ -182,7 +182,6 @@ enum guessed_disk_nature_type { + typedef enum guessed_disk_nature_type VirtioGDN; + + VirtioGDN virtio_guessed_disk_nature(void); +-void virtio_assume_scsi(void); + void virtio_assume_eckd(void); + void virtio_assume_iso9660(void); + +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Request-the-right-fea.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Request-the-right-fea.patch new file mode 100644 index 0000000..ade5ff2 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Request-the-right-fea.patch @@ -0,0 +1,63 @@ +From 042e966a70789bd3ed450fa4f57016129a34672e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 37/37] pc-bios/s390-ccw/virtio-blkdev: Request the right + feature bits + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [9/9] f04835423d648b04f2187ef9890f2d1689e2b57e +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit 9125a314cca4a1838b09305a87d8efb98f80ab67 +Author: Thomas Huth +Date: Mon Jul 4 13:19:01 2022 +0200 + + pc-bios/s390-ccw/virtio-blkdev: Request the right feature bits + + The virtio-blk code uses the block size and geometry fields in the + config area. According to the virtio-spec, these have to be negotiated + with the right feature bits during initialization, otherwise they + might not be available. QEMU is so far very forgiving and always + provides them, but we should not rely on this behavior, so let's + better request them properly via the VIRTIO_BLK_F_GEOMETRY and + VIRTIO_BLK_F_BLK_SIZE feature bits. + + Message-Id: <20220704111903.62400-11-thuth@redhat.com> + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio-blkdev.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index c175b66a47..8271c47296 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -13,6 +13,9 @@ + #include "virtio.h" + #include "virtio-scsi.h" + ++#define VIRTIO_BLK_F_GEOMETRY (1 << 4) ++#define VIRTIO_BLK_F_BLK_SIZE (1 << 6) ++ + static int virtio_blk_read_many(VDev *vdev, ulong sector, void *load_addr, + int sec_num) + { +@@ -223,6 +226,7 @@ int virtio_blk_setup_device(SubChannelId schid) + { + VDev *vdev = virtio_get_device(); + ++ vdev->guest_features[0] = VIRTIO_BLK_F_GEOMETRY | VIRTIO_BLK_F_BLK_SIZE; + vdev->schid = schid; + virtio_setup_ccw(vdev); + +-- +2.35.3 + diff --git a/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Simplify-fix-virtio_i.patch b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Simplify-fix-virtio_i.patch new file mode 100644 index 0000000..1730dd3 --- /dev/null +++ b/SOURCES/kvm-pc-bios-s390-ccw-virtio-blkdev-Simplify-fix-virtio_i.patch @@ -0,0 +1,124 @@ +From f09f2f12133073d6ccab3b2bd95717d435adc442 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 8 Jul 2022 12:29:50 +0200 +Subject: [PATCH 31/37] pc-bios/s390-ccw/virtio-blkdev: Simplify/fix + virtio_ipl_disk_is_valid() + +RH-Author: Thomas Huth +RH-MergeRequest: 198: pc-bios/s390-ccw: Fix boot from disks with 4k sectors that do not have the typical DASD geometry +RH-Commit: [3/9] ca0b836a417ce5bbd26e489551f573d6b2fc9e94 +RH-Bugzilla: 2098076 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Bugzilla: http://bugzilla.redhat.com/2098076 + +commit bbf615f7b707f009ef8e757d170902ad33b90644 +Author: Thomas Huth +Date: Mon Jul 4 13:18:55 2022 +0200 + + pc-bios/s390-ccw/virtio-blkdev: Simplify/fix virtio_ipl_disk_is_valid() + + The s390-ccw bios fails to boot if the boot disk is a virtio-blk + disk with a sector size of 4096. For example: + + dasdfmt -b 4096 -d cdl -y -p -M quick /dev/dasdX + fdasd -a /dev/dasdX + install a guest onto /dev/dasdX1 using virtio-blk + qemu-system-s390x -nographic -hda /dev/dasdX1 + + The bios then bails out with: + + ! Cannot read block 0 ! + + Looking at virtio_ipl_disk_is_valid() and especially the function + virtio_disk_is_scsi(), it does not really make sense that we expect + only such a limited disk geometry (like a block size of 512) for + our boot disks. Let's relax the check and allow everything that + remotely looks like a sane disk. + + Message-Id: <20220704111903.62400-5-thuth@redhat.com> + Reviewed-by: Eric Farman + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + pc-bios/s390-ccw/virtio-blkdev.c | 41 ++++++-------------------------- + pc-bios/s390-ccw/virtio.h | 2 -- + 2 files changed, 7 insertions(+), 36 deletions(-) + +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 6483307630..7e13155589 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -166,46 +166,19 @@ void virtio_assume_eckd(void) + virtio_eckd_sectors_for_block_size(vdev->config.blk.blk_size); + } + +-bool virtio_disk_is_scsi(void) +-{ +- VDev *vdev = virtio_get_device(); +- +- if (vdev->guessed_disk_nature == VIRTIO_GDN_SCSI) { +- return true; +- } +- switch (vdev->senseid.cu_model) { +- case VIRTIO_ID_BLOCK: +- return (vdev->config.blk.geometry.heads == 255) +- && (vdev->config.blk.geometry.sectors == 63) +- && (virtio_get_block_size() == VIRTIO_SCSI_BLOCK_SIZE); +- case VIRTIO_ID_SCSI: +- return true; +- } +- return false; +-} +- +-bool virtio_disk_is_eckd(void) ++bool virtio_ipl_disk_is_valid(void) + { ++ int blksize = virtio_get_block_size(); + VDev *vdev = virtio_get_device(); +- const int block_size = virtio_get_block_size(); + +- if (vdev->guessed_disk_nature == VIRTIO_GDN_DASD) { ++ if (vdev->guessed_disk_nature == VIRTIO_GDN_SCSI || ++ vdev->guessed_disk_nature == VIRTIO_GDN_DASD) { + return true; + } +- switch (vdev->senseid.cu_model) { +- case VIRTIO_ID_BLOCK: +- return (vdev->config.blk.geometry.heads == 15) +- && (vdev->config.blk.geometry.sectors == +- virtio_eckd_sectors_for_block_size(block_size)); +- case VIRTIO_ID_SCSI: +- return false; +- } +- return false; +-} + +-bool virtio_ipl_disk_is_valid(void) +-{ +- return virtio_disk_is_scsi() || virtio_disk_is_eckd(); ++ return (vdev->senseid.cu_model == VIRTIO_ID_BLOCK || ++ vdev->senseid.cu_model == VIRTIO_ID_SCSI) && ++ blksize >= 512 && blksize <= 4096; + } + + int virtio_get_block_size(void) +diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h +index 9e410bde6f..241730effe 100644 +--- a/pc-bios/s390-ccw/virtio.h ++++ b/pc-bios/s390-ccw/virtio.h +@@ -186,8 +186,6 @@ void virtio_assume_scsi(void); + void virtio_assume_eckd(void); + void virtio_assume_iso9660(void); + +-extern bool virtio_disk_is_scsi(void); +-extern bool virtio_disk_is_eckd(void); + extern bool virtio_ipl_disk_is_valid(void); + extern int virtio_get_block_size(void); + extern uint8_t virtio_get_heads(void); +-- +2.35.3 + diff --git a/SOURCES/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch b/SOURCES/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch new file mode 100644 index 0000000..8ed0d2e --- /dev/null +++ b/SOURCES/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch @@ -0,0 +1,162 @@ +From 552e7c8ae2c6e281a72791aefa1729be86f96642 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:52 +0200 +Subject: [PATCH 5/6] qcow2: Add errp to rebuild_refcount_structure() + +RH-Author: Hanna Reitz +RH-MergeRequest: 171: qcow2: Improve refcount structure rebuilding +RH-Commit: [3/4] 9dddd1d21383c4cbd528e5a0d42b0c2a7d87c8f6 +RH-Bugzilla: 1519071 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Eric Blake + +Instead of fprint()-ing error messages in rebuild_refcount_structure() +and its rebuild_refcounts_write_refblocks() helper, pass them through an +Error object to qcow2_check_refcounts() (which will then print it). + +Suggested-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-4-hreitz@redhat.com> +Reviewed-by: Eric Blake +(cherry picked from commit 0423f75351ab83b844a31349218b0eadd830e07a) +Signed-off-by: Hanna Reitz +--- + block/qcow2-refcount.c | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index 555d8ba5ac..09f8ef4927 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2462,7 +2462,8 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, + static int rebuild_refcounts_write_refblocks( + BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, + int64_t first_cluster, int64_t end_cluster, +- uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr ++ uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr, ++ Error **errp + ) + { + BDRVQcow2State *s = bs->opaque; +@@ -2513,8 +2514,8 @@ static int rebuild_refcounts_write_refblocks( + nb_clusters, + &first_free_cluster); + if (refblock_offset < 0) { +- fprintf(stderr, "ERROR allocating refblock: %s\n", +- strerror(-refblock_offset)); ++ error_setg_errno(errp, -refblock_offset, ++ "ERROR allocating refblock"); + return refblock_offset; + } + +@@ -2536,6 +2537,7 @@ static int rebuild_refcounts_write_refblocks( + on_disk_reftable_entries * + REFTABLE_ENTRY_SIZE); + if (!on_disk_reftable) { ++ error_setg(errp, "ERROR allocating reftable memory"); + return -ENOMEM; + } + +@@ -2559,7 +2561,7 @@ static int rebuild_refcounts_write_refblocks( + ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, + s->cluster_size, false); + if (ret < 0) { +- fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing refblock"); + return ret; + } + +@@ -2575,7 +2577,7 @@ static int rebuild_refcounts_write_refblocks( + ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock, + s->cluster_size); + if (ret < 0) { +- fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing refblock"); + return ret; + } + +@@ -2598,7 +2600,8 @@ static int rebuild_refcounts_write_refblocks( + static int rebuild_refcount_structure(BlockDriverState *bs, + BdrvCheckResult *res, + void **refcount_table, +- int64_t *nb_clusters) ++ int64_t *nb_clusters, ++ Error **errp) + { + BDRVQcow2State *s = bs->opaque; + int64_t reftable_offset = -1; +@@ -2649,7 +2652,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, + 0, *nb_clusters, + &on_disk_reftable, +- &on_disk_reftable_entries); ++ &on_disk_reftable_entries, errp); + if (reftable_size_changed < 0) { + res->check_errors++; + ret = reftable_size_changed; +@@ -2673,8 +2676,8 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + refcount_table, nb_clusters, + &first_free_cluster); + if (reftable_offset < 0) { +- fprintf(stderr, "ERROR allocating reftable: %s\n", +- strerror(-reftable_offset)); ++ error_setg_errno(errp, -reftable_offset, ++ "ERROR allocating reftable"); + res->check_errors++; + ret = reftable_offset; + goto fail; +@@ -2692,7 +2695,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + reftable_start_cluster, + reftable_end_cluster, + &on_disk_reftable, +- &on_disk_reftable_entries); ++ &on_disk_reftable_entries, errp); + if (reftable_size_changed < 0) { + res->check_errors++; + ret = reftable_size_changed; +@@ -2722,7 +2725,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length, + false); + if (ret < 0) { +- fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing reftable"); + goto fail; + } + +@@ -2730,7 +2733,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, + reftable_length); + if (ret < 0) { +- fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing reftable"); + goto fail; + } + +@@ -2743,7 +2746,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + &reftable_offset_and_clusters, + sizeof(reftable_offset_and_clusters)); + if (ret < 0) { +- fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR setting reftable"); + goto fail; + } + +@@ -2811,11 +2814,13 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + if (rebuild && (fix & BDRV_FIX_ERRORS)) { + BdrvCheckResult old_res = *res; + int fresh_leaks = 0; ++ Error *local_err = NULL; + + fprintf(stderr, "Rebuilding refcount structure\n"); + ret = rebuild_refcount_structure(bs, res, &refcount_table, +- &nb_clusters); ++ &nb_clusters, &local_err); + if (ret < 0) { ++ error_report_err(local_err); + goto fail; + } + +-- +2.27.0 + diff --git a/SOURCES/kvm-qcow2-Improve-refcount-structure-rebuilding.patch b/SOURCES/kvm-qcow2-Improve-refcount-structure-rebuilding.patch new file mode 100644 index 0000000..efae75f --- /dev/null +++ b/SOURCES/kvm-qcow2-Improve-refcount-structure-rebuilding.patch @@ -0,0 +1,465 @@ +From be54c6206b0f0a19e0ffe6a058f4f97277027a17 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:50 +0200 +Subject: [PATCH 3/6] qcow2: Improve refcount structure rebuilding + +RH-Author: Hanna Reitz +RH-MergeRequest: 171: qcow2: Improve refcount structure rebuilding +RH-Commit: [1/4] 0bb78f7735a0730204670ae5ec2e040ad1d23942 +RH-Bugzilla: 1519071 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Eric Blake + +When rebuilding the refcount structures (when qemu-img check -r found +errors with refcount = 0, but reference count > 0), the new refcount +table defaults to being put at the image file end[1]. There is no good +reason for that except that it means we will not have to rewrite any +refblocks we already wrote to disk. + +Changing the code to rewrite those refblocks is not too difficult, +though, so let us do that. That is beneficial for images on block +devices, where we cannot really write beyond the end of the image file. + +Use this opportunity to add extensive comments to the code, and refactor +it a bit, getting rid of the backwards-jumping goto. + +[1] Unless there is something allocated in the area pointed to by the + last refblock, so we have to write that refblock. In that case, we + try to put the reftable in there. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1519071 +Closes: https://gitlab.com/qemu-project/qemu/-/issues/941 +Reviewed-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-2-hreitz@redhat.com> +(cherry picked from commit a8c07ec287554dcefd33733f0e5888a281ddc95e) +Signed-off-by: Hanna Reitz +--- + block/qcow2-refcount.c | 332 +++++++++++++++++++++++++++++------------ + 1 file changed, 235 insertions(+), 97 deletions(-) + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index 4614572252..555d8ba5ac 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2435,111 +2435,140 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, + } + + /* +- * Creates a new refcount structure based solely on the in-memory information +- * given through *refcount_table. All necessary allocations will be reflected +- * in that array. ++ * Helper function for rebuild_refcount_structure(). + * +- * On success, the old refcount structure is leaked (it will be covered by the +- * new refcount structure). ++ * Scan the range of clusters [first_cluster, end_cluster) for allocated ++ * clusters and write all corresponding refblocks to disk. The refblock ++ * and allocation data is taken from the in-memory refcount table ++ * *refcount_table[] (of size *nb_clusters), which is basically one big ++ * (unlimited size) refblock for the whole image. ++ * ++ * For these refblocks, clusters are allocated using said in-memory ++ * refcount table. Care is taken that these allocations are reflected ++ * in the refblocks written to disk. ++ * ++ * The refblocks' offsets are written into a reftable, which is ++ * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr). If ++ * that reftable is of insufficient size, it will be resized to fit. ++ * This reftable is not written to disk. ++ * ++ * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed ++ * to point to existing valid refblocks that do not need to be allocated ++ * again.) ++ * ++ * Return whether the on-disk reftable array was resized (true/false), ++ * or -errno on error. + */ +-static int rebuild_refcount_structure(BlockDriverState *bs, +- BdrvCheckResult *res, +- void **refcount_table, +- int64_t *nb_clusters) ++static int rebuild_refcounts_write_refblocks( ++ BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, ++ int64_t first_cluster, int64_t end_cluster, ++ uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr ++ ) + { + BDRVQcow2State *s = bs->opaque; +- int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; ++ int64_t cluster; + int64_t refblock_offset, refblock_start, refblock_index; +- uint32_t reftable_size = 0; +- uint64_t *on_disk_reftable = NULL; ++ int64_t first_free_cluster = 0; ++ uint64_t *on_disk_reftable = *on_disk_reftable_ptr; ++ uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr; + void *on_disk_refblock; +- int ret = 0; +- struct { +- uint64_t reftable_offset; +- uint32_t reftable_clusters; +- } QEMU_PACKED reftable_offset_and_clusters; +- +- qcow2_cache_empty(bs, s->refcount_block_cache); ++ bool reftable_grown = false; ++ int ret; + +-write_refblocks: +- for (; cluster < *nb_clusters; cluster++) { ++ for (cluster = first_cluster; cluster < end_cluster; cluster++) { ++ /* Check all clusters to find refblocks that contain non-zero entries */ + if (!s->get_refcount(*refcount_table, cluster)) { + continue; + } + ++ /* ++ * This cluster is allocated, so we need to create a refblock ++ * for it. The data we will write to disk is just the ++ * respective slice from *refcount_table, so it will contain ++ * accurate refcounts for all clusters belonging to this ++ * refblock. After we have written it, we will therefore skip ++ * all remaining clusters in this refblock. ++ */ ++ + refblock_index = cluster >> s->refcount_block_bits; + refblock_start = refblock_index << s->refcount_block_bits; + +- /* Don't allocate a cluster in a refblock already written to disk */ +- if (first_free_cluster < refblock_start) { +- first_free_cluster = refblock_start; +- } +- refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, +- nb_clusters, &first_free_cluster); +- if (refblock_offset < 0) { +- fprintf(stderr, "ERROR allocating refblock: %s\n", +- strerror(-refblock_offset)); +- res->check_errors++; +- ret = refblock_offset; +- goto fail; +- } ++ if (on_disk_reftable_entries > refblock_index && ++ on_disk_reftable[refblock_index]) ++ { ++ /* ++ * We can get here after a `goto write_refblocks`: We have a ++ * reftable from a previous run, and the refblock is already ++ * allocated. No need to allocate it again. ++ */ ++ refblock_offset = on_disk_reftable[refblock_index]; ++ } else { ++ int64_t refblock_cluster_index; + +- if (reftable_size <= refblock_index) { +- uint32_t old_reftable_size = reftable_size; +- uint64_t *new_on_disk_reftable; ++ /* Don't allocate a cluster in a refblock already written to disk */ ++ if (first_free_cluster < refblock_start) { ++ first_free_cluster = refblock_start; ++ } ++ refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, ++ nb_clusters, ++ &first_free_cluster); ++ if (refblock_offset < 0) { ++ fprintf(stderr, "ERROR allocating refblock: %s\n", ++ strerror(-refblock_offset)); ++ return refblock_offset; ++ } + +- reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE, +- s->cluster_size) / REFTABLE_ENTRY_SIZE; +- new_on_disk_reftable = g_try_realloc(on_disk_reftable, +- reftable_size * +- REFTABLE_ENTRY_SIZE); +- if (!new_on_disk_reftable) { +- res->check_errors++; +- ret = -ENOMEM; +- goto fail; ++ refblock_cluster_index = refblock_offset / s->cluster_size; ++ if (refblock_cluster_index >= end_cluster) { ++ /* ++ * We must write the refblock that holds this refblock's ++ * refcount ++ */ ++ end_cluster = refblock_cluster_index + 1; + } +- on_disk_reftable = new_on_disk_reftable; + +- memset(on_disk_reftable + old_reftable_size, 0, +- (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE); ++ if (on_disk_reftable_entries <= refblock_index) { ++ on_disk_reftable_entries = ++ ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE, ++ s->cluster_size) / REFTABLE_ENTRY_SIZE; ++ on_disk_reftable = ++ g_try_realloc(on_disk_reftable, ++ on_disk_reftable_entries * ++ REFTABLE_ENTRY_SIZE); ++ if (!on_disk_reftable) { ++ return -ENOMEM; ++ } + +- /* The offset we have for the reftable is now no longer valid; +- * this will leak that range, but we can easily fix that by running +- * a leak-fixing check after this rebuild operation */ +- reftable_offset = -1; +- } else { +- assert(on_disk_reftable); +- } +- on_disk_reftable[refblock_index] = refblock_offset; ++ memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0, ++ (on_disk_reftable_entries - ++ *on_disk_reftable_entries_ptr) * ++ REFTABLE_ENTRY_SIZE); + +- /* If this is apparently the last refblock (for now), try to squeeze the +- * reftable in */ +- if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && +- reftable_offset < 0) +- { +- uint64_t reftable_clusters = size_to_clusters(s, reftable_size * +- REFTABLE_ENTRY_SIZE); +- reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, +- refcount_table, nb_clusters, +- &first_free_cluster); +- if (reftable_offset < 0) { +- fprintf(stderr, "ERROR allocating reftable: %s\n", +- strerror(-reftable_offset)); +- res->check_errors++; +- ret = reftable_offset; +- goto fail; ++ *on_disk_reftable_ptr = on_disk_reftable; ++ *on_disk_reftable_entries_ptr = on_disk_reftable_entries; ++ ++ reftable_grown = true; ++ } else { ++ assert(on_disk_reftable); + } ++ on_disk_reftable[refblock_index] = refblock_offset; + } + ++ /* Refblock is allocated, write it to disk */ ++ + ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, + s->cluster_size, false); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +- goto fail; ++ return ret; + } + +- /* The size of *refcount_table is always cluster-aligned, therefore the +- * write operation will not overflow */ ++ /* ++ * The refblock is simply a slice of *refcount_table. ++ * Note that the size of *refcount_table is always aligned to ++ * whole clusters, so the write operation will not result in ++ * out-of-bounds accesses. ++ */ + on_disk_refblock = (void *)((char *) *refcount_table + + refblock_index * s->cluster_size); + +@@ -2547,23 +2576,99 @@ write_refblocks: + s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +- goto fail; ++ return ret; + } + +- /* Go to the end of this refblock */ ++ /* This refblock is done, skip to its end */ + cluster = refblock_start + s->refcount_block_size - 1; + } + +- if (reftable_offset < 0) { +- uint64_t post_refblock_start, reftable_clusters; ++ return reftable_grown; ++} ++ ++/* ++ * Creates a new refcount structure based solely on the in-memory information ++ * given through *refcount_table (this in-memory information is basically just ++ * the concatenation of all refblocks). All necessary allocations will be ++ * reflected in that array. ++ * ++ * On success, the old refcount structure is leaked (it will be covered by the ++ * new refcount structure). ++ */ ++static int rebuild_refcount_structure(BlockDriverState *bs, ++ BdrvCheckResult *res, ++ void **refcount_table, ++ int64_t *nb_clusters) ++{ ++ BDRVQcow2State *s = bs->opaque; ++ int64_t reftable_offset = -1; ++ int64_t reftable_length = 0; ++ int64_t reftable_clusters; ++ int64_t refblock_index; ++ uint32_t on_disk_reftable_entries = 0; ++ uint64_t *on_disk_reftable = NULL; ++ int ret = 0; ++ int reftable_size_changed = 0; ++ struct { ++ uint64_t reftable_offset; ++ uint32_t reftable_clusters; ++ } QEMU_PACKED reftable_offset_and_clusters; ++ ++ qcow2_cache_empty(bs, s->refcount_block_cache); ++ ++ /* ++ * For each refblock containing entries, we try to allocate a ++ * cluster (in the in-memory refcount table) and write its offset ++ * into on_disk_reftable[]. We then write the whole refblock to ++ * disk (as a slice of the in-memory refcount table). ++ * This is done by rebuild_refcounts_write_refblocks(). ++ * ++ * Once we have scanned all clusters, we try to find space for the ++ * reftable. This will dirty the in-memory refcount table (i.e. ++ * make it differ from the refblocks we have already written), so we ++ * need to run rebuild_refcounts_write_refblocks() again for the ++ * range of clusters where the reftable has been allocated. ++ * ++ * This second run might make the reftable grow again, in which case ++ * we will need to allocate another space for it, which is why we ++ * repeat all this until the reftable stops growing. ++ * ++ * (This loop will terminate, because with every cluster the ++ * reftable grows, it can accomodate a multitude of more refcounts, ++ * so that at some point this must be able to cover the reftable ++ * and all refblocks describing it.) ++ * ++ * We then convert the reftable to big-endian and write it to disk. ++ * ++ * Note that we never free any reftable allocations. Doing so would ++ * needlessly complicate the algorithm: The eventual second check ++ * run we do will clean up all leaks we have caused. ++ */ ++ ++ reftable_size_changed = ++ rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, ++ 0, *nb_clusters, ++ &on_disk_reftable, ++ &on_disk_reftable_entries); ++ if (reftable_size_changed < 0) { ++ res->check_errors++; ++ ret = reftable_size_changed; ++ goto fail; ++ } ++ ++ /* ++ * There was no reftable before, so rebuild_refcounts_write_refblocks() ++ * must have increased its size (from 0 to something). ++ */ ++ assert(reftable_size_changed); ++ ++ do { ++ int64_t reftable_start_cluster, reftable_end_cluster; ++ int64_t first_free_cluster = 0; ++ ++ reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE; ++ reftable_clusters = size_to_clusters(s, reftable_length); + +- post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); +- reftable_clusters = +- size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE); +- /* Not pretty but simple */ +- if (first_free_cluster < post_refblock_start) { +- first_free_cluster = post_refblock_start; +- } + reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, + refcount_table, nb_clusters, + &first_free_cluster); +@@ -2575,24 +2680,55 @@ write_refblocks: + goto fail; + } + +- goto write_refblocks; +- } ++ /* ++ * We need to update the affected refblocks, so re-run the ++ * write_refblocks loop for the reftable's range of clusters. ++ */ ++ assert(offset_into_cluster(s, reftable_offset) == 0); ++ reftable_start_cluster = reftable_offset / s->cluster_size; ++ reftable_end_cluster = reftable_start_cluster + reftable_clusters; ++ reftable_size_changed = ++ rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, ++ reftable_start_cluster, ++ reftable_end_cluster, ++ &on_disk_reftable, ++ &on_disk_reftable_entries); ++ if (reftable_size_changed < 0) { ++ res->check_errors++; ++ ret = reftable_size_changed; ++ goto fail; ++ } ++ ++ /* ++ * If the reftable size has changed, we will need to find a new ++ * allocation, repeating the loop. ++ */ ++ } while (reftable_size_changed); + +- for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { ++ /* The above loop must have run at least once */ ++ assert(reftable_offset >= 0); ++ ++ /* ++ * All allocations are done, all refblocks are written, convert the ++ * reftable to big-endian and write it to disk. ++ */ ++ ++ for (refblock_index = 0; refblock_index < on_disk_reftable_entries; ++ refblock_index++) ++ { + cpu_to_be64s(&on_disk_reftable[refblock_index]); + } + +- ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, +- reftable_size * REFTABLE_ENTRY_SIZE, ++ ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length, + false); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; + } + +- assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE); ++ assert(reftable_length < INT_MAX); + ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, +- reftable_size * REFTABLE_ENTRY_SIZE); ++ reftable_length); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; +@@ -2601,7 +2737,7 @@ write_refblocks: + /* Enter new reftable into the image header */ + reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); + reftable_offset_and_clusters.reftable_clusters = +- cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE)); ++ cpu_to_be32(reftable_clusters); + ret = bdrv_pwrite_sync(bs->file, + offsetof(QCowHeader, refcount_table_offset), + &reftable_offset_and_clusters, +@@ -2611,12 +2747,14 @@ write_refblocks: + goto fail; + } + +- for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { ++ for (refblock_index = 0; refblock_index < on_disk_reftable_entries; ++ refblock_index++) ++ { + be64_to_cpus(&on_disk_reftable[refblock_index]); + } + s->refcount_table = on_disk_reftable; + s->refcount_table_offset = reftable_offset; +- s->refcount_table_size = reftable_size; ++ s->refcount_table_size = on_disk_reftable_entries; + update_max_refcount_table_index(s); + + return 0; +-- +2.27.0 + diff --git a/SOURCES/kvm-s390x-ipl-support-extended-kernel-command-line-size.patch b/SOURCES/kvm-s390x-ipl-support-extended-kernel-command-line-size.patch new file mode 100644 index 0000000..d62a45a --- /dev/null +++ b/SOURCES/kvm-s390x-ipl-support-extended-kernel-command-line-size.patch @@ -0,0 +1,97 @@ +From ddfee9d393af322938e4df466cd01b8f9570a1c9 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 5 Apr 2022 10:20:59 +0200 +Subject: [PATCH 1/6] s390x/ipl: support extended kernel command line size + +RH-Author: Thomas Huth +RH-MergeRequest: 144: s390x/ipl: support extended kernel command line size +RH-Commit: [1/1] be227e50af5dbe7802605f873db29ac5358aa196 +RH-Bugzilla: 2043830 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Bugzilla: http://bugzilla.redhat.com/2043830 + +commit b2173046a64beed76715f310f98538f159276af1 +Author: Marc Hartmayer +Date: Mon Nov 22 12:29:09 2021 +0100 + + s390x/ipl: support extended kernel command line size + + In the past s390 used a fixed command line length of 896 bytes. This has changed + with the Linux commit 5ecb2da660ab ("s390: support command lines longer than 896 + bytes"). There is now a parm area indicating the maximum command line size. This + parm area has always been initialized to zero, so with older kernels this field + would read zero and we must then assume that only 896 bytes are available. + + Signed-off-by: Marc Hartmayer + Reviewed-by: David Hildenbrand + Reviewed-by: Christian Borntraeger + Acked-by: Viktor Mihajlovski + Message-Id: <20211122112909.18138-1-mhartmay@linux.ibm.com> + [thuth: Cosmetic fixes, and use PRIu64 instead of %lu] + Signed-off-by: Thomas Huth + +Signed-off-by: Thomas Huth +--- + hw/s390x/ipl.c | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index 7ddca0127f..eb7fc4c4ae 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -37,8 +37,9 @@ + + #define KERN_IMAGE_START 0x010000UL + #define LINUX_MAGIC_ADDR 0x010008UL ++#define KERN_PARM_AREA_SIZE_ADDR 0x010430UL + #define KERN_PARM_AREA 0x010480UL +-#define KERN_PARM_AREA_SIZE 0x000380UL ++#define LEGACY_KERN_PARM_AREA_SIZE 0x000380UL + #define INITRD_START 0x800000UL + #define INITRD_PARM_START 0x010408UL + #define PARMFILE_START 0x001000UL +@@ -110,6 +111,21 @@ static uint64_t bios_translate_addr(void *opaque, uint64_t srcaddr) + return srcaddr + dstaddr; + } + ++static uint64_t get_max_kernel_cmdline_size(void) ++{ ++ uint64_t *size_ptr = rom_ptr(KERN_PARM_AREA_SIZE_ADDR, sizeof(*size_ptr)); ++ ++ if (size_ptr) { ++ uint64_t size; ++ ++ size = be64_to_cpu(*size_ptr); ++ if (size) { ++ return size; ++ } ++ } ++ return LEGACY_KERN_PARM_AREA_SIZE; ++} ++ + static void s390_ipl_realize(DeviceState *dev, Error **errp) + { + MachineState *ms = MACHINE(qdev_get_machine()); +@@ -197,10 +213,13 @@ static void s390_ipl_realize(DeviceState *dev, Error **errp) + ipl->start_addr = KERN_IMAGE_START; + /* Overwrite parameters in the kernel image, which are "rom" */ + if (parm_area) { +- if (cmdline_size > KERN_PARM_AREA_SIZE) { ++ uint64_t max_cmdline_size = get_max_kernel_cmdline_size(); ++ ++ if (cmdline_size > max_cmdline_size) { + error_setg(errp, +- "kernel command line exceeds maximum size: %zu > %lu", +- cmdline_size, KERN_PARM_AREA_SIZE); ++ "kernel command line exceeds maximum size:" ++ " %zu > %" PRIu64, ++ cmdline_size, max_cmdline_size); + return; + } + +-- +2.27.0 + diff --git a/SOURCES/kvm-scsi-generic-Fix-emulated-block-limits-VPD-page.patch b/SOURCES/kvm-scsi-generic-Fix-emulated-block-limits-VPD-page.patch new file mode 100644 index 0000000..0fc1a71 --- /dev/null +++ b/SOURCES/kvm-scsi-generic-Fix-emulated-block-limits-VPD-page.patch @@ -0,0 +1,97 @@ +From a9a4dfdd6312e192e9134d46edfac4c1b1bfa63d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 22 Aug 2022 14:53:20 +0200 +Subject: [PATCH] scsi-generic: Fix emulated block limits VPD page + +RH-Author: Kevin Wolf +RH-MergeRequest: 212: scsi-generic: Fix emulated block limits VPD page +RH-Commit: [1/1] d3ba6b2e03039043716ddc6b7d4a424d92249081 +RH-Bugzilla: 2120279 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Hanna Reitz +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Stefan Hajnoczi + +Commits 01ef8185b80 amd 24b36e9813e updated the way that the maximum +transfer length is calculated for patching block limits VPD page in an +INQUIRY response. + +The same updates also need to be made for the case where the host device +does not support the block limits VPD page at all and we emulate the +whole page. + +Without this fix, on host block devices a maximum transfer length of +(INT_MAX - sector_size) bytes is advertised to the guest, resulting in +I/O errors when a request that exceeds the host limits is made by the +guest. (Prior to commit 24b36e9813e, this code path would use the +max_transfer value from the host instead of INT_MAX, but still miss the +fix from 01ef8185b80 where max_transfer is also capped to max_iov +host pages, so it would be less wrong, but still wrong.) + +Cc: qemu-stable@nongnu.org +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2096251 +Fixes: 01ef8185b809af9d287e1a03a3f9d8ea8231118a +Fixes: 24b36e9813ec15da7db62e3b3621730710c5f020 +Signed-off-by: Kevin Wolf +Message-Id: <20220822125320.48257-1-kwolf@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Kevin Wolf +(cherry picked from commit 51e15194b0a091e5c40aab2eb234a1d36c5c58ee) + +Resolved conflict: qemu_real_host_page_size() is a getter function in +current upstream, but still just a public global variable downstream. + +Signed-off-by: Kevin Wolf +--- + hw/scsi/scsi-generic.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c +index 0306ccc7b1..3742899839 100644 +--- a/hw/scsi/scsi-generic.c ++++ b/hw/scsi/scsi-generic.c +@@ -147,6 +147,18 @@ static int execute_command(BlockBackend *blk, + return 0; + } + ++static uint64_t calculate_max_transfer(SCSIDevice *s) ++{ ++ uint64_t max_transfer = blk_get_max_hw_transfer(s->conf.blk); ++ uint32_t max_iov = blk_get_max_hw_iov(s->conf.blk); ++ ++ assert(max_transfer); ++ max_transfer = MIN_NON_ZERO(max_transfer, ++ max_iov * qemu_real_host_page_size); ++ ++ return max_transfer / s->blocksize; ++} ++ + static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len) + { + uint8_t page, page_idx; +@@ -179,12 +191,7 @@ static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len) + (r->req.cmd.buf[1] & 0x01)) { + page = r->req.cmd.buf[2]; + if (page == 0xb0) { +- uint64_t max_transfer = blk_get_max_hw_transfer(s->conf.blk); +- uint32_t max_iov = blk_get_max_hw_iov(s->conf.blk); +- +- assert(max_transfer); +- max_transfer = MIN_NON_ZERO(max_transfer, max_iov * qemu_real_host_page_size) +- / s->blocksize; ++ uint64_t max_transfer = calculate_max_transfer(s); + stl_be_p(&r->buf[8], max_transfer); + /* Also take care of the opt xfer len. */ + stl_be_p(&r->buf[12], +@@ -230,7 +237,7 @@ static int scsi_generic_emulate_block_limits(SCSIGenericReq *r, SCSIDevice *s) + uint8_t buf[64]; + + SCSIBlockLimits bl = { +- .max_io_sectors = blk_get_max_transfer(s->conf.blk) / s->blocksize ++ .max_io_sectors = calculate_max_transfer(s), + }; + + memset(r->buf, 0, r->buflen); +-- +2.35.3 + diff --git a/SOURCES/kvm-target-i386-kvm-do-not-access-uninitialized-variable.patch b/SOURCES/kvm-target-i386-kvm-do-not-access-uninitialized-variable.patch new file mode 100644 index 0000000..04db85f --- /dev/null +++ b/SOURCES/kvm-target-i386-kvm-do-not-access-uninitialized-variable.patch @@ -0,0 +1,73 @@ +From 688c9f386635544dbc468171a32fbc84f0c9224e Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 18 Mar 2022 16:23:47 +0100 +Subject: [PATCH 12/24] target/i386: kvm: do not access uninitialized variable + on older kernels + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [12/13] 776fac1e7d1aa16ec5f4d99ddad3039eab8212af +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +KVM support for AMX includes a new system attribute, KVM_X86_XCOMP_GUEST_SUPP. +Commit 19db68ca68 ("x86: Grant AMX permission for guest", 2022-03-15) however +did not fully consider the behavior on older kernels. First, it warns +too aggressively. Second, it invokes the KVM_GET_DEVICE_ATTR ioctl +unconditionally and then uses the "bitmask" variable, which remains +uninitialized if the ioctl fails. Third, kvm_ioctl returns -errno rather +than -1 on errors. + +While at it, explain why the ioctl is needed and KVM_GET_SUPPORTED_CPUID +is not enough. + +Signed-off-by: Paolo Bonzini +(cherry picked from commit 3ec5ad40081b14af28496198b4d08dbe13386790) +Signed-off-by: Paul Lai +--- + target/i386/kvm/kvm.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index b1128b0e07..bd439e56ad 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -409,6 +409,12 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + } + } else if (function == 0xd && index == 0 && + (reg == R_EAX || reg == R_EDX)) { ++ /* ++ * The value returned by KVM_GET_SUPPORTED_CPUID does not include ++ * features that still have to be enabled with the arch_prctl ++ * system call. QEMU needs the full value, which is retrieved ++ * with KVM_GET_DEVICE_ATTR. ++ */ + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, +@@ -417,13 +423,16 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + + bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES); + if (!sys_attr) { +- warn_report("cannot get sys attribute capabilities %d", sys_attr); ++ return ret; + } + + int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr); +- if (rc == -1 && (errno == ENXIO || errno == EINVAL)) { +- warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " +- "error: %d", rc); ++ if (rc < 0) { ++ if (rc != -ENXIO) { ++ warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " ++ "error: %d", rc); ++ } ++ return ret; + } + ret = (reg == R_EAX) ? bitmask : bitmask >> 32; + } else if (function == 0x80000001 && reg == R_ECX) { +-- +2.35.3 + diff --git a/SOURCES/kvm-target-i386-properly-reset-TSC-on-reset.patch b/SOURCES/kvm-target-i386-properly-reset-TSC-on-reset.patch new file mode 100644 index 0000000..47ce2af --- /dev/null +++ b/SOURCES/kvm-target-i386-properly-reset-TSC-on-reset.patch @@ -0,0 +1,83 @@ +From 416de21d11540a927cceb533bf54ce28ffa15ad6 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 24 Mar 2022 09:21:41 +0100 +Subject: [PATCH 2/3] target/i386: properly reset TSC on reset + +RH-Author: Paolo Bonzini +RH-MergeRequest: 172: target/i386: properly reset TSC on reset +RH-Commit: [1/1] 7008bc5d02ad0a2d8b78259459d22d8f0986c989 +RH-Bugzilla: 2070417 +RH-Acked-by: Marcelo Tosatti +RH-Acked-by: Igor Mammedov +RH-Acked-by: Vitaly Kuznetsov + +Some versions of Windows hang on reboot if their TSC value is greater +than 2^54. The calibration of the Hyper-V reference time overflows +and fails; as a result the processors' clock sources are out of sync. + +The issue is that the TSC _should_ be reset to 0 on CPU reset and +QEMU tries to do that. However, KVM special cases writing 0 to the +TSC and thinks that QEMU is trying to hot-plug a CPU, which is +correct the first time through but not later. Thwart this valiant +effort and reset the TSC to 1 instead, but only if the CPU has been +run once. + +For this to work, env->tsc has to be moved to the part of CPUArchState +that is not zeroed at the beginning of x86_cpu_reset. + +Reported-by: Vadim Rozenfeld +Supersedes: <20220324082346.72180-1-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 5286c3662294119dc2dd1e9296757337211451f6) +--- + target/i386/cpu.c | 13 +++++++++++++ + target/i386/cpu.h | 2 +- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 6e25d13339..dd6935b1dd 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -5871,6 +5871,19 @@ static void x86_cpu_reset(DeviceState *dev) + env->xstate_bv = 0; + + env->pat = 0x0007040600070406ULL; ++ ++ if (kvm_enabled()) { ++ /* ++ * KVM handles TSC = 0 specially and thinks we are hot-plugging ++ * a new CPU, use 1 instead to force a reset. ++ */ ++ if (env->tsc != 0) { ++ env->tsc = 1; ++ } ++ } else { ++ env->tsc = 0; ++ } ++ + env->msr_ia32_misc_enable = MSR_IA32_MISC_ENABLE_DEFAULT; + if (env->features[FEAT_1_ECX] & CPUID_EXT_MONITOR) { + env->msr_ia32_misc_enable |= MSR_IA32_MISC_ENABLE_MWAIT; +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 04f2b790c9..c6a6c871f1 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1510,7 +1510,6 @@ typedef struct CPUX86State { + target_ulong kernelgsbase; + #endif + +- uint64_t tsc; + uint64_t tsc_adjust; + uint64_t tsc_deadline; + uint64_t tsc_aux; +@@ -1660,6 +1659,7 @@ typedef struct CPUX86State { + int64_t tsc_khz; + int64_t user_tsc_khz; /* for sanity check only */ + uint64_t apic_bus_freq; ++ uint64_t tsc; + #if defined(CONFIG_KVM) || defined(CONFIG_HVF) + void *xsave_buf; + uint32_t xsave_buf_len; +-- +2.35.1 + diff --git a/SOURCES/kvm-tests-qtest-fdc-test-Add-a-regression-test-for-CVE-2.patch b/SOURCES/kvm-tests-qtest-fdc-test-Add-a-regression-test-for-CVE-2.patch new file mode 100644 index 0000000..4c04458 --- /dev/null +++ b/SOURCES/kvm-tests-qtest-fdc-test-Add-a-regression-test-for-CVE-2.patch @@ -0,0 +1,120 @@ +From 24af433728429578e586d179e27451b7d4a46cba Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 18 Nov 2021 12:57:33 +0100 +Subject: [PATCH 3/3] tests/qtest/fdc-test: Add a regression test for + CVE-2021-3507 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 194: hw/block/fdc: Prevent end-of-track overrun (CVE-2021-3507) +RH-Commit: [2/2] 31ec71276b521b06d4142fffa88a3fa4d1494d92 (jmaloy/qemu-kvm) +RH-Bugzilla: 1951521 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth +RH-Acked-by: Hanna Reitz + +Add the reproducer from https://gitlab.com/qemu-project/qemu/-/issues/339 + +Without the previous commit, when running 'make check-qtest-i386' +with QEMU configured with '--enable-sanitizers' we get: + + ==4028352==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x619000062a00 at pc 0x5626d03c491a bp 0x7ffdb4199410 sp 0x7ffdb4198bc0 + READ of size 786432 at 0x619000062a00 thread T0 + #0 0x5626d03c4919 in __asan_memcpy (qemu-system-i386+0x1e65919) + #1 0x5626d1c023cc in flatview_write_continue softmmu/physmem.c:2787:13 + #2 0x5626d1bf0c0f in flatview_write softmmu/physmem.c:2822:14 + #3 0x5626d1bf0798 in address_space_write softmmu/physmem.c:2914:18 + #4 0x5626d1bf0f37 in address_space_rw softmmu/physmem.c:2924:16 + #5 0x5626d1bf14c8 in cpu_physical_memory_rw softmmu/physmem.c:2933:5 + #6 0x5626d0bd5649 in cpu_physical_memory_write include/exec/cpu-common.h:82:5 + #7 0x5626d0bd0a07 in i8257_dma_write_memory hw/dma/i8257.c:452:9 + #8 0x5626d09f825d in fdctrl_transfer_handler hw/block/fdc.c:1616:13 + #9 0x5626d0a048b4 in fdctrl_start_transfer hw/block/fdc.c:1539:13 + #10 0x5626d09f4c3e in fdctrl_write_data hw/block/fdc.c:2266:13 + #11 0x5626d09f22f7 in fdctrl_write hw/block/fdc.c:829:9 + #12 0x5626d1c20bc5 in portio_write softmmu/ioport.c:207:17 + + 0x619000062a00 is located 0 bytes to the right of 512-byte region [0x619000062800,0x619000062a00) + allocated by thread T0 here: + #0 0x5626d03c66ec in posix_memalign (qemu-system-i386+0x1e676ec) + #1 0x5626d2b988d4 in qemu_try_memalign util/oslib-posix.c:210:11 + #2 0x5626d2b98b0c in qemu_memalign util/oslib-posix.c:226:27 + #3 0x5626d09fbaf0 in fdctrl_realize_common hw/block/fdc.c:2341:20 + #4 0x5626d0a150ed in isabus_fdc_realize hw/block/fdc-isa.c:113:5 + #5 0x5626d2367935 in device_set_realized hw/core/qdev.c:531:13 + + SUMMARY: AddressSanitizer: heap-buffer-overflow (qemu-system-i386+0x1e65919) in __asan_memcpy + Shadow bytes around the buggy address: + 0x0c32800044f0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + 0x0c3280004510: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + 0x0c3280004520: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + 0x0c3280004530: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + =>0x0c3280004540:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004550: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004560: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004570: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004580: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa + 0x0c3280004590: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd + Shadow byte legend (one shadow byte represents 8 application bytes): + Addressable: 00 + Heap left redzone: fa + Freed heap region: fd + ==4028352==ABORTING + +[ kwolf: Added snapshot=on to prevent write file lock failure ] + +Reported-by: Alexander Bulekov +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Alexander Bulekov +Signed-off-by: Kevin Wolf +(cherry picked from commit 46609b90d9e3a6304def11038a76b58ff43f77bc) +Signed-off-by: Jon Maloy +--- + tests/qtest/fdc-test.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/tests/qtest/fdc-test.c b/tests/qtest/fdc-test.c +index 8f6eee84a4..6f5850354f 100644 +--- a/tests/qtest/fdc-test.c ++++ b/tests/qtest/fdc-test.c +@@ -583,6 +583,26 @@ static void test_cve_2021_20196(void) + qtest_quit(s); + } + ++static void test_cve_2021_3507(void) ++{ ++ QTestState *s; ++ ++ s = qtest_initf("-nographic -m 32M -nodefaults " ++ "-drive file=%s,format=raw,if=floppy,snapshot=on", ++ test_image); ++ qtest_outl(s, 0x9, 0x0a0206); ++ qtest_outw(s, 0x3f4, 0x1600); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_outw(s, 0x3f4, 0x0200); ++ qtest_outw(s, 0x3f4, 0x0200); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_outw(s, 0x3f4, 0x0000); ++ qtest_quit(s); ++} ++ + int main(int argc, char **argv) + { + int fd; +@@ -614,6 +634,7 @@ int main(int argc, char **argv) + qtest_add_func("/fdc/read_no_dma_19", test_read_no_dma_19); + qtest_add_func("/fdc/fuzz-registers", fuzz_registers); + qtest_add_func("/fdc/fuzz/cve_2021_20196", test_cve_2021_20196); ++ qtest_add_func("/fdc/fuzz/cve_2021_3507", test_cve_2021_3507); + + ret = g_test_run(); + +-- +2.35.3 + diff --git a/SOURCES/kvm-ui-cursor-fix-integer-overflow-in-cursor_alloc-CVE-2.patch b/SOURCES/kvm-ui-cursor-fix-integer-overflow-in-cursor_alloc-CVE-2.patch new file mode 100644 index 0000000..813d0ca --- /dev/null +++ b/SOURCES/kvm-ui-cursor-fix-integer-overflow-in-cursor_alloc-CVE-2.patch @@ -0,0 +1,105 @@ +From 87a318f0b8758f940a316831a77b6ebebca42b19 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 4 May 2022 10:35:17 -0400 +Subject: [PATCH 3/3] ui/cursor: fix integer overflow in cursor_alloc + (CVE-2021-4206) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 180: ui/cursor: fix integer overflow in cursor_alloc (CVE-2021-4206) +RH-Commit: [1/1] 7ad711347bc6248dc5aefa45401ca74448dee5e5 (jmaloy/qemu-kvm) +RH-Bugzilla: 2040734 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Mauro Matteo Cascella +RH-Acked-by: Gerd Hoffmann + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=2040734 +Upstream: Merged +CVE: CVE-2021-4206 + +commit fa892e9abb728e76afcf27323ab29c57fb0fe7aa +Author: Mauro Matteo Cascella +Date: Thu Apr 7 10:17:12 2022 +0200 + + ui/cursor: fix integer overflow in cursor_alloc (CVE-2021-4206) + + Prevent potential integer overflow by limiting 'width' and 'height' to + 512x512. Also change 'datasize' type to size_t. Refer to security + advisory https://starlabs.sg/advisories/22-4206/ for more information. + + Fixes: CVE-2021-4206 + Signed-off-by: Mauro Matteo Cascella + Reviewed-by: Marc-André Lureau + Message-Id: <20220407081712.345609-1-mcascell@redhat.com> + Signed-off-by: Gerd Hoffmann + +(cherry picked from commit fa892e9abb728e76afcf27323ab29c57fb0fe7aa) +Signed-off-by: Jon Maloy +--- + hw/display/qxl-render.c | 7 +++++++ + hw/display/vmware_vga.c | 2 ++ + ui/cursor.c | 8 +++++++- + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/hw/display/qxl-render.c b/hw/display/qxl-render.c +index 237ed293ba..ca217004bf 100644 +--- a/hw/display/qxl-render.c ++++ b/hw/display/qxl-render.c +@@ -247,6 +247,13 @@ static QEMUCursor *qxl_cursor(PCIQXLDevice *qxl, QXLCursor *cursor, + size_t size; + + c = cursor_alloc(cursor->header.width, cursor->header.height); ++ ++ if (!c) { ++ qxl_set_guest_bug(qxl, "%s: cursor %ux%u alloc error", __func__, ++ cursor->header.width, cursor->header.height); ++ goto fail; ++ } ++ + c->hot_x = cursor->header.hot_spot_x; + c->hot_y = cursor->header.hot_spot_y; + switch (cursor->header.type) { +diff --git a/hw/display/vmware_vga.c b/hw/display/vmware_vga.c +index e2969a6c81..2b81d6122f 100644 +--- a/hw/display/vmware_vga.c ++++ b/hw/display/vmware_vga.c +@@ -509,6 +509,8 @@ static inline void vmsvga_cursor_define(struct vmsvga_state_s *s, + int i, pixels; + + qc = cursor_alloc(c->width, c->height); ++ assert(qc != NULL); ++ + qc->hot_x = c->hot_x; + qc->hot_y = c->hot_y; + switch (c->bpp) { +diff --git a/ui/cursor.c b/ui/cursor.c +index 1d62ddd4d0..835f0802f9 100644 +--- a/ui/cursor.c ++++ b/ui/cursor.c +@@ -46,6 +46,8 @@ static QEMUCursor *cursor_parse_xpm(const char *xpm[]) + + /* parse pixel data */ + c = cursor_alloc(width, height); ++ assert(c != NULL); ++ + for (pixel = 0, y = 0; y < height; y++, line++) { + for (x = 0; x < height; x++, pixel++) { + idx = xpm[line][x]; +@@ -91,7 +93,11 @@ QEMUCursor *cursor_builtin_left_ptr(void) + QEMUCursor *cursor_alloc(int width, int height) + { + QEMUCursor *c; +- int datasize = width * height * sizeof(uint32_t); ++ size_t datasize = width * height * sizeof(uint32_t); ++ ++ if (width > 512 || height > 512) { ++ return NULL; ++ } + + c = g_malloc0(sizeof(QEMUCursor) + datasize); + c->width = width; +-- +2.35.1 + diff --git a/SOURCES/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch b/SOURCES/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch new file mode 100644 index 0000000..c3dbcf9 --- /dev/null +++ b/SOURCES/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch @@ -0,0 +1,56 @@ +From 9a62319b973ec33f9ccbeeae7f2f3b4b31db0c26 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:15 -0700 +Subject: [PATCH 17/24] vhost-net: fix improper cleanup in vhost_net_start +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [4/7] bebe7990a12e901fbb84e5e4b7a62744d75c9d9e +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +vhost_net_start() missed a corresponding stop_one() upon error from +vhost_set_vring_enable(). While at it, make the error handling for +err_start more robust. No real issue was found due to this though. + +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-5-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 6f3910b5eee00b8cc959e94659c0d524c482a418) +Signed-off-by: Jason Wang +--- + hw/net/vhost_net.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c +index 30379d2ca4..d6d7c51f62 100644 +--- a/hw/net/vhost_net.c ++++ b/hw/net/vhost_net.c +@@ -381,6 +381,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, + r = vhost_set_vring_enable(peer, peer->vring_enable); + + if (r < 0) { ++ vhost_net_stop_one(get_vhost_net(peer), dev); + goto err_start; + } + } +@@ -390,7 +391,8 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, + + err_start: + while (--i >= 0) { +- peer = qemu_get_peer(ncs , i); ++ peer = qemu_get_peer(ncs, i < data_queue_pairs ? ++ i : n->max_queue_pairs); + vhost_net_stop_one(get_vhost_net(peer), dev); + } + e = k->set_guest_notifiers(qbus->parent, total_notifiers, false); +-- +2.35.3 + diff --git a/SOURCES/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch b/SOURCES/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch new file mode 100644 index 0000000..ef700fd --- /dev/null +++ b/SOURCES/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch @@ -0,0 +1,58 @@ +From 01270bb66a4f7897a4fd06ba248eeeb41dc47571 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:16 -0700 +Subject: [PATCH 18/24] vhost-vdpa: backend feature should set only once +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [5/7] 0ab13542cf25c129dc403db95c7db12cdb012744 +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +The vhost_vdpa_one_time_request() branch in +vhost_vdpa_set_backend_cap() incorrectly sends down +ioctls on vhost_dev with non-zero index. This may +end up with multiple VHOST_SET_BACKEND_FEATURES +ioctl calls sent down on the vhost-vdpa fd that is +shared between all these vhost_dev's. + +To fix it, send down ioctl only once via the first +vhost_dev with index 0. Toggle the polarity of the +vhost_vdpa_one_time_request() test should do the +trick. + +Fixes: 4d191cfdc7de ("vhost-vdpa: classify one time request") +Signed-off-by: Si-Wei Liu +Reviewed-by: Stefano Garzarella +Acked-by: Jason Wang +Acked-by: Eugenio Pérez +Message-Id: <1651890498-24478-6-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 6aee7e4233f6467f69531fcd352adff028f3f5ea) +Signed-off-by: Jason Wang +--- + hw/virtio/vhost-vdpa.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c +index 78da48a333..a9be24776a 100644 +--- a/hw/virtio/vhost-vdpa.c ++++ b/hw/virtio/vhost-vdpa.c +@@ -525,7 +525,7 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) + + features &= f; + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_one_time_request(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; +-- +2.35.3 + diff --git a/SOURCES/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch b/SOURCES/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch new file mode 100644 index 0000000..bbc1c85 --- /dev/null +++ b/SOURCES/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch @@ -0,0 +1,126 @@ +From c8cb46fa93a3ccad6f3e183045b270f28eed7b12 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:17 -0700 +Subject: [PATCH 19/24] vhost-vdpa: change name and polarity for + vhost_vdpa_one_time_request() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [6/7] 727ab0bb813f073e8cd2f7e68a9acda60c2cb33d +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +The name vhost_vdpa_one_time_request() was confusing. No +matter whatever it returns, its typical occurrence had +always been at requests that only need to be applied once. +And the name didn't suggest what it actually checks for. +Change it to vhost_vdpa_first_dev() with polarity flipped +for better readibility of code. That way it is able to +reflect what the check is really about. + +This call is applicable to request which performs operation +only once, before queues are set up, and usually at the beginning +of the caller function. Document the requirement for it in place. + +Conflicts: hw/virtio/vhost-vdpa.c since we don't have shadow virtqueue +suport. + +Signed-off-by: Si-Wei Liu +Message-Id: <1651890498-24478-7-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Stefano Garzarella +Acked-by: Jason Wang +(cherry picked from commit d71b0609fc04217e28d17009f04d74b08be6f466) +Signed-off-by: Jason Wang +--- + hw/virtio/vhost-vdpa.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c +index a9be24776a..38bbcb3c18 100644 +--- a/hw/virtio/vhost-vdpa.c ++++ b/hw/virtio/vhost-vdpa.c +@@ -319,11 +319,18 @@ static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) + v->iova_range.last); + } + +-static bool vhost_vdpa_one_time_request(struct vhost_dev *dev) ++/* ++ * The use of this function is for requests that only need to be ++ * applied once. Typically such request occurs at the beginning ++ * of operation, and before setting up queues. It should not be ++ * used for request that performs operation until all queues are ++ * set, which would need to check dev->vq_index_end instead. ++ */ ++static bool vhost_vdpa_first_dev(struct vhost_dev *dev) + { + struct vhost_vdpa *v = dev->opaque; + +- return v->index != 0; ++ return v->index == 0; + } + + static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) +@@ -351,7 +358,7 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) + + vhost_vdpa_get_iova_range(v); + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -468,7 +475,7 @@ static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) + static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) + { +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -496,7 +503,7 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, + { + int ret; + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -525,7 +532,7 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) + + features &= f; + +- if (!vhost_vdpa_one_time_request(dev)) { ++ if (vhost_vdpa_first_dev(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; +@@ -670,7 +677,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) + static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) + { +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -739,7 +746,7 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev, + + static int vhost_vdpa_set_owner(struct vhost_dev *dev) + { +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +-- +2.35.3 + diff --git a/SOURCES/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch b/SOURCES/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch new file mode 100644 index 0000000..68c7d5f --- /dev/null +++ b/SOURCES/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch @@ -0,0 +1,48 @@ +From c10ef6f79d4a4c8ccc5901b25234501c621e4e04 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:14 -0700 +Subject: [PATCH 16/24] vhost-vdpa: fix improper cleanup in net_init_vhost_vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [3/7] b3b658dcb4695defe1fdb199570fb984291e8e21 +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +... such that no memory leaks on dangling net clients in case of +error. + +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-4-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 9bd055073e375c8a0d7ebce925e05d914d69fc7f) +Signed-off-by: Jason Wang +--- + net/vhost-vdpa.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c +index 25dd6dd975..814f704687 100644 +--- a/net/vhost-vdpa.c ++++ b/net/vhost-vdpa.c +@@ -306,7 +306,9 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, + + err: + if (i) { +- qemu_del_net_client(ncs[0]); ++ for (i--; i >= 0; i--) { ++ qemu_del_net_client(ncs[i]); ++ } + } + qemu_close(vdpa_device_fd); + g_free(ncs); +-- +2.35.3 + diff --git a/SOURCES/kvm-virtio-gpu-do-not-byteswap-padding.patch b/SOURCES/kvm-virtio-gpu-do-not-byteswap-padding.patch new file mode 100644 index 0000000..dc723bd --- /dev/null +++ b/SOURCES/kvm-virtio-gpu-do-not-byteswap-padding.patch @@ -0,0 +1,48 @@ +From e118a451dc1ed68f1371a5d8e042120542be6d31 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 11 Nov 2021 12:06:00 +0100 +Subject: [PATCH 01/24] virtio-gpu: do not byteswap padding +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [1/13] 12714f53820b7632e7fc0a8a3bf8eb4a64f41750 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +In Linux 5.16, the padding of struct virtio_gpu_ctrl_hdr has become a +single-byte field followed by a uint8_t[3] array of padding bytes, +and virtio_gpu_ctrl_hdr_bswap does not compile anymore. + +Signed-off-by: Paolo Bonzini +Acked-by: Cornelia Huck +Reviewed-by: Alex Bennée +Reviewed-by: Michael S. Tsirkin +Reviewed-by: Philippe Mathieu-Daudé +Message-Id: <20211111110604.207376-2-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit a4663f1a5506626175fc64c86e52135587c36872) +Signed-off-by: Paul Lai +--- + include/hw/virtio/virtio-gpu-bswap.h | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/include/hw/virtio/virtio-gpu-bswap.h b/include/hw/virtio/virtio-gpu-bswap.h +index e2bee8f595..5faac0d8d5 100644 +--- a/include/hw/virtio/virtio-gpu-bswap.h ++++ b/include/hw/virtio/virtio-gpu-bswap.h +@@ -24,7 +24,6 @@ virtio_gpu_ctrl_hdr_bswap(struct virtio_gpu_ctrl_hdr *hdr) + le32_to_cpus(&hdr->flags); + le64_to_cpus(&hdr->fence_id); + le32_to_cpus(&hdr->ctx_id); +- le32_to_cpus(&hdr->padding); + } + + static inline void +-- +2.35.3 + diff --git a/SOURCES/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch b/SOURCES/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch new file mode 100644 index 0000000..f23f38c --- /dev/null +++ b/SOURCES/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch @@ -0,0 +1,143 @@ +From 39cdd781c885b0695f8830a33420caa9e9b0bd50 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:13 -0700 +Subject: [PATCH 15/24] virtio-net: align ctrl_vq index for non-mq guest for + vhost_vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [2/7] 2647cf59f3dd1e3d8af2d12c01e06ae26fbc1dc2 +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +With MQ enabled vdpa device and non-MQ supporting guest e.g. +booting vdpa with mq=on over OVMF of single vqp, below assert +failure is seen: + +../hw/virtio/vhost-vdpa.c:560: vhost_vdpa_get_vq_index: Assertion `idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs' failed. + +0 0x00007f8ce3ff3387 in raise () at /lib64/libc.so.6 +1 0x00007f8ce3ff4a78 in abort () at /lib64/libc.so.6 +2 0x00007f8ce3fec1a6 in __assert_fail_base () at /lib64/libc.so.6 +3 0x00007f8ce3fec252 in () at /lib64/libc.so.6 +4 0x0000558f52d79421 in vhost_vdpa_get_vq_index (dev=, idx=) at ../hw/virtio/vhost-vdpa.c:563 +5 0x0000558f52d79421 in vhost_vdpa_get_vq_index (dev=, idx=) at ../hw/virtio/vhost-vdpa.c:558 +6 0x0000558f52d7329a in vhost_virtqueue_mask (hdev=0x558f55c01800, vdev=0x558f568f91f0, n=2, mask=) at ../hw/virtio/vhost.c:1557 +7 0x0000558f52c6b89a in virtio_pci_set_guest_notifier (d=d@entry=0x558f568f0f60, n=n@entry=2, assign=assign@entry=true, with_irqfd=with_irqfd@entry=false) + at ../hw/virtio/virtio-pci.c:974 +8 0x0000558f52c6c0d8 in virtio_pci_set_guest_notifiers (d=0x558f568f0f60, nvqs=3, assign=true) at ../hw/virtio/virtio-pci.c:1019 +9 0x0000558f52bf091d in vhost_net_start (dev=dev@entry=0x558f568f91f0, ncs=0x558f56937cd0, data_queue_pairs=data_queue_pairs@entry=1, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:361 +10 0x0000558f52d4e5e7 in virtio_net_set_status (status=, n=0x558f568f91f0) at ../hw/net/virtio-net.c:289 +11 0x0000558f52d4e5e7 in virtio_net_set_status (vdev=0x558f568f91f0, status=15 '\017') at ../hw/net/virtio-net.c:370 +12 0x0000558f52d6c4b2 in virtio_set_status (vdev=vdev@entry=0x558f568f91f0, val=val@entry=15 '\017') at ../hw/virtio/virtio.c:1945 +13 0x0000558f52c69eff in virtio_pci_common_write (opaque=0x558f568f0f60, addr=, val=, size=) at ../hw/virtio/virtio-pci.c:1292 +14 0x0000558f52d15d6e in memory_region_write_accessor (mr=0x558f568f19d0, addr=20, value=, size=1, shift=, mask=, attrs=...) + at ../softmmu/memory.c:492 +15 0x0000558f52d127de in access_with_adjusted_size (addr=addr@entry=20, value=value@entry=0x7f8cdbffe748, size=size@entry=1, access_size_min=, access_size_max=, access_fn=0x558f52d15cf0 , mr=0x558f568f19d0, attrs=...) at ../softmmu/memory.c:554 +16 0x0000558f52d157ef in memory_region_dispatch_write (mr=mr@entry=0x558f568f19d0, addr=20, data=, op=, attrs=attrs@entry=...) + at ../softmmu/memory.c:1504 +17 0x0000558f52d078e7 in flatview_write_continue (fv=fv@entry=0x7f8accbc3b90, addr=addr@entry=103079215124, attrs=..., ptr=ptr@entry=0x7f8ce6300028, len=len@entry=1, addr1=, l=, mr=0x558f568f19d0) at /home/opc/qemu-upstream/include/qemu/host-utils.h:165 +18 0x0000558f52d07b06 in flatview_write (fv=0x7f8accbc3b90, addr=103079215124, attrs=..., buf=0x7f8ce6300028, len=1) at ../softmmu/physmem.c:2822 +19 0x0000558f52d0b36b in address_space_write (as=, addr=, attrs=..., buf=buf@entry=0x7f8ce6300028, len=) + at ../softmmu/physmem.c:2914 +20 0x0000558f52d0b3da in address_space_rw (as=, addr=, attrs=..., + attrs@entry=..., buf=buf@entry=0x7f8ce6300028, len=, is_write=) at ../softmmu/physmem.c:2924 +21 0x0000558f52dced09 in kvm_cpu_exec (cpu=cpu@entry=0x558f55c2da60) at ../accel/kvm/kvm-all.c:2903 +22 0x0000558f52dcfabd in kvm_vcpu_thread_fn (arg=arg@entry=0x558f55c2da60) at ../accel/kvm/kvm-accel-ops.c:49 +23 0x0000558f52f9f04a in qemu_thread_start (args=) at ../util/qemu-thread-posix.c:556 +24 0x00007f8ce4392ea5 in start_thread () at /lib64/libpthread.so.0 +25 0x00007f8ce40bb9fd in clone () at /lib64/libc.so.6 + +The cause for the assert failure is due to that the vhost_dev index +for the ctrl vq was not aligned with actual one in use by the guest. +Upon multiqueue feature negotiation in virtio_net_set_multiqueue(), +if guest doesn't support multiqueue, the guest vq layout would shrink +to a single queue pair, consisting of 3 vqs in total (rx, tx and ctrl). +This results in ctrl_vq taking a different vhost_dev group index than +the default. We can map vq to the correct vhost_dev group by checking +if MQ is supported by guest and successfully negotiated. Since the +MQ feature is only present along with CTRL_VQ, we ensure the index +2 is only meant for the control vq while MQ is not supported by guest. + +Fixes: 22288fe ("virtio-net: vhost control virtqueue support") +Suggested-by: Jason Wang +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-3-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 68b0a6395f36a8f48f56f46d05f30be2067598b0) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 33 +++++++++++++++++++++++++++++++-- + 1 file changed, 31 insertions(+), 2 deletions(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index ec045c3f41..f118379bb4 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -14,6 +14,7 @@ + #include "qemu/osdep.h" + #include "qemu/atomic.h" + #include "qemu/iov.h" ++#include "qemu/log.h" + #include "qemu/main-loop.h" + #include "qemu/module.h" + #include "hw/virtio/virtio.h" +@@ -3163,8 +3164,22 @@ static NetClientInfo net_virtio_info = { + static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) + { + VirtIONet *n = VIRTIO_NET(vdev); +- NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ NetClientState *nc; + assert(n->vhost_started); ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { ++ /* Must guard against invalid features and bogus queue index ++ * from being set by malicious guest, or penetrated through ++ * buggy migration stream. ++ */ ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { ++ qemu_log_mask(LOG_GUEST_ERROR, ++ "%s: bogus vq index ignored\n", __func__); ++ return false; ++ } ++ nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); ++ } else { ++ nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ } + return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx); + } + +@@ -3172,8 +3187,22 @@ static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) + { + VirtIONet *n = VIRTIO_NET(vdev); +- NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ NetClientState *nc; + assert(n->vhost_started); ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { ++ /* Must guard against invalid features and bogus queue index ++ * from being set by malicious guest, or penetrated through ++ * buggy migration stream. ++ */ ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { ++ qemu_log_mask(LOG_GUEST_ERROR, ++ "%s: bogus vq index ignored\n", __func__); ++ return; ++ } ++ nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); ++ } else { ++ nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ } + vhost_net_virtqueue_mask(get_vhost_net(nc->peer), + vdev, idx, mask); + } +-- +2.35.3 + diff --git a/SOURCES/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch b/SOURCES/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch new file mode 100644 index 0000000..25c1aa9 --- /dev/null +++ b/SOURCES/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch @@ -0,0 +1,109 @@ +From c9b51d54530c526f14ca0f3b9fc0bfa0b60d45ee Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:18 -0700 +Subject: [PATCH 20/24] virtio-net: don't handle mq request in userspace + handler for vhost-vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [7/7] 0e6684d12e42752deae8f5ebc56456fed174e0ed +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +virtio_queue_host_notifier_read() tends to read pending event +left behind on ioeventfd in the vhost_net_stop() path, and +attempts to handle outstanding kicks from userspace vq handler. +However, in the ctrl_vq handler, virtio_net_handle_mq() has a +recursive call into virtio_net_set_status(), which may lead to +segmentation fault as shown in below stack trace: + +0 0x000055f800df1780 in qdev_get_parent_bus (dev=0x0) at ../hw/core/qdev.c:376 +1 0x000055f800c68ad8 in virtio_bus_device_iommu_enabled (vdev=vdev@entry=0x0) at ../hw/virtio/virtio-bus.c:331 +2 0x000055f800d70d7f in vhost_memory_unmap (dev=) at ../hw/virtio/vhost.c:318 +3 0x000055f800d70d7f in vhost_memory_unmap (dev=, buffer=0x7fc19bec5240, len=2052, is_write=1, access_len=2052) at ../hw/virtio/vhost.c:336 +4 0x000055f800d71867 in vhost_virtqueue_stop (dev=dev@entry=0x55f8037ccc30, vdev=vdev@entry=0x55f8044ec590, vq=0x55f8037cceb0, idx=0) at ../hw/virtio/vhost.c:1241 +5 0x000055f800d7406c in vhost_dev_stop (hdev=hdev@entry=0x55f8037ccc30, vdev=vdev@entry=0x55f8044ec590) at ../hw/virtio/vhost.c:1839 +6 0x000055f800bf00a7 in vhost_net_stop_one (net=0x55f8037ccc30, dev=0x55f8044ec590) at ../hw/net/vhost_net.c:315 +7 0x000055f800bf0678 in vhost_net_stop (dev=dev@entry=0x55f8044ec590, ncs=0x55f80452bae0, data_queue_pairs=data_queue_pairs@entry=7, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:423 +8 0x000055f800d4e628 in virtio_net_set_status (status=, n=0x55f8044ec590) at ../hw/net/virtio-net.c:296 +9 0x000055f800d4e628 in virtio_net_set_status (vdev=vdev@entry=0x55f8044ec590, status=15 '\017') at ../hw/net/virtio-net.c:370 +10 0x000055f800d534d8 in virtio_net_handle_ctrl (iov_cnt=, iov=, cmd=0 '\000', n=0x55f8044ec590) at ../hw/net/virtio-net.c:1408 +11 0x000055f800d534d8 in virtio_net_handle_ctrl (vdev=0x55f8044ec590, vq=0x7fc1a7e888d0) at ../hw/net/virtio-net.c:1452 +12 0x000055f800d69f37 in virtio_queue_host_notifier_read (vq=0x7fc1a7e888d0) at ../hw/virtio/virtio.c:2331 +13 0x000055f800d69f37 in virtio_queue_host_notifier_read (n=n@entry=0x7fc1a7e8894c) at ../hw/virtio/virtio.c:3575 +14 0x000055f800c688e6 in virtio_bus_cleanup_host_notifier (bus=, n=n@entry=14) at ../hw/virtio/virtio-bus.c:312 +15 0x000055f800d73106 in vhost_dev_disable_notifiers (hdev=hdev@entry=0x55f8035b51b0, vdev=vdev@entry=0x55f8044ec590) + at ../../../include/hw/virtio/virtio-bus.h:35 +16 0x000055f800bf00b2 in vhost_net_stop_one (net=0x55f8035b51b0, dev=0x55f8044ec590) at ../hw/net/vhost_net.c:316 +17 0x000055f800bf0678 in vhost_net_stop (dev=dev@entry=0x55f8044ec590, ncs=0x55f80452bae0, data_queue_pairs=data_queue_pairs@entry=7, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:423 +18 0x000055f800d4e628 in virtio_net_set_status (status=, n=0x55f8044ec590) at ../hw/net/virtio-net.c:296 +19 0x000055f800d4e628 in virtio_net_set_status (vdev=0x55f8044ec590, status=15 '\017') at ../hw/net/virtio-net.c:370 +20 0x000055f800d6c4b2 in virtio_set_status (vdev=0x55f8044ec590, val=) at ../hw/virtio/virtio.c:1945 +21 0x000055f800d11d9d in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:333 +22 0x000055f800d04e7a in do_vm_stop (state=state@entry=RUN_STATE_SHUTDOWN, send_stop=send_stop@entry=false) at ../softmmu/cpus.c:262 +23 0x000055f800d04e99 in vm_shutdown () at ../softmmu/cpus.c:280 +24 0x000055f800d126af in qemu_cleanup () at ../softmmu/runstate.c:812 +25 0x000055f800ad5b13 in main (argc=, argv=, envp=) at ../softmmu/main.c:51 + +For now, temporarily disable handling MQ request from the ctrl_vq +userspace hanlder to avoid the recursive virtio_net_set_status() +call. Some rework is needed to allow changing the number of +queues without going through a full virtio_net_set_status cycle, +particularly for vhost-vdpa backend. + +This patch will need to be reverted as soon as future patches of +having the change of #queues handled in userspace is merged. + +Fixes: 402378407db ("vhost-vdpa: multiqueue support") +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-8-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 2a7888cc3aa31faee839fa5dddad354ff8941f4c) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index f118379bb4..7e172ef829 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -1373,6 +1373,7 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, + { + VirtIODevice *vdev = VIRTIO_DEVICE(n); + uint16_t queue_pairs; ++ NetClientState *nc = qemu_get_queue(n->nic); + + virtio_net_disable_rss(n); + if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) { +@@ -1404,6 +1405,18 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, + return VIRTIO_NET_ERR; + } + ++ /* Avoid changing the number of queue_pairs for vdpa device in ++ * userspace handler. A future fix is needed to handle the mq ++ * change in userspace handler with vhost-vdpa. Let's disable ++ * the mq handling from userspace for now and only allow get ++ * done through the kernel. Ripples may be seen when falling ++ * back to userspace, but without doing it qemu process would ++ * crash on a recursive entry to virtio_net_set_status(). ++ */ ++ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { ++ return VIRTIO_NET_ERR; ++ } ++ + n->curr_queue_pairs = queue_pairs; + /* stop the backend before changing the number of queue_pairs to avoid handling a + * disabled queue */ +-- +2.35.3 + diff --git a/SOURCES/kvm-virtio-net-fix-map-leaking-on-error-during-receive.patch b/SOURCES/kvm-virtio-net-fix-map-leaking-on-error-during-receive.patch new file mode 100644 index 0000000..4855e59 --- /dev/null +++ b/SOURCES/kvm-virtio-net-fix-map-leaking-on-error-during-receive.patch @@ -0,0 +1,60 @@ +From 10b3a7b56dc9b4c88e503c36c1b13d80bcb7b066 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Tue, 8 Mar 2022 10:42:51 +0800 +Subject: [PATCH 2/6] virtio-net: fix map leaking on error during receive + +RH-Author: Jon Maloy +RH-MergeRequest: 154: virtio-net: fix map leaking on error during receive +RH-Commit: [1/1] 7178b0cd5ce7c89fe476f2e199c9212c8b89327a (jmaloy/qemu-kvm) +RH-Bugzilla: 2063206 +RH-Acked-by: Jason Wang +RH-Acked-by: Kevin Wolf +RH-Acked-by: Laurent Vivier + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2063206 +Upstream: Merged +CVE: CVE-2022-26353 + +commit abe300d9d894f7138e1af7c8e9c88c04bfe98b37 +Author: Jason Wang +Date: Tue Mar 8 10:42:51 2022 +0800 + + virtio-net: fix map leaking on error during receive + + Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg") + tries to fix the use after free of the sg by caching the virtqueue + elements in an array and unmap them at once after receiving the + packets, But it forgot to unmap the cached elements on error which + will lead to leaking of mapping and other unexpected results. + + Fixing this by detaching the cached elements on error. This addresses + CVE-2022-26353. + + Reported-by: Victor Tom + Cc: qemu-stable@nongnu.org + Fixes: CVE-2022-26353 + Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg") + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Jason Wang + +(cherry picked from commit abe300d9d894f7138e1af7c8e9c88c04bfe98b37) +Signed-off-by: Jon Maloy +--- + hw/net/virtio-net.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index f2014d5ea0..e1f4748831 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -1862,6 +1862,7 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, + + err: + for (j = 0; j < i; j++) { ++ virtqueue_detach_element(q->rx_vq, elems[j], lens[j]); + g_free(elems[j]); + } + +-- +2.27.0 + diff --git a/SOURCES/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch b/SOURCES/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch new file mode 100644 index 0000000..2e46cff --- /dev/null +++ b/SOURCES/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch @@ -0,0 +1,52 @@ +From bc307149fe4e3fe2a3e0ac52534383c955051e7e Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:12 -0700 +Subject: [PATCH 14/24] virtio-net: setup vhost_dev and notifiers for cvq only + when feature is negotiated +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 187: Multiqueue fixes for vhost-vDPA +RH-Commit: [1/7] 38bcfaa661f437b3dfa6b6f152dffd60073dc054 +RH-Bugzilla: 2069946 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Acked-by: Laurent Vivier + +When the control virtqueue feature is absent or not negotiated, +vhost_net_start() still tries to set up vhost_dev and install +vhost notifiers for the control virtqueue, which results in +erroneous ioctl calls with incorrect queue index sending down +to driver. Do that only when needed. + +Fixes: 22288fe ("virtio-net: vhost control virtqueue support") +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-2-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit aa8581945a13712ff3eed0ad3ba7a9664fc1604b) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index e1f4748831..ec045c3f41 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -244,7 +244,8 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) + VirtIODevice *vdev = VIRTIO_DEVICE(n); + NetClientState *nc = qemu_get_queue(n->nic); + int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; +- int cvq = n->max_ncs - n->max_queue_pairs; ++ int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? ++ n->max_ncs - n->max_queue_pairs : 0; + + if (!get_vhost_net(nc->peer)) { + return; +-- +2.35.3 + diff --git a/SOURCES/kvm-virtiofsd-Fix-breakage-due-to-fuse_init_in-size-chan.patch b/SOURCES/kvm-virtiofsd-Fix-breakage-due-to-fuse_init_in-size-chan.patch new file mode 100644 index 0000000..7ee71ae --- /dev/null +++ b/SOURCES/kvm-virtiofsd-Fix-breakage-due-to-fuse_init_in-size-chan.patch @@ -0,0 +1,63 @@ +From 1da951c4c3b4e403a6c1668a54e6264381c0003d Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Tue, 8 Feb 2022 15:48:04 -0500 +Subject: [PATCH 1/3] virtiofsd: Fix breakage due to fuse_init_in size change + +RH-Author: Dr. David Alan Gilbert +RH-MergeRequest: 193: virtiofsd: Fix breakage due to fuse_init_in size change +RH-Commit: [1/1] 5809db034f9361fb462181d71e7cdde1324f8e54 +RH-Bugzilla: 2097209 +RH-Acked-by: German Maglione +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Vivek Goyal +RH-Acked-by: Stefan Hajnoczi + +Kernel version 5.17 has increased the size of "struct fuse_init_in" struct. +Previously this struct was 16 bytes and now it has been extended to +64 bytes in size. + +Once qemu headers are updated to latest, it will expect to receive 64 byte +size struct (for protocol version major 7 and minor > 6). But if guest is +booting older kernel (older than 5.17), then it still sends older +fuse_init_in of size 16 bytes. And do_init() fails. It is expecting +64 byte struct. And this results in mount of virtiofs failing. + +Fix this by parsing 16 bytes only for now. Separate patches will be +posted which will parse rest of the bytes and enable new functionality. +Right now we don't support any of the new functionality, so we don't +lose anything by not parsing bytes beyond 16. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal +Message-Id: <20220208204813.682906-2-vgoyal@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a086d54c6ffa38f7e71f182b63a25315304a3392) +--- + tools/virtiofsd/fuse_lowlevel.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index e4679c73ab..5d431a7038 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1880,6 +1880,8 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) + { + size_t compat_size = offsetof(struct fuse_init_in, max_readahead); ++ size_t compat2_size = offsetof(struct fuse_init_in, flags) + ++ sizeof(uint32_t); + struct fuse_init_in *arg; + struct fuse_init_out outarg; + struct fuse_session *se = req->se; +@@ -1897,7 +1899,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + + /* ...and now consume the new fields. */ + if (arg->major == 7 && arg->minor >= 6) { +- if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { ++ if (!fuse_mbuf_iter_advance(iter, compat2_size - compat_size)) { + fuse_reply_err(req, EINVAL); + return; + } +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Add-AMX-CPUIDs-enumeration.patch b/SOURCES/kvm-x86-Add-AMX-CPUIDs-enumeration.patch new file mode 100644 index 0000000..d61e4cf --- /dev/null +++ b/SOURCES/kvm-x86-Add-AMX-CPUIDs-enumeration.patch @@ -0,0 +1,135 @@ +From d0826a8c2c3c389eeeed1014d7e316f39f083971 Mon Sep 17 00:00:00 2001 +From: Jing Liu +Date: Wed, 16 Feb 2022 22:04:31 -0800 +Subject: [PATCH 09/24] x86: Add AMX CPUIDs enumeration + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [9/13] fab147992ad927c9538529f018f06e2f48546c5b +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Add AMX primary feature bits XFD and AMX_TILE to +enumerate the CPU's AMX capability. Meanwhile, add +AMX TILE and TMUL CPUID leaf and subleaves which +exist when AMX TILE is present to provide the maximum +capability of TILE and TMUL. + +Signed-off-by: Jing Liu +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-6-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit f21a48171cf3fa39532fc8553fd82e81b88b6474) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 55 ++++++++++++++++++++++++++++++++++++++++--- + target/i386/kvm/kvm.c | 4 +++- + 2 files changed, 55 insertions(+), 4 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index cd27c0eb81..09e08f7f38 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -574,6 +574,18 @@ static CPUCacheInfo legacy_l3_cache = { + #define INTEL_PT_CYCLE_BITMAP 0x1fff /* Support 0,2^(0~11) */ + #define INTEL_PT_PSB_BITMAP (0x003f << 16) /* Support 2K,4K,8K,16K,32K,64K */ + ++/* CPUID Leaf 0x1D constants: */ ++#define INTEL_AMX_TILE_MAX_SUBLEAF 0x1 ++#define INTEL_AMX_TOTAL_TILE_BYTES 0x2000 ++#define INTEL_AMX_BYTES_PER_TILE 0x400 ++#define INTEL_AMX_BYTES_PER_ROW 0x40 ++#define INTEL_AMX_TILE_MAX_NAMES 0x8 ++#define INTEL_AMX_TILE_MAX_ROWS 0x10 ++ ++/* CPUID Leaf 0x1E constants: */ ++#define INTEL_AMX_TMUL_MAX_K 0x10 ++#define INTEL_AMX_TMUL_MAX_N 0x40 ++ + void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, + uint32_t vendor2, uint32_t vendor3) + { +@@ -843,8 +855,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "avx512-vp2intersect", NULL, "md-clear", NULL, + NULL, NULL, "serialize", NULL, + "tsx-ldtrk", NULL, NULL /* pconfig */, NULL, +- NULL, NULL, NULL, "avx512-fp16", +- NULL, NULL, "spec-ctrl", "stibp", ++ NULL, NULL, "amx-bf16", "avx512-fp16", ++ "amx-tile", "amx-int8", "spec-ctrl", "stibp", + NULL, "arch-capabilities", "core-capability", "ssbd", + }, + .cpuid = { +@@ -909,7 +921,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + "xsaveopt", "xsavec", "xgetbv1", "xsaves", +- NULL, NULL, NULL, NULL, ++ "xfd", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +@@ -5593,6 +5605,43 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + } + break; + } ++ case 0x1D: { ++ /* AMX TILE */ ++ *eax = 0; ++ *ebx = 0; ++ *ecx = 0; ++ *edx = 0; ++ if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) { ++ break; ++ } ++ ++ if (count == 0) { ++ /* Highest numbered palette subleaf */ ++ *eax = INTEL_AMX_TILE_MAX_SUBLEAF; ++ } else if (count == 1) { ++ *eax = INTEL_AMX_TOTAL_TILE_BYTES | ++ (INTEL_AMX_BYTES_PER_TILE << 16); ++ *ebx = INTEL_AMX_BYTES_PER_ROW | (INTEL_AMX_TILE_MAX_NAMES << 16); ++ *ecx = INTEL_AMX_TILE_MAX_ROWS; ++ } ++ break; ++ } ++ case 0x1E: { ++ /* AMX TMUL */ ++ *eax = 0; ++ *ebx = 0; ++ *ecx = 0; ++ *edx = 0; ++ if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) { ++ break; ++ } ++ ++ if (count == 0) { ++ /* Highest numbered palette subleaf */ ++ *ebx = INTEL_AMX_TMUL_MAX_K | (INTEL_AMX_TMUL_MAX_N << 8); ++ } ++ break; ++ } + case 0x40000000: + /* + * CPUID code in kvm_arch_init_vcpu() ignores stuff +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index b5d98c4361..a64a79d870 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1779,7 +1779,9 @@ int kvm_arch_init_vcpu(CPUState *cs) + c = &cpuid_data.entries[cpuid_i++]; + } + break; +- case 0x14: { ++ case 0x14: ++ case 0x1d: ++ case 0x1e: { + uint32_t times; + + c->function = i; +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch b/SOURCES/kvm-x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch new file mode 100644 index 0000000..064b124 --- /dev/null +++ b/SOURCES/kvm-x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch @@ -0,0 +1,112 @@ +From 3ba6092159b6e3b25505af2a49c0f6ac99043db9 Mon Sep 17 00:00:00 2001 +From: Jing Liu +Date: Wed, 16 Feb 2022 22:04:28 -0800 +Subject: [PATCH 06/24] x86: Add AMX XTILECFG and XTILEDATA components + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [6/13] 95229f87b4494631d57232f374a174f7bc95843a +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +The AMX TILECFG register and the TMMx tile data registers are +saved/restored via XSAVE, respectively in state component 17 +(64 bytes) and state component 18 (8192 bytes). + +Add AMX feature bits to x86_ext_save_areas array to set +up AMX components. Add structs that define the layout of +AMX XSAVE areas and use QEMU_BUILD_BUG_ON to validate the +structs sizes. + +Signed-off-by: Jing Liu +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-3-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 1f16764f7d4515bfd5e4ae0aae814fa280a7d0c8) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 8 ++++++++ + target/i386/cpu.h | 18 +++++++++++++++++- + 2 files changed, 25 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f44fad3a2a..0453c27c9d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1401,6 +1401,14 @@ ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = { + [XSTATE_PKRU_BIT] = + { .feature = FEAT_7_0_ECX, .bits = CPUID_7_0_ECX_PKU, + .size = sizeof(XSavePKRU) }, ++ [XSTATE_XTILE_CFG_BIT] = { ++ .feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE, ++ .size = sizeof(XSaveXTILECFG), ++ }, ++ [XSTATE_XTILE_DATA_BIT] = { ++ .feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE, ++ .size = sizeof(XSaveXTILEDATA) ++ }, + }; + + static uint32_t xsave_area_size(uint64_t mask) +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 5d9702a991..e1dd8b9555 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -537,6 +537,8 @@ typedef enum X86Seg { + #define XSTATE_ZMM_Hi256_BIT 6 + #define XSTATE_Hi16_ZMM_BIT 7 + #define XSTATE_PKRU_BIT 9 ++#define XSTATE_XTILE_CFG_BIT 17 ++#define XSTATE_XTILE_DATA_BIT 18 + + #define XSTATE_FP_MASK (1ULL << XSTATE_FP_BIT) + #define XSTATE_SSE_MASK (1ULL << XSTATE_SSE_BIT) +@@ -845,6 +847,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_TSX_LDTRK (1U << 16) + /* AVX512_FP16 instruction */ + #define CPUID_7_0_EDX_AVX512_FP16 (1U << 23) ++/* AMX tile (two-dimensional register) */ ++#define CPUID_7_0_EDX_AMX_TILE (1U << 24) + /* Speculation Control */ + #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) + /* Single Thread Indirect Branch Predictors */ +@@ -1348,6 +1352,16 @@ typedef struct XSavePKRU { + uint32_t padding; + } XSavePKRU; + ++/* Ext. save area 17: AMX XTILECFG state */ ++typedef struct XSaveXTILECFG { ++ uint8_t xtilecfg[64]; ++} XSaveXTILECFG; ++ ++/* Ext. save area 18: AMX XTILEDATA state */ ++typedef struct XSaveXTILEDATA { ++ uint8_t xtiledata[8][1024]; ++} XSaveXTILEDATA; ++ + QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100); + QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40); + QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40); +@@ -1355,6 +1369,8 @@ QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40); + QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200); + QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400); + QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8); ++QEMU_BUILD_BUG_ON(sizeof(XSaveXTILECFG) != 0x40); ++QEMU_BUILD_BUG_ON(sizeof(XSaveXTILEDATA) != 0x2000); + + typedef struct ExtSaveArea { + uint32_t feature, bits; +@@ -1362,7 +1378,7 @@ typedef struct ExtSaveArea { + uint32_t ecx; + } ExtSaveArea; + +-#define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1) ++#define XSAVE_STATE_AREA_COUNT (XSTATE_XTILE_DATA_BIT + 1) + + extern ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT]; + +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Add-XFD-faulting-bit-for-state-components.patch b/SOURCES/kvm-x86-Add-XFD-faulting-bit-for-state-components.patch new file mode 100644 index 0000000..5c0fd0a --- /dev/null +++ b/SOURCES/kvm-x86-Add-XFD-faulting-bit-for-state-components.patch @@ -0,0 +1,62 @@ +From 098d6a965ada02f5897b73f0489413a050a176bb Mon Sep 17 00:00:00 2001 +From: Jing Liu +Date: Wed, 16 Feb 2022 22:04:30 -0800 +Subject: [PATCH 08/24] x86: Add XFD faulting bit for state components + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [8/13] 0b1b46c5d075655ab94bc79e042b187c5dc55551 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Intel introduces XFD faulting mechanism for extended +XSAVE features to dynamically enable the features in +runtime. If CPUID (EAX=0Dh, ECX=n, n>1).ECX[2] is set +as 1, it indicates support for XFD faulting of this +state component. + +Signed-off-by: Jing Liu +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-5-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 0f17f6b30f3b051f0f96ccc98c9f7f395713699f) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 3 ++- + target/i386/cpu.h | 2 ++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index c19b51ea32..cd27c0eb81 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -5503,7 +5503,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + const ExtSaveArea *esa = &x86_ext_save_areas[count]; + *eax = esa->size; + *ebx = esa->offset; +- *ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK; ++ *ecx = esa->ecx & ++ (ESA_FEATURE_ALIGN64_MASK | ESA_FEATURE_XFD_MASK); + } + } + break; +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 58676390e6..f2bdef9c26 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -555,8 +555,10 @@ typedef enum X86Seg { + #define XSTATE_DYNAMIC_MASK (XSTATE_XTILE_DATA_MASK) + + #define ESA_FEATURE_ALIGN64_BIT 1 ++#define ESA_FEATURE_XFD_BIT 2 + + #define ESA_FEATURE_ALIGN64_MASK (1U << ESA_FEATURE_ALIGN64_BIT) ++#define ESA_FEATURE_XFD_MASK (1U << ESA_FEATURE_XFD_BIT) + + + /* CPUID feature words */ +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch b/SOURCES/kvm-x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch new file mode 100644 index 0000000..2db4c60 --- /dev/null +++ b/SOURCES/kvm-x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch @@ -0,0 +1,88 @@ +From 6eae12166341c236da023e5117b64b842ae72083 Mon Sep 17 00:00:00 2001 +From: Jing Liu +Date: Wed, 16 Feb 2022 22:04:27 -0800 +Subject: [PATCH 05/24] x86: Fix the 64-byte boundary enumeration for extended + state + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [5/13] 64fc93e3b0ad0fc56da9d71b33d9eefd3cbba1d7 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +The extended state subleaves (EAX=0Dh, ECX=n, n>1).ECX[1] +indicate whether the extended state component locates +on the next 64-byte boundary following the preceding state +component when the compacted format of an XSAVE area is +used. + +Right now, they are all zero because no supported component +needed the bit to be set, but the upcoming AMX feature will +use it. Fix the subleaves value according to KVM's supported +cpuid. + +Signed-off-by: Jing Liu +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-2-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 131266b7565bd437127bd231563572696bb27235) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 1 + + target/i386/cpu.h | 6 ++++++ + target/i386/kvm/kvm-cpu.c | 1 + + 3 files changed, 8 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index dd6935b1dd..f44fad3a2a 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -5495,6 +5495,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + const ExtSaveArea *esa = &x86_ext_save_areas[count]; + *eax = esa->size; + *ebx = esa->offset; ++ *ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK; + } + } + break; +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index c6a6c871f1..5d9702a991 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -548,6 +548,11 @@ typedef enum X86Seg { + #define XSTATE_Hi16_ZMM_MASK (1ULL << XSTATE_Hi16_ZMM_BIT) + #define XSTATE_PKRU_MASK (1ULL << XSTATE_PKRU_BIT) + ++#define ESA_FEATURE_ALIGN64_BIT 1 ++ ++#define ESA_FEATURE_ALIGN64_MASK (1U << ESA_FEATURE_ALIGN64_BIT) ++ ++ + /* CPUID feature words */ + typedef enum FeatureWord { + FEAT_1_EDX, /* CPUID[1].EDX */ +@@ -1354,6 +1359,7 @@ QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8); + typedef struct ExtSaveArea { + uint32_t feature, bits; + uint32_t offset, size; ++ uint32_t ecx; + } ExtSaveArea; + + #define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1) +diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c +index 7b004065ae..86ef7b2712 100644 +--- a/target/i386/kvm/kvm-cpu.c ++++ b/target/i386/kvm/kvm-cpu.c +@@ -104,6 +104,7 @@ static void kvm_cpu_xsave_init(void) + if (sz != 0) { + assert(esa->size == sz); + esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX); ++ esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX); + } + } + } +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Grant-AMX-permission-for-guest.patch b/SOURCES/kvm-x86-Grant-AMX-permission-for-guest.patch new file mode 100644 index 0000000..c2ab95d --- /dev/null +++ b/SOURCES/kvm-x86-Grant-AMX-permission-for-guest.patch @@ -0,0 +1,215 @@ +From 50840e01d05a466a1dfbc219e49233834e5d7ed0 Mon Sep 17 00:00:00 2001 +From: Yang Zhong +Date: Wed, 16 Feb 2022 22:04:29 -0800 +Subject: [PATCH 07/24] x86: Grant AMX permission for guest + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [7/13] 437578191f61139ca710cc7045ab38eb0d05eae2 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Kernel allocates 4K xstate buffer by default. For XSAVE features +which require large state component (e.g. AMX), Linux kernel +dynamically expands the xstate buffer only after the process has +acquired the necessary permissions. Those are called dynamically- +enabled XSAVE features (or dynamic xfeatures). + +There are separate permissions for native tasks and guests. + +Qemu should request the guest permissions for dynamic xfeatures +which will be exposed to the guest. This only needs to be done +once before the first vcpu is created. + +KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to +get host side supported_xcr0 and Qemu can decide if it can request +dynamically enabled XSAVE features permission. +https://lore.kernel.org/all/20220126152210.3044876-1-pbonzini@redhat.com/ + +Suggested-by: Paolo Bonzini +Signed-off-by: Yang Zhong +Signed-off-by: Jing Liu +Message-Id: <20220217060434.52460-4-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 19db68ca68a78fa033a21d419036b6e416554564) +Signed-off-by: Paul Lai +--- + target/i386/cpu.c | 7 +++++ + target/i386/cpu.h | 4 +++ + target/i386/kvm/kvm-cpu.c | 12 ++++---- + target/i386/kvm/kvm.c | 57 ++++++++++++++++++++++++++++++++++++++ + target/i386/kvm/kvm_i386.h | 1 + + 5 files changed, 75 insertions(+), 6 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 0453c27c9d..c19b51ea32 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6027,6 +6027,7 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + CPUX86State *env = &cpu->env; + int i; + uint64_t mask; ++ static bool request_perm; + + if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { + env->features[FEAT_XSAVE_COMP_LO] = 0; +@@ -6042,6 +6043,12 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + } + } + ++ /* Only request permission for first vcpu */ ++ if (kvm_enabled() && !request_perm) { ++ kvm_request_xsave_components(cpu, mask); ++ request_perm = true; ++ } ++ + env->features[FEAT_XSAVE_COMP_LO] = mask; + env->features[FEAT_XSAVE_COMP_HI] = mask >> 32; + } +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e1dd8b9555..58676390e6 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -549,6 +549,10 @@ typedef enum X86Seg { + #define XSTATE_ZMM_Hi256_MASK (1ULL << XSTATE_ZMM_Hi256_BIT) + #define XSTATE_Hi16_ZMM_MASK (1ULL << XSTATE_Hi16_ZMM_BIT) + #define XSTATE_PKRU_MASK (1ULL << XSTATE_PKRU_BIT) ++#define XSTATE_XTILE_CFG_MASK (1ULL << XSTATE_XTILE_CFG_BIT) ++#define XSTATE_XTILE_DATA_MASK (1ULL << XSTATE_XTILE_DATA_BIT) ++ ++#define XSTATE_DYNAMIC_MASK (XSTATE_XTILE_DATA_MASK) + + #define ESA_FEATURE_ALIGN64_BIT 1 + +diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c +index 86ef7b2712..bdc967c484 100644 +--- a/target/i386/kvm/kvm-cpu.c ++++ b/target/i386/kvm/kvm-cpu.c +@@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu) + static void kvm_cpu_xsave_init(void) + { + static bool first = true; +- KVMState *s = kvm_state; ++ uint32_t eax, ebx, ecx, edx; + int i; + + if (!first) { +@@ -100,11 +100,11 @@ static void kvm_cpu_xsave_init(void) + ExtSaveArea *esa = &x86_ext_save_areas[i]; + + if (esa->size) { +- int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX); +- if (sz != 0) { +- assert(esa->size == sz); +- esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX); +- esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX); ++ host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx); ++ if (eax != 0) { ++ assert(esa->size == eax); ++ esa->offset = ebx; ++ esa->ecx = ecx; + } + } + } +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index a668f521ac..b5d98c4361 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -17,6 +17,7 @@ + #include "qapi/error.h" + #include + #include ++#include + + #include + #include "standard-headers/asm-x86/kvm_para.h" +@@ -347,6 +348,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + struct kvm_cpuid2 *cpuid; + uint32_t ret = 0; + uint32_t cpuid_1_edx; ++ uint64_t bitmask; + + cpuid = get_supported_cpuid(s); + +@@ -404,6 +406,25 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + if (!has_msr_arch_capabs) { + ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; + } ++ } else if (function == 0xd && index == 0 && ++ (reg == R_EAX || reg == R_EDX)) { ++ struct kvm_device_attr attr = { ++ .group = 0, ++ .attr = KVM_X86_XCOMP_GUEST_SUPP, ++ .addr = (unsigned long) &bitmask ++ }; ++ ++ bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES); ++ if (!sys_attr) { ++ warn_report("cannot get sys attribute capabilities %d", sys_attr); ++ } ++ ++ int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr); ++ if (rc == -1 && (errno == ENXIO || errno == EINVAL)) { ++ warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " ++ "error: %d", rc); ++ } ++ ret = (reg == R_EAX) ? bitmask : bitmask >> 32; + } else if (function == 0x80000001 && reg == R_ECX) { + /* + * It's safe to enable TOPOEXT even if it's not returned by +@@ -5054,3 +5075,39 @@ bool kvm_arch_cpu_check_are_resettable(void) + { + return !sev_es_enabled(); + } ++ ++#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 ++ ++void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) ++{ ++ KVMState *s = kvm_state; ++ uint64_t supported; ++ ++ mask &= XSTATE_DYNAMIC_MASK; ++ if (!mask) { ++ return; ++ } ++ /* ++ * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0]. ++ * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned ++ * about them already because they are not supported features. ++ */ ++ supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX); ++ supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32; ++ mask &= supported; ++ ++ while (mask) { ++ int bit = ctz64(mask); ++ int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); ++ if (rc) { ++ /* ++ * Older kernel version (<5.17) do not support ++ * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return ++ * any dynamic feature from kvm_arch_get_supported_cpuid. ++ */ ++ warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure " ++ "for feature bit %d", bit); ++ } ++ mask &= ~BIT_ULL(bit); ++ } ++} +diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h +index a978509d50..4124912c20 100644 +--- a/target/i386/kvm/kvm_i386.h ++++ b/target/i386/kvm/kvm_i386.h +@@ -52,5 +52,6 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp); + uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address); + + bool kvm_enable_sgx_provisioning(KVMState *s); ++void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask); + + #endif +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-Support-XFD-and-AMX-xsave-data-migration.patch b/SOURCES/kvm-x86-Support-XFD-and-AMX-xsave-data-migration.patch new file mode 100644 index 0000000..e4846b3 --- /dev/null +++ b/SOURCES/kvm-x86-Support-XFD-and-AMX-xsave-data-migration.patch @@ -0,0 +1,178 @@ +From 90a276ed72deab84f3fdd4b57e9ccfc6514934fb Mon Sep 17 00:00:00 2001 +From: Zeng Guang +Date: Wed, 16 Feb 2022 22:04:33 -0800 +Subject: [PATCH 11/24] x86: Support XFD and AMX xsave data migration + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [11/13] 4ff6e5544ffdac4e6d2f568f7f63b937502ca6c5 +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +XFD(eXtended Feature Disable) allows to enable a +feature on xsave state while preventing specific +user threads from using the feature. + +Support save and restore XFD MSRs if CPUID.D.1.EAX[4] +enumerate to be valid. Likewise migrate the MSRs and +related xsave state necessarily. + +Signed-off-by: Zeng Guang +Signed-off-by: Wei Wang +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-8-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit cdec2b753b487d9e8aab028231c35d87789ea083) +Signed-off-by: Paul Lai +--- + target/i386/cpu.h | 9 +++++++++ + target/i386/kvm/kvm.c | 18 +++++++++++++++++ + target/i386/machine.c | 46 +++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 73 insertions(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 14a3501b87..8ab2a4042a 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -505,6 +505,9 @@ typedef enum X86Seg { + + #define MSR_VM_HSAVE_PA 0xc0010117 + ++#define MSR_IA32_XFD 0x000001c4 ++#define MSR_IA32_XFD_ERR 0x000001c5 ++ + #define MSR_IA32_BNDCFGS 0x00000d90 + #define MSR_IA32_XSS 0x00000da0 + #define MSR_IA32_UMWAIT_CONTROL 0xe1 +@@ -870,6 +873,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_1_EAX_AVX_VNNI (1U << 4) + /* AVX512 BFloat16 Instruction */ + #define CPUID_7_1_EAX_AVX512_BF16 (1U << 5) ++/* XFD Extend Feature Disabled */ ++#define CPUID_D_1_EAX_XFD (1U << 4) + + /* Packets which contain IP payload have LIP values */ + #define CPUID_14_0_ECX_LIP (1U << 31) +@@ -1610,6 +1615,10 @@ typedef struct CPUX86State { + uint64_t msr_rtit_cr3_match; + uint64_t msr_rtit_addrs[MAX_RTIT_ADDRS]; + ++ /* Per-VCPU XFD MSRs */ ++ uint64_t msr_xfd; ++ uint64_t msr_xfd_err; ++ + /* exception/interrupt handling */ + int error_code; + int exception_is_int; +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index d3d476df27..b1128b0e07 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -3219,6 +3219,13 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + env->msr_ia32_sgxlepubkeyhash[3]); + } + ++ if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { ++ kvm_msr_entry_add(cpu, MSR_IA32_XFD, ++ env->msr_xfd); ++ kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, ++ env->msr_xfd_err); ++ } ++ + /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see + * kvm_put_msr_feature_control. */ + } +@@ -3571,6 +3578,11 @@ static int kvm_get_msrs(X86CPU *cpu) + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0); + } + ++ if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { ++ kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0); ++ } ++ + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); + if (ret < 0) { + return ret; +@@ -3870,6 +3882,12 @@ static int kvm_get_msrs(X86CPU *cpu) + env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] = + msrs[i].data; + break; ++ case MSR_IA32_XFD: ++ env->msr_xfd = msrs[i].data; ++ break; ++ case MSR_IA32_XFD_ERR: ++ env->msr_xfd_err = msrs[i].data; ++ break; + } + } + +diff --git a/target/i386/machine.c b/target/i386/machine.c +index 83c2b91529..3977e9d8f8 100644 +--- a/target/i386/machine.c ++++ b/target/i386/machine.c +@@ -1455,6 +1455,48 @@ static const VMStateDescription vmstate_msr_intel_sgx = { + } + }; + ++static bool xfd_msrs_needed(void *opaque) ++{ ++ X86CPU *cpu = opaque; ++ CPUX86State *env = &cpu->env; ++ ++ return !!(env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD); ++} ++ ++static const VMStateDescription vmstate_msr_xfd = { ++ .name = "cpu/msr_xfd", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = xfd_msrs_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT64(env.msr_xfd, X86CPU), ++ VMSTATE_UINT64(env.msr_xfd_err, X86CPU), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ ++#ifdef TARGET_X86_64 ++static bool amx_xtile_needed(void *opaque) ++{ ++ X86CPU *cpu = opaque; ++ CPUX86State *env = &cpu->env; ++ ++ return !!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE); ++} ++ ++static const VMStateDescription vmstate_amx_xtile = { ++ .name = "cpu/intel_amx_xtile", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = amx_xtile_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT8_ARRAY(env.xtilecfg, X86CPU, 64), ++ VMSTATE_UINT8_ARRAY(env.xtiledata, X86CPU, 8192), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++#endif ++ + const VMStateDescription vmstate_x86_cpu = { + .name = "cpu", + .version_id = 12, +@@ -1593,6 +1635,10 @@ const VMStateDescription vmstate_x86_cpu = { + #endif + &vmstate_msr_tsx_ctrl, + &vmstate_msr_intel_sgx, ++ &vmstate_msr_xfd, ++#ifdef TARGET_X86_64 ++ &vmstate_amx_xtile, ++#endif + NULL + } + }; +-- +2.35.3 + diff --git a/SOURCES/kvm-x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch b/SOURCES/kvm-x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch new file mode 100644 index 0000000..13566b1 --- /dev/null +++ b/SOURCES/kvm-x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch @@ -0,0 +1,182 @@ +From 28cf1b55f346a9f56e84fa57921f5a28a99cd59b Mon Sep 17 00:00:00 2001 +From: Jing Liu +Date: Wed, 16 Feb 2022 22:04:32 -0800 +Subject: [PATCH 10/24] x86: add support for KVM_CAP_XSAVE2 and AMX state + migration + +RH-Author: Paul Lai +RH-MergeRequest: 176: Enable KVM AMX support +RH-Commit: [10/13] d584f455ba1ecd8a4a87f3470e6aac24ba9a1f5a +RH-Bugzilla: 1916415 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +When dynamic xfeatures (e.g. AMX) are used by the guest, the xsave +area would be larger than 4KB. KVM_GET_XSAVE2 and KVM_SET_XSAVE +under KVM_CAP_XSAVE2 works with a xsave buffer larger than 4KB. +Always use the new ioctls under KVM_CAP_XSAVE2 when KVM supports it. + +Signed-off-by: Jing Liu +Signed-off-by: Zeng Guang +Signed-off-by: Wei Wang +Signed-off-by: Yang Zhong +Message-Id: <20220217060434.52460-7-yang.zhong@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit e56dd3c70abb31893c61ac834109fa7a38841330) +Signed-off-by: Paul Lai +--- + target/i386/cpu.h | 4 ++++ + target/i386/kvm/kvm.c | 42 ++++++++++++++++++++++++-------------- + target/i386/xsave_helper.c | 28 +++++++++++++++++++++++++ + 3 files changed, 59 insertions(+), 15 deletions(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f2bdef9c26..14a3501b87 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1522,6 +1522,10 @@ typedef struct CPUX86State { + uint64_t opmask_regs[NB_OPMASK_REGS]; + YMMReg zmmh_regs[CPU_NB_REGS]; + ZMMReg hi16_zmm_regs[CPU_NB_REGS]; ++#ifdef TARGET_X86_64 ++ uint8_t xtilecfg[64]; ++ uint8_t xtiledata[8192]; ++#endif + + /* sysenter registers */ + uint32_t sysenter_cs; +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index a64a79d870..d3d476df27 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -123,6 +123,7 @@ static uint32_t num_architectural_pmu_gp_counters; + static uint32_t num_architectural_pmu_fixed_counters; + + static int has_xsave; ++static int has_xsave2; + static int has_xcrs; + static int has_pit_state2; + static int has_exception_payload; +@@ -1585,6 +1586,26 @@ static Error *invtsc_mig_blocker; + + #define KVM_MAX_CPUID_ENTRIES 100 + ++static void kvm_init_xsave(CPUX86State *env) ++{ ++ if (has_xsave2) { ++ env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096); ++ } else if (has_xsave) { ++ env->xsave_buf_len = sizeof(struct kvm_xsave); ++ } else { ++ return; ++ } ++ ++ env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len); ++ memset(env->xsave_buf, 0, env->xsave_buf_len); ++ /* ++ * The allocated storage must be large enough for all of the ++ * possible XSAVE state components. ++ */ ++ assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <= ++ env->xsave_buf_len); ++} ++ + int kvm_arch_init_vcpu(CPUState *cs) + { + struct { +@@ -1614,6 +1635,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + + cpuid_i = 0; + ++ has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2); ++ + r = kvm_arch_set_tsc_khz(cs); + if (r < 0) { + return r; +@@ -2003,19 +2026,7 @@ int kvm_arch_init_vcpu(CPUState *cs) + if (r) { + goto fail; + } +- +- if (has_xsave) { +- env->xsave_buf_len = sizeof(struct kvm_xsave); +- env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len); +- memset(env->xsave_buf, 0, env->xsave_buf_len); +- +- /* +- * The allocated storage must be large enough for all of the +- * possible XSAVE state components. +- */ +- assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) +- <= env->xsave_buf_len); +- } ++ kvm_init_xsave(env); + + max_nested_state_len = kvm_max_nested_state_length(); + if (max_nested_state_len > 0) { +@@ -3263,13 +3274,14 @@ static int kvm_get_xsave(X86CPU *cpu) + { + CPUX86State *env = &cpu->env; + void *xsave = env->xsave_buf; +- int ret; ++ int type, ret; + + if (!has_xsave) { + return kvm_get_fpu(cpu); + } + +- ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave); ++ type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE; ++ ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave); + if (ret < 0) { + return ret; + } +diff --git a/target/i386/xsave_helper.c b/target/i386/xsave_helper.c +index ac61a96344..996e9f3bfe 100644 +--- a/target/i386/xsave_helper.c ++++ b/target/i386/xsave_helper.c +@@ -126,6 +126,20 @@ void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, uint32_t buflen) + + memcpy(pkru, &env->pkru, sizeof(env->pkru)); + } ++ ++ e = &x86_ext_save_areas[XSTATE_XTILE_CFG_BIT]; ++ if (e->size && e->offset) { ++ XSaveXTILECFG *tilecfg = buf + e->offset; ++ ++ memcpy(tilecfg, &env->xtilecfg, sizeof(env->xtilecfg)); ++ } ++ ++ e = &x86_ext_save_areas[XSTATE_XTILE_DATA_BIT]; ++ if (e->size && e->offset && buflen >= e->size + e->offset) { ++ XSaveXTILEDATA *tiledata = buf + e->offset; ++ ++ memcpy(tiledata, &env->xtiledata, sizeof(env->xtiledata)); ++ } + #endif + } + +@@ -247,5 +261,19 @@ void x86_cpu_xrstor_all_areas(X86CPU *cpu, const void *buf, uint32_t buflen) + pkru = buf + e->offset; + memcpy(&env->pkru, pkru, sizeof(env->pkru)); + } ++ ++ e = &x86_ext_save_areas[XSTATE_XTILE_CFG_BIT]; ++ if (e->size && e->offset) { ++ const XSaveXTILECFG *tilecfg = buf + e->offset; ++ ++ memcpy(&env->xtilecfg, tilecfg, sizeof(env->xtilecfg)); ++ } ++ ++ e = &x86_ext_save_areas[XSTATE_XTILE_DATA_BIT]; ++ if (e->size && e->offset && buflen >= e->size + e->offset) { ++ const XSaveXTILEDATA *tiledata = buf + e->offset; ++ ++ memcpy(&env->xtiledata, tiledata, sizeof(env->xtiledata)); ++ } + #endif + } +-- +2.35.3 + diff --git a/SPECS/qemu-kvm.spec b/SPECS/qemu-kvm.spec index 66f14a2..918cd68 100644 --- a/SPECS/qemu-kvm.spec +++ b/SPECS/qemu-kvm.spec @@ -83,7 +83,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version}-%{release} Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 6.2.0 -Release: 12%{?rcrel}%{?dist} +Release: 20%{?rcrel}%{?dist}.1 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped Epoch: 15 License: GPLv2 and GPLv2+ and CC-BY @@ -257,6 +257,180 @@ Patch78: kvm-softmmu-physmem-Introduce-MemTxAttrs-memory-field-an.patch Patch79: kvm-display-qxl-render-fix-race-condition-in-qxl_cursor-.patch # For bz#2063262 - CVE-2022-26354 virt:rhel/qemu-kvm: QEMU: vhost-vsock: missing virtqueue detach on error can lead to memory leak [rhel-8] Patch80: kvm-vhost-vsock-detach-the-virqueue-element-in-case-of-e.patch +# For bz#2043830 - [IBM 8.7 FEAT] KVM: Allow long kernel command lines for QEMU +Patch81: kvm-s390x-ipl-support-extended-kernel-command-line-size.patch +# For bz#2063206 - CVE-2022-26353 virt:rhel/qemu-kvm: QEMU: virtio-net: map leaking on error during receive [rhel-8] +Patch82: kvm-virtio-net-fix-map-leaking-on-error-during-receive.patch +# For bz#1519071 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch83: kvm-qcow2-Improve-refcount-structure-rebuilding.patch +# For bz#1519071 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch84: kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch +# For bz#1519071 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch85: kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch +# For bz#1519071 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch86: kvm-iotests-108-Fix-when-missing-user_allow_other.patch +# For bz#2065043 - Remove upstream-only devices from the qemu-kvm binary +Patch87: kvm-Revert-redhat-Add-some-devices-for-exporting-upstrea.patch +# For bz#2070417 - Windows guest hangs after updating and restarting from the guest OS [rhel-8.7.0] +Patch88: kvm-target-i386-properly-reset-TSC-on-reset.patch +# For bz#2040734 - CVE-2021-4206 virt:rhel/qemu-kvm: QEMU: QXL: integer overflow in cursor_alloc() can lead to heap buffer overflow [rhel-8.7] +Patch89: kvm-ui-cursor-fix-integer-overflow-in-cursor_alloc-CVE-2.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch90: kvm-virtio-gpu-do-not-byteswap-padding.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch91: kvm-linux-headers-update-to-5.16-rc1.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch92: kvm-linux-headers-Update-headers-to-v5.17-rc1.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch93: kvm-linux-headers-include-missing-changes-from-5.17.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch94: kvm-x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch95: kvm-x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch96: kvm-x86-Grant-AMX-permission-for-guest.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch97: kvm-x86-Add-XFD-faulting-bit-for-state-components.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch98: kvm-x86-Add-AMX-CPUIDs-enumeration.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch99: kvm-x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch100: kvm-x86-Support-XFD-and-AMX-xsave-data-migration.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch101: kvm-target-i386-kvm-do-not-access-uninitialized-variable.patch +# For bz#1916415 - [Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions +Patch102: kvm-KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch103: kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch104: kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch105: kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch106: kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch107: kvm-vhost-vdpa-backend-feature-should-set-only-once.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch108: kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch +# For bz#2069946 - PXE boot crash qemu when using multiqueue vDPA +Patch109: kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch +# For bz#2029980 - Failed assertion in IDE emulation with Ceph backend +Patch110: kvm-ide-Increment-BB-in-flight-counter-for-TRIM-BH.patch +# For bz#2072932 - Qemu coredump when refreshing block limits on an actively used iothread block device [rhel.8.7] +Patch111: kvm-block-Make-bdrv_refresh_limits-non-recursive.patch +# For bz#2072932 - Qemu coredump when refreshing block limits on an actively used iothread block device [rhel.8.7] +Patch112: kvm-iotests-Allow-using-QMP-with-the-QSD.patch +# For bz#2072932 - Qemu coredump when refreshing block limits on an actively used iothread block device [rhel.8.7] +Patch113: kvm-iotests-graph-changes-while-io-New-test.patch +# For bz#2097209 - [virtiofs] mount virtiofs failed: SELinux: (dev virtiofs, type virtiofs) getxattr errno 111 +Patch114: kvm-virtiofsd-Fix-breakage-due-to-fuse_init_in-size-chan.patch +# For bz#1951521 - CVE-2021-3507 virt:rhel/qemu-kvm: QEMU: fdc: heap buffer overflow in DMA read data transfers [rhel-8] +Patch115: kvm-hw-block-fdc-Prevent-end-of-track-overrun-CVE-2021-3.patch +# For bz#1951521 - CVE-2021-3507 virt:rhel/qemu-kvm: QEMU: fdc: heap buffer overflow in DMA read data transfers [rhel-8] +Patch116: kvm-tests-qtest-fdc-test-Add-a-regression-test-for-CVE-2.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch117: kvm-migration-Never-call-twice-qemu_target_page_size.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch118: kvm-multifd-Rename-used-field-to-num.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch119: kvm-multifd-Add-missing-documentation.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch120: kvm-multifd-The-variable-is-only-used-inside-the-loop.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch121: kvm-multifd-remove-used-parameter-from-send_prepare-meth.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch122: kvm-multifd-remove-used-parameter-from-send_recv_pages-m.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch123: kvm-multifd-Fill-offset-and-block-for-reception.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch124: kvm-multifd-Make-zstd-compression-method-not-use-iovs.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch125: kvm-multifd-Make-zlib-compression-method-not-use-iovs.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch126: kvm-migration-All-this-fields-are-unsigned.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch127: kvm-multifd-Move-iov-from-pages-to-params.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch128: kvm-multifd-Make-zlib-use-iov-s.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch129: kvm-multifd-Make-zstd-use-iov-s.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch130: kvm-multifd-Remove-send_write-method.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch131: kvm-multifd-Use-a-single-writev-on-the-send-side.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch132: kvm-multifd-Use-normal-pages-array-on-the-send-side.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch133: kvm-QIOChannel-Add-flags-on-io_writev-and-introduce-io_f.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch134: kvm-QIOChannelSocket-Implement-io_writev-zero-copy-flag-.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch135: kvm-migration-Add-zero-copy-send-parameter-for-QMP-HMP-f.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch136: kvm-migration-Add-migrate_use_tls-helper.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch137: kvm-multifd-multifd_send_sync_main-now-returns-negative-.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch138: kvm-multifd-Send-header-packet-without-flags-if-zero-cop.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch139: kvm-multifd-Implement-zero-copy-write-in-multifd-migrati.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch140: kvm-QIOChannelSocket-Introduce-assert-and-reduce-ifdefs-.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch141: kvm-QIOChannelSocket-Fix-zero-copy-send-so-socket-flush-.patch +# For bz#2072049 - Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8 +Patch142: kvm-migration-Change-zero_copy_send-from-migration-param.patch +# For bz#2097652 - The migration port is not released if use it again for recovering postcopy migration +Patch143: kvm-migration-Add-migration_incoming_transport_cleanup.patch +# For bz#2097652 - The migration port is not released if use it again for recovering postcopy migration +Patch144: kvm-migration-Allow-migrate-recover-to-run-multiple-time.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch145: kvm-pc-bios-s390-ccw-virtio-Introduce-a-macro-for-the-DA.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch146: kvm-pc-bios-s390-ccw-bootmap-Improve-the-guessing-logic-.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch147: kvm-pc-bios-s390-ccw-virtio-blkdev-Simplify-fix-virtio_i.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch148: kvm-pc-bios-s390-ccw-virtio-blkdev-Remove-virtio_assume_.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch149: kvm-pc-bios-s390-ccw-virtio-Set-missing-status-bits-whil.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch150: kvm-pc-bios-s390-ccw-virtio-Read-device-config-after-fea.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch151: kvm-pc-bios-s390-ccw-virtio-Beautify-the-code-for-readin.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch152: kvm-pc-bios-s390-ccw-Split-virtio-scsi-code-from-virtio_.patch +# For bz#2098076 - virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions +Patch153: kvm-pc-bios-s390-ccw-virtio-blkdev-Request-the-right-fea.patch +# For bz#2105410 - Stalled IO Operations in VM +Patch154: kvm-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch +# For bz#2105410 - Stalled IO Operations in VM +Patch155: kvm-linux-aio-explain-why-max-batch-is-checked-in-laio_i.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch156: kvm-migration-Introduce-ram_transferred_add.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch157: kvm-migration-Tally-pre-copy-downtime-and-post-copy-byte.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch158: kvm-QIOChannelSocket-Fix-zero-copy-flush-returning-code-.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch159: kvm-Add-dirty-sync-missed-zero-copy-migration-stat.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch160: kvm-migration-multifd-Report-to-user-when-zerocopy-not-w.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch161: kvm-migration-Avoid-false-positive-on-non-supported-scen.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch162: kvm-migration-add-remaining-params-has_-true-in-migratio.patch +# For bz#2110203 - zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together +Patch163: kvm-QIOChannelSocket-Add-support-for-MSG_ZEROCOPY-IPV6.patch +# For bz#2112296 - virtio-blk: Can't boot fresh installation from used 512 cluster_size image under certain conditions +Patch164: kvm-pc-bios-s390-ccw-Fix-booting-with-logical-block-size.patch +# For bz#2120279 - Wrong max_sectors_kb and Maximum transfer length on the pass-through device [rhel-8.7] +Patch165: kvm-scsi-generic-Fix-emulated-block-limits-VPD-page.patch +# For bz#2116743 - [RHEL8.7] Guests in VMX root operation fail to reboot with QEMU's 'system_reset' command +Patch166: kvm-i386-reset-KVM-nested-state-upon-CPU-reset.patch +# For bz#2116743 - [RHEL8.7] Guests in VMX root operation fail to reboot with QEMU's 'system_reset' command +Patch167: kvm-i386-do-kvm_put_msr_feature_control-first-thing-when.patch BuildRequires: wget BuildRequires: rpm-build @@ -1426,6 +1600,151 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : %changelog +* Wed Sep 14 2022 Miroslav Rezanina - 6.2.0-20.el8.1 +- kvm-i386-reset-KVM-nested-state-upon-CPU-reset.patch [bz#2116743] +- kvm-i386-do-kvm_put_msr_feature_control-first-thing-when.patch [bz#2116743] +- Resolves: bz#2116743 + ([RHEL8.7] Guests in VMX root operation fail to reboot with QEMU's 'system_reset' command) + +* Fri Aug 26 2022 Jon Maloy - 6.2.0-20 +- kvm-scsi-generic-Fix-emulated-block-limits-VPD-page.patch [bz#2120279] +- Resolves: bz#2120279 + (Wrong max_sectors_kb and Maximum transfer length on the pass-through device [rhel-8.7]) + +* Tue Aug 16 2022 Miroslav Rezanina - 6.2.0-19 +- kvm-migration-Introduce-ram_transferred_add.patch [bz#2110203] +- kvm-migration-Tally-pre-copy-downtime-and-post-copy-byte.patch [bz#2110203] +- kvm-QIOChannelSocket-Fix-zero-copy-flush-returning-code-.patch [bz#2110203] +- kvm-Add-dirty-sync-missed-zero-copy-migration-stat.patch [bz#2110203] +- kvm-migration-multifd-Report-to-user-when-zerocopy-not-w.patch [bz#2110203] +- kvm-migration-Avoid-false-positive-on-non-supported-scen.patch [bz#2110203] +- kvm-migration-add-remaining-params-has_-true-in-migratio.patch [bz#2110203] +- kvm-QIOChannelSocket-Add-support-for-MSG_ZEROCOPY-IPV6.patch [bz#2110203] +- kvm-pc-bios-s390-ccw-Fix-booting-with-logical-block-size.patch [bz#2112296] +- Resolves: bz#2110203 + (zerocopy capability can be enabled when set migrate capabilities with multifd and compress/xbzrle together) +- Resolves: bz#2112296 + (virtio-blk: Can't boot fresh installation from used 512 cluster_size image under certain conditions) + +* Tue Jul 19 2022 Camilla Conte - 6.2.0-18 +- kvm-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch [bz#2105410] +- kvm-linux-aio-explain-why-max-batch-is-checked-in-laio_i.patch [bz#2105410] +- Resolves: bz#2105410 + (Stalled IO Operations in VM) + +* Tue Jul 12 2022 Camilla Conte - 6.2.0-17 +- kvm-migration-Never-call-twice-qemu_target_page_size.patch [bz#2072049] +- kvm-multifd-Rename-used-field-to-num.patch [bz#2072049] +- kvm-multifd-Add-missing-documentation.patch [bz#2072049] +- kvm-multifd-The-variable-is-only-used-inside-the-loop.patch [bz#2072049] +- kvm-multifd-remove-used-parameter-from-send_prepare-meth.patch [bz#2072049] +- kvm-multifd-remove-used-parameter-from-send_recv_pages-m.patch [bz#2072049] +- kvm-multifd-Fill-offset-and-block-for-reception.patch [bz#2072049] +- kvm-multifd-Make-zstd-compression-method-not-use-iovs.patch [bz#2072049] +- kvm-multifd-Make-zlib-compression-method-not-use-iovs.patch [bz#2072049] +- kvm-migration-All-this-fields-are-unsigned.patch [bz#2072049] +- kvm-multifd-Move-iov-from-pages-to-params.patch [bz#2072049] +- kvm-multifd-Make-zlib-use-iov-s.patch [bz#2072049] +- kvm-multifd-Make-zstd-use-iov-s.patch [bz#2072049] +- kvm-multifd-Remove-send_write-method.patch [bz#2072049] +- kvm-multifd-Use-a-single-writev-on-the-send-side.patch [bz#2072049] +- kvm-multifd-Use-normal-pages-array-on-the-send-side.patch [bz#2072049] +- kvm-QIOChannel-Add-flags-on-io_writev-and-introduce-io_f.patch [bz#2072049] +- kvm-QIOChannelSocket-Implement-io_writev-zero-copy-flag-.patch [bz#2072049] +- kvm-migration-Add-zero-copy-send-parameter-for-QMP-HMP-f.patch [bz#2072049] +- kvm-migration-Add-migrate_use_tls-helper.patch [bz#2072049] +- kvm-multifd-multifd_send_sync_main-now-returns-negative-.patch [bz#2072049] +- kvm-multifd-Send-header-packet-without-flags-if-zero-cop.patch [bz#2072049] +- kvm-multifd-Implement-zero-copy-write-in-multifd-migrati.patch [bz#2072049] +- kvm-QIOChannelSocket-Introduce-assert-and-reduce-ifdefs-.patch [bz#2072049] +- kvm-QIOChannelSocket-Fix-zero-copy-send-so-socket-flush-.patch [bz#2072049] +- kvm-migration-Change-zero_copy_send-from-migration-param.patch [bz#2072049] +- kvm-migration-Add-migration_incoming_transport_cleanup.patch [bz#2097652] +- kvm-migration-Allow-migrate-recover-to-run-multiple-time.patch [bz#2097652] +- kvm-pc-bios-s390-ccw-virtio-Introduce-a-macro-for-the-DA.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-bootmap-Improve-the-guessing-logic-.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-blkdev-Simplify-fix-virtio_i.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-blkdev-Remove-virtio_assume_.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-Set-missing-status-bits-whil.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-Read-device-config-after-fea.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-Beautify-the-code-for-readin.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-Split-virtio-scsi-code-from-virtio_.patch [bz#2098076] +- kvm-pc-bios-s390-ccw-virtio-blkdev-Request-the-right-fea.patch [bz#2098076] +- Resolves: bz#2072049 + (Pull MSG_ZEROCOPY on QEMU Live Migration Patches into RHEL 8) +- Resolves: bz#2097652 + (The migration port is not released if use it again for recovering postcopy migration) +- Resolves: bz#2098076 + (virtio-blk: Can't boot fresh installation from used virtio-blk dasd disk under certain conditions) + +* Thu Jun 23 2022 Jon Maloy - 6.2.0-16 +- kvm-virtiofsd-Fix-breakage-due-to-fuse_init_in-size-chan.patch [bz#2097209] +- kvm-hw-block-fdc-Prevent-end-of-track-overrun-CVE-2021-3.patch [bz#1951521] +- kvm-tests-qtest-fdc-test-Add-a-regression-test-for-CVE-2.patch [bz#1951521] +- Resolves: bz#2097209 + ([virtiofs] mount virtiofs failed: SELinux: (dev virtiofs, type virtiofs) getxattr errno 111) +- Resolves: bz#1951521 + (CVE-2021-3507 virt:rhel/qemu-kvm: QEMU: fdc: heap buffer overflow in DMA read data transfers [rhel-8]) + +* Tue Jun 14 2022 Jon Maloy - 6.2.0-15 +- kvm-virtio-gpu-do-not-byteswap-padding.patch [bz#1916415] +- kvm-linux-headers-update-to-5.16-rc1.patch [bz#1916415] +- kvm-linux-headers-Update-headers-to-v5.17-rc1.patch [bz#1916415] +- kvm-linux-headers-include-missing-changes-from-5.17.patch [bz#1916415] +- kvm-x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch [bz#1916415] +- kvm-x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch [bz#1916415] +- kvm-x86-Grant-AMX-permission-for-guest.patch [bz#1916415] +- kvm-x86-Add-XFD-faulting-bit-for-state-components.patch [bz#1916415] +- kvm-x86-Add-AMX-CPUIDs-enumeration.patch [bz#1916415] +- kvm-x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch [bz#1916415] +- kvm-x86-Support-XFD-and-AMX-xsave-data-migration.patch [bz#1916415] +- kvm-target-i386-kvm-do-not-access-uninitialized-variable.patch [bz#1916415] +- kvm-KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch [bz#1916415] +- kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch [bz#2069946] +- kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch [bz#2069946] +- kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch [bz#2069946] +- kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch [bz#2069946] +- kvm-vhost-vdpa-backend-feature-should-set-only-once.patch [bz#2069946] +- kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch [bz#2069946] +- kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch [bz#2069946] +- kvm-ide-Increment-BB-in-flight-counter-for-TRIM-BH.patch [bz#2029980] +- kvm-block-Make-bdrv_refresh_limits-non-recursive.patch [bz#2072932] +- kvm-iotests-Allow-using-QMP-with-the-QSD.patch [bz#2072932] +- kvm-iotests-graph-changes-while-io-New-test.patch [bz#2072932] +- Resolves: bz#1916415 + ([Intel 8.7 FEAT] qemu-kvm Sapphire Rapids (SPR) AMX Instructions) +- Resolves: bz#2069946 + (PXE boot crash qemu when using multiqueue vDPA) +- Resolves: bz#2029980 + (Failed assertion in IDE emulation with Ceph backend) +- Resolves: bz#2072932 + (Qemu coredump when refreshing block limits on an actively used iothread block device [rhel.8.7]) + +* Thu May 19 2022 Jon Maloy - 6.2.0-14 +- kvm-Revert-redhat-Add-some-devices-for-exporting-upstrea.patch [bz#2065043] +- kvm-target-i386-properly-reset-TSC-on-reset.patch [bz#2070417] +- kvm-ui-cursor-fix-integer-overflow-in-cursor_alloc-CVE-2.patch [bz#2040734] +- Resolves: bz#2065043 + (Remove upstream-only devices from the qemu-kvm binary) +- Resolves: bz#2070417 + (Windows guest hangs after updating and restarting from the guest OS [rhel-8.7.0]) +- Resolves: bz#2040734 + (CVE-2021-4206 virt:rhel/qemu-kvm: QEMU: QXL: integer overflow in cursor_alloc() can lead to heap buffer overflow [rhel-8.7]) + +* Tue May 03 2022 Jon Maloy - 6.2.0-13 +- kvm-s390x-ipl-support-extended-kernel-command-line-size.patch [bz#2043830] +- kvm-virtio-net-fix-map-leaking-on-error-during-receive.patch [bz#2063206] +- kvm-qcow2-Improve-refcount-structure-rebuilding.patch [bz#1519071] +- kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch [bz#1519071] +- kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch [bz#1519071] +- kvm-iotests-108-Fix-when-missing-user_allow_other.patch [bz#1519071] +- Resolves: bz#2043830 + ([IBM 8.7 FEAT] KVM: Allow long kernel command lines for QEMU) +- Resolves: bz#2063206 + (CVE-2022-26353 virt:rhel/qemu-kvm: QEMU: virtio-net: map leaking on error during receive [rhel-8]) +- Resolves: bz#1519071 + (Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs)) + * Thu Apr 21 2022 Jon Maloy - 6.2.0-12 - kvm-display-qxl-render-fix-race-condition-in-qxl_cursor-.patch [bz#2040738] - kvm-vhost-vsock-detach-the-virqueue-element-in-case-of-e.patch [bz#2063262]