Blame SOURCES/0001-vhost-improve-dirty-pages-logging-performance.patch

c7ffa4
From 4d8b1e6aa5d7ecfc1d2ee606b4bd838b4f1ac9d2 Mon Sep 17 00:00:00 2001
c7ffa4
From: Maxime Coquelin <maxime.coquelin@redhat.com>
c7ffa4
Date: Thu, 17 May 2018 13:44:47 +0200
c7ffa4
Subject: [PATCH] vhost: improve dirty pages logging performance
c7ffa4
c7ffa4
[ upstream commit c16915b8710911a75f0fbdb1aa5243f4cdfaf26a ]
c7ffa4
c7ffa4
This patch caches all dirty pages logging until the used ring index
c7ffa4
is updated.
c7ffa4
c7ffa4
The goal of this optimization is to fix a performance regression
c7ffa4
introduced when the vhost library started to use atomic operations
c7ffa4
to set bits in the shared dirty log map. While the fix was valid
c7ffa4
as previous implementation wasn't safe against concurrent accesses,
c7ffa4
contention was induced.
c7ffa4
c7ffa4
With this patch, during migration, we have:
c7ffa4
1. Less atomic operations as only a single atomic OR operation
c7ffa4
per 32 or 64 (depending on CPU) pages.
c7ffa4
2. Less atomic operations as during a burst, the same page will
c7ffa4
be marked dirty only once.
c7ffa4
3. Less write memory barriers.
c7ffa4
c7ffa4
Fixes: 897f13a1f726 ("vhost: make page logging atomic")
c7ffa4
Cc: stable@dpdk.org
c7ffa4
c7ffa4
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
c7ffa4
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
c7ffa4
Reviewed-by: Tiwei Bie <tiwei.bie@intel.com>
c7ffa4
---
c7ffa4
 lib/librte_vhost/vhost.h      | 119 +++++++++++++++++++++++++++++++++++++++++-
c7ffa4
 lib/librte_vhost/virtio_net.c |  29 ++++++----
c7ffa4
 2 files changed, 137 insertions(+), 11 deletions(-)
c7ffa4
c7ffa4
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
c7ffa4
index 16d6b8913..42c6a3a75 100644
c7ffa4
--- a/lib/librte_vhost/vhost.h
c7ffa4
+++ b/lib/librte_vhost/vhost.h
c7ffa4
@@ -59,6 +59,8 @@
c7ffa4
 
c7ffa4
 #define BUF_VECTOR_MAX 256
c7ffa4
 
c7ffa4
+#define VHOST_LOG_CACHE_NR 32
c7ffa4
+
c7ffa4
 /**
c7ffa4
  * Structure contains buffer address, length and descriptor index
c7ffa4
  * from vring to do scatter RX.
c7ffa4
@@ -92,6 +94,14 @@ struct batch_copy_elem {
c7ffa4
 	uint64_t log_addr;
c7ffa4
 };
c7ffa4
 
c7ffa4
+/*
c7ffa4
+ * Structure that contains the info for batched dirty logging.
c7ffa4
+ */
c7ffa4
+struct log_cache_entry {
c7ffa4
+	uint32_t offset;
c7ffa4
+	unsigned long val;
c7ffa4
+};
c7ffa4
+
c7ffa4
 /**
c7ffa4
  * Structure contains variables relevant to RX/TX virtqueues.
c7ffa4
  */
c7ffa4
@@ -133,6 +143,9 @@ struct vhost_virtqueue {
c7ffa4
 	struct batch_copy_elem	*batch_copy_elems;
c7ffa4
 	uint16_t		batch_copy_nb_elems;
c7ffa4
 
c7ffa4
+	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
c7ffa4
+	uint16_t log_cache_nb_elem;
c7ffa4
+
c7ffa4
 	rte_rwlock_t	iotlb_lock;
c7ffa4
 	rte_rwlock_t	iotlb_pending_lock;
c7ffa4
 	struct rte_mempool *iotlb_pool;
c7ffa4
@@ -266,7 +279,15 @@ struct virtio_net {
c7ffa4
 static __rte_always_inline void
c7ffa4
 vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
c7ffa4
 {
c7ffa4
-	__sync_fetch_and_or_8(addr, (1U << nr));
c7ffa4
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
c7ffa4
+	/*
c7ffa4
+	 * __sync_ built-ins are deprecated, but __atomic_ ones
c7ffa4
+	 * are sub-optimized in older GCC versions.
c7ffa4
+	 */
c7ffa4
+	__sync_fetch_and_or_1(addr, (1U << nr));
c7ffa4
+#else
c7ffa4
+	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
c7ffa4
+#endif
c7ffa4
 }
c7ffa4
 
c7ffa4
 static __rte_always_inline void
c7ffa4
@@ -297,6 +318,102 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
c7ffa4
 	}
c7ffa4
 }
c7ffa4
 
c7ffa4
+static __rte_always_inline void
c7ffa4
+vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
c7ffa4
+{
c7ffa4
+	unsigned long *log_base;
c7ffa4
+	int i;
c7ffa4
+
c7ffa4
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
c7ffa4
+		   !dev->log_base))
c7ffa4
+		return;
c7ffa4
+
c7ffa4
+	log_base = (unsigned long *)(uintptr_t)dev->log_base;
c7ffa4
+
c7ffa4
+	/*
c7ffa4
+	 * It is expected a write memory barrier has been issued
c7ffa4
+	 * before this function is called.
c7ffa4
+	 */
c7ffa4
+
c7ffa4
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
c7ffa4
+		struct log_cache_entry *elem = vq->log_cache + i;
c7ffa4
+
c7ffa4
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
c7ffa4
+		/*
c7ffa4
+		 * '__sync' builtins are deprecated, but '__atomic' ones
c7ffa4
+		 * are sub-optimized in older GCC versions.
c7ffa4
+		 */
c7ffa4
+		__sync_fetch_and_or(log_base + elem->offset, elem->val);
c7ffa4
+#else
c7ffa4
+		__atomic_fetch_or(log_base + elem->offset, elem->val,
c7ffa4
+				__ATOMIC_RELAXED);
c7ffa4
+#endif
c7ffa4
+	}
c7ffa4
+
c7ffa4
+	rte_smp_wmb();
c7ffa4
+
c7ffa4
+	vq->log_cache_nb_elem = 0;
c7ffa4
+}
c7ffa4
+
c7ffa4
+static __rte_always_inline void
c7ffa4
+vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
+			uint64_t page)
c7ffa4
+{
c7ffa4
+	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
c7ffa4
+	uint32_t offset = page / (sizeof(unsigned long) << 3);
c7ffa4
+	int i;
c7ffa4
+
c7ffa4
+	for (i = 0; i < vq->log_cache_nb_elem; i++) {
c7ffa4
+		struct log_cache_entry *elem = vq->log_cache + i;
c7ffa4
+
c7ffa4
+		if (elem->offset == offset) {
c7ffa4
+			elem->val |= (1UL << bit_nr);
c7ffa4
+			return;
c7ffa4
+		}
c7ffa4
+	}
c7ffa4
+
c7ffa4
+	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
c7ffa4
+		/*
c7ffa4
+		 * No more room for a new log cache entry,
c7ffa4
+		 * so write the dirty log map directly.
c7ffa4
+		 */
c7ffa4
+		rte_smp_wmb();
c7ffa4
+		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
c7ffa4
+
c7ffa4
+		return;
c7ffa4
+	}
c7ffa4
+
c7ffa4
+	vq->log_cache[i].offset = offset;
c7ffa4
+	vq->log_cache[i].val = (1UL << bit_nr);
c7ffa4
+}
c7ffa4
+
c7ffa4
+static __rte_always_inline void
c7ffa4
+vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
+			uint64_t addr, uint64_t len)
c7ffa4
+{
c7ffa4
+	uint64_t page;
c7ffa4
+
c7ffa4
+	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
c7ffa4
+		   !dev->log_base || !len))
c7ffa4
+		return;
c7ffa4
+
c7ffa4
+	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
c7ffa4
+		return;
c7ffa4
+
c7ffa4
+	page = addr / VHOST_LOG_PAGE;
c7ffa4
+	while (page * VHOST_LOG_PAGE < addr + len) {
c7ffa4
+		vhost_log_cache_page(dev, vq, page);
c7ffa4
+		page += 1;
c7ffa4
+	}
c7ffa4
+}
c7ffa4
+
c7ffa4
+static __rte_always_inline void
c7ffa4
+vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
+			uint64_t offset, uint64_t len)
c7ffa4
+{
c7ffa4
+	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
c7ffa4
+}
c7ffa4
+
c7ffa4
 static __rte_always_inline void
c7ffa4
 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 		     uint64_t offset, uint64_t len)
c7ffa4
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
c7ffa4
index a013c07b0..5f8763d3a 100644
c7ffa4
--- a/lib/librte_vhost/virtio_net.c
c7ffa4
+++ b/lib/librte_vhost/virtio_net.c
c7ffa4
@@ -107,7 +107,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 	rte_memcpy(&vq->used->ring[to],
c7ffa4
 			&vq->shadow_used_ring[from],
c7ffa4
 			size * sizeof(struct vring_used_elem));
c7ffa4
-	vhost_log_used_vring(dev, vq,
c7ffa4
+	vhost_log_cache_used_vring(dev, vq,
c7ffa4
 			offsetof(struct vring_used, ring[to]),
c7ffa4
 			size * sizeof(struct vring_used_elem));
c7ffa4
 }
c7ffa4
@@ -135,6 +135,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
c7ffa4
 
c7ffa4
 	rte_smp_wmb();
c7ffa4
 
c7ffa4
+	vhost_log_cache_sync(dev, vq);
c7ffa4
+
c7ffa4
 	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
c7ffa4
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
c7ffa4
 		sizeof(vq->used->idx));
c7ffa4
@@ -159,7 +161,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
c7ffa4
 
c7ffa4
 	for (i = 0; i < count; i++) {
c7ffa4
 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
c7ffa4
-		vhost_log_write(dev, elem[i].log_addr, elem[i].len);
c7ffa4
+		vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
c7ffa4
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
c7ffa4
 	}
c7ffa4
 }
c7ffa4
@@ -275,7 +277,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 		virtio_enqueue_offload(m,
c7ffa4
 				(struct virtio_net_hdr *)(uintptr_t)desc_addr);
c7ffa4
 		PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
c7ffa4
-		vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
c7ffa4
+		vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
c7ffa4
 	} else {
c7ffa4
 		struct virtio_net_hdr vnet_hdr;
c7ffa4
 		uint64_t remain = dev->vhost_hlen;
c7ffa4
@@ -298,7 +300,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 					(void *)(uintptr_t)src, len);
c7ffa4
 
c7ffa4
 			PRINT_PACKET(dev, (uintptr_t)dst, len, 0);
c7ffa4
-			vhost_log_write(dev, guest_addr, len);
c7ffa4
+			vhost_log_cache_write(dev, vq, guest_addr, len);
c7ffa4
 			remain -= len;
c7ffa4
 			guest_addr += len;
c7ffa4
 			dst += len;
c7ffa4
@@ -379,7 +381,8 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 							desc_offset)),
c7ffa4
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
c7ffa4
 				cpy_len);
c7ffa4
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
c7ffa4
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
c7ffa4
+					cpy_len);
c7ffa4
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
c7ffa4
 				     cpy_len, 0);
c7ffa4
 		} else {
c7ffa4
@@ -468,7 +471,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
c7ffa4
 		vq->used->ring[used_idx].id = desc_indexes[i];
c7ffa4
 		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
c7ffa4
 					       dev->vhost_hlen;
c7ffa4
-		vhost_log_used_vring(dev, vq,
c7ffa4
+		vhost_log_cache_used_vring(dev, vq,
c7ffa4
 			offsetof(struct vring_used, ring[used_idx]),
c7ffa4
 			sizeof(vq->used->ring[used_idx]));
c7ffa4
 	}
c7ffa4
@@ -528,6 +531,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
c7ffa4
 
c7ffa4
 	rte_smp_wmb();
c7ffa4
 
c7ffa4
+	vhost_log_cache_sync(dev, vq);
c7ffa4
+
c7ffa4
 	*(volatile uint16_t *)&vq->used->idx += count;
c7ffa4
 	vq->last_used_idx += count;
c7ffa4
 	vhost_log_used_vring(dev, vq,
c7ffa4
@@ -797,7 +802,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 
c7ffa4
 					PRINT_PACKET(dev, (uintptr_t)dst,
c7ffa4
 							len, 0);
c7ffa4
-					vhost_log_write(dev, guest_addr, len);
c7ffa4
+					vhost_log_cache_write(dev, vq,
c7ffa4
+							guest_addr, len);
c7ffa4
 
c7ffa4
 					remain -= len;
c7ffa4
 					guest_addr += len;
c7ffa4
@@ -806,7 +812,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 			} else {
c7ffa4
 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
c7ffa4
 						dev->vhost_hlen, 0);
c7ffa4
-				vhost_log_write(dev, hdr_phys_addr,
c7ffa4
+				vhost_log_cache_write(dev, vq, hdr_phys_addr,
c7ffa4
 						dev->vhost_hlen);
c7ffa4
 			}
c7ffa4
 
c7ffa4
@@ -820,7 +826,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 							desc_offset)),
c7ffa4
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
c7ffa4
 				cpy_len);
c7ffa4
-			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
c7ffa4
+			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
c7ffa4
+					cpy_len);
c7ffa4
 			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
c7ffa4
 				cpy_len, 0);
c7ffa4
 		} else {
c7ffa4
@@ -1384,7 +1391,7 @@ update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 {
c7ffa4
 	vq->used->ring[used_idx].id  = desc_idx;
c7ffa4
 	vq->used->ring[used_idx].len = 0;
c7ffa4
-	vhost_log_used_vring(dev, vq,
c7ffa4
+	vhost_log_cache_used_vring(dev, vq,
c7ffa4
 			offsetof(struct vring_used, ring[used_idx]),
c7ffa4
 			sizeof(vq->used->ring[used_idx]));
c7ffa4
 }
c7ffa4
@@ -1399,6 +1406,8 @@ update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
c7ffa4
 	rte_smp_wmb();
c7ffa4
 	rte_smp_rmb();
c7ffa4
 
c7ffa4
+	vhost_log_cache_sync(dev, vq);
c7ffa4
+
c7ffa4
 	vq->used->idx += count;
c7ffa4
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
c7ffa4
 			sizeof(vq->used->idx));
c7ffa4
-- 
c7ffa4
2.14.3
c7ffa4