|
Boris Burkov |
40ef45 |
From bbd4bb4fd663da72d0a9806e53d84601d03532b2 Mon Sep 17 00:00:00 2001
|
|
Boris Burkov |
40ef45 |
Message-Id: <bbd4bb4fd663da72d0a9806e53d84601d03532b2.1668799549.git.boris@bur.io>
|
|
Boris Burkov |
40ef45 |
In-Reply-To: <cover.1668799549.git.boris@bur.io>
|
|
Boris Burkov |
40ef45 |
References: <cover.1668799549.git.boris@bur.io>
|
|
Boris Burkov |
40ef45 |
From: Filipe Manana <fdmanana@suse.com>
|
|
Boris Burkov |
40ef45 |
Date: Tue, 15 Nov 2022 16:25:26 +0000
|
|
Boris Burkov |
40ef45 |
Subject: [PATCH 3/3] btrfs-progs: receive: fix silent data loss after fall
|
|
Boris Burkov |
40ef45 |
back from encoded write
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
When attempting an encoded write, if it fails for some specific reason
|
|
Boris Burkov |
40ef45 |
like -EINVAL (when an offset is not sector size aligned) or -ENOSPC, we
|
|
Boris Burkov |
40ef45 |
then fallback into decompressing the data and writing it using regular
|
|
Boris Burkov |
40ef45 |
buffered IO. This logic however is not correct, one of the reasons is
|
|
Boris Burkov |
40ef45 |
that it assumes the encoded offset is smaller than the unencoded file
|
|
Boris Burkov |
40ef45 |
length and that they can be compared, but one is an offset and the other
|
|
Boris Burkov |
40ef45 |
is a length, not an end offset, so they can't be compared to get correct
|
|
Boris Burkov |
40ef45 |
results. This bad logic will often result in not copying all data, or even
|
|
Boris Burkov |
40ef45 |
no data at all, resulting in a silent data loss. This is easily seen in
|
|
Boris Burkov |
40ef45 |
with the following reproducer:
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
$ cat test.sh
|
|
Boris Burkov |
40ef45 |
#!/bin/bash
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
DEV=/dev/sdj
|
|
Boris Burkov |
40ef45 |
MNT=/mnt/sdj
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
umount $DEV &> /dev/null
|
|
Boris Burkov |
40ef45 |
mkfs.btrfs -f $DEV > /dev/null
|
|
Boris Burkov |
40ef45 |
mount -o compress $DEV $MNT
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
# File foo has a size of 33K, not aligned to the sector size.
|
|
Boris Burkov |
40ef45 |
xfs_io -f -c "pwrite -S 0xab 0 33K" $MNT/foo
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
xfs_io -f -c "pwrite -S 0xcd 0 64K" $MNT/bar
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
# Now clone the first 32K of file bar into foo at offset 0.
|
|
Boris Burkov |
40ef45 |
xfs_io -c "reflink $MNT/bar 0 0 32K" $MNT/foo
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
# Snapshot the default subvolume and create a full send stream (v2).
|
|
Boris Burkov |
40ef45 |
btrfs subvolume snapshot -r $MNT $MNT/snap
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
btrfs send --compressed-data -f /tmp/test.send $MNT/snap
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
echo -e "\nFile bar in the original filesystem:"
|
|
Boris Burkov |
40ef45 |
od -A d -t x1 $MNT/snap/bar
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
umount $MNT
|
|
Boris Burkov |
40ef45 |
mkfs.btrfs -f $DEV > /dev/null
|
|
Boris Burkov |
40ef45 |
mount $DEV $MNT
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
echo -e "\nReceiving stream in a new filesystem..."
|
|
Boris Burkov |
40ef45 |
btrfs receive -f /tmp/test.send $MNT
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
echo -e "\nFile bar in the new filesystem:"
|
|
Boris Burkov |
40ef45 |
od -A d -t x1 $MNT/snap/bar
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
umount $MNT
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Running the test without this patch:
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
$ ./test.sh
|
|
Boris Burkov |
40ef45 |
(...)
|
|
Boris Burkov |
40ef45 |
File bar in the original filesystem:
|
|
Boris Burkov |
40ef45 |
0000000 cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd
|
|
Boris Burkov |
40ef45 |
*
|
|
Boris Burkov |
40ef45 |
0065536
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Receiving stream in a new filesystem...
|
|
Boris Burkov |
40ef45 |
At subvol snap
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
File bar in the new filesystem:
|
|
Boris Burkov |
40ef45 |
0000000 cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd
|
|
Boris Burkov |
40ef45 |
*
|
|
Boris Burkov |
40ef45 |
0033792
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
We end up with file bar having less data, and a smaller size, than in the
|
|
Boris Burkov |
40ef45 |
original filesystem.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
This happens because when processing file bar, send issues the following
|
|
Boris Burkov |
40ef45 |
commands:
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
clone bar - source=foo source offset=0 offset=0 length=32768
|
|
Boris Burkov |
40ef45 |
write bar - offset=32768 length=1024
|
|
Boris Burkov |
40ef45 |
encoded_write bar - offset=33792, len=4096, unencoded_offset=33792, unencoded_file_len=31744, unencoded_len=65536, compression=1, encryption=0
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
The first 32K are cloned from file foo, as that file ranged is shared
|
|
Boris Burkov |
40ef45 |
between the files.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Then there's a regular write operation for the file range [32K, 33K[,
|
|
Boris Burkov |
40ef45 |
since file foo has different data from bar for that file range.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Finally for the remainder of file bar, the send side issues an encoded
|
|
Boris Burkov |
40ef45 |
write since the extent is compressed in the source filesystem, for the
|
|
Boris Burkov |
40ef45 |
file offset 33792 (33K), remaining 31K of data. The receiver will try the
|
|
Boris Burkov |
40ef45 |
encoded write, but that fails with -EINVAL since the offset 33K is not
|
|
Boris Burkov |
40ef45 |
sector size aligned, so it will fallback to decompressing the data and
|
|
Boris Burkov |
40ef45 |
writing it using regular buffered writes. However that results in doing
|
|
Boris Burkov |
40ef45 |
no writes at decompress_and_write() because 'pos' is initialized to the
|
|
Boris Burkov |
40ef45 |
value of 33K (unencoded_offset) and unencoded_file_len is 31K, so the
|
|
Boris Burkov |
40ef45 |
while loop has no iterations.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Another case where we can fallback to decompression plus regular buffered
|
|
Boris Burkov |
40ef45 |
writes is when the destination filesystem has a sector size larger then
|
|
Boris Burkov |
40ef45 |
the sector size of the source filesystem (for example when the source
|
|
Boris Burkov |
40ef45 |
filesystem is on x86_64 with a 4K sector size and the destination
|
|
Boris Burkov |
40ef45 |
filesystem is PowerPC with a 64K sector size). In that scenario encoded
|
|
Boris Burkov |
40ef45 |
write attempts wil fail with -EINVAL due to offsets not being aligned with
|
|
Boris Burkov |
40ef45 |
the sector size of the destination filesystem, and the receive will
|
|
Boris Burkov |
40ef45 |
attempt the fallback of decompressing the buffer and writing the
|
|
Boris Burkov |
40ef45 |
decompressed using regular buffered IO.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Fix this by tracking the number of written bytes instead, and increment
|
|
Boris Burkov |
40ef45 |
it, and the unencoded offset, after each write.
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
Fixes: d20e759fc917 ("btrfs-progs: receive: encoded_write fallback to explicit decode and write")
|
|
Boris Burkov |
40ef45 |
Signed-off-by: Filipe Manana <fdmanana@suse.com>
|
|
Boris Burkov |
40ef45 |
Reviewed-by: Boris Burkov <boris@bur.io>
|
|
Boris Burkov |
40ef45 |
---
|
|
Boris Burkov |
40ef45 |
cmds/receive.c | 15 ++++++++-------
|
|
Boris Burkov |
40ef45 |
1 file changed, 8 insertions(+), 7 deletions(-)
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
diff --git a/cmds/receive.c b/cmds/receive.c
|
|
Boris Burkov |
40ef45 |
index 9f73b072..7756b42c 100644
|
|
Boris Burkov |
40ef45 |
--- a/cmds/receive.c
|
|
Boris Burkov |
40ef45 |
+++ b/cmds/receive.c
|
|
Boris Burkov |
40ef45 |
@@ -1155,10 +1155,9 @@ static int decompress_and_write(struct btrfs_receive *rctx,
|
|
Boris Burkov |
40ef45 |
u32 compression)
|
|
Boris Burkov |
40ef45 |
{
|
|
Boris Burkov |
40ef45 |
int ret = 0;
|
|
Boris Burkov |
40ef45 |
- size_t pos;
|
|
Boris Burkov |
40ef45 |
- ssize_t w;
|
|
Boris Burkov |
40ef45 |
char *unencoded_data;
|
|
Boris Burkov |
ea6bc9 |
int sector_shift;
|
|
Boris Burkov |
40ef45 |
+ u64 written = 0;
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
unencoded_data = calloc(unencoded_len, 1);
|
|
Boris Burkov |
40ef45 |
if (!unencoded_data) {
|
|
Boris Burkov |
40ef45 |
@@ -1209,17 +1208,19 @@ static int decompress_and_write(struct btrfs_receive *rctx,
|
|
Boris Burkov |
40ef45 |
goto out;
|
|
Boris Burkov |
40ef45 |
}
|
|
Boris Burkov |
40ef45 |
|
|
Boris Burkov |
40ef45 |
- pos = unencoded_offset;
|
|
Boris Burkov |
40ef45 |
- while (pos < unencoded_file_len) {
|
|
Boris Burkov |
40ef45 |
- w = pwrite(rctx->write_fd, unencoded_data + pos,
|
|
Boris Burkov |
40ef45 |
- unencoded_file_len - pos, offset);
|
|
Boris Burkov |
40ef45 |
+ while (written < unencoded_file_len) {
|
|
Boris Burkov |
40ef45 |
+ ssize_t w;
|
|
Boris Burkov |
40ef45 |
+
|
|
Boris Burkov |
40ef45 |
+ w = pwrite(rctx->write_fd, unencoded_data + unencoded_offset,
|
|
Boris Burkov |
40ef45 |
+ unencoded_file_len - written, offset);
|
|
Boris Burkov |
40ef45 |
if (w < 0) {
|
|
Boris Burkov |
40ef45 |
ret = -errno;
|
|
Boris Burkov |
40ef45 |
error("writing unencoded data failed: %m");
|
|
Boris Burkov |
40ef45 |
goto out;
|
|
Boris Burkov |
40ef45 |
}
|
|
Boris Burkov |
40ef45 |
- pos += w;
|
|
Boris Burkov |
40ef45 |
+ written += w;
|
|
Boris Burkov |
40ef45 |
offset += w;
|
|
Boris Burkov |
40ef45 |
+ unencoded_offset += w;
|
|
Boris Burkov |
40ef45 |
}
|
|
Boris Burkov |
40ef45 |
out:
|
|
Boris Burkov |
40ef45 |
free(unencoded_data);
|
|
Boris Burkov |
40ef45 |
--
|
|
Boris Burkov |
40ef45 |
2.38.1
|
|
Boris Burkov |
40ef45 |
|