Data Corruption bug with Samba's vfs_iouring and Linux 5.6.7/5.7rc3

Stefan Metzmacher metze at samba.org
Wed May 6 14:08:03 UTC 2020


Am 06.05.20 um 14:41 schrieb Anoop C S:
> On Wed, 2020-05-06 at 12:33 +0200, Stefan Metzmacher wrote:
>> Hi Anoop,
>>
>>> I could reproduce the difference in SHA256 checksum after copying a
>>> directory with 100 copies of test file(provided by reporter) from
>>> io_uring VFS module enabled share using Windows explorer(right-
>>> click-
>>>> copy/paste). Only 5 out of 100 files had correct checksum after
>>>> copy
>>> operation :-/
>>
>> Great! Can you please try to collect level 1 log files with
>> the patch https://bugzilla.samba.org/attachment.cgi?id=15955
>> applied?
> 
> I have attached three log files.
> log.io_uring.smbd -- Copy using Windows explorer
> log.io_uring-mget.smd -- Copy using smbclient
> log.io_uring-powershell.smd -- Copy using `Copy-Item`

Thanks! All of them show short reads like:

> [2020/05/06 17:27:28.130248,  1] ../../source3/modules/vfs_io_uring.c:103(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: pread ofs=0 (0x0) len=32768 (0x8000) nread=32768 (0x32768) eof=10000000 (0x989680) blks=4096 blocks=19536 dir/1.bin fnum 1607026405
> [2020/05/06 17:27:28.131049,  1] ../../source3/modules/vfs_io_uring.c:103(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: pread ofs=9969664 (0x982000) len=30336 (0x7680) nread=30336 (0x30336) eof=10000000 (0x989680) blks=4096 blocks=19536 dir/1.bin fnum 1607026405
> [2020/05/06 17:27:28.133679,  1] ../../source3/modules/vfs_io_uring.c:103(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: pread ofs=61440 (0xf000) len=32768 (0x8000) nread=32768 (0x32768) eof=10000000 (0x989680) blks=4096 blocks=19536 dir/1.bin fnum 1607026405
> [2020/05/06 17:27:28.140184,  0] ../../source3/modules/vfs_io_uring.c:88(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: Invalid pread ofs=0 (0x0) len=1048576 (0x100000) nread=32768 (0x32768) eof=10000000 (0x989680) blks=4096 blocks=19536 dir/1.bin fnum 1607026405

It seems the first request is at ofs=0 len=32768 (0x8000) and we get
32768 back.
A follow up request with ofs=0 len=1048576 (0x100000) only gets the
first 32768 bytes which are already in the page cache.

I can easily reproduce this with the Ubuntu 5.4 kernel once I add
state->ur.sqe.rw_flags |= RWF_NOWAIT; to vfs_io_uring_pread_send()
and use this.

echo 1 > /proc/sys/vm/drop_caches
head -c 1024 /root/samba-test/ff1.dat | hexdump -C
00000000  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff
|................|
*
00000400
smbclient //172.31.9.167/uringff -Uroot%test -c "get ff1.dat"

results in this log entries:
> [2020/05/06 06:51:57.069990,  0] ../../source3/modules/vfs_io_uring.c:89(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: Invalid pread ofs=0 (0x0) len=8388608 (0x800000) nread=16384 (0x4000) eof=8388608 (0x800000) blks=4096 blocks=16384 ff1.dat fnum 840153065
> [2020/05/06 06:51:57.076882,  1] ../../source3/modules/vfs_io_uring.c:104(vfs_io_uring_finish_req)
>   vfs_io_uring_finish_req: pread ofs=16384 (0x4000) len=8372224 (0x7fc000) nread=8372224 (0x7fc000) eof=8388608 (0x800000) blks=4096 blocks=16384 ff1.dat fnum 840153065

smbclient is just smart enough to recover itself from the short read.
But the windows client isn't.

The attached test against liburing (git://git.kernel.dk/liburing) should
be able to demonstrate the problem. It can also be found in
https://github.com/metze-samba/liburing/tree/implicit-rwf-nowaithttps://github.com/metze-samba/liburing/commit/eb06dcfde747e46bd08bedf9def2e6cb536c39e3


I added the sqe->rw_flags = RWF_NOWAIT; line in order to demonstrate it
against the Ubuntu 5.3 and 5.4 kernels. They both seem to have the bug.

Can someone run the unmodified test/implicit-rwf_nowait against
a newer kernel?

Thanks!
metze
-------------- next part --------------
From eb06dcfde747e46bd08bedf9def2e6cb536c39e3 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze at samba.org>
Date: Wed, 6 May 2020 15:33:53 +0200
Subject: [PATCH] TODO!!! test/implicit-rwf_nowait.c

Once this proves the problem, we need a more detailed commit message...
---
 test/Makefile              |   4 +-
 test/implicit-rwf_nowait.c | 162 +++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 test/implicit-rwf_nowait.c

diff --git a/test/Makefile b/test/Makefile
index ff4d4b8ec04a..50a75588aca7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -21,7 +21,7 @@ all_targets += poll poll-cancel ring-leak fsync io_uring_setup io_uring_register
 		file-update accept-reuse poll-v-poll fadvise madvise \
 		short-read openat2 probe shared-wq personality eventfd \
 		send_recv eventfd-ring across-fork sq-poll-kthread splice \
-		lfs-openat lfs-openat-write
+		lfs-openat lfs-openat-write implicit-rwf_nowait
 
 include ../Makefile.quiet
 
@@ -53,7 +53,7 @@ test_srcs := poll.c poll-cancel.c ring-leak.c fsync.c io_uring_setup.c \
 	file-update.c accept-reuse.c poll-v-poll.c fadvise.c \
 	madvise.c short-read.c openat2.c probe.c shared-wq.c \
 	personality.c eventfd.c eventfd-ring.c across-fork.c sq-poll-kthread.c \
-	splice.c lfs-openat.c lfs-openat-write.c
+	splice.c lfs-openat.c lfs-openat-write.c implicit-rwf-nowait.c
 
 ifdef CONFIG_HAVE_STATX
 test_srcs += statx.c
diff --git a/test/implicit-rwf_nowait.c b/test/implicit-rwf_nowait.c
new file mode 100644
index 000000000000..581df209d295
--- /dev/null
+++ b/test/implicit-rwf_nowait.c
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: Implicit RWF_NOWAIT bug
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+
+#define BLOCK	4096
+
+#ifndef RWF_NOWAIT
+#define RWF_NOWAIT	8
+#endif
+
+static int get_file_fd(void)
+{
+	ssize_t ret;
+	char *buf;
+	int fd;
+
+	fd = open("testfile", O_RDWR | O_CREAT, 0644);
+	if (fd < 0) {
+		perror("open file");
+		return -1;
+	}
+
+	buf = malloc(BLOCK);
+	memset(buf, 0xff, BLOCK);
+	ret = pwrite(fd, buf, BLOCK, 0);
+	if (ret != BLOCK) {
+		if (ret < 0)
+			perror("write");
+		else
+			printf("Short write\n");
+		goto err;
+	}
+	ret = pwrite(fd, buf, BLOCK, BLOCK);
+	if (ret != BLOCK) {
+		if (ret < 0)
+			perror("write");
+		else
+			printf("Short write\n");
+		goto err;
+	}
+	fsync(fd);
+
+	if (posix_fadvise(fd, BLOCK, BLOCK, POSIX_FADV_DONTNEED)) {
+		perror("fadvise");
+err:
+		close(fd);
+		free(buf);
+		return -1;
+	}
+
+	free(buf);
+	return fd;
+}
+
+static void put_file_fd(int fd)
+{
+	close(fd);
+	unlink("testfile");
+}
+
+int main(int argc, char *argv[])
+{
+	struct io_uring ring;
+	struct io_uring_sqe *sqe;
+	struct io_uring_cqe *cqe;
+	struct iovec iov;
+	int ret, fd;
+
+	iov.iov_base = malloc(2*BLOCK);
+	iov.iov_len = BLOCK;
+
+	ret = io_uring_queue_init(1, &ring, 0);
+	if (ret) {
+		printf("ring setup failed\n");
+		return 1;
+
+	}
+
+	sqe = io_uring_get_sqe(&ring);
+	if (!sqe) {
+		printf("get sqe failed\n");
+		return 1;
+	}
+
+	fd = get_file_fd();
+	if (fd < 0)
+		return 1;
+
+	io_uring_prep_readv(sqe, fd, &iov, 1, 0);
+	io_uring_sqe_set_data(sqe, (void *)(uintptr_t)0x11111111);
+	ret = io_uring_submit(&ring);
+	if (ret != 1) {
+		printf("Got submit %d, expected 1\n", ret);
+		goto err;
+	}
+
+	ret = io_uring_wait_cqe(&ring, &cqe);
+	if (ret) {
+		printf("Ring wait got %d\n", ret);
+		goto err;
+	}
+	ret = (uintptr_t)io_uring_cqe_get_data(cqe);
+	if (ret != 0x11111111) {
+		printf("Got invalid data 0x%08x, expected 0x11111111\n", ret);
+		goto err;
+	}
+	io_uring_cq_advance(&ring, 1);
+
+	if (cqe->res != BLOCK) {
+		printf("cqe res=%d != %u\n", cqe->res, BLOCK);
+		goto err;
+	}
+
+	sqe = io_uring_get_sqe(&ring);
+	if (!sqe) {
+		printf("get sqe failed\n");
+		return 1;
+	}
+
+	iov.iov_len = 2*BLOCK;
+	io_uring_prep_readv(sqe, fd, &iov, 1, 0);
+	// Impliet by broken kernels? sqe->rw_flags = RWF_NOWAIT;
+	io_uring_sqe_set_data(sqe, (void *)(uintptr_t)0x22222222);
+	ret = io_uring_submit(&ring);
+	if (ret != 1) {
+		printf("Got submit %d, expected 1\n", ret);
+		goto err;
+	}
+
+	ret = io_uring_wait_cqe(&ring, &cqe);
+	if (ret) {
+		printf("Ring peek got %d\n", ret);
+		goto err;
+	}
+	ret = (uintptr_t)io_uring_cqe_get_data(cqe);
+	if (ret != 0x22222222) {
+		printf("Got invalid data 0x%08x, expected 0x22222222\n", ret);
+		goto err;
+	}
+	io_uring_cq_advance(&ring, 1);
+
+	if (cqe->res != 2*BLOCK) {
+		printf("cqe res=%d != %u\n", cqe->res, 2*BLOCK);
+		goto err;
+	}
+
+	put_file_fd(fd);
+	return 0;
+err:
+	put_file_fd(fd);
+	return 1;
+}
-- 
2.17.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: OpenPGP digital signature
URL: <http://lists.samba.org/pipermail/samba-technical/attachments/20200506/7a64de39/signature.sig>


More information about the samba-technical mailing list