Data Corruption bug with Samba's vfs_iouring and Linux 5.6.7/5.7rc3

Jeremy Allison jra at samba.org
Fri May 8 06:27:25 UTC 2020


On Fri, May 08, 2020 at 10:39:38AM +0530, Anoop C S wrote:
> 
> Your patch fixes mismatch in SHA256 checksum of 100 bin files copied
> using Windows explorer in my environment.
> 
> > If so I'll fix it up to be production-ready,
> > (for example I think I can get rid of the
> > immediate event useage) fix the pwrite case
> > and then cut it onto bite-sized reviewable chunks.

OK, here's the "production ready" version.
Gets rid of the crappy immediate event.

Still doesn't fix the io_uring pwrite issue
but if can confirm that this version also fixes
the problem (and it's a much cleaner patchset :-)
then I'll add the pwrite fixes next and I'll
try running in CI.

Then we should have a fix for io_uring in 4.12 !

Cheers,

Jeremy.
-------------- next part --------------
From f3c7199d6f7513c549e6acfaef2e49e555089ae9 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 22:39:13 -0700
Subject: [PATCH 1/7] s3: VFS: io_uring. Add opcode field to struct
 vfs_io_uring_request.

Fill it in in the callers. The callback is going to need to
know what kind of operation just completed in order to cope
with short IO read/writes.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 378e48d112f..124704fc7bf 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -37,10 +37,17 @@ struct vfs_io_uring_config {
 	struct vfs_io_uring_request *pending;
 };
 
+enum vfs_io_uring_opcode {
+	SMB_VFS_IO_URING_PREAD,
+	SMB_VFS_IO_URING_PWRITE,
+	SMB_VFS_IO_URING_FSYNC
+};
+
 struct vfs_io_uring_request {
 	struct vfs_io_uring_request *prev, *next;
 	struct vfs_io_uring_request **list_head;
 	struct vfs_io_uring_config *config;
+	enum vfs_io_uring_opcode opcode;
 	struct tevent_req *req;
 	void *state;
 	struct io_uring_sqe sqe;
@@ -319,6 +326,7 @@ static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *hand
 	state->ur.config = config;
 	state->ur.req = req;
 	state->ur.state = state;
+	state->ur.opcode = SMB_VFS_IO_URING_PREAD;
 
 	SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
 				     state->ur.profile_bytes, n);
@@ -399,6 +407,7 @@ static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *han
 	state->ur.config = config;
 	state->ur.req = req;
 	state->ur.state = state;
+	state->ur.opcode = SMB_VFS_IO_URING_PWRITE;
 
 	SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
 				     state->ur.profile_bytes, n);
@@ -476,6 +485,7 @@ static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *hand
 	state->ur.config = config;
 	state->ur.req = req;
 	state->ur.state = state;
+	state->ur.opcode = SMB_VFS_IO_URING_FSYNC;
 
 	SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
 				     state->ur.profile_bytes, 0);
-- 
2.20.1


From 4503a5fb6dae2065e27596dd725460953bbc8038 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 22:42:57 -0700
Subject: [PATCH 2/7] s3: VFS: Rename vfs_io_uring_queue_run() ->
 _vfs_io_uring_queue_run().

Call from a wrapper function. We'll need this as
eventually we'll need to call this in a loop for
short IO retries.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 124704fc7bf..047b534b384 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -226,7 +226,7 @@ static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
 	return 0;
 }
 
-static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
+static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 {
 	struct vfs_io_uring_request *cur = NULL, *next = NULL;
 	struct io_uring_cqe *cqe = NULL;
@@ -283,6 +283,11 @@ static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 	io_uring_cq_advance(&config->uring, nr);
 }
 
+static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
+{
+	_vfs_io_uring_queue_run(config);
+}
+
 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
 				    struct tevent_fd *fde,
 				    uint16_t flags,
-- 
2.20.1


From 0af4ecfa3d49987c3626ea9cab3ec70135e873f4 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 22:46:11 -0700
Subject: [PATCH 3/7] s3: VFS: io_uring. Change _vfs_io_uring_queue_run()
 funtion to return a bool.

If true, it means retry the submit calls. Currently
always returns false.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 047b534b384..d1a5af6c7a7 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -226,7 +226,7 @@ static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
 	return 0;
 }
 
-static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
+static bool _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 {
 	struct vfs_io_uring_request *cur = NULL, *next = NULL;
 	struct io_uring_cqe *cqe = NULL;
@@ -235,12 +235,13 @@ static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 	struct timespec start_time;
 	struct timespec end_time;
 	int ret;
+	bool need_retry = false;
 
 	PROFILE_TIMESTAMP(&start_time);
 
 	if (config->uring.ring_fd == -1) {
 		vfs_io_uring_config_destroy(config, -ESTALE, __location__);
-		return;
+		return false;
 	}
 
 	for (cur = config->queue; cur != NULL; cur = next) {
@@ -269,7 +270,7 @@ static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 		/* We just retry later */
 	} else if (ret < 0) {
 		vfs_io_uring_config_destroy(config, ret, __location__);
-		return;
+		return false;
 	}
 
 	PROFILE_TIMESTAMP(&end_time);
@@ -281,11 +282,16 @@ static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 	}
 
 	io_uring_cq_advance(&config->uring, nr);
+	return need_retry;
 }
 
 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 {
-	_vfs_io_uring_queue_run(config);
+	bool need_retry;
+
+	do {
+		need_retry = _vfs_io_uring_queue_run(config);
+	} while (need_retry);
 }
 
 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
-- 
2.20.1


From 8a4eebb653279a8458a95cf2d99f7262203c1ffa Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 22:50:07 -0700
Subject: [PATCH 4/7] s3: VFS: Extend struct vfs_io_uring_pread_state to store
 the original request values.

We'll need these later when we decide if a read was short or not.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index d1a5af6c7a7..18dee5e59d5 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -312,6 +312,12 @@ static void vfs_io_uring_fd_handler(struct tevent_context *ev,
 struct vfs_io_uring_pread_state {
 	struct vfs_io_uring_request ur;
 	struct iovec iov;
+	/* We have to remember the original values in case of short read. */
+	struct files_struct *fsp;
+	void *data;
+	size_t total_toread;
+	ssize_t total_read;
+	off_t offset;
 };
 
 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
@@ -338,6 +344,10 @@ static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *hand
 	state->ur.req = req;
 	state->ur.state = state;
 	state->ur.opcode = SMB_VFS_IO_URING_PREAD;
+	state->fsp = fsp;
+	state->data = data;
+	state->total_toread = n;
+	state->offset = offset;
 
 	SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
 				     state->ur.profile_bytes, n);
-- 
2.20.1


From dc14398334d7db8c9e88db49a9600ea9aa4f1422 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 23:00:42 -0700
Subject: [PATCH 5/7] s3: VFS: io_uring. Add is_io_uring_pread_complete()
 funtion to check for short reads.

Currently commented out, not called.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 56 ++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 18dee5e59d5..0afd8c0ed60 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -226,6 +226,25 @@ static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
 	return 0;
 }
 
+#if 0
+static bool is_io_uring_pread_complete(struct vfs_io_uring_request *cur,
+				const struct io_uring_cqe *cqe);
+
+/*
+ * Return true if it was a pread and the bytes transferred
+ * are less than requested.
+ */
+
+static bool vfs_io_uring_complete(struct vfs_io_uring_request *cur,
+				const struct io_uring_cqe *cqe)
+{
+	if (cur->opcode == SMB_VFS_IO_URING_PREAD) {
+		return is_io_uring_pread_complete(cur, cqe);
+	}
+	return true;
+}
+#endif
+
 static bool _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 {
 	struct vfs_io_uring_request *cur = NULL, *next = NULL;
@@ -400,6 +419,43 @@ static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
 	return ret;
 }
 
+#if 0
+/* Returns false if more to read. Updates the total_read count. */
+
+static bool is_io_uring_pread_complete(struct vfs_io_uring_request *cur,
+				const struct io_uring_cqe *cqe)
+{
+	struct tevent_req *req = talloc_get_type_abort(cur->req,
+				struct tevent_req);
+	struct vfs_io_uring_pread_state *state = tevent_req_data(
+				req, struct vfs_io_uring_pread_state);
+
+	if (cqe->res < 0) {
+		/* Error. Deal with it as normal. */
+		return true;
+	}
+
+	if (cqe->res == 0) {
+		/* EOF. Deal with it as normal. */
+		return true;
+	}
+
+	state->total_read += cqe->res;
+
+	if (state->total_read < state->total_toread ) {
+		/* More needed. */
+		DBG_DEBUG("file %s short read, wanted 0x%"PRIx64" at "
+			"offset 0x%"PRIx64" got 0x%"PRIx64"\n",
+			fsp_str_dbg(state->fsp),
+			(uint64_t)state->total_toread,
+			(uint64_t)state->offset,
+			(uint64_t)state->total_read);
+		return false;
+	}
+	return true;
+}
+#endif
+
 struct vfs_io_uring_pwrite_state {
 	struct vfs_io_uring_request ur;
 	struct iovec iov;
-- 
2.20.1


From cb869ac6da3c00727559cee93942015b65bd7ba9 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 23:07:47 -0700
Subject: [PATCH 6/7] s3: VFS: io_uring. Add vfs_io_uring_pread_op_reschedule()

Reschedules short IO to completion. Not yet called
so commented out.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 0afd8c0ed60..87a8f838247 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -243,6 +243,17 @@ static bool vfs_io_uring_complete(struct vfs_io_uring_request *cur,
 	}
 	return true;
 }
+
+static void vfs_io_uring_pread_op_reschedule(struct vfs_io_uring_config *config,
+				struct vfs_io_uring_request *cur);
+
+static void vfs_io_uring_op_reschedule(struct vfs_io_uring_config *config,
+				struct vfs_io_uring_request *cur)
+{
+	if (cur->opcode == SMB_VFS_IO_URING_PREAD) {
+		vfs_io_uring_pread_op_reschedule(config, cur);
+	}
+}
 #endif
 
 static bool _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
@@ -454,6 +465,40 @@ static bool is_io_uring_pread_complete(struct vfs_io_uring_request *cur,
 	}
 	return true;
 }
+
+/*
+ * Reschedule short read. Adjust the buffer, length
+ * and offsets and resubmit.
+ */
+
+static void vfs_io_uring_pread_op_reschedule(struct vfs_io_uring_config *config,
+				struct vfs_io_uring_request *cur)
+{
+	struct tevent_req *req = talloc_get_type_abort(cur->req,
+				struct tevent_req);
+	struct vfs_io_uring_pread_state *state = tevent_req_data(
+				req, struct vfs_io_uring_pread_state);
+	off_t offset;
+	uint8_t *new_base = (uint8_t *)state->data + state->total_read;
+
+	/* Set up the parameters for the remaining IO. */
+	state->iov.iov_base = (void *)new_base;
+	state->iov.iov_len = state->total_toread - state->total_read;
+	offset = state->offset + state->total_read;
+
+	DBG_DEBUG("reschedule read on file %s. Get 0x%"PRIx64" at "
+			"offset 0x%"PRIx64"\n",
+			fsp_str_dbg(state->fsp),
+			(uint64_t)state->iov.iov_len,
+			(uint64_t)offset);
+
+	io_uring_prep_readv(&cur->sqe,
+			state->fsp->fh->fd,
+			&state->iov,
+			1,
+			offset);
+	io_uring_sqe_set_data(&cur->sqe, cur);
+}
 #endif
 
 struct vfs_io_uring_pwrite_state {
-- 
2.20.1


From 6f58fd4d442bf043f83dcc0c160a1943aea31743 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Thu, 7 May 2020 23:12:53 -0700
Subject: [PATCH 7/7] s3: VFS: io_uring. On cqe completion, check for short
 reads.

If any occurred, call the resheduling functions and cause
_vfs_io_uring_queue_run() to be retried.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=14361

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 source3/modules/vfs_io_uring.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/source3/modules/vfs_io_uring.c b/source3/modules/vfs_io_uring.c
index 87a8f838247..80ef1ac0161 100644
--- a/source3/modules/vfs_io_uring.c
+++ b/source3/modules/vfs_io_uring.c
@@ -226,7 +226,6 @@ static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
 	return 0;
 }
 
-#if 0
 static bool is_io_uring_pread_complete(struct vfs_io_uring_request *cur,
 				const struct io_uring_cqe *cqe);
 
@@ -254,7 +253,6 @@ static void vfs_io_uring_op_reschedule(struct vfs_io_uring_config *config,
 		vfs_io_uring_pread_op_reschedule(config, cur);
 	}
 }
-#endif
 
 static bool _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 {
@@ -307,7 +305,24 @@ static bool _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
 
 	io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
 		cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
-		vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
+		if (vfs_io_uring_complete(cur, cqe)) {
+			/* Full IO completed. We're done. */
+			vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
+		} else {
+			/* Short read or write. We must reschedule. */
+			/* Take us off the pending list. */
+			DLIST_REMOVE(config->pending, cur);
+			cur->list_head = NULL;
+
+			vfs_io_uring_op_reschedule(config, cur);
+
+			/* Put us back on the queued list. */
+			DLIST_ADD_END(config->queue, cur);
+			cur->list_head = &config->queue;
+
+			/* Cause the caller to call us again. */
+			need_retry = true;
+		}
 		nr++;
 	}
 
@@ -423,14 +438,13 @@ static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
 		ret = -1;
 	} else {
 		vfs_aio_state->error = 0;
-		ret = state->ur.cqe.res;
+		ret = state->total_read;
 	}
 
 	tevent_req_received(req);
 	return ret;
 }
 
-#if 0
 /* Returns false if more to read. Updates the total_read count. */
 
 static bool is_io_uring_pread_complete(struct vfs_io_uring_request *cur,
@@ -499,7 +513,6 @@ static void vfs_io_uring_pread_op_reschedule(struct vfs_io_uring_config *config,
 			offset);
 	io_uring_sqe_set_data(&cur->sqe, cur);
 }
-#endif
 
 struct vfs_io_uring_pwrite_state {
 	struct vfs_io_uring_request ur;
-- 
2.20.1



More information about the samba-technical mailing list