Problems replacing epoll with io_uring in tevent

Stefan Metzmacher metze at samba.org
Thu Oct 27 19:25:29 UTC 2022


Hi Jens,

>>> I'm currently trying to prototype for an IORING_POLL_CANCEL_ON_CLOSE
>>> flag that can be passed to POLL_ADD. With that we'll register
>>> the request in &req->file->f_uring_poll (similar to the file->f_ep list for epoll)
>>> Then we only get a real reference to the file during the call to
>>> vfs_poll() otherwise we drop the fget/fput reference and rely on
>>> an io_uring_poll_release_file() (similar to eventpoll_release_file())
>>> to cancel our registered poll request.
>>
>> Yes, this is a bit tricky as we hold the file ref across the operation. I'd
>> be interested in seeing your approach to this, and also how it would
>> interact with registered files...
> 
> Here's my current patch:
> https://git.samba.org/?p=metze/linux/wip.git;a=commitdiff;h=b9cccfac515739fc279c6eec87ce655a96f94685
> It compiles, but I haven't tested it yet. And I'm not sure if the locking is done correctly...

It doesn't deadlock nor blow up immediately :-)
And it does fix the problem I had.

So what do you think about that patch?
Am I doing stupid things there?

These points might be changed:
- returning -EBADF instead of -ECANCELED
   might be better and allow the caller to avoid
   retrying.
- I guess we could use a single linked list, but
   I'm mostly used to how struct list_head works.
   And I want something that works first.
- We may find a better name than IORING_POLL_CANCEL_ON_CLOSE
- struct io_poll is completely full, as well as io_kiocb->flags
   (maybe we should move flags to 64 bit?),
   so we need to use some other generic struct io_kiocb space,
   which might also be good in order make it possible to keep io_poll_add()
   and io_arm_poll_handler() in common.
   But we may have the new field a bit differently. Note that
   struct io_kiocb (without this patch) still has 32 free bytes before
   4 64 byte cachelines are filled. With my patch 24 bytes are left...
- In struct file it might be possible to share a reference list with
   with the epoll code, where each element can indicate it epoll
   or io_uring is used.

I'm pasting it below in order to make it easier to get comments...

metze

  fs/file_table.c                |   3 ++
  include/linux/fs.h             |   1 +
  include/linux/io_uring.h       |  12 +++++
  include/linux/io_uring_types.h |   4 ++
  include/uapi/linux/io_uring.h  |   1 +
  io_uring/opdef.c               |   1 +
  io_uring/poll.c                | 100 ++++++++++++++++++++++++++++++++++++++++-
  io_uring/poll.h                |   1 +
  8 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index dd88701e54a9..cad408e9c0f5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -16,6 +16,7 @@
  #include <linux/security.h>
  #include <linux/cred.h>
  #include <linux/eventpoll.h>
+#include <linux/io_uring.h>
  #include <linux/rcupdate.h>
  #include <linux/mount.h>
  #include <linux/capability.h>
@@ -147,6 +148,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
  	}

  	atomic_long_set(&f->f_count, 1);
+	INIT_LIST_HEAD(&f->f_uring_poll);
  	rwlock_init(&f->f_owner.lock);
  	spin_lock_init(&f->f_lock);
  	mutex_init(&f->f_pos_lock);
@@ -309,6 +311,7 @@ static void __fput(struct file *file)
  	 * in the file cleanup chain.
  	 */
  	eventpoll_release(file);
+	io_uring_poll_release(file);
  	locks_remove_file(file);

  	ima_file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e654435f1651..7f99efa7a1dc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -972,6 +972,7 @@ struct file {
  	/* Used by fs/eventpoll.c to link all the hooks to this file */
  	struct hlist_head	*f_ep;
  #endif /* #ifdef CONFIG_EPOLL */
+	struct list_head	f_uring_poll;
  	struct address_space	*f_mapping;
  	errseq_t		f_wb_err;
  	errseq_t		f_sb_err; /* for syncfs */
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 43bc8a2edccf..c931ea92c29a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -61,6 +61,15 @@ static inline void io_uring_free(struct task_struct *tsk)
  	if (tsk->io_uring)
  		__io_uring_free(tsk);
  }
+
+void io_uring_poll_release_file(struct file *file);
+static inline void io_uring_poll_release(struct file *file)
+{
+	if (likely(list_empty_careful(&file->f_uring_poll)))
+		return;
+
+	io_uring_poll_release_file(file);
+}
  #else
  static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
  			      struct iov_iter *iter, void *ioucmd)
@@ -92,6 +101,9 @@ static inline const char *io_uring_get_opcode(u8 opcode)
  {
  	return "";
  }
+static inline void io_uring_poll_release(struct file *file)
+{
+}
  #endif

  #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..2373e01c57e7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -547,8 +547,12 @@ struct io_kiocb {
  	union {
  		/* used by request caches, completion batching and iopoll */
  		struct io_wq_work_node	comp_list;
+		struct {
  		/* cache ->apoll->events */
  		__poll_t apoll_events;
+		u8 poll_cancel_on_close:1;
+		struct list_head		f_uring_poll_entry;
+		};
  	};
  	atomic_t			refs;
  	atomic_t			poll_refs;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a2ce8ba7abb5..fe311667cb8c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -276,6 +276,7 @@ enum io_uring_op {
  #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
  #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
  #define IORING_POLL_ADD_LEVEL		(1U << 3)
+#define IORING_POLL_CANCEL_ON_CLOSE	(1U << 4)

  /*
   * ASYNC_CANCEL flags.
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 34b08c87ffa5..540ee55961a3 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -131,6 +131,7 @@ const struct io_op_def io_op_defs[] = {
  		.name			= "POLL_ADD",
  		.prep			= io_poll_add_prep,
  		.issue			= io_poll_add,
+		.cleanup		= io_poll_cleanup,
  	},
  	[IORING_OP_POLL_REMOVE] = {
  		.audit_skip		= 1,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 0d9f49c575e0..d4ccf2f2e815 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -163,6 +163,19 @@ static inline void io_poll_remove_entry(struct io_poll *poll)

  static void io_poll_remove_entries(struct io_kiocb *req)
  {
+	if (!list_empty_careful(&req->f_uring_poll_entry)) {
+		spin_lock(&req->file->f_lock);
+		list_del_init_careful(&req->f_uring_poll_entry);
+		/*
+		 * upgrade to a full reference again,
+		 * it will be released in the common
+		 * cleanup code via io_put_file().
+		 */
+		if (!(req->flags & REQ_F_FIXED_FILE))
+			WARN_ON_ONCE(!get_file_rcu(req->file));
+		spin_unlock(&req->file->f_lock);
+	}
+
  	/*
  	 * Nothing to do if neither of those flags are set. Avoid dipping
  	 * into the poll/apoll/double cachelines if we can.
@@ -199,6 +212,54 @@ enum {
  	IOU_POLL_REMOVE_POLL_USE_RES = 2,
  };

+static inline struct file *io_poll_get_additional_file_ref(struct io_kiocb *req,
+							   unsigned issue_flags)
+{
+	if (!(req->poll_cancel_on_close))
+		return NULL;
+
+	if (unlikely(!req->file))
+		return NULL;
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+
+	if (list_empty_careful(&req->f_uring_poll_entry)) {
+		/*
+		 * This first time we need to add ourself to the
+		 * file->f_uring_poll.
+		 */
+		spin_lock(&req->file->f_lock);
+		list_add_tail(&req->f_uring_poll_entry, &req->file->f_uring_poll);
+		spin_unlock(&req->file->f_lock);
+		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			/*
+			 * If it's not a fixed file,
+			 * we can allow the caller to drop the existing
+			 * reference.
+			 */
+			return req->file;
+		}
+		/*
+		 * For fixed files we grab an additional reference
+		 */
+	}
+
+	io_ring_submit_lock(req->ctx, issue_flags);
+	if (unlikely(!req->file)) {
+		io_ring_submit_unlock(req->ctx, issue_flags);
+		return NULL;
+	}
+	rcu_read_lock();
+	if (unlikely(!get_file_rcu(req->file))) {
+		req->file = NULL;
+		req->cqe.fd = -1;
+		io_poll_mark_cancelled(req);
+	}
+	rcu_read_unlock();
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return req->file;
+}
+
  /*
   * All poll tw should go through this. Checks for poll events, manages
   * references, does rewait, etc.
@@ -230,7 +291,12 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
  		/* the mask was stashed in __io_poll_execute */
  		if (!req->cqe.res) {
  			struct poll_table_struct pt = { ._key = req->apoll_events };
+			unsigned issue_flags = (!*locked) ? IO_URING_F_UNLOCKED : 0;
+			struct file *file_to_put = io_poll_get_additional_file_ref(req, issue_flags);
+			if (unlikely(!req->file))
+				return -ECANCELED;
  			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
+			io_put_file(file_to_put);
  		}

  		if ((unlikely(!req->cqe.res)))
@@ -499,6 +565,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
  				 unsigned issue_flags)
  {
  	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file_to_put;
  	int v;

  	INIT_HLIST_NODE(&req->hash_node);
@@ -506,6 +573,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
  	io_init_poll_iocb(poll, mask, io_poll_wake);
  	poll->file = req->file;
  	req->apoll_events = poll->events;
+	INIT_LIST_HEAD(&req->f_uring_poll_entry);

  	ipt->pt._key = mask;
  	ipt->req = req;
@@ -529,7 +597,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
  	if (issue_flags & IO_URING_F_UNLOCKED)
  		req->flags &= ~REQ_F_HASH_LOCKED;

+	file_to_put = io_poll_get_additional_file_ref(req, issue_flags);
+	if (unlikely(!req->file))
+		return -ECANCELED;
  	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+	io_put_file(file_to_put);

  	if (unlikely(ipt->error || !ipt->nr_entries)) {
  		io_poll_remove_entries(req);
@@ -857,11 +929,17 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  	if (sqe->buf_index || sqe->off || sqe->addr)
  		return -EINVAL;
  	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_CANCEL_ON_CLOSE))
  		return -EINVAL;
  	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
  		return -EINVAL;

+	if (flags & IORING_POLL_CANCEL_ON_CLOSE) {
+		req->poll_cancel_on_close = 1;
+	} else {
+		req->poll_cancel_on_close = 0;
+	}
+
  	poll->events = io_poll_parse_events(sqe, flags);
  	return 0;
  }
@@ -963,3 +1041,23 @@ void io_apoll_cache_free(struct io_cache_entry *entry)
  {
  	kfree(container_of(entry, struct async_poll, cache));
  }
+
+void io_uring_poll_release_file(struct file *file)
+{
+	struct io_kiocb *req, *next;
+
+	list_for_each_entry_safe(req, next, &file->f_uring_poll, f_uring_poll_entry) {
+		io_ring_submit_lock(req->ctx, IO_URING_F_UNLOCKED);
+		io_poll_mark_cancelled(req);
+		list_del_init_careful(&req->f_uring_poll_entry);
+		io_poll_remove_entries(req);
+		req->file = NULL;
+		io_poll_execute(req, 0);
+		io_ring_submit_unlock(req->ctx, IO_URING_F_UNLOCKED);
+	}
+}
+
+void io_poll_cleanup(struct io_kiocb *req)
+{
+	io_poll_remove_entries(req);
+}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index ef25c26fdaf8..43e6b877f1bc 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -27,6 +27,7 @@ struct async_poll {

  int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
  int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
+void io_poll_cleanup(struct io_kiocb *req);

  int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
  int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);




More information about the samba-technical mailing list