[PATCH] Fix epoll backend to allow separate read/write events on one fd.

Jeremy Allison jra at samba.org
Wed Feb 13 12:31:52 MST 2013


The tevent epoll backend has a bug in that it does
not allow separate read/write events to be added
using the same file descriptor.

Unfortunately Samba uses this functionality. The best example
is inside nb_trans_got_reader() in source3/libsmb/namequery.c.

We do:

        subreq = sock_packet_read_send(
                state, state->ev, state->sock,
                state->reader, state->type, state->trn_id,
                state->validator, state->private_data);
        if (tevent_req_nomem(subreq, req)) {
                return;
        }
        tevent_req_set_callback(subreq, nb_trans_done, req);

        subreq = sendto_send(state, state->ev, state->sock,
                             state->buf, state->buflen, 0, state->dst_addr);

The sock_packet_read_send() calls recvfrom_send()
which adds a TEVENT_FD_READ on state->sock, then a few
lines further down sendto_send() adds a TEVENT_FD_WRITE
on state->sock also, which is the same file descriptor.

The epoll backend doesn't currently cope with this as every
time a new fd event is added it calls epoll_ctl(..., EPOLL_CTL_ADD,..)
which fails with errno == EEXIST if the fd being added
is already associated with the events array. This
causes the epoll backend to fail on adding an event
and call epoll_panic().

The reason we get away with this currently in Samba
as we use the "standard" backend, which silently
falls back to using select() in current master (or
to poll() when my previous patchset is applied). So
we don't notice the epoll backend failure in this
case.

The following patchset fixes this by allowing read and
write fd_events to be multiplexed correctly inside the
tevent epoll backend.

It includes a regression test that demonstrates the
problem and causes smbtorture local.event to fail
if all the tevent backends do not support separate
read/write events using the same fd. It depends on
the previous patchset that fixes the epoll->poll
fallback in the "standard" backend.

Please review and push to master on top of the previous
tevent patchset if you're happy with it !

Cheers,

        Jeremy.
-------------- next part --------------
From 17ced8e1b025c03586538f93177ee22ef97bde54 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:15:02 -0800
Subject: [PATCH 1/7] Start to fix the epoll backend to support 2 fd events on
 the same fd correctly.

Add a utility function that finds an fde with a duplicate fd.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index 0fea916..7585ff2 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -185,6 +185,31 @@ static void epoll_check_reopen(struct epoll_event_context *epoll_ev)
 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR	(1<<2)
 
 /*
+ Utility function to support multiplexing two fde events,
+ one read, one write, into one epoll event. This function
+ finds an fde event with the same file descriptor that
+ has been added as an epoll event.
+*/
+
+static struct tevent_fd *find_duplicate_fde(struct epoll_event_context *epoll_ev,
+					struct tevent_fd *fde)
+{
+	struct tevent_fd *orig_fde;
+
+	for (orig_fde = epoll_ev->ev->fd_events; orig_fde; orig_fde = orig_fde->next) {
+		if (orig_fde == fde) {
+			continue;
+		}
+		if (orig_fde->fd == fde->fd &&
+		    (orig_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) {
+			break;
+		}
+	}
+
+	return orig_fde;
+}
+
+/*
  add the epoll event to the given fd_event
 */
 static void epoll_add_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
-- 
1.8.1


From d97a42d197ae08f5d2cdca59894d8aaf04244885 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:17:59 -0800
Subject: [PATCH 2/7] Add another utility function epoll_add_duplicate_fd().

This will be called by epoll_add_event() to merge two
fde events with the same file descriptor.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index 7585ff2..43784cf 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -183,6 +183,7 @@ static void epoll_check_reopen(struct epoll_event_context *epoll_ev)
 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT	(1<<0)
 #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR	(1<<1)
 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR	(1<<2)
+#define EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED	(1<<3)
 
 /*
  Utility function to support multiplexing two fde events,
@@ -210,6 +211,50 @@ static struct tevent_fd *find_duplicate_fde(struct epoll_event_context *epoll_ev
 }
 
 /*
+ epoll cannot add the same file descriptor twice, once
+ with read, once with write which is allowed by the
+ tevent backend. Multiplex the existing fde, flag it
+ as such so we can search for the correct fde on
+ event triggering.
+*/
+
+static int epoll_add_duplicate_fd(struct epoll_event_context *epoll_ev,
+				struct tevent_fd *add_fde)
+{
+	struct epoll_event event;
+	struct tevent_fd *orig_fde;
+	int ret;
+
+	/* Find the existing fde that caused the EEXIST error. */
+	orig_fde = find_duplicate_fde(epoll_ev, add_fde);
+	if (orig_fde == NULL) {
+		epoll_panic(epoll_ev, "logic error");
+		return -1;
+	}
+
+	if (orig_fde->additional_flags & EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED) {
+		/* Logic error. Can't have more than 2 multiplexed fde's. */
+		epoll_panic(epoll_ev, "logic error");
+		return -1;
+	}
+
+	/* Modify the orig_fde to add in the new flags. */
+	ZERO_STRUCT(event);
+	event.events = epoll_map_flags(orig_fde->flags) |
+			epoll_map_flags(add_fde->flags);
+	event.data.ptr = orig_fde;
+	ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, orig_fde->fd, &event);
+	if (ret != 0) {
+		return ret;
+	}
+
+	/* Now flag both fde's as being multiplexed. */
+	orig_fde->additional_flags |= EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED;
+	add_fde->additional_flags |= EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED;
+	return ret;
+}
+
+/*
  add the epoll event to the given fd_event
 */
 static void epoll_add_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
-- 
1.8.1


From fe7781bc8f69c1dfcc59b1b2634442a783150d75 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:20:00 -0800
Subject: [PATCH 3/7] If epoll_ctl(..EPOLL_CTL_ADD,..) failes with EEXIST,
 merge the two fde's into one epoll event.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index 43784cf..47cdd01 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -272,8 +272,16 @@ static void epoll_add_event(struct epoll_event_context *epoll_ev, struct tevent_
 	event.events = epoll_map_flags(fde->flags);
 	event.data.ptr = fde;
 	if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event) != 0) {
-		epoll_panic(epoll_ev, "EPOLL_CTL_ADD failed");
-		return;
+		if (errno == EEXIST) {
+			if (epoll_add_duplicate_fd(epoll_ev, fde) != 0) {
+				epoll_panic(epoll_ev, "epoll_add_duplicate_fd "
+					"failed");
+				return;
+			}
+		} else {
+			epoll_panic(epoll_ev, "EPOLL_CTL_ADD failed");
+			return;
+		}
 	}
 	fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
 
-- 
1.8.1


From 9bb1971a127642fb362c5ec0805e0e10259ce1ac Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:22:59 -0800
Subject: [PATCH 4/7] Fix up epoll_del_event to cope with deleting a
 multiplexed fde event.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index 47cdd01..a418c16 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -305,13 +305,38 @@ static void epoll_del_event(struct epoll_event_context *epoll_ev, struct tevent_
 	/* if there's no epoll_event, we don't need to delete it */
 	if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) return;
 
-	ZERO_STRUCT(event);
-	event.events = epoll_map_flags(fde->flags);
-	event.data.ptr = fde;
-	if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event) != 0) {
-		tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
-			     "epoll_del_event failed! probable early close bug (%s)\n",
-			     strerror(errno));
+	if (fde->additional_flags & EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED) {
+		/*
+		 * If this is a multiplexed fde, we need to replace it
+		 * with the remaining event.
+		 */
+		struct tevent_fd *mpx_fde = find_duplicate_fde(epoll_ev, fde);
+		if (mpx_fde == NULL) {
+			epoll_panic(epoll_ev, "find_duplicate_fd failed");
+			return;
+		}
+		ZERO_STRUCT(event);
+		event.events = epoll_map_flags(mpx_fde->flags);
+		event.data.ptr = mpx_fde;
+		if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, mpx_fde->fd, &event) != 0) {
+			tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+				"epoll_del_event failed changing mpx fd (%s)\n",
+				strerror(errno));
+			epoll_panic(epoll_ev, "EPOLL_CTL_ADD failed");
+			return;
+		}
+		/* Remove multiplexed flag from both events. */
+		fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED;
+		mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED;
+	} else {
+		ZERO_STRUCT(event);
+		event.events = epoll_map_flags(fde->flags);
+		event.data.ptr = fde;
+		if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event) != 0) {
+			tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+				"epoll_del_event failed! probable early close bug (%s)\n",
+				strerror(errno));
+		}
 	}
 	fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
 }
-- 
1.8.1


From 4ad68185c97b01850d2dad5c2ae52067bb102f97 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:24:21 -0800
Subject: [PATCH 5/7] Fix epoll_mod_event() to cope with modifying a
 multiplexed fde event.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index a418c16..d2265fa 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -353,6 +353,18 @@ static void epoll_mod_event(struct epoll_event_context *epoll_ev, struct tevent_
 
 	ZERO_STRUCT(event);
 	event.events = epoll_map_flags(fde->flags);
+	if (fde->additional_flags & EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED) {
+		/*
+		 * This is a multiplexed fde, we need to include both
+		 * flags in the modified event.
+		 */
+		struct tevent_fd *mpx_fde = find_duplicate_fde(epoll_ev, fde);
+		if (mpx_fde == NULL) {
+			epoll_panic(epoll_ev, "find_duplicate_fd failed");
+			return;
+		}
+		event.events |= epoll_map_flags(mpx_fde->flags);
+	}
 	event.data.ptr = fde;
 	if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event) != 0) {
 		epoll_panic(epoll_ev, "EPOLL_CTL_MOD failed");
-- 
1.8.1


From 27f9b9d11f41fe290244c2bbe89a1f8973a6f459 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Tue, 12 Feb 2013 13:25:37 -0800
Subject: [PATCH 6/7] In epoll_event_loop() ensure we trigger the right handler
 for a multiplexed fde event.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/tevent_epoll.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c
index d2265fa..5be9db3 100644
--- a/lib/tevent/tevent_epoll.c
+++ b/lib/tevent/tevent_epoll.c
@@ -478,6 +478,18 @@ static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval
 		}
 		if (events[i].events & EPOLLIN) flags |= TEVENT_FD_READ;
 		if (events[i].events & EPOLLOUT) flags |= TEVENT_FD_WRITE;
+
+		if (fde->additional_flags & EPOLL_ADDITIONAL_FD_WAS_MULTIPLEXED) {
+			/* Ensure we got the right fde. */
+			if ((flags & fde->flags) == 0) {
+				fde = find_duplicate_fde(epoll_ev, fde);
+				if (fde == NULL) {
+					epoll_panic(epoll_ev, "logic error");
+					return -1;
+				}
+			}
+		}
+
 		if (flags) {
 			fde->handler(epoll_ev->ev, fde, flags, fde->private_data);
 			break;
-- 
1.8.1


From fde9624309c5db860690c41f705e370e2bfab829 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra at samba.org>
Date: Wed, 13 Feb 2013 10:45:40 -0800
Subject: [PATCH 7/7] Regression test to ensure that a tevent backend can cope
 with separate read/write events on a single fd.

This tests the multiplex fd changes to the epoll backend to
ensure they work correctly.

Signed-off-by: Jeremy Allison <jra at samba.org>
---
 lib/tevent/testsuite.c | 51 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/lib/tevent/testsuite.c b/lib/tevent/testsuite.c
index 3d2a79a..ba6bbee 100644
--- a/lib/tevent/testsuite.c
+++ b/lib/tevent/testsuite.c
@@ -34,19 +34,26 @@
 #endif
 
 static int fde_count;
+static int fde_wcount;
 
-static void fde_handler(struct tevent_context *ev_ctx, struct tevent_fd *f, 
+static void fde_handler_readwrite(struct tevent_context *ev_ctx, struct tevent_fd *f, 
 			uint16_t flags, void *private_data)
 {
 	int *fd = (int *)private_data;
-	char c;
+	char c = 0;
 #ifdef SA_SIGINFO
 	kill(getpid(), SIGUSR1);
 #endif
 	kill(getpid(), SIGALRM);
-	read(fd[0], &c, 1);
-	write(fd[1], &c, 1);
-	fde_count++;
+
+	if ((flags & TEVENT_FD_WRITE) && (fde_wcount - fde_count < 256)) {
+		/* Don't fill the pipe and block... */
+		write(fd[1], &c, 1);
+		fde_wcount++;
+	} else {
+		read(fd[0], &c, 1);
+		fde_count++;
+	}
 }
 
 static void finished_handler(struct tevent_context *ev_ctx, struct tevent_timer *te,
@@ -70,7 +77,10 @@ static bool test_event_context(struct torture_context *test,
 	int fd[2] = { -1, -1 };
 	const char *backend = (const char *)test_data;
 	int alarm_count=0, info_count=0;
-	struct tevent_fd *fde;
+	struct tevent_fd *fde_read;
+	struct tevent_fd *fde_read_1;
+	struct tevent_fd *fde_write;
+	struct tevent_fd *fde_write_1;
 #ifdef SA_RESTART
 	struct tevent_signal *se1 = NULL;
 #endif
@@ -80,7 +90,6 @@ static bool test_event_context(struct torture_context *test,
 #endif
 	int finished=0;
 	struct timeval t;
-	char c = 0;
 
 	ev_ctx = tevent_context_init_byname(test, backend);
 	if (ev_ctx == NULL) {
@@ -92,13 +101,23 @@ static bool test_event_context(struct torture_context *test,
 
 	/* reset globals */
 	fde_count = 0;
+	fde_wcount = 0;
 
 	/* create a pipe */
 	pipe(fd);
 
-	fde = tevent_add_fd(ev_ctx, ev_ctx, fd[0], TEVENT_FD_READ,
-			    fde_handler, fd);
-	tevent_fd_set_auto_close(fde);
+	fde_read = tevent_add_fd(ev_ctx, ev_ctx, fd[0], TEVENT_FD_READ,
+			    fde_handler_readwrite, fd);
+	fde_write_1 = tevent_add_fd(ev_ctx, ev_ctx, fd[0], TEVENT_FD_WRITE,
+			    fde_handler_readwrite, fd);
+
+	fde_write = tevent_add_fd(ev_ctx, ev_ctx, fd[1], TEVENT_FD_WRITE,
+			    fde_handler_readwrite, fd);
+	fde_read_1 = tevent_add_fd(ev_ctx, ev_ctx, fd[1], TEVENT_FD_READ,
+			    fde_handler_readwrite, fd);
+
+	tevent_fd_set_auto_close(fde_read);
+	tevent_fd_set_auto_close(fde_write);
 
 	tevent_add_timer(ev_ctx, ev_ctx, timeval_current_ofs(2,0),
 			 finished_handler, &finished);
@@ -113,8 +132,6 @@ static bool test_event_context(struct torture_context *test,
 	se3 = tevent_add_signal(ev_ctx, ev_ctx, SIGUSR1, SA_SIGINFO, count_handler, &info_count);
 #endif
 
-	write(fd[1], &c, 1);
-
 	t = timeval_current();
 	while (!finished) {
 		errno = 0;
@@ -124,8 +141,10 @@ static bool test_event_context(struct torture_context *test,
 		}
 	}
 
-	talloc_free(fde);
-	close(fd[1]);
+	talloc_free(fde_read);
+	talloc_free(fde_write);
+	talloc_free(fde_read_1);
+	talloc_free(fde_write_1);
 
 	while (alarm_count < fde_count+1) {
 		if (tevent_loop_once(ev_ctx) == -1) {
@@ -139,11 +158,11 @@ static bool test_event_context(struct torture_context *test,
 	talloc_free(se1);
 #endif
 
-	torture_assert_int_equal(test, alarm_count, 1+fde_count, "alarm count mismatch");
+	torture_assert_int_equal(test, alarm_count, 1+fde_count+fde_wcount, "alarm count mismatch");
 
 #ifdef SA_SIGINFO
 	talloc_free(se3);
-	torture_assert_int_equal(test, info_count, fde_count, "info count mismatch");
+	torture_assert_int_equal(test, info_count, fde_count+fde_wcount, "info count mismatch");
 #endif
 
 	talloc_free(ev_ctx);
-- 
1.8.1



More information about the samba-technical mailing list