[PATCH] Restarting cleanupd when ctdb-messaging is down

Ralph Boehme slow at samba.org
Fri Jul 15 08:34:27 UTC 2016


Hi!

Attached is a patch for bug:
<https://bugzilla.samba.org/show_bug.cgi?id=12022>

When running "ctdb ban" on a node where "ctdb timeout" parameter is
set as well, cleanup may exit because of the "ctdb timeout" when
attempting ctdb database ops.

smbd will then restart cleanupd (since patch for bug 11855), but
cleanup will fail to reinitialize messaging with ctdb (remember: ctdb
node is banned, this causes failure in ctdb_working()) and immediately
exit once again. No further restart attempts are tried, because no
child objects was added to the child list in smbd.

To fix this we need three things:
- keep retrying to start cleanupd at intervals
- queue cleanup events in the parent smbd as long as cleanupd is down
- once cleanupd comes back, send him the cleanup events

Patch attached, please review and push if ok.

Cheerio!
-slow
-------------- next part --------------
From e248222946b670903e27c9c2bdb88449eb11fb51 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 14 Jul 2016 16:31:44 +0200
Subject: [PATCH 1/3] s3/smbd: add cleanupd_init_send()/recv()

Previously, without this patch, if cleanupd died for whatever reason, it
would be restarted from smbd. However, if cleanupd initialization
failed and it exitted again, there would be no child entry in smbd for
it and it wouldn't be attempted to restart it again.

This patch adds async send/recv methods for starting cleanupd that will
reschedule restart attempt every second in case initilisation failed.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 3 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 6e70edc..9fed646 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -527,6 +527,87 @@ static void cleanupd_stopped(struct tevent_req *req)
 	DBG_WARNING("cleanupd stopped: %s\n", nt_errstr(status));
 }
 
+static void cleanupd_init_timer(struct tevent_context *ev,
+				struct tevent_timer *te,
+				struct timeval current_time,
+				void *private_data);
+
+struct cleanup_init_state {
+	bool ok;
+	struct tevent_context *ev;
+	struct messaging_context *msg;
+	struct tevent_timer *te;
+	struct server_id *ppid;
+};
+
+static struct tevent_req *cleanupd_init_send(struct tevent_context *ev,
+					     TALLOC_CTX *mem_ctx,
+					     struct messaging_context *msg,
+					     struct server_id *ppid)
+{
+	struct tevent_req *req = NULL;
+	struct cleanup_init_state *state = NULL;
+	struct timeval tv;
+
+	req = tevent_req_create(mem_ctx, &state, struct cleanup_init_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	*state = (struct cleanup_init_state) {
+		.msg = msg,
+		.ev = ev,
+		.ppid = ppid
+	};
+
+	tv = tevent_timeval_current_ofs(0, 0);
+
+	state->te = tevent_add_timer(ev, state, tv, cleanupd_init_timer, req);
+	if (tevent_req_nomem(state->te, req)) {
+		tevent_req_post(req, ev);
+	}
+
+	return req;
+}
+
+static void cleanupd_init_timer(struct tevent_context *ev,
+				struct tevent_timer *te,
+				struct timeval current_time,
+				void *private_data)
+{
+	struct tevent_req *req = talloc_get_type_abort(
+		private_data, struct tevent_req);
+	struct cleanup_init_state *state = tevent_req_data(
+		req, struct cleanup_init_state);
+	struct timeval tv;
+
+	DBG_NOTICE("Initializing cleanupd from restart timer()\n");
+
+	state->ok = cleanupd_init(state->msg, false, state->ppid);
+	if (state->ok) {
+		DBG_WARNING("cleanupd restarted\n");
+		tevent_req_done(req);
+		return;
+	}
+
+	DBG_ERR("Scheduled cleanupd restart failed, rescheduling\n");
+
+	tv = tevent_timeval_current_ofs(1, 0);
+
+	state->te = tevent_add_timer(ev, state, tv, cleanupd_init_timer, req);
+	if (tevent_req_nomem(state->te, req)) {
+		tevent_req_post(req, ev);
+	}
+}
+
+static bool cleanupd_init_recv(struct tevent_req *req)
+{
+	struct cleanup_init_state *state = tevent_req_data(
+		req, struct cleanup_init_state);
+
+	return state->ok;
+}
+
 /*
   at most every smbd:cleanuptime seconds (default 20), we scan the BRL
   and locking database for entries to cleanup. As a side effect this
@@ -553,6 +634,18 @@ static void cleanup_timeout_fn(struct tevent_context *event_ctx,
 			   MSG_SMB_UNLOCK, NULL, 0);
 }
 
+static void cleanupd_started(struct tevent_req *req)
+{
+	bool ok;
+
+	ok = cleanupd_init_recv(req);
+	TALLOC_FREE(req);
+	if (!ok) {
+		DBG_ERR("Failed to restart cleanupd, giving up\n");
+		return;
+	}
+}
+
 static void remove_child_pid(struct smbd_parent_context *parent,
 			     pid_t pid,
 			     bool unclean_shutdown)
@@ -578,13 +671,20 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 	}
 
 	if (pid == procid_to_pid(&parent->cleanupd)) {
-		bool ok;
+		struct tevent_req *req;
+
+		server_id_set_disconnected(&parent->cleanupd);
 
 		DBG_WARNING("Restarting cleanupd\n");
-		ok = cleanupd_init(parent->msg_ctx, false, &parent->cleanupd);
-		if (!ok) {
+		req = cleanupd_init_send(messaging_tevent_context(parent->msg_ctx),
+					 parent,
+					 parent->msg_ctx,
+					 &parent->cleanupd);
+		if (req == NULL) {
 			DBG_ERR("Failed to restart cleanupd\n");
+			return;
 		}
+		tevent_req_set_callback(req, cleanupd_started, parent);
 		return;
 	}
 
-- 
2.5.0


From 2b9c0422a5dfc58ab0eaf1681b17f93b8ccc750d Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 14 Jul 2016 16:37:30 +0200
Subject: [PATCH 2/3] s3/smbd: free child outside of the for loop

The next commits needs access to the child after the loop. Other then
that this doesn't change overall behaviour.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 9fed646..bbc055d 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -656,9 +656,7 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 
 	for (child = parent->children; child != NULL; child = child->next) {
 		if (child->pid == pid) {
-			struct smbd_child_pid *tmp = child;
 			DLIST_REMOVE(parent->children, child);
-			TALLOC_FREE(tmp);
 			parent->num_children -= 1;
 			break;
 		}
@@ -673,6 +671,7 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 	if (pid == procid_to_pid(&parent->cleanupd)) {
 		struct tevent_req *req;
 
+		TALLOC_FREE(child);
 		server_id_set_disconnected(&parent->cleanupd);
 
 		DBG_WARNING("Restarting cleanupd\n");
@@ -688,6 +687,8 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 		return;
 	}
 
+	TALLOC_FREE(child);
+
 	iov[0] = (struct iovec) { .iov_base = (uint8_t *)&pid,
 				  .iov_len = sizeof(pid) };
 	iov[1] = (struct iovec) { .iov_base = (uint8_t *)&unclean_shutdown,
-- 
2.5.0


From a8e2fb7fd0f7f8c7fc75be529f5d9e1331351c18 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 14 Jul 2016 16:43:28 +0200
Subject: [PATCH 3/3] s3/smbd: add queueing of cleanup events

If cleanupd is not running, queue cleanup events in smbd and fire the
off if cleanupd is back again.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index bbc055d..720091b 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -75,6 +75,9 @@ struct smbd_parent_context {
 	struct server_id cleanupd;
 
 	struct tevent_timer *cleanup_te;
+
+	/* List of pending child cleanups */
+	struct smbd_child_pid *pending_child_cleanups;
 };
 
 struct smbd_open_socket {
@@ -87,6 +90,7 @@ struct smbd_open_socket {
 struct smbd_child_pid {
 	struct smbd_child_pid *prev, *next;
 	pid_t pid;
+	bool unclean_shutdown;
 };
 
 extern void start_epmd(struct tevent_context *ev_ctx,
@@ -637,6 +641,10 @@ static void cleanup_timeout_fn(struct tevent_context *event_ctx,
 static void cleanupd_started(struct tevent_req *req)
 {
 	bool ok;
+	struct smbd_parent_context *parent = tevent_req_callback_data(
+		req, struct smbd_parent_context);
+	struct smbd_child_pid *child;
+	int cleanup_time = lp_parm_int(-1, "smbd", "cleanuptime", 20);
 
 	ok = cleanupd_init_recv(req);
 	TALLOC_FREE(req);
@@ -644,6 +652,45 @@ static void cleanupd_started(struct tevent_req *req)
 		DBG_ERR("Failed to restart cleanupd, giving up\n");
 		return;
 	}
+
+	/* Not effective, but simple and not expected to be called often */
+	while ((child = parent->pending_child_cleanups) != NULL) {
+		struct iovec iov[2];
+		NTSTATUS status;
+
+		DLIST_REMOVE(parent->pending_child_cleanups, child);
+
+		iov[0] = (struct iovec) {
+			.iov_base = (uint8_t *)&child->pid,
+			.iov_len = sizeof(pid_t)
+		};
+
+		iov[1] = (struct iovec) {
+			.iov_base = (uint8_t *)&child->unclean_shutdown,
+			.iov_len = sizeof(bool)
+		};
+
+		status = messaging_send_iov(parent->msg_ctx,
+					    parent->cleanupd,
+					    MSG_SMB_NOTIFY_CLEANUP,
+					    iov, ARRAY_SIZE(iov),
+					    NULL, 0);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_ERR("messaging_send_iov returned %s\n",
+				nt_errstr(status));
+		}
+	}
+
+	if (parent->cleanup_te != NULL) {
+		return;
+	}
+
+	parent->cleanup_te = tevent_add_timer(parent->ev_ctx,
+					      parent,
+					      timeval_current_ofs(cleanup_time, 0),
+					      cleanup_timeout_fn,
+					      parent);
+	DBG_NOTICE("Scheduled locks cleanup after cleanupd restart\n");
 }
 
 static void remove_child_pid(struct smbd_parent_context *parent,
@@ -687,6 +734,12 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 		return;
 	}
 
+	if (server_id_is_disconnected(&parent->cleanupd)) {
+		child->unclean_shutdown = unclean_shutdown;
+		DLIST_ADD(parent->pending_child_cleanups, child);
+		return;
+	}
+
 	TALLOC_FREE(child);
 
 	iov[0] = (struct iovec) { .iov_base = (uint8_t *)&pid,
-- 
2.5.0



More information about the samba-technical mailing list