[PATCH] Restarting cleanupd when ctdb-messaging is down

Ralph Böhme slow at samba.org
Fri Jul 22 20:00:25 UTC 2016


On Mon, Jul 18, 2016 at 06:20:49PM +0200, Ralph Böhme wrote:
> As discussed off-list, this will also need some sort of protection
> against cleanup messages lost in-flight, like an ACK message or
> possibly use a local tbd for posting cleanup events. Looking into
> it...

updated patchset attached.

Summary:

o Adds restart retry semantics to cleanupd and notifyd (while I was at
  it)

o cleanupd uses a tdb for passing cleanup events

Passes a private autobuild and smoke testing with:

# ctdb ban 120
# kill $(ps -e -o pid,comm | awk '/ cleanupd$| smbd-notifyd$/ {print $1}')

...wait 120 seconds, watch failing restarts. Finally, after 120
seconds when ctdb messaging is up again, both cleanupd and notifyd are
alive again.

Please review&push if ok. Thanks!

Cheerio!
-slow
-------------- next part --------------
From eec339ab57b20976ed15be63575a724513900f5a Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 21 Jul 2016 16:53:15 +0200
Subject: [PATCH 1/4] s3/lib: add smbd_cleanupd.tdb

This will be used between cleanupd and smbd for passing information
about exitted smbd childs.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/lib/cleanupdb.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++
 source3/lib/cleanupdb.h |  30 ++++++++
 source3/wscript_build   |   3 +-
 3 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 source3/lib/cleanupdb.c
 create mode 100644 source3/lib/cleanupdb.h

diff --git a/source3/lib/cleanupdb.c b/source3/lib/cleanupdb.c
new file mode 100644
index 0000000..386a35e
--- /dev/null
+++ b/source3/lib/cleanupdb.c
@@ -0,0 +1,196 @@
+/*
+   Unix SMB/CIFS implementation.
+   Implementation of reliable cleanup events
+   Copyright (C) Ralph Boehme 2016
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "cleanupdb.h"
+
+struct cleanup_key {
+	pid_t pid;
+};
+
+struct cleanup_rec {
+	/* Storing the pid here as well saves a few lines of code */
+	pid_t pid;
+	bool unclean;
+};
+
+static struct tdb_wrap *cleanup_db(void)
+{
+	static struct tdb_wrap *db;
+	char *db_path = NULL;
+	int tdbflags = TDB_INCOMPATIBLE_HASH | TDB_CLEAR_IF_FIRST |
+		TDB_MUTEX_LOCKING;
+
+	if (db != NULL) {
+		return db;
+	}
+
+	db_path = lock_path("smbd_cleanupd.tdb");
+	if (db_path == NULL) {
+		return NULL;
+	}
+
+	db = tdb_wrap_open(NULL, db_path, 0, tdbflags,
+			   O_CREAT | O_RDWR, 0644);
+	if (db == NULL) {
+		DBG_ERR("Failed to open smbd_cleanupd.tdb\n");
+	}
+
+	TALLOC_FREE(db_path);
+	return db;
+}
+
+static void cleanup_fill_key(const pid_t pid,
+			     struct cleanup_key *key)
+{
+	*key = (struct cleanup_key) {
+		.pid = pid,
+	};
+}
+
+static void cleanup_fill_rec(const pid_t pid,
+			     const bool unclean,
+			     struct cleanup_rec *rec)
+{
+	*rec = (struct cleanup_rec) {
+		.pid = pid,
+		.unclean = unclean
+	};
+}
+
+bool cleanupdb_store_child(const pid_t pid, const bool unclean)
+{
+	struct tdb_wrap *db;
+	struct cleanup_key key;
+	struct cleanup_rec rec;
+	TDB_DATA tdbkey, tdbdata;
+	int result;
+
+	db = cleanup_db();
+	if (db == NULL) {
+		return false;
+	}
+
+	cleanup_fill_key(pid, &key);
+	cleanup_fill_rec(pid, unclean, &rec);
+
+	tdbkey = make_tdb_data((uint8_t *)&key, sizeof(key));
+	tdbdata = make_tdb_data((uint8_t *)&rec, sizeof(rec));
+
+	result = tdb_store(db->tdb, tdbkey, tdbdata, TDB_REPLACE);
+	if (result != 0) {
+		DBG_ERR("tdb_store failed for pid %d\n", (int)pid);
+		return false;
+	}
+
+	return true;
+}
+
+bool cleanupdb_delete_child(const pid_t pid)
+{
+	struct tdb_wrap *db;
+	struct cleanup_key key;
+	TDB_DATA tdbkey;
+	int result;
+
+	db = cleanup_db();
+	if (db == NULL) {
+		return false;
+	}
+
+	cleanup_fill_key(pid, &key);
+	tdbkey = make_tdb_data((uint8_t *)&key, sizeof(key));
+
+	result = tdb_delete(db->tdb, tdbkey);
+	if (result != 0) {
+		DBG_ERR("tdb_delete failed for pid %d\n", (int)pid);
+		return false;
+	}
+
+	return true;
+}
+
+static bool cleanup_rec_parse(TDB_DATA tdbdata,
+			      struct cleanup_rec *cleanup_rec)
+{
+	if (tdbdata.dsize != sizeof(struct cleanup_rec)) {
+		DBG_ERR("Found invalid value length %d in cleanup.tdb\n",
+			(int)tdbdata.dsize);
+		return false;
+	}
+
+	memcpy(cleanup_rec, tdbdata.dptr, sizeof(struct cleanup_rec));
+
+	return true;
+}
+
+struct cleanup_read_state {
+	int (*fn)(const pid_t pid, const bool cleanup, void *private_data);
+	void *private_data;
+};
+
+static int cleanup_traverse_fn(struct tdb_context *tdb,
+			       TDB_DATA key, TDB_DATA value,
+			       void *private_data)
+{
+	struct cleanup_read_state *state =
+		(struct cleanup_read_state *)private_data;
+	struct cleanup_rec rec;
+	bool ok;
+	int result;
+
+	ok = cleanup_rec_parse(value, &rec);
+	if (!ok) {
+		return -1;
+	}
+
+	result = state->fn(rec.pid, rec.unclean, state->private_data);
+	if (result != 0) {
+		return -1;
+	}
+
+	return 0;
+}
+
+int cleanupdb_traverse_read(int (*fn)(const pid_t pid,
+				      const bool cleanup,
+				      void *private_data),
+			    void *private_data)
+{
+	struct tdb_wrap *db;
+	struct cleanup_read_state state;
+	int result;
+
+	db = cleanup_db();
+	if (db == NULL) {
+		return -1;
+	}
+
+	state = (struct cleanup_read_state) {
+		.fn = fn,
+		.private_data = private_data
+	};
+
+	result = tdb_traverse_read(db->tdb, cleanup_traverse_fn, &state);
+	if (result < 0) {
+		DBG_ERR("tdb_traverse_read failed\n");
+		return -1;
+	}
+
+	return result;
+}
diff --git a/source3/lib/cleanupdb.h b/source3/lib/cleanupdb.h
new file mode 100644
index 0000000..71dd695
--- /dev/null
+++ b/source3/lib/cleanupdb.h
@@ -0,0 +1,30 @@
+/*
+   Unix SMB/CIFS implementation.
+   Implementation of reliable cleanup events
+   Copyright (C) Ralph Boehme 2016
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "util_tdb.h"
+#include "lib/tdb_wrap/tdb_wrap.h"
+
+bool cleanupdb_store_child(const pid_t pid, const bool unclean);
+bool cleanupdb_delete_child(const pid_t pid);
+int cleanupdb_traverse_read(int (*fn)(const pid_t pid,
+				      const bool cleanup,
+				      void *private_data),
+			    void *private_data);
diff --git a/source3/wscript_build b/source3/wscript_build
index b9a2ee6..edf921c 100755
--- a/source3/wscript_build
+++ b/source3/wscript_build
@@ -345,7 +345,8 @@ bld.SAMBA3_SUBSYSTEM('samba3core',
                    lib/tevent_wait.c
                    lib/idmap_cache.c
                    lib/util_ea.c
-                   lib/background.c''',
+                   lib/background.c
+                   lib/cleanupdb.c''',
                    deps='''
                         samba3util
                         LIBTSOCKET
-- 
2.7.4


From a1c3400ac635f79086bd2aa8c0133edd27ae295f Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 14 Jul 2016 16:31:44 +0200
Subject: [PATCH 2/4] s3/smbd: add cleanupd_init_send()/recv()

Previously, without this patch, if cleanupd died for whatever reason, it
would be restarted from smbd. However, if cleanupd initialization
failed and it exitted again, there would be no child entry in smbd for
it and it wouldn't be attempted to restart it again.

This patch adds async send/recv methods for starting cleanupd that will
reschedule restart attempt every second in case initilisation failed.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 9e3c652..a2ac77e 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -542,6 +542,89 @@ static void cleanupd_stopped(struct tevent_req *req)
 	DBG_WARNING("cleanupd stopped: %s\n", nt_errstr(status));
 }
 
+static void cleanupd_init_trigger(struct tevent_req *req);
+
+struct cleanup_init_state {
+	bool ok;
+	struct tevent_context *ev;
+	struct messaging_context *msg;
+	struct server_id *ppid;
+};
+
+static struct tevent_req *cleanupd_init_send(struct tevent_context *ev,
+					     TALLOC_CTX *mem_ctx,
+					     struct messaging_context *msg,
+					     struct server_id *ppid)
+{
+	struct tevent_req *req = NULL;
+	struct tevent_req *subreq = NULL;
+	struct cleanup_init_state *state = NULL;
+
+	req = tevent_req_create(mem_ctx, &state, struct cleanup_init_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	*state = (struct cleanup_init_state) {
+		.msg = msg,
+		.ev = ev,
+		.ppid = ppid
+	};
+
+	subreq = tevent_wakeup_send(req, ev, tevent_timeval_current_ofs(0, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	tevent_req_set_callback(subreq, cleanupd_init_trigger, req);
+	return req;
+}
+
+static void cleanupd_init_trigger(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct cleanup_init_state *state = tevent_req_data(
+		req, struct cleanup_init_state);
+	bool ok;
+
+	DBG_NOTICE("Triggering cleanupd startup\n");
+
+	ok = tevent_wakeup_recv(subreq);
+	TALLOC_FREE(subreq);
+	if (!ok) {
+		tevent_req_error(req, ENOMEM);
+		return;
+	}
+
+	state->ok = cleanupd_init(state->msg, false, state->ppid);
+	if (state->ok) {
+		DBG_WARNING("cleanupd restarted\n");
+		tevent_req_done(req);
+		return;
+	}
+
+	DBG_NOTICE("cleanupd startup failed, rescheduling\n");
+
+	subreq = tevent_wakeup_send(req, state->ev,
+				    tevent_timeval_current_ofs(1, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		DBG_ERR("scheduling cleanupd restart failed, giving up\n");
+		return;
+	}
+
+	tevent_req_set_callback(subreq, cleanupd_init_trigger, req);
+	return;
+}
+
+static bool cleanupd_init_recv(struct tevent_req *req)
+{
+	struct cleanup_init_state *state = tevent_req_data(
+		req, struct cleanup_init_state);
+
+	return state->ok;
+}
+
 /*
   at most every smbd:cleanuptime seconds (default 20), we scan the BRL
   and locking database for entries to cleanup. As a side effect this
@@ -568,6 +651,18 @@ static void cleanup_timeout_fn(struct tevent_context *event_ctx,
 			   MSG_SMB_UNLOCK, NULL, 0);
 }
 
+static void cleanupd_started(struct tevent_req *req)
+{
+	bool ok;
+
+	ok = cleanupd_init_recv(req);
+	TALLOC_FREE(req);
+	if (!ok) {
+		DBG_ERR("Failed to restart cleanupd, giving up\n");
+		return;
+	}
+}
+
 static void remove_child_pid(struct smbd_parent_context *parent,
 			     pid_t pid,
 			     bool unclean_shutdown)
@@ -593,13 +688,20 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 	}
 
 	if (pid == procid_to_pid(&parent->cleanupd)) {
-		bool ok;
+		struct tevent_req *req;
+
+		server_id_set_disconnected(&parent->cleanupd);
 
 		DBG_WARNING("Restarting cleanupd\n");
-		ok = cleanupd_init(parent->msg_ctx, false, &parent->cleanupd);
-		if (!ok) {
+		req = cleanupd_init_send(messaging_tevent_context(parent->msg_ctx),
+					 parent,
+					 parent->msg_ctx,
+					 &parent->cleanupd);
+		if (req == NULL) {
 			DBG_ERR("Failed to restart cleanupd\n");
+			return;
 		}
+		tevent_req_set_callback(req, cleanupd_started, parent);
 		return;
 	}
 
-- 
2.7.4


From 6546d928576abb24f4a9d18785c52fc2d4b0186c Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Thu, 21 Jul 2016 19:08:47 +0200
Subject: [PATCH 3/4] s3/cleanupd: use smbd_cleanupd.tdb

Instead of using messaging to send individual cleanup events, it works
this way:

o parent smbd stores cleanup events (ie exitted children) in cleanup.tdb

o it sends cleanupd an empty MSG_SMB_NOTIFY_CLEANUP message

o cleanupd does a traverse on the smbd_cleanupd.tdb and collects all
  childs in a list

o after the traverse cleanupd walks the list and does the real work

It would have beeen possible to optimize for the common case by passing
info about exitted childs with the message (as was done before this
patch), adding a new message type for triggering a db traverse that
would be used when cleanupd had to be restarted and cleanup events may
have been accumulated in cleanup.tdb.

But this could be subject to subtle race conditions and could loose
events if cleanupd dies randomly.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c        |  40 +++++++++++----
 source3/smbd/smbd_cleanupd.c | 120 +++++++++++++++++++++++++++++++++----------
 2 files changed, 121 insertions(+), 39 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index a2ac77e..2e005e6 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -51,6 +51,7 @@
 #include "smbd/notifyd/notifyd.h"
 #include "smbd/smbd_cleanupd.h"
 #include "lib/util/sys_rw.h"
+#include "cleanupdb.h"
 
 #ifdef CLUSTER_SUPPORT
 #include "ctdb_protocol.h"
@@ -654,6 +655,9 @@ static void cleanup_timeout_fn(struct tevent_context *event_ctx,
 static void cleanupd_started(struct tevent_req *req)
 {
 	bool ok;
+	NTSTATUS status;
+	struct smbd_parent_context *parent = tevent_req_callback_data(
+		req, struct smbd_parent_context);
 
 	ok = cleanupd_init_recv(req);
 	TALLOC_FREE(req);
@@ -661,6 +665,15 @@ static void cleanupd_started(struct tevent_req *req)
 		DBG_ERR("Failed to restart cleanupd, giving up\n");
 		return;
 	}
+
+	status = messaging_send(parent->msg_ctx,
+				parent->cleanupd,
+				MSG_SMB_NOTIFY_CLEANUP,
+				&data_blob_null);
+	if (!NT_STATUS_IS_OK(status)) {
+		DBG_ERR("messaging_send returned %s\n",
+			nt_errstr(status));
+	}
 }
 
 static void remove_child_pid(struct smbd_parent_context *parent,
@@ -668,8 +681,8 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 			     bool unclean_shutdown)
 {
 	struct smbd_child_pid *child;
-	struct iovec iov[2];
 	NTSTATUS status;
+	bool ok;
 
 	for (child = parent->children; child != NULL; child = child->next) {
 		if (child->pid == pid) {
@@ -706,8 +719,6 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 	}
 
 	if (pid == procid_to_pid(&parent->notifyd)) {
-		bool ok;
-
 		DBG_WARNING("Restarting notifyd\n");
 		ok = smbd_notifyd_init(parent->msg_ctx, false,
 				       &parent->notifyd);
@@ -717,15 +728,22 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 		return;
 	}
 
-	iov[0] = (struct iovec) { .iov_base = (uint8_t *)&pid,
-				  .iov_len = sizeof(pid) };
-	iov[1] = (struct iovec) { .iov_base = (uint8_t *)&unclean_shutdown,
-				  .iov_len = sizeof(bool) };
+	ok = cleanupdb_store_child(pid, unclean_shutdown);
+	if (!ok) {
+		DBG_ERR("cleanupdb_store_child failed\n");
+		return;
+	}
 
-	status = messaging_send_iov(parent->msg_ctx, parent->cleanupd,
-				    MSG_SMB_NOTIFY_CLEANUP,
-				    iov, ARRAY_SIZE(iov), NULL, 0);
-	DEBUG(10, ("messaging_send_iov returned %s\n", nt_errstr(status)));
+	if (!server_id_is_disconnected(&parent->cleanupd)) {
+		status = messaging_send(parent->msg_ctx,
+					parent->cleanupd,
+					MSG_SMB_NOTIFY_CLEANUP,
+					&data_blob_null);
+		if (!NT_STATUS_IS_OK(status)) {
+			DBG_ERR("messaging_send returned %s\n",
+				nt_errstr(status));
+		}
+	}
 
 	if (unclean_shutdown) {
 		/* a child terminated uncleanly so tickle all
diff --git a/source3/smbd/smbd_cleanupd.c b/source3/smbd/smbd_cleanupd.c
index c940ef1..0deb8b0 100644
--- a/source3/smbd/smbd_cleanupd.c
+++ b/source3/smbd/smbd_cleanupd.c
@@ -25,6 +25,7 @@
 #include "smbprofile.h"
 #include "serverid.h"
 #include "locking/proto.h"
+#include "cleanupdb.h"
 
 struct smbd_cleanupd_state {
 	pid_t parent_pid;
@@ -102,6 +103,39 @@ static void smbd_cleanupd_unlock(struct messaging_context *msg,
 	brl_revalidate(msg, private_data, msg_type, server_id, data);
 }
 
+struct cleanup_child {
+	struct cleanup_child *prev, *next;
+	pid_t pid;
+	bool unclean;
+};
+
+struct cleanupdb_traverse_state {
+	TALLOC_CTX *mem_ctx;
+	bool ok;
+	struct cleanup_child *childs;
+};
+
+static int cleanupdb_traverse_fn(const pid_t pid,
+				 const bool unclean,
+				 void *private_data)
+{
+	struct cleanupdb_traverse_state *cleanup_state =
+		(struct cleanupdb_traverse_state *)private_data;
+	struct cleanup_child *child = NULL;
+
+	child = talloc_zero(cleanup_state->mem_ctx, struct cleanup_child);
+	if (child == NULL) {
+		DBG_ERR("talloc_zero failed\n");
+		return -1;
+	}
+
+	child->pid = pid;
+	child->unclean = unclean;
+	DLIST_ADD(cleanup_state->childs, child);
+
+	return 0;
+}
+
 static void smbd_cleanupd_process_exited(struct messaging_context *msg,
 					 void *private_data, uint32_t msg_type,
 					 struct server_id server_id,
@@ -111,43 +145,73 @@ static void smbd_cleanupd_process_exited(struct messaging_context *msg,
 		private_data, struct tevent_req);
 	struct smbd_cleanupd_state *state = tevent_req_data(
 		req, struct smbd_cleanupd_state);
-	pid_t pid;
-	struct server_id child_id;
-	bool unclean_shutdown;
 	int ret;
+	struct cleanupdb_traverse_state cleanup_state;
+	TALLOC_CTX *frame = talloc_stackframe();
+	struct cleanup_child *child = NULL;
 
-	if (data->length != (sizeof(pid) + sizeof(unclean_shutdown))) {
-		DBG_WARNING("Got invalid length: %zu\n", data->length);
-		return;
-	}
-
-	memcpy(&pid, data->data, sizeof(pid));
-	memcpy(&unclean_shutdown, data->data + sizeof(pid),
-	       sizeof(unclean_shutdown));
-
-	DBG_DEBUG("%d exited %sclean\n", (int)pid,
-		  unclean_shutdown ? "un" : "");
+	cleanup_state = (struct cleanupdb_traverse_state) {
+		.mem_ctx = frame
+	};
 
 	/*
-	 * Get child_id before messaging_cleanup which wipes the
-	 * unique_id. Not that it really matters here for functionality (the
-	 * child should have properly cleaned up :-)) though, but it looks
-	 * nicer.
+	 * This merely collect childs in a list, whatever we're
+	 * supposed to cleanup for every child, it has to take place
+	 * *after* the db traverse in a list loop. This is to minimize
+	 * locking interaction between the traverse and writers (ie
+	 * the parent smbd).
 	 */
-	child_id = pid_to_procid(pid);
-
-	smbprofile_cleanup(pid, state->parent_pid);
-
-	ret = messaging_cleanup(msg, pid);
+	ret = cleanupdb_traverse_read(cleanupdb_traverse_fn, &cleanup_state);
+	if (ret < 0) {
+		DBG_ERR("cleanupdb_traverse_read failed\n");
+		TALLOC_FREE(frame);
+		return;
+	}
 
-	if ((ret != 0) && (ret != ENOENT)) {
-		DBG_DEBUG("messaging_cleanup returned %s\n", strerror(ret));
+	if (ret == 0) {
+		DBG_ERR("got 0 cleanup events, expected at least 1\n");
+		TALLOC_FREE(frame);
+		return;
 	}
 
-	if (!serverid_deregister(child_id)) {
-		DEBUG(1, ("Could not remove pid %d from serverid.tdb\n",
-			  (int)pid));
+	for (child = cleanup_state.childs;
+	     child != NULL;
+	     child = child->next)
+	{
+		struct server_id child_id;
+		bool ok;
+
+		ok = cleanupdb_delete_child(child->pid);
+		if (!ok) {
+			DBG_ERR("failed to delete pid %d\n", (int)child->pid);
+		}
+
+		/*
+		 * Get child_id before messaging_cleanup which wipes
+		 * the unique_id. Not that it really matters here for
+		 * functionality (the child should have properly
+		 * cleaned up :-)) though, but it looks nicer.
+		 */
+		child_id = pid_to_procid(child->pid);
+
+		smbprofile_cleanup(child->pid, state->parent_pid);
+
+		ret = messaging_cleanup(msg, child->pid);
+
+		if ((ret != 0) && (ret != ENOENT)) {
+			DBG_DEBUG("messaging_cleanup returned %s\n",
+				  strerror(ret));
+		}
+
+		if (!serverid_deregister(child_id)) {
+			DBG_ERR("Could not remove pid %d from serverid.tdb\n",
+				(int)child->pid);
+		}
+
+		DBG_DEBUG("cleaned up pid %d\n", (int)child->pid);
 	}
+
+	TALLOC_FREE(frame);
 }
 
 NTSTATUS smbd_cleanupd_recv(struct tevent_req *req)
-- 
2.7.4


From b04c33dbf4d34cfa1c3cfc915bc820370637ea57 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Fri, 22 Jul 2016 08:29:13 +0200
Subject: [PATCH 4/4] s3/notifyd: add async send/recv functions

Previously, without this patch, if notifyd died for whatever reason, it
would be restarted from smbd. However, if its initialization failed and
it exitted again, there would be no child entry in smbd for it and it
wouldn't be attempted to restart it again.

This patch adds async send/recv methods for starting cleanupd that will
reschedule restart attempt every second in case initilisation failed.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 source3/smbd/server.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 3 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 2e005e6..97c0fdc 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -427,6 +427,101 @@ static bool smbd_notifyd_init(struct messaging_context *msg, bool interactive,
 	return tevent_req_poll(req, ev);
 }
 
+static void notifyd_init_trigger(struct tevent_req *req);
+
+struct notifyd_init_state {
+	bool ok;
+	struct tevent_context *ev;
+	struct messaging_context *msg;
+	struct server_id *ppid;
+};
+
+static struct tevent_req *notifyd_init_send(struct tevent_context *ev,
+					    TALLOC_CTX *mem_ctx,
+					    struct messaging_context *msg,
+					    struct server_id *ppid)
+{
+	struct tevent_req *req = NULL;
+	struct tevent_req *subreq = NULL;
+	struct notifyd_init_state *state = NULL;
+
+	req = tevent_req_create(mem_ctx, &state, struct notifyd_init_state);
+	if (req == NULL) {
+		return NULL;
+	}
+
+	*state = (struct notifyd_init_state) {
+		.msg = msg,
+		.ev = ev,
+		.ppid = ppid
+	};
+
+	subreq = tevent_wakeup_send(req, ev, tevent_timeval_current_ofs(1, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		return tevent_req_post(req, ev);
+	}
+
+	tevent_req_set_callback(subreq, notifyd_init_trigger, req);
+	return req;
+}
+
+static void notifyd_init_trigger(struct tevent_req *subreq)
+{
+	struct tevent_req *req = tevent_req_callback_data(
+		subreq, struct tevent_req);
+	struct notifyd_init_state *state = tevent_req_data(
+		req, struct notifyd_init_state);
+	bool ok;
+
+	DBG_NOTICE("Triggering notifyd startup\n");
+
+	ok = tevent_wakeup_recv(subreq);
+	TALLOC_FREE(subreq);
+	if (!ok) {
+		tevent_req_error(req, ENOMEM);
+		return;
+	}
+
+	state->ok = smbd_notifyd_init(state->msg, false, state->ppid);
+	if (state->ok) {
+		DBG_WARNING("notifyd restarted\n");
+		tevent_req_done(req);
+		return;
+	}
+
+	DBG_NOTICE("notifyd startup failed, rescheduling\n");
+
+	subreq = tevent_wakeup_send(req, state->ev,
+				    tevent_timeval_current_ofs(1, 0));
+	if (tevent_req_nomem(subreq, req)) {
+		DBG_ERR("scheduling notifyd restart failed, giving up\n");
+		return;
+	}
+
+	tevent_req_set_callback(subreq, notifyd_init_trigger, req);
+	return;
+}
+
+static bool notifyd_init_recv(struct tevent_req *req)
+{
+	struct notifyd_init_state *state = tevent_req_data(
+		req, struct notifyd_init_state);
+
+	return state->ok;
+}
+
+static void notifyd_started(struct tevent_req *req)
+{
+	bool ok;
+
+	ok = notifyd_init_recv(req);
+	TALLOC_FREE(req);
+	if (!ok) {
+		DBG_ERR("Failed to restart notifyd, giving up\n");
+		return;
+	}
+}
+
 static void cleanupd_stopped(struct tevent_req *req);
 
 static bool cleanupd_init(struct messaging_context *msg, bool interactive,
@@ -719,12 +814,22 @@ static void remove_child_pid(struct smbd_parent_context *parent,
 	}
 
 	if (pid == procid_to_pid(&parent->notifyd)) {
+		struct tevent_req *req;
+		struct tevent_context *ev = messaging_tevent_context(
+			parent->msg_ctx);
+
+		server_id_set_disconnected(&parent->notifyd);
+
 		DBG_WARNING("Restarting notifyd\n");
-		ok = smbd_notifyd_init(parent->msg_ctx, false,
-				       &parent->notifyd);
-		if (!ok) {
+		req = notifyd_init_send(ev,
+					parent,
+					parent->msg_ctx,
+					&parent->notifyd);
+		if (req == NULL) {
 			DBG_ERR("Failed to restart notifyd\n");
+			return;
 		}
+		tevent_req_set_callback(req, notifyd_started, parent);
 		return;
 	}
 
-- 
2.7.4



More information about the samba-technical mailing list