[SCM] CTDB repository - branch 1.13 updated - ctdb-1.42-1-gb550f46

Ronnie Sahlberg sahlberg at samba.org
Wed May 2 22:22:13 MDT 2012


The branch, 1.13 has been updated
       via  b550f4644ea81b7010d401e46cf803b13f16e3c1 (commit)
      from  16282b5b2b8fb14454ab61a7ff7c651528a4a117 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=1.13


- Log -----------------------------------------------------------------
commit b550f4644ea81b7010d401e46cf803b13f16e3c1
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 3 11:42:41 2012 +1000

    Track all child process so we never send a signal to an unrelated process (our child died  and kernel wrapped the pid-space and reused the pid for a different process
    
    Wrap all creation of child processes inside ctdb_fork() which is used to track all processes we have spawned.
    Capture SIGCHLD to track also which child processes have terminated.
    
    Wrap kill() inside ctdb_kill() and make sure that we never send a !0 signal to a child process pid that has already terminated (and might have been replaced with a

-----------------------------------------------------------------------

Summary of changes:
 Makefile.in                 |    2 +-
 common/ctdb_fork.c          |  124 +++++++++++++++++++++++++++++++++++++++++++
 common/ctdb_util.c          |   18 ------
 include/ctdb_private.h      |    5 ++
 server/ctdb_call.c          |    4 +-
 server/ctdb_daemon.c        |   39 ++-----------
 server/ctdb_freeze.c        |    6 +-
 server/ctdb_lockwait.c      |    4 +-
 server/ctdb_recover.c       |    6 +-
 server/ctdb_recoverd.c      |   12 ++--
 server/ctdb_takeover.c      |    6 +-
 server/ctdb_traverse.c      |    2 +-
 server/ctdb_update_record.c |    6 +-
 server/ctdb_vacuum.c        |    2 +-
 server/eventscript.c        |    6 +-
 tests/src/ctdb_test.c       |    1 +
 tests/src/ctdbd_test.c      |    1 +
 17 files changed, 165 insertions(+), 79 deletions(-)
 create mode 100644 common/ctdb_fork.c


Changeset truncated at 500 lines:

diff --git a/Makefile.in b/Makefile.in
index 1794be5..26eeac8 100755
--- a/Makefile.in
+++ b/Makefile.in
@@ -68,7 +68,7 @@ UTIL_OBJ = lib/util/idtree.o lib/util/db_wrap.o lib/util/strlist.o lib/util/util
 CTDB_COMMON_OBJ =  common/ctdb_io.o common/ctdb_util.o \
 	common/ctdb_ltdb.o common/ctdb_message.o common/cmdline.o  \
 	lib/util/debug.o common/rb_tree.o @CTDB_SYSTEM_OBJ@ common/system_common.o \
-	common/ctdb_logging.c
+	common/ctdb_logging.c common/ctdb_fork.o
 
 CTDB_LIB_OBJ = libctdb/ctdb.o libctdb/io_elem.o libctdb/local_tdb.o \
 	libctdb/messages.o libctdb/sync.o libctdb/control.o \
diff --git a/common/ctdb_fork.c b/common/ctdb_fork.c
new file mode 100644
index 0000000..81055c5
--- /dev/null
+++ b/common/ctdb_fork.c
@@ -0,0 +1,124 @@
+/* 
+   functions to track and manage processes
+
+   Copyright (C) Ronnie Sahlberg 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/wait.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+/*
+ * This function forks a child process and drops the realtime 
+ * scheduler for the child process.
+ */
+pid_t ctdb_fork(struct ctdb_context *ctdb)
+{
+	pid_t pid;
+	char *process;
+
+	pid = fork();
+	if (pid == -1) {
+		return -1;
+	}
+	if (pid == 0) {
+		if (ctdb->do_setsched) {
+			ctdb_restore_scheduler(ctdb);
+		}
+		ctdb->can_send_controls = false;
+		return 0;
+	}
+
+	if (getpid() != ctdb->ctdbd_pid) {
+		return pid;
+	}
+
+	process = talloc_asprintf(ctdb->child_processes, "process:%d", (int)pid);
+	trbt_insert32(ctdb->child_processes, pid, process);
+
+	return pid;
+}
+
+
+
+static void ctdb_sigchld_handler(struct tevent_context *ev,
+	struct tevent_signal *te, int signum, int count,
+	void *dont_care, 
+	void *private_data)
+{
+	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+	int status;
+	pid_t pid = -1;
+
+	while (pid != 0) {
+		pid = waitpid(-1, &status, WNOHANG);
+		if (pid == -1) {
+			DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
+			return;
+		}
+		if (pid > 0) {
+			char *process;
+
+			if (getpid() != ctdb->ctdbd_pid) {
+				continue;
+			}
+
+			process = trbt_lookup32(ctdb->child_processes, pid);
+			if (process == NULL) {
+				DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we didn not spawn with ctdb_fork\n", pid));
+			}
+
+			DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, process));
+			talloc_free(process);
+		}
+	}
+}
+
+
+struct tevent_signal *
+ctdb_init_sigchld(struct ctdb_context *ctdb)
+{
+	struct tevent_signal *se;
+
+	ctdb->child_processes = trbt_create(ctdb, 0);
+
+	se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, ctdb_sigchld_handler, ctdb);
+	return se;
+}
+
+int
+ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum)
+{
+	char *process;
+
+	if (signum == 0) {
+		return kill(pid, signum);
+	}
+
+	if (getpid() != ctdb->ctdbd_pid) {
+		return kill(pid, signum);
+	}
+
+	process = trbt_lookup32(ctdb->child_processes, pid);
+	if (process == NULL) {
+		DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process that does not exist\n", pid, signum));
+		return 0;
+	}
+
+	return kill(pid, signum);
+}
diff --git a/common/ctdb_util.c b/common/ctdb_util.c
index 5584c17..b9af95b 100644
--- a/common/ctdb_util.c
+++ b/common/ctdb_util.c
@@ -332,24 +332,6 @@ void ctdb_restore_scheduler(struct ctdb_context *ctdb)
 #endif
 }
 
-/*
- * This function forks a child process and drops the realtime 
- * scheduler for the child process.
- */
-pid_t ctdb_fork(struct ctdb_context *ctdb)
-{
-	pid_t pid;
-
-	pid = fork();
-	if (pid == 0) {
-		if (ctdb->do_setsched) {
-			ctdb_restore_scheduler(ctdb);
-		}
-		ctdb->can_send_controls = false;
-	}
-	return pid;
-}
-
 void set_nonblocking(int fd)
 {
 	unsigned v;
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index d973fcc..086e427 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -501,6 +501,7 @@ struct ctdb_context {
 	struct ctdb_reloadips_handle *reload_ips;
 
 	const char *public_addresses_file;
+	struct trbt_tree *child_processes; 
 };
 
 struct ctdb_db_context {
@@ -1031,7 +1032,11 @@ void ctdb_node_connected(struct ctdb_node *node);
 bool ctdb_blocking_freeze(struct ctdb_context *ctdb);
 void ctdb_set_scheduler(struct ctdb_context *ctdb);
 void ctdb_restore_scheduler(struct ctdb_context *ctdb);
+
+struct tevent_signal *ctdb_init_sigchld(struct ctdb_context *ctdb);
 pid_t ctdb_fork(struct ctdb_context *ctdb);
+int ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum);
+
 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
 				 struct ctdb_req_control *c,
 				 TDB_DATA indata, 
diff --git a/server/ctdb_call.c b/server/ctdb_call.c
index 08b701c..fe7e947 100644
--- a/server/ctdb_call.c
+++ b/server/ctdb_call.c
@@ -1388,7 +1388,7 @@ static int revokechild_destructor(struct revokechild_handle *rc)
 	if (rc->fd[1] != -1) {
 		close(rc->fd[1]);
 	}
-	kill(rc->child, SIGKILL);
+	ctdb_kill(rc->ctdb, rc->child, SIGKILL);
 
 	DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc);
 	return 0;
@@ -1617,7 +1617,7 @@ int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_contex
 child_finished:
 		write(rc->fd[1], &c, 1);
 		/* make sure we die when our parent dies */
-		while (kill(parent, 0) == 0 || errno != ESRCH) {
+		while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 		}
 		_exit(0);
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c
index f946ac6..899b3bd 100644
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -1029,27 +1029,6 @@ failed:
 	return -1;	
 }
 
-static void sig_child_handler(struct event_context *ev,
-	struct signal_event *se, int signum, int count,
-	void *dont_care, 
-	void *private_data)
-{
-//	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
-	int status;
-	pid_t pid = -1;
-
-	while (pid != 0) {
-		pid = waitpid(-1, &status, WNOHANG);
-		if (pid == -1) {
-			DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
-			return;
-		}
-		if (pid > 0) {
-			DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d\n", (int)pid));
-		}
-	}
-}
-
 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
 				      void *private_data)
 {
@@ -1074,7 +1053,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog,
 	int res, ret = -1;
 	struct fd_event *fde;
 	const char *domain_socket_name;
-	struct signal_event *se;
 
 	/* get rid of any old sockets */
 	unlink(ctdb->daemon.name);
@@ -1105,7 +1083,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog,
 	ctdbd_pid = getpid();
 	ctdb->ctdbd_pid = ctdbd_pid;
 
-
 	DEBUG(DEBUG_ERR, ("Starting CTDBD as pid : %u\n", ctdbd_pid));
 
 	if (ctdb->do_setsched) {
@@ -1128,6 +1105,12 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog,
 		exit(1);
 	}
 
+	/* set up a handler to pick up sigchld */
+	if (ctdb_init_sigchld(ctdb) == NULL) {
+		DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
+		exit(1);
+	}
+
 	ctdb_set_child_logging(ctdb);
 
 	/* initialize statistics collection */
@@ -1203,16 +1186,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog,
 	/* start the transport going */
 	ctdb_start_transport(ctdb);
 
-	/* set up a handler to pick up sigchld */
-	se = event_add_signal(ctdb->ev, ctdb,
-				     SIGCHLD, 0,
-				     sig_child_handler,
-				     ctdb);
-	if (se == NULL) {
-		DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
-		exit(1);
-	}
-
 	ret = ctdb_event_script_callback(ctdb,
 					 ctdb,
 					 ctdb_setup_event_callback,
diff --git a/server/ctdb_freeze.c b/server/ctdb_freeze.c
index 7233317..56dc19c 100644
--- a/server/ctdb_freeze.c
+++ b/server/ctdb_freeze.c
@@ -122,7 +122,7 @@ static int ctdb_freeze_handle_destructor(struct ctdb_freeze_handle *h)
 	ctdb->freeze_mode[h->priority]    = CTDB_FREEZE_NONE;
 	ctdb->freeze_handles[h->priority] = NULL;
 
-	kill(h->child, SIGKILL);
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
 	return 0;
 }
 
@@ -189,7 +189,7 @@ static struct ctdb_freeze_handle *ctdb_freeze_lock(struct ctdb_context *ctdb, ui
 		return NULL;
 	}
 	
-	h->child = fork();
+	h->child = ctdb_fork(ctdb);
 	if (h->child == -1) {
 		DEBUG(DEBUG_ERR,("Failed to fork child for ctdb_freeze_lock\n"));
 		talloc_free(h);
@@ -216,7 +216,7 @@ static struct ctdb_freeze_handle *ctdb_freeze_lock(struct ctdb_context *ctdb, ui
 
 		while (1) {
 			sleep(1);
-			if (kill(ctdb->ctdbd_pid, 0) != 0) {
+			if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
 				DEBUG(DEBUG_ERR,("Parent died. Exiting lock wait child\n"));
 
 				_exit(0);
diff --git a/server/ctdb_lockwait.c b/server/ctdb_lockwait.c
index 35f5902..2339dc8 100644
--- a/server/ctdb_lockwait.c
+++ b/server/ctdb_lockwait.c
@@ -72,7 +72,7 @@ static void do_overflow(struct ctdb_db_context *ctdb_db,
 static int lockwait_destructor(struct lockwait_handle *h)
 {
 	CTDB_DECREMENT_STAT(h->ctdb, pending_lockwait_calls);
-	kill(h->child, SIGKILL);
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
 	h->ctdb_db->pending_requests--;
 	DLIST_REMOVE(h->ctdb_db->lockwait_active, h);
 	return 0;
@@ -202,7 +202,7 @@ struct lockwait_handle *ctdb_lockwait(struct ctdb_db_context *ctdb_db,
 		tdb_chainlock(ctdb_db->ltdb->tdb, key);
 		write(result->fd[1], &c, 1);
 		/* make sure we die when our parent dies */
-		while (kill(parent, 0) == 0 || errno != ESRCH) {
+		while (ctdb_kill(ctdb_db->ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 		}
 		_exit(0);
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 937a2b8..15e9f03 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -626,7 +626,7 @@ static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
 	if (state->fd[1] != -1) {
 		state->fd[1] = -1;
 	}
-	kill(state->child, SIGKILL);
+	ctdb_kill(state->ctdb, state->child, SIGKILL);
 	return 0;
 }
 
@@ -778,7 +778,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		return -1;
 	}
 
-	state->child = fork();
+	state->child = ctdb_fork(ctdb);
 	if (state->child == (pid_t)-1) {
 		close(state->fd[0]);
 		close(state->fd[1]);
@@ -801,7 +801,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 
 		write(state->fd[1], &cc, 1);
 		/* make sure we die when our parent dies */
-		while (kill(parent, 0) == 0 || errno != ESRCH) {
+		while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 			write(state->fd[1], &cc, 1);
 		}
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 1108257..4cbbece 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2949,7 +2949,7 @@ static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
 		close(state->fd[1]);
 		state->fd[1] = -1;
 	}
-	kill(state->child, SIGKILL);
+	ctdb_kill(ctdb, state->child, SIGKILL);
 	return 0;
 }
 
@@ -3047,7 +3047,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
 
 		write(state->fd[1], &cc, 1);
 		/* make sure we die when our parent dies */
-		while (kill(parent, 0) == 0 || errno != ESRCH) {
+		while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 			write(state->fd[1], &cc, 1);
 		}
@@ -3166,7 +3166,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 
 
 	/* verify that the main daemon is still running */
-	if (kill(ctdb->ctdbd_pid, 0) != 0) {
+	if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
 		DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
 		exit(-1);
 	}
@@ -3809,7 +3809,7 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
 {
 	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
 
-	if (kill(ctdb->recoverd_pid, 0) != 0) {
+	if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
 		DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
 
 		event_add_timed(ctdb->ev, ctdb, timeval_zero(), 
@@ -3861,7 +3861,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 
 	ctdb->ctdbd_pid = getpid();
 
-	ctdb->recoverd_pid = fork();
+	ctdb->recoverd_pid = ctdb_fork(ctdb);
 	if (ctdb->recoverd_pid == -1) {
 		return -1;
 	}
@@ -3915,7 +3915,7 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb)
 	}
 
 	DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
-	kill(ctdb->recoverd_pid, SIGTERM);
+	ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
 }
 
 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, 
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c
index 3668305..fe73997 100644
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -730,7 +730,7 @@ static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr
 					(unsigned)client->pid,
 					ctdb_addr_to_str(addr),
 					ip->client_id));
-				kill(client->pid, SIGKILL);
+				ctdb_kill(ctdb, client->pid, SIGKILL);
 			}
 		}
 	}
@@ -3656,7 +3656,7 @@ static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
 		ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
 		h->c = NULL;
 	}
-	kill(h->child, SIGKILL);
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
 	return 0;
 }
 
@@ -3831,7 +3831,7 @@ int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_re
 
 		write(h->fd[1], &res, 1);
 		/* make sure we die when our parent dies */
-		while (kill(parent, 0) == 0 || errno != ESRCH) {
+		while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 		}
 		_exit(0);
diff --git a/server/ctdb_traverse.c b/server/ctdb_traverse.c
index d739934..54ee70f 100644
--- a/server/ctdb_traverse.c
+++ b/server/ctdb_traverse.c
@@ -78,7 +78,7 @@ static void ctdb_traverse_local_handler(uint8_t *rawdata, size_t length, void *p
 static int traverse_local_destructor(struct ctdb_traverse_local_handle *h)
 {
 	DLIST_REMOVE(h->ctdb_db->traverse, h);
-	kill(h->child, SIGKILL);
+	ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL);
 	return 0;
 }
 
diff --git a/server/ctdb_update_record.c b/server/ctdb_update_record.c
index 92b304c..1543b46 100644
--- a/server/ctdb_update_record.c
+++ b/server/ctdb_update_record.c
@@ -162,7 +162,7 @@ struct childwrite_handle {
 static int childwrite_destructor(struct childwrite_handle *h)
 {
 	CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
-	kill(h->child, SIGKILL);
+	ctdb_kill(h->ctdb, h->child, SIGKILL);
 	return 0;
 }
 
@@ -199,7 +199,7 @@ static void childwrite_handler(struct event_context *ev, struct fd_event *fde,
 
 	callback(c, p);
 
-	kill(child, SIGKILL);


-- 
CTDB repository


More information about the samba-cvs mailing list