[SCM] CTDB repository - branch master updated - ctdb-1.0.84-17-gfc8a364

Fri Jun 19 05:12:03 GMT 2009

The branch, master has been updated
       via  fc8a364eb095ec11ca01246a583bf1dc53510141 (commit)
       via  268c3e4b269a92741a02280c84384178e73de10e (commit)
       via  d177b08f1dc79534491f27726b05405d47e12e20 (commit)
       via  6d1e4321b63973c2e53c63d386e8cc0bd9605cae (commit)
      from  facddcacb4a961cddb117818fa38a3e97770b2fa (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit fc8a364eb095ec11ca01246a583bf1dc53510141
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Jun 19 14:58:06 2009 +1000

    dont leak file descriptors when set recmdoe timesout

commit 268c3e4b269a92741a02280c84384178e73de10e
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Jun 19 14:54:22 2009 +1000

    dont leak file descriptors

commit d177b08f1dc79534491f27726b05405d47e12e20
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Jun 19 14:44:26 2009 +1000

    in the recovery daemon, check that the recovery master can access the recovery lock file and verify it is not stale from a child process.
    This allows us to timeout the operation if the underlying filesystem has become temporarily unresponsive without causing a new recovery.

commit 6d1e4321b63973c2e53c63d386e8cc0bd9605cae
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Jun 19 13:09:11 2009 +1000

    reduce the timeout we wait for the reclock child process to finish to 5 seconds
    before we log an error and abort

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_recover.c  |   14 +++-
 server/ctdb_recoverd.c |  185 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 185 insertions(+), 14 deletions(-)


Changeset truncated at 500 lines:

diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 7953c6b..526a310 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -531,7 +531,13 @@ static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
 	double l = timeval_elapsed(&state->start_time);
 
 	ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
-		
+
+	if (state->fd[0] != -1) {
+		state->fd[0] = -1;
+	}
+	if (state->fd[1] != -1) {
+		state->fd[1] = -1;
+	}
 	kill(state->child, SIGKILL);
 	return 0;
 }
@@ -645,6 +651,8 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 	CTDB_NO_MEMORY(ctdb, state);
 
 	state->start_time = timeval_current();
+	state->fd[0] = -1;
+	state->fd[1] = -1;
 
 	if (ctdb->tunable.verify_recovery_lock == 0) {
 		/* dont need to verify the reclock file */
@@ -693,16 +701,18 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		_exit(0);
 	}
 	close(state->fd[1]);
+	state->fd[1] = -1;
 
 	talloc_set_destructor(state, set_recmode_destructor);
 
-	state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
+	state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
 				    ctdb_set_recmode_timeout, state);
 
 	state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
 				EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
 				set_recmode_handler,
 				(void *)state);
+
 	if (state->fde == NULL) {
 		talloc_free(state);
 		return -1;
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 6b2fb5e..07f3f0d 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2331,6 +2331,176 @@ static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
 	return 0;
 }
 
+enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
+struct ctdb_check_reclock_state {
+	struct ctdb_context *ctdb;
+	struct timeval start_time;
+	int fd[2];
+	pid_t child;
+	struct timed_event *te;
+	struct fd_event *fde;
+	enum reclock_child_status status;
+};
+
+/* when we free the reclock state we must kill any child process.
+*/
+static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
+{
+	struct ctdb_context *ctdb = state->ctdb;
+
+	ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
+
+	if (state->fd[0] != -1) {
+		close(state->fd[0]);
+		state->fd[0] = -1;
+	}
+	if (state->fd[1] != -1) {
+		close(state->fd[1]);
+		state->fd[1] = -1;
+	}
+	kill(state->child, SIGKILL);
+	return 0;
+}
+
+/*
+  called if our check_reclock child times out. this would happen if
+  i/o to the reclock file blocks.
+ */
+static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te, 
+					 struct timeval t, void *private_data)
+{
+	struct ctdb_check_reclock_state *state = talloc_get_type(private_data, 
+					   struct ctdb_check_reclock_state);
+
+	DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
+	state->status = RECLOCK_TIMEOUT;
+}
+
+/* this is called when the child process has completed checking the reclock
+   file and has written data back to us through the pipe.
+*/
+static void reclock_child_handler(struct event_context *ev, struct fd_event *fde, 
+			     uint16_t flags, void *private_data)
+{
+	struct ctdb_check_reclock_state *state= talloc_get_type(private_data, 
+					     struct ctdb_check_reclock_state);
+	char c = 0;
+	int ret;
+
+	/* we got a response from our child process so we can abort the
+	   timeout.
+	*/
+	talloc_free(state->te);
+	state->te = NULL;
+
+	ret = read(state->fd[0], &c, 1);
+	if (ret != 1 || c != RECLOCK_OK) {
+		DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
+		state->status = RECLOCK_FAILED;
+
+		return;
+	}
+
+	state->status = RECLOCK_OK;
+	return;
+}
+
+static int check_recovery_lock(struct ctdb_context *ctdb)
+{
+	int ret;
+	struct ctdb_check_reclock_state *state;
+	pid_t parent = getpid();
+
+	if (ctdb->recovery_lock_fd == -1) {
+		DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
+		return -1;
+	}
+
+	state = talloc(ctdb, struct ctdb_check_reclock_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	state->ctdb = ctdb;
+	state->start_time = timeval_current();
+	state->status = RECLOCK_CHECKING;
+	state->fd[0] = -1;
+	state->fd[1] = -1;
+
+	ret = pipe(state->fd);
+	if (ret != 0) {
+		talloc_free(state);
+		DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
+		return -1;
+	}
+
+	state->child = fork();
+	if (state->child == (pid_t)-1) {
+		DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
+		close(state->fd[0]);
+		state->fd[0] = -1;
+		close(state->fd[1]);
+		state->fd[1] = -1;
+		talloc_free(state);
+		return -1;
+	}
+
+	if (state->child == 0) {
+		char cc = RECLOCK_OK;
+		close(state->fd[0]);
+		state->fd[0] = -1;
+
+		if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
+			DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
+			cc = RECLOCK_FAILED;
+		}
+
+		write(state->fd[1], &cc, 1);
+		/* make sure we die when our parent dies */
+		while (kill(parent, 0) == 0 || errno != ESRCH) {
+			sleep(5);
+			write(state->fd[1], &cc, 1);
+		}
+		_exit(0);
+	}
+	close(state->fd[1]);
+	state->fd[1] = -1;
+
+	talloc_set_destructor(state, check_reclock_destructor);
+
+	state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
+				    ctdb_check_reclock_timeout, state);
+	if (state->te == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
+				EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+				reclock_child_handler,
+				(void *)state);
+
+	if (state->fde == NULL) {
+		DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	while (state->status == RECLOCK_CHECKING) {
+		event_loop_once(ctdb->ev);
+	}
+
+	if (state->status == RECLOCK_FAILED) {
+		DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
+		close(ctdb->recovery_lock_fd);
+		ctdb->recovery_lock_fd = -1;
+		talloc_free(state);
+		return -1;
+	}
+
+	talloc_free(state);
+	return 0;
+}
+
 /*
   the main monitoring loop
  */
@@ -2346,7 +2516,6 @@ static void monitor_cluster(struct ctdb_context *ctdb)
 	int32_t debug_level;
 	int i, j, ret;
 	struct ctdb_recoverd *rec;
-	char c;
 
 	DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
 
@@ -2663,21 +2832,13 @@ again:
 
 
 	/* we should have the reclock - check its not stale */
-	if (ctdb->recovery_lock_fd == -1) {
-		DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
-		do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
-		goto again;
-	}
-
-	if (pread(ctdb->recovery_lock_fd, &c, 1, 0) == -1) {
-		DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
-		close(ctdb->recovery_lock_fd);
-		ctdb->recovery_lock_fd = -1;
+	ret = check_recovery_lock(ctdb);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
 		do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
 		goto again;
 	}
 
-
 	/* get the nodemap for all active remote nodes
 	 */
 	remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);


-- 
CTDB repository