[SCM] CTDB repository - branch master updated - ctdb-1.0.95-7-ga3406c1

Tue Oct 13 21:20:14 MDT 2009

The branch, master has been updated
       via  a3406c10d70f89d332eab25d481083142dff987d (commit)
       via  eb854f65f978f24583e221138eb4f9b917b89285 (commit)
       via  3807681e74f4bfe92befdae6ed616ff5f1a99880 (commit)
      from  343c005367789e108c0320e95d7a264535d68dd8 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit a3406c10d70f89d332eab25d481083142dff987d
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Wed Oct 14 14:14:28 2009 +1100

    add more debugging output to eventscripts and when a script has timed out,
    print a full "pstree -p" to the log.
    
    Example :
            |-ctdbd(29826)-+-ctdbd(29862)
            |              `-ctdbd(31897)-+-00.ctdb(31898)---sleep(31908)
    
    change the default timeout to 60 seconds for eventscripts

commit eb854f65f978f24583e221138eb4f9b917b89285
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Wed Oct 14 12:12:04 2009 +1100

    move the logging of the warning "No reclock file used" to the startup case so we only print this warning on "service ctdb start" and not for "service ctdb *"

commit 3807681e74f4bfe92befdae6ed616ff5f1a99880
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Wed Oct 14 11:59:16 2009 +1100

    when we change state between healthy/unhealthy, make sure we ask the recovery
    master to perform an explicit ip reallocation.
    
    This is more reliable and faster than having the recovery dameon track these
    changes, and since we now have an explicit method to ask the recovery daemon
    to perform an explicit ip reallocation, we should use this.

-----------------------------------------------------------------------

Summary of changes:
 config/ctdb.init       |    7 ++---
 include/ctdb.h         |    5 ++++
 include/ctdb_private.h |    8 +++++++
 server/ctdb_monitor.c  |   25 ++++++++++++++++++++++++
 server/ctdb_tunables.c |    2 +-
 server/eventscript.c   |   49 ++++++++++++++++++++++++++++++++++++++++++++---
 tools/ctdb.c           |    2 +-
 7 files changed, 88 insertions(+), 10 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/ctdb.init b/config/ctdb.init
index 9b965a9..d4b9424 100755
--- a/config/ctdb.init
+++ b/config/ctdb.init
@@ -43,10 +43,6 @@ unset TMPDIR
 loadconfig network
 loadconfig ctdb
 
-[ -z "$CTDB_RECOVERY_LOCK" ] && {
-    echo "No recovery lock specified. Starting CTDB without split brain prevention"
-}
-
 # check networking is up (for redhat)
 [ "$NETWORKING" = "no" ] && exit 0
 
@@ -83,6 +79,9 @@ build_ctdb_options () {
 	CTDB_OPTIONS="${CTDB_OPTIONS}${CTDB_OPTIONS:+ }${1}${sep}${val}"
     }
 
+    [ -z "$CTDB_RECOVERY_LOCK" ] && {
+        echo "No recovery lock specified. Starting CTDB without split brain prevention"
+    }
     maybe_set "--reclock"                "$CTDB_RECOVERY_LOCK"
 
     # build up CTDB_OPTIONS variable from optional parameters
diff --git a/include/ctdb.h b/include/ctdb.h
index b9a7685..abe09a3 100644
--- a/include/ctdb.h
+++ b/include/ctdb.h
@@ -106,6 +106,11 @@ struct ctdb_call_info {
 */
 #define CTDB_SRVID_DISABLE_IP_CHECK  0xFC00000000000000LL
 
+/* A dummy port used for sending back ipreallocate resposnes to the main
+   daemon
+*/
+#define CTDB_SRVID_TAKEOVER_RUN_RESPONSE  0xFD00000000000000LL
+
 /* used on the domain socket, send a pdu to the local daemon */
 #define CTDB_CURRENT_NODE     0xF0000001
 /* send a broadcast to all nodes in the cluster, active or not */
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index acaaf5f..af77556 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -46,6 +46,14 @@ struct rd_memdump_reply {
 };
 
 /*
+  description for a TAKEOVER_RUN message reply address
+ */
+struct takeover_run_reply {
+	uint32_t pnn;
+	uint64_t srvid;
+};
+
+/*
   a tcp connection description
  */
 struct ctdb_tcp_connection {
diff --git a/server/ctdb_monitor.c b/server/ctdb_monitor.c
index 4a554d4..fc96fd7 100644
--- a/server/ctdb_monitor.c
+++ b/server/ctdb_monitor.c
@@ -110,10 +110,19 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
 	TDB_DATA data;
 	struct ctdb_node_flag_change c;
 	uint32_t next_interval;
+	int ret;
+	TDB_DATA rddata;
+	struct takeover_run_reply rd;
 
 	c.pnn = ctdb->pnn;
 	c.old_flags = node->flags;
 
+	rd.pnn   = ctdb->pnn;
+	rd.srvid = CTDB_SRVID_TAKEOVER_RUN_RESPONSE;
+
+	rddata.dptr = (uint8_t *)&rd;
+	rddata.dsize = sizeof(rd);
+
 	if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
 		DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
 		node->flags |= NODE_FLAGS_UNHEALTHY;
@@ -124,12 +133,28 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
 		}
 
 		ctdb_run_notification_script(ctdb, "unhealthy");
+
+		/* ask the recmaster to reallocate all addresses */
+		DEBUG(DEBUG_ERR,("Node became UNHEALTHY. Ask recovery master %u to perform ip reallocation\n", ctdb->recovery_master));
+		ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master));
+		}
+
 	} else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
 		DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
 		node->flags &= ~NODE_FLAGS_UNHEALTHY;
 		ctdb->monitor->next_interval = 1;
 
 		ctdb_run_notification_script(ctdb, "healthy");
+
+		/* ask the recmaster to reallocate all addresses */
+		DEBUG(DEBUG_ERR,("Node became HEALTHY. Ask recovery master %u to perform ip reallocation\n", ctdb->recovery_master));
+		ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master));
+		}
+
 	}
 
 	next_interval = ctdb->monitor->next_interval;
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index a321789..519a7b7 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -37,7 +37,7 @@ static const struct {
 	{ "TakeoverTimeout",      5,  offsetof(struct ctdb_tunable, takeover_timeout) },
 	{ "MonitorInterval",     15,  offsetof(struct ctdb_tunable, monitor_interval) },
 	{ "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, tickle_update_interval) },
-	{ "EventScriptTimeout",  20,  offsetof(struct ctdb_tunable, script_timeout) },
+	{ "EventScriptTimeout",  60,  offsetof(struct ctdb_tunable, script_timeout) },
 	{ "EventScriptBanCount",  5,  offsetof(struct ctdb_tunable, script_ban_count) },
 	{ "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, recovery_grace_period) },
 	{ "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, recovery_ban_period) },
diff --git a/server/eventscript.c b/server/eventscript.c
index bbb4016..6d2b370 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -36,8 +36,37 @@ static struct {
  */
 static void sigterm(int sig)
 {
-	DEBUG(DEBUG_ERR,("Timed out running script '%s' after %.1f seconds\n", 
-		 child_state.script_running, timeval_elapsed(&child_state.start)));
+	FILE *p;
+
+	DEBUG(DEBUG_ERR,("Timed out running script '%s' after %.1f seconds pid :%d\n", 
+		 child_state.script_running, timeval_elapsed(&child_state.start), getpid()));
+
+	p = popen("pstree -p", "r");
+	if (p == NULL) {
+		DEBUG(DEBUG_ERR,("Failed popen to collect pstree for hung script\n"));
+	} else {
+		char buf[256];
+		int count;
+
+		DEBUG(DEBUG_ERR,("PSTREE:\n"));
+		while(!feof(p)){
+			count=fread(buf, 1, 255, p);
+			if (count == EOF) {
+				break;
+			}
+			if (count < 0) {
+				break;
+			}
+			if (count == 0) {
+				break;
+			}
+			buf[count] = 0;
+			DEBUG(DEBUG_ERR,("%s", buf)); 
+		}
+		DEBUG(DEBUG_ERR,("END OF PSTREE OUTPUT\n"));
+		pclose(p);
+	}
+
 	/* all the child processes will be running in the same process group */
 	kill(-getpgrp(), SIGKILL);
 	exit(1);
@@ -653,7 +682,15 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
 		talloc_get_type(ctdb->script_monitoring_ctx,
 			struct ctdb_monitoring_status);
 
-	DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u\n", state->options, ctdb->event_script_timeouts));
+	DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u  pid : %d\n", state->options, ctdb->event_script_timeouts, state->child));
+	if (kill(state->child, 0) != 0) {
+		DEBUG(DEBUG_ERR,("Event script child process already dead, errno %s(%d)\n", strerror(errno), errno));
+		callback(ctdb, 0, private_data);
+
+		talloc_set_destructor(state, NULL);
+		talloc_free(state);
+		return;
+	}
 
 	options = talloc_strdup(ctdb, state->options);
 	CTDB_NO_MEMORY_VOID(ctdb, options);
@@ -709,7 +746,11 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
 static int event_script_destructor(struct ctdb_event_script_state *state)
 {
 	DEBUG(DEBUG_ERR,(__location__ " Sending SIGTERM to child pid:%d\n", state->child));
-	kill(state->child, SIGTERM);
+
+	if (kill(state->child, SIGTERM) != 0) {
+		DEBUG(DEBUG_ERR,("Failed to kill child process for eventscript, errno %s(%d)\n", strerror(errno), errno));
+	}
+
 	return 0;
 }
 
diff --git a/tools/ctdb.c b/tools/ctdb.c
index ec70fe8..2f78ebe 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -1619,7 +1619,7 @@ static int control_ipreallocate(struct ctdb_context *ctdb, int argc, const char
 {
 	int i, ret;
 	TDB_DATA data;
-	struct rd_memdump_reply rd;
+	struct takeover_run_reply rd;
 	uint32_t recmaster;
 	struct ctdb_node_map *nodemap=NULL;
 	int retries=0;


-- 
CTDB repository