[SCM] CTDB repository - branch master updated - ctdb-1.0.57-8-gcd69d29

Ronnie Sahlberg sahlberg at samba.org
Tue Sep 9 03:47:42 GMT 2008


The branch, master has been updated
       via  cd69d292292eaab3aac0e9d9fc57cb621597c63c (commit)
      from  e26ce5140ed005725f8b7ac8ba23a180fd7d5337 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit cd69d292292eaab3aac0e9d9fc57cb621597c63c
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Tue Sep 9 13:44:46 2008 +1000

    additional monitoring between the two daemons.
    
    we currently only monitor that the dameons are running by kill(0, pid)
    and verifying the the domain socket between them is ok.
    
    this is not sufficient since we can have a situation where the recovery
    daemon is hung.
    
    this new code monitors that the recovery daemon is operating.
    if the recovery hangs, we log this and shut down the main daemon

-----------------------------------------------------------------------

Summary of changes:
 client/ctdb_client.c   |   18 ++++++++++++++++++
 include/ctdb.h         |    2 ++
 include/ctdb_private.h |    4 ++++
 server/ctdb_control.c  |    4 ++++
 server/ctdb_daemon.c   |    3 +++
 server/ctdb_recover.c  |   38 ++++++++++++++++++++++++++++++++++++++
 server/ctdb_recoverd.c |    3 +++
 server/ctdb_tunables.c |    1 +
 8 files changed, 73 insertions(+), 0 deletions(-)


Changeset truncated at 500 lines:

diff --git a/client/ctdb_client.c b/client/ctdb_client.c
index dfcd4d9..6d80efc 100644
--- a/client/ctdb_client.c
+++ b/client/ctdb_client.c
@@ -3280,3 +3280,21 @@ again:
 	talloc_free(h);
 	return 0;
 }
+
+/*
+  recovery daemon ping to main daemon
+ */
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
+{
+	int ret;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null, 
+			   ctdb, NULL, &res, NULL, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send recd ping\n"));
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/include/ctdb.h b/include/ctdb.h
index d43ab50..60fa60b 100644
--- a/include/ctdb.h
+++ b/include/ctdb.h
@@ -566,4 +566,6 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
 			   TDB_DATA key, TDB_DATA data);
 int ctdb_transaction_commit(struct ctdb_transaction_handle *h);
 
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb);
+
 #endif
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index a25674c..b2ded31 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -114,6 +114,7 @@ struct ctdb_tunable {
 	uint32_t reclock_ping_period;
 	uint32_t no_ip_failback;
 	uint32_t verbose_memory_names;
+	uint32_t recd_ping_timeout;
 };
 
 /*
@@ -417,6 +418,7 @@ struct ctdb_context {
 	int start_as_disabled;
 	uint32_t event_script_timeouts; /* counting how many consecutive times an eventscript has timedout */
 	TALLOC_CTX *eventscripts_ctx; /* a context to hold data for the RUN_EVENTSCRIPTS control */
+	TALLOC_CTX *recd_ping_ctx;
 };
 
 struct ctdb_db_context {
@@ -550,6 +552,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_TRANS2_FINISHED         = 84,
 		    CTDB_CONTROL_TRANS2_ERROR            = 85,
 		    CTDB_CONTROL_TRANS2_COMMIT_RETRY     = 86,
+		    CTDB_CONTROL_RECD_PING		 = 87,
 };	
 
 /*
@@ -1378,5 +1381,6 @@ int32_t ctdb_control_trans2_error(struct ctdb_context *ctdb,
 char *ctdb_addr_to_str(ctdb_sock_addr *addr);
 void ctdb_canonicalize_ip(const ctdb_sock_addr *ip, ctdb_sock_addr *cip);
 
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb);
 
 #endif
diff --git a/server/ctdb_control.c b/server/ctdb_control.c
index 4128797..94736fb 100644
--- a/server/ctdb_control.c
+++ b/server/ctdb_control.c
@@ -406,6 +406,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 	case CTDB_CONTROL_TRANS2_FINISHED:
 		return ctdb_control_trans2_finished(ctdb, c);
 
+	case CTDB_CONTROL_RECD_PING:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_recd_ping(ctdb);
+
 	default:
 		DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c
index efe3d75..885ce7e 100644
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -103,6 +103,9 @@ static void ctdb_start_transport(struct ctdb_context *ctdb)
 
 	/* start periodic update of tcp tickle lists */
        	ctdb_start_tcp_tickle_update(ctdb);
+
+	/* start listening for recovery daemon pings */
+	ctdb_control_recd_ping(ctdb);
 }
 
 static void block_signal(int signum)
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 3243f42..6b207d5 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -971,3 +971,41 @@ int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outda
 	return 0;	
 }
 
+static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
+{
+	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+	DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Shutting down ctdb daemon\n"));
+
+	ctdb_stop_recoverd(ctdb);
+	ctdb_stop_keepalive(ctdb);
+	ctdb_stop_monitoring(ctdb);
+	ctdb_release_all_ips(ctdb);
+	if (ctdb->methods != NULL) {
+		ctdb->methods->shutdown(ctdb);
+	}
+	ctdb_event_script(ctdb, "shutdown");
+	DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Daemon has been shut down.\n"));
+	exit(0);
+}
+
+/* The recovery daemon will ping us at regular intervals.
+   If we havent been pinged for a while we assume the recovery
+   daemon is inoperable and we shut down.
+*/
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
+{
+	talloc_free(ctdb->recd_ping_ctx);
+
+	ctdb->recd_ping_ctx = talloc_new(ctdb);
+	CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_ctx);
+
+	if (ctdb->tunable.recd_ping_timeout != 0) {
+		event_add_timed(ctdb->ev, ctdb->recd_ping_ctx, 
+			timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+			ctdb_recd_ping_timeout, ctdb);
+	}
+
+	return 0;
+}
+
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index c6a4ab3..a8c004a 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2317,6 +2317,9 @@ again:
 		exit(-1);
 	}
 
+	/* ping the local daemon to tell it we are alive */
+	ctdb_ctrl_recd_ping(ctdb);
+
 	if (rec->election_timeout) {
 		/* an election is in progress */
 		goto again;
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index d138137..de3e466 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -50,6 +50,7 @@ static const struct {
 	{ "ReclockPingPeriod",   60,  offsetof(struct ctdb_tunable,  reclock_ping_period) },
 	{ "NoIPFailback",         0,  offsetof(struct ctdb_tunable, no_ip_failback) },
 	{ "VerboseMemoryNames",   0,  offsetof(struct ctdb_tunable, verbose_memory_names) },
+	{ "RecdPingTimeout",	 60,  offsetof(struct ctdb_tunable, recd_ping_timeout) },
 };
 
 /*


-- 
CTDB repository


More information about the samba-cvs mailing list