[SCM] CTDB repository - branch 1.2.39 updated - ctdb-1.9.1-506-g12c9737

Thu Nov 29 20:53:06 MST 2012

The branch, 1.2.39 has been updated
       via  12c9737e00f548599ea88b644b625be72466bb8b (commit)
       via  dbf5e73e54a1df613707837959d33a72a6c3118b (commit)
       via  a630c2f4e201e1b17d5873e8a45231f92b019278 (commit)
      from  0491fdec7b11b0e79750c92977c6cc776d5f88bf (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=1.2.39


- Log -----------------------------------------------------------------
commit 12c9737e00f548599ea88b644b625be72466bb8b
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Fri Nov 30 14:16:21 2012 +1100

    New version 1.2.39-7
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit dbf5e73e54a1df613707837959d33a72a6c3118b
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Tue Oct 23 16:23:12 2012 +1100

    recoverd: Track the nodes that fail takeover run and set culprit count
    
    If any of the nodes fail takeover run (either due to timeout or failure
    to complete within takeover_timeout interval) from main loop, recovery
    master will give up trying takeover run with following message:
    
      "Unable to setup public takeover addresses. Try again later"
    
    And as a side-effect the monitoring is disabled on all the nodes. Before
    ctdb_takeover_run() is called from main loop, monitoring get disabled via
    startrecovery event. Since ctdb_takeover_run() fails, it never runs
    recovered event and monitoring does not get re-enabled.
    
    In main_loop, ctdb_takeover_run() is called with a takeover_fail_callback.
    This callback will get called if any of the nodes fail in handling
    takeip/releaseip/ipreallocated events in ctdb_takeover_run().
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>
    
    Cherry-pick-from: cbe68821180e04988edf186dcf6d042edcab81de
    
    Conflicts:
    	server/ctdb_recoverd.c

commit a630c2f4e201e1b17d5873e8a45231f92b019278
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Tue Oct 23 15:06:33 2012 +1100

    daemon: Do not ignore timed out monitor events
    
    If an eventscript times out for monitor event, it is considered successful
    and the remaining eventscripts are not run. This can make a node prematurely
    healthy, cause healthy node to fail over IPs to this node and this node will
    not be able to host those IPs. Thus causing loss of access and in case of NAT-GW
    configuration, loss of a default route.
    
    Cherry-pick-from: 5205d545e8d8c72d73b9d5fd148df6de30392fc8
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

-----------------------------------------------------------------------

Summary of changes:
 include/ctdb_private.h     |    6 +++---
 packaging/RPM/ctdb.spec.in |    5 ++++-
 server/ctdb_recoverd.c     |   32 ++++++++++++++++++++++++++++----
 server/ctdb_takeover.c     |   14 +++++++++++---
 server/ctdb_tunables.c     |    2 +-
 server/eventscript.c       |    1 -
 6 files changed, 47 insertions(+), 13 deletions(-)


Changeset truncated at 500 lines:

diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index e3a9a15..32e8c68 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -1118,6 +1118,8 @@ int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
 		      const ctdb_sock_addr *src,
 		      uint32_t seq, uint32_t ack, int rst);
 
+typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
+
 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist);
 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
 			      const char *iface,
@@ -1125,7 +1127,7 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
 int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir);
 int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap);
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, client_async_callback fail_callback, void *callback_data);
 
 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, 
 				TDB_DATA indata);
@@ -1264,8 +1266,6 @@ int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
 int ctdb_set_child_logging(struct ctdb_context *ctdb);
 void ctdb_lockdown_memory(struct ctdb_context *ctdb);
 
-typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
-
 struct client_async_data {
 	enum ctdb_controls opcode;
 	bool dont_log_errors;
diff --git a/packaging/RPM/ctdb.spec.in b/packaging/RPM/ctdb.spec.in
index 12a9e0b..5165c5c 100644
--- a/packaging/RPM/ctdb.spec.in
+++ b/packaging/RPM/ctdb.spec.in
@@ -4,7 +4,7 @@ Summary: Clustered TDB
 Vendor: Samba Team
 Packager: Samba Team <samba at samba.org>
 Version: 1.2.39
-Release: 5GITHASH
+Release: 7GITHASH
 Epoch: 0
 License: GNU GPL version 3
 Group: System Environment/Daemons
@@ -144,6 +144,9 @@ development libraries for ctdb
 %{_libdir}/libctdb.a
 
 %changelog
+* Fri Nov 30 2012 : version 1.2.39-7
+  - Do not ignore timed out monitor events
+  - Track the nodes that fail takeover run and set culprit count
 * Mon Mar 5 2012 : version 1.2.39-5
  - remove change for 35306
 * Wed Feb 29 2012 : version 1.2.39-4
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 4963c3f..336a9a7 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -1345,6 +1345,21 @@ static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
 
 
 /*
+ * this callback is called for every node that failed to execute ctdb_takeover_run()
+ * and set flag to re-run takeover run.
+ */
+static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+	DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
+
+	ctdb_set_culprit(rec, node_pnn);
+	rec->need_takeover_run = true;
+}
+
+
+/*
   we are the recmaster, and recovery is needed - start a recovery run
  */
 static int do_recovery(struct ctdb_recoverd *rec, 
@@ -1631,7 +1646,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		return -1;
 	}
 	rec->need_takeover_run = false;
-	ret = ctdb_takeover_run(ctdb, nodemap);
+	ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
 		rec->need_takeover_run = true;
@@ -2046,7 +2061,7 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
 		rec->need_takeover_run = true;
 	}
 	if (ret == 0) {
-		ret = ctdb_takeover_run(ctdb, rec->nodemap);
+		ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
 		if (ret != 0) {
 			DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
 			rec->need_takeover_run = true;
@@ -3412,9 +3427,18 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 			return;
 		}
 
-		ret = ctdb_takeover_run(ctdb, nodemap);
+		/* If takeover run fails, then the offending nodes are
+		 * assigned ban culprit counts. And we re-try takeover.
+		 * If takeover run fails repeatedly, the node would get
+		 * banned.
+		 *
+		 * If rec->need_takeover_run is not set to true at this
+		 * failure, monitoring is disabled cluster-wide (via
+		 * startrecovery eventscript) and will not get enabled.
+		 */
+		ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
 		if (ret != 0) {
-			DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
+			DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
 			return;
 		}
 
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c
index 9cbafaf..e17b481 100644
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -1933,7 +1933,8 @@ finished:
 /*
   make any IP alias changes for public addresses that are necessary 
  */
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+		      client_async_callback fail_callback, void *callback_data)
 {
 	int i;
 	struct ctdb_public_ip ip;
@@ -1965,6 +1966,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 	async_data = talloc_zero(tmp_ctx, struct client_async_data);
 	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
+	async_data->fail_callback = fail_callback;
+	async_data->callback_data = callback_data;
+
 	for (i=0;i<nodemap->num;i++) {
 		/* don't talk to unconnected nodes, but do talk to banned nodes */
 		if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
@@ -2022,6 +2026,10 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 	/* tell all nodes to get their own IPs */
 	async_data = talloc_zero(tmp_ctx, struct client_async_data);
 	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+	async_data->fail_callback = fail_callback;
+	async_data->callback_data = callback_data;
+
 	for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
 		if (tmp_ip->pnn == -1) {
 			/* this IP won't be taken over */
@@ -2074,8 +2082,8 @@ ipreallocated:
 	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
 				      nodes, 0, TAKEOVER_TIMEOUT(),
 				      false, data,
-				      NULL, NULL,
-				      NULL) != 0) {
+				      NULL, fail_callback,
+				      callback_data) != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
 	}
 
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index 092c30f..609c71a 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -37,7 +37,7 @@ static const struct {
 	{ "MonitorInterval",     15,  offsetof(struct ctdb_tunable, monitor_interval) },
 	{ "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, tickle_update_interval) },
 	{ "EventScriptTimeout",  30,  offsetof(struct ctdb_tunable, script_timeout) },
-	{ "EventScriptTimeoutCount", 1,  offsetof(struct ctdb_tunable, script_timeout_count) },
+	{ "EventScriptTimeoutCount", 20,  offsetof(struct ctdb_tunable, script_timeout_count) },
 	{ "EventScriptUnhealthyOnTimeout", 0, offsetof(struct ctdb_tunable, script_unhealthy_on_timeout) },/* OBSOLETE */
 	{ "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, recovery_grace_period) },
 	{ "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, recovery_ban_period) },
diff --git a/server/eventscript.c b/server/eventscript.c
index c42aaa4..a1bcf01 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -561,7 +561,6 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
 	case CTDB_EVENT_TAKE_IP:
 	case CTDB_EVENT_RELEASE_IP:
 	case CTDB_EVENT_STOPPED:
-	case CTDB_EVENT_MONITOR:
 	case CTDB_EVENT_STATUS:
 		state->scripts->scripts[state->current].status = 0;
 		DEBUG(DEBUG_ERR,("Ignoring hung script for %s call %d\n", state->options, state->call));


-- 
CTDB repository