[SCM] CTDB repository - branch master updated - ctdb-2.1-97-g05f785b

Amitay Isaacs amitay at samba.org
Sun May 5 21:36:18 MDT 2013


The branch, master has been updated
       via  05f785b51cfd8b22b3ae35bf034127fbc07005be (commit)
       via  0b7257642f62ebd83c05b6e2922f0dc2737f175c (commit)
      from  b5a8791268e938d7e017056e0e2bd2cbec1fa690 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 05f785b51cfd8b22b3ae35bf034127fbc07005be
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Apr 30 17:22:23 2013 +1000

    ctdbd: Avoid freeing non-monitor event callback when monitoring is disabled
    
    When running a non-monitor event, check is made for any active monitor
    events.  If there is an active monitor event, then the active monitor
    event is cancelled.  This is done by freeing state->callback which is
    allocated from monitor_context.
    
    When CTDB is stopped or shutdown, monitoring is disabled by freeing
    monitor_context, which frees callback and then stopped or shutdown event
    is run.  This creates a new callback structure which is allocated at
    the exact same memory location as the monitor callback which was freed.
    So in the check for active monitor events, it frees the new callback
    for non-monitor event.  Since the callback function flags successful
    completion of that event, it is never marked complete and CTDB is stuck
    in a loop waiting for completion.
    
    Move the monitor cancellation to the top of the function so that this
    can't happen.
    
    Follow log snippest highlights the problem.
    
    2013/04/30 16:54:10.673807 [21505]: Received SHUTDOWN command. Stopping CTDB daemon.
    2013/04/30 16:54:10.673814 [21505]: Shutting down recovery daemon
    2013/04/30 16:54:10.673852 [21505]: server/eventscript.c:696 in remove_callback 0x1c6d5c0
    2013/04/30 16:54:10.673858 [21505]: Monitoring has been stopped
    2013/04/30 16:54:10.673899 [21505]: server/eventscript.c:594 Sending SIGTERM to child pid:23847
    2013/04/30 16:54:10.673913 [21505]: server/eventscript.c:629 searching for callback 0x1c6d5c0
    2013/04/30 16:54:10.673932 [21505]: server/eventscript.c:641 running callback
    2013/04/30 16:54:10.673939 [21505]: server/eventscript.c:866 in event_script_callback
    2013/04/30 16:54:10.673946 [21505]: server/eventscript.c:696 in remove_callback 0x1c6d5c0
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Pair-programmed-with: Amitay Isaacs <amitay at gmail.com>

commit 0b7257642f62ebd83c05b6e2922f0dc2737f175c
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Feb 21 10:43:35 2013 +1100

    recoverd: Interface reference count changes should not cause takeover runs
    
    At the moment a naive compare of the all the interface data is done.
    So, if any IPs move then the reference counts for the the relevant
    interfaces change, interfaces appear to have changed and another
    takeover run is initiated by each node that took/released IPs.
    
    This change stops the spurious takeover runs by changing the interface
    comparison to ignore the reference counts.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_recoverd.c |   70 ++++++++++++++++++++++++++++++++---------------
 server/eventscript.c   |   61 ++++++++++++++++++++---------------------
 2 files changed, 77 insertions(+), 54 deletions(-)


Changeset truncated at 500 lines:

diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 2cfa9b9..c3a1852 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2855,17 +2855,61 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct
 	return status;
 }
 
+static bool interfaces_have_changed(struct ctdb_context *ctdb,
+				    struct ctdb_recoverd *rec)
+{
+	struct ctdb_control_get_ifaces *ifaces = NULL;
+	TALLOC_CTX *mem_ctx;
+	bool ret = false;
+
+	mem_ctx = talloc_new(NULL);
+
+	/* Read the interfaces from the local node */
+	if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
+				 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
+		/* We could return an error.  However, this will be
+		 * rare so we'll decide that the interfaces have
+		 * actually changed, just in case.
+		 */
+		talloc_free(mem_ctx);
+		return true;
+	}
+
+	if (!rec->ifaces) {
+		/* We haven't been here before so things have changed */
+		ret = true;
+	} else if (rec->ifaces->num != ifaces->num) {
+		/* Number of interfaces has changed */
+		ret = true;
+	} else {
+		/* See if interface names or link states have changed */
+		int i;
+		for (i = 0; i < rec->ifaces->num; i++) {
+			struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
+			if (strcmp(iface->name, ifaces->ifaces[i].name) != 0 ||
+			    iface->link_state != ifaces->ifaces[i].link_state) {
+				ret = true;
+				break;
+			}
+		}
+	}
+
+	talloc_free(rec->ifaces);
+	rec->ifaces = talloc_steal(rec, ifaces);
+
+	talloc_free(mem_ctx);
+	return ret;
+}
 
 /* called to check that the local allocation of public ip addresses is ok.
 */
 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
 {
 	TALLOC_CTX *mem_ctx = talloc_new(NULL);
-	struct ctdb_control_get_ifaces *ifaces = NULL;
 	struct ctdb_uptime *uptime1 = NULL;
 	struct ctdb_uptime *uptime2 = NULL;
 	int ret, j;
-	bool need_iface_check = false;
 	bool need_takeover_run = false;
 
 	ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
@@ -2876,27 +2920,7 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec
 		return -1;
 	}
 
-
-	/* read the interfaces from the local node */
-	ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
-		talloc_free(mem_ctx);
-		return -1;
-	}
-
-	if (!rec->ifaces) {
-		need_iface_check = true;
-	} else if (rec->ifaces->num != ifaces->num) {
-		need_iface_check = true;
-	} else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
-		need_iface_check = true;
-	}
-
-	talloc_free(rec->ifaces);
-	rec->ifaces = talloc_steal(rec, ifaces);
-
-	if (need_iface_check) {
+	if (interfaces_have_changed(ctdb, rec)) {
 		DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
 				     "local node %u - force takeover run\n",
 				     pnn));
diff --git a/server/eventscript.c b/server/eventscript.c
index 817cb0c..41bf21d 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -707,36 +707,6 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
 {
 	struct ctdb_event_script_state *state;
 
-	state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state);
-	CTDB_NO_MEMORY(ctdb, state);
-
-	/* The callback isn't done if the context is freed. */
-	state->callback = talloc(mem_ctx, struct event_script_callback);
-	CTDB_NO_MEMORY(ctdb, state->callback);
-	DLIST_ADD(ctdb->script_callbacks, state->callback);
-	talloc_set_destructor(state->callback, remove_callback);
-	state->callback->ctdb         = ctdb;
-	state->callback->fn           = callback;
-	state->callback->private_data = private_data;
-
-	state->ctdb = ctdb;
-	state->from_user = from_user;
-	state->call = call;
-	state->options = talloc_vasprintf(state, fmt, ap);
-	state->timeout = timeval_set(ctdb->tunable.script_timeout, 0);
-	state->scripts = NULL;
-	if (state->options == NULL) {
-		DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n"));
-		talloc_free(state);
-		return -1;
-	}
-	if (!check_options(state->call, state->options)) {
-		DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n",
-				  ctdb_eventscript_call_names[state->call], state->options));
-		talloc_free(state);
-		return -1;
-	}
-
 	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
 		/* we guarantee that only some specifically allowed event scripts are run
 		   while in recovery */
@@ -755,7 +725,6 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
 		if (i == ARRAY_SIZE(allowed_calls)) {
 			DEBUG(DEBUG_ERR,("Refusing to run event scripts call '%s' while in recovery\n",
 				 ctdb_eventscript_call_names[call]));
-			talloc_free(state);
 			return -1;
 		}
 	}
@@ -778,6 +747,36 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
 		ctdb->current_monitor = NULL;
 	}
 
+	state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state);
+	CTDB_NO_MEMORY(ctdb, state);
+
+	/* The callback isn't done if the context is freed. */
+	state->callback = talloc(mem_ctx, struct event_script_callback);
+	CTDB_NO_MEMORY(ctdb, state->callback);
+	DLIST_ADD(ctdb->script_callbacks, state->callback);
+	talloc_set_destructor(state->callback, remove_callback);
+	state->callback->ctdb         = ctdb;
+	state->callback->fn           = callback;
+	state->callback->private_data = private_data;
+
+	state->ctdb = ctdb;
+	state->from_user = from_user;
+	state->call = call;
+	state->options = talloc_vasprintf(state, fmt, ap);
+	state->timeout = timeval_set(ctdb->tunable.script_timeout, 0);
+	state->scripts = NULL;
+	if (state->options == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n"));
+		talloc_free(state);
+		return -1;
+	}
+	if (!check_options(state->call, state->options)) {
+		DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n",
+				  ctdb_eventscript_call_names[state->call], state->options));
+		talloc_free(state);
+		return -1;
+	}
+
 	DEBUG(DEBUG_INFO,(__location__ " Starting eventscript %s %s\n",
 			  ctdb_eventscript_call_names[state->call],
 			  state->options));


-- 
CTDB repository


More information about the samba-cvs mailing list