[SCM] CTDB repository - branch master updated - ctdb-1.13-249-gfc18188

Wed Aug 8 20:19:05 MDT 2012

The branch, master has been updated
       via  fc18188b7b63eb0dafbc47e3abf80e306e1dfc31 (commit)
       via  e7dc10da3ced54ea9d719ad167ee42dcca8dce75 (commit)
       via  a0c30c820fd47d4f8620dc060c825be10754f5d1 (commit)
       via  f586e8a2911fc6e7f6698f516653145d8fd45dad (commit)
       via  cc9d96f4248e45ea99c5f00db1526426ac26fbc2 (commit)
       via  9119a568c2b4601318f7751f537dca2f92a7230b (commit)
      from  c29a943f9bbcfecb861e71d007c7698a53dc8773 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit fc18188b7b63eb0dafbc47e3abf80e306e1dfc31
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Jul 6 20:43:46 2012 +1000

    recoverd: All inactive nodes should yield recovery master role
    
    Not just stopped nodes.  In reality, this means that banned nodes will
    also yield, since nodes in the other inactive states won't be running
    a daemon.
    
    This seems sensible since if another node notices that an inactive
    node is the recovery master then it will force an election anyway.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit e7dc10da3ced54ea9d719ad167ee42dcca8dce75
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Jul 6 20:36:48 2012 +1000

    recoverd: An inactive node should not force recovery master elections
    
    An inactive node can't become the recovery master.  So if an inactive
    node notices that the recovery master is inactive, it shouldn't force
    an election for recovery master and nominate itself as a candidate.
    This can cause the recovery master to flip-flop between nodes when all
    nodes are inactive.
    
    If there is actually an active node then it will trigger the election.
    
    This is fairly cosmetic but is a step along the way towards ironing
    out weirdness when all nodes are stopped.
    
    Also, fix a related comment.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit a0c30c820fd47d4f8620dc060c825be10754f5d1
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jul 3 10:30:29 2012 +1000

    recoverd: main_loop() should not verify local IPs if node is stopped
    
    Doing these checks is pointless and potentially causes unnecessary log
    messages.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit f586e8a2911fc6e7f6698f516653145d8fd45dad
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jul 3 10:15:25 2012 +1000

    recoverd: verify_local_ip_allocation() should dup ifaces before early return
    
    If CTDB starts in STOPPED state then it thinks it is in the middle of
    a recovery.  rec->ifaces is also NULL and an early exit further down
    (that checks to see if a recovery is in process) means that it stays
    that way.
    
    However, each time this function is entered the need for a takeover
    run is re-flagged.  The takeover run never happens due to the the
    early exit, causing a couple of unneeded messages to be logged each
    time.
    
    This is avoided by moving the code that sets rec->ifaces so that it is
    executed earlier and, in this case, in the middle of a recovery.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit cc9d96f4248e45ea99c5f00db1526426ac26fbc2
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Jul 2 17:26:04 2012 +1000

    recoverd: Update a log message that has bit-rotted
    
    This message used to be correct because the ipreallocated event only
    handled updating the NAT gateway.  However, that has changed so the
    message needs to be updated.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9119a568c2b4601318f7751f537dca2f92a7230b
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Jun 22 14:01:02 2012 +1000

    recoverd: Fix bogus info in message about changed flags
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_recoverd.c |   25 +++++++++++++++++--------
 server/ctdb_takeover.c |   11 ++++++++---
 2 files changed, 25 insertions(+), 11 deletions(-)


Changeset truncated at 500 lines:

diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 7b7435c..02ce69f 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2484,7 +2484,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 	}
 
 	if (nodemap->nodes[i].flags != c->new_flags) {
-		DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
+		DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, nodemap->nodes[i].flags));
 	}
 
 	disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
@@ -2791,6 +2791,9 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec
 		need_iface_check = true;
 	}
 
+	talloc_free(rec->ifaces);
+	rec->ifaces = talloc_steal(rec, ifaces);
+
 	if (need_iface_check) {
 		DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
 				     "local node %u - force takeover run\n",
@@ -2839,9 +2842,6 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec
 		return 0;
 	}
 
-	talloc_free(rec->ifaces);
-	rec->ifaces = talloc_steal(rec, ifaces);
-
 	/* verify that we have the ip addresses we should have
 	   and we dont have ones we shouldnt have.
 	   if we find an inconsistency we set recmode to
@@ -3325,8 +3325,8 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 	/* If the local node is stopped, verify we are not the recmaster 
 	   and yield this role if so
 	*/
-	if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
-		DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
+	if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
+		DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
 		force_election(rec, pnn, nodemap);
 		return;
 	}
@@ -3387,7 +3387,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 		return;
 	}
 
-	/* grap the nodemap from the recovery master to check if it is banned */
+	/* get nodemap from the recovery master to check if it is inactive */
 	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
 				   mem_ctx, &recmaster_nodemap);
 	if (ret != 0) {
@@ -3397,12 +3397,21 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 	}
 
 
-	if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+	if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
+	    (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
 		DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
 		force_election(rec, pnn, nodemap);
 		return;
 	}
 
+	/* If this node is stopped then it is not the recovery master
+	 * so the only remaining action is to potentially to verify
+	 * the local IP allocation below.  This won't accomplish
+	 * anything useful so skip it.
+	 */
+	if (rec->node_flags & NODE_FLAGS_STOPPED) {
+		return;
+	}
 
 	/* verify that we have all ip addresses we should have and we dont
 	 * have addresses we shouldnt have.
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c
index 538f776..40bf4bc 100644
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -2244,8 +2244,13 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 	}
 
 ipreallocated:
-	/* tell all nodes to update natwg */
-	/* send the flags update natgw on all connected nodes */
+	/* 
+	 * Tell all nodes to run eventscripts to process the
+	 * "ipreallocated" event.  This can do a lot of things,
+	 * including restarting services to reconfigure them if public
+	 * IPs have moved.  Once upon a time this event only used to
+	 * update natwg.
+	 */
 	data.dptr  = discard_const("ipreallocated");
 	data.dsize = strlen((char *)data.dptr) + 1; 
 	nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
@@ -2254,7 +2259,7 @@ ipreallocated:
 				      false, data,
 				      NULL, NULL,
 				      NULL) != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
+		DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
 	}
 
 	talloc_free(tmp_ctx);


-- 
CTDB repository