[SCM] CTDB repository - branch master updated - ctdb-2.4-36-g7108067

Thu Sep 19 04:58:05 CEST 2013

The branch, master has been updated
       via  71080676bb4acbd0d9b595a30cf7fe6dddbf426f (commit)
       via  b39aa2e401fbb581207d986bac93778e9c01acdc (commit)
       via  6d44657a5e5b0df22bab2d487a503dd1c5ba79b4 (commit)
       via  0846c00597adb66bba8c9dbf63443d0c2f91a7d1 (commit)
       via  ac946ee4ad01b1e5cd1006930b9f8a190a0a58ba (commit)
       via  d921b2756d5f1c4ad7a35fe120f6fda9f5bf5686 (commit)
       via  e81589b7084c661adf617e166cc2c25b4939f841 (commit)
       via  ceb30432a9a550778aed0b422a654fc5287b82a3 (commit)
       via  85a5b544ec032173e98c9cc3b5402a76b961aa3b (commit)
       via  c51c1efe5fc7fa668597f2acd435dee16e410fc9 (commit)
       via  4cd727439a0824ebb8dbcf737d9888ffc3c41184 (commit)
       via  d66a072d9b120c78c47e726e9f29a3c1cfdd87ce (commit)
       via  428f800bcdf3dbfe19de8bb36099fbf01ebeaab4 (commit)
       via  0a51a85915486b2a8fded7ba6444b18c6c1ee8e8 (commit)
       via  00db4de53a0d86013e79e6577e7e6cf3ef864e56 (commit)
       via  52050e1c75b21961dafe2bc410268b44240ab24e (commit)
       via  a566fb5e70282c4e9f76654b1be4dc80829dced0 (commit)
       via  c58ee0eddf7ae3283e3ca8bd25575e6e677e1b17 (commit)
       via  e4eae6e3291baa299a1d0f733ab11b138ee699a3 (commit)
       via  d9c22b04d5aa7938a3965bd3144568664eb772ce (commit)
       via  48b603fbf16311daa47b01e7a33d477ed51da56d (commit)
       via  8ed29c60c0a7dd29f2a6efdf694d38e94281e1c4 (commit)
       via  e5f94c7857405bdeac233069003c3769b3dc3616 (commit)
       via  53722430ad35f80935aabd12fa07654126443b8b (commit)
       via  9a3f0c0e61ca5c17e020c6e0463d73c7cf4f7c09 (commit)
       via  f0f48f22f45e4c82eba2582efae307e25385de81 (commit)
       via  403938804caf1322f9773d63197e4303a7b2a788 (commit)
      from  c0bb147ca09e82019b05ec22995623cffc3184e2 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 71080676bb4acbd0d9b595a30cf7fe6dddbf426f
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 18 17:07:32 2013 +1000

    recoverd: Disable takeover runs on other nodes for 5 minutes
    
    60 seconds might not be long enough to kill all connections and
    release IPs.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit b39aa2e401fbb581207d986bac93778e9c01acdc
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 18 17:06:16 2013 +1000

    recoverd: Improve logging for takeover runs
    
    Takeover runs are currently silent when they succeed.  However, they
    are important, so log something by default.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 6d44657a5e5b0df22bab2d487a503dd1c5ba79b4
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 18 16:35:18 2013 +1000

    tools/ctdb: Use the standard long timeout when disabling takeover runs
    
    This means that takeover runs will be disabled for about as long as the
    reloadips control can take to complete.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 0846c00597adb66bba8c9dbf63443d0c2f91a7d1
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 6 13:20:26 2013 +1000

    tools/ctdb: Fix arguments/semantics of rebalance node
    
    There's no reason why specifying a node should be compulsory.  This is
    a cluster-wide operation because it is implemented by the recovery
    master so multiple nodes should not be specified using -n.  However,
    the command should be able to specify multiple nodes so let it have
    its own nodestring argument.
    
    This change should be backward compatible with the old requirement of
    specifying a single node via -n.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit ac946ee4ad01b1e5cd1006930b9f8a190a0a58ba
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 6 13:19:09 2013 +1000

    tools/ctdb: Make rebalancenode more robust
    
    Use a broadcast instead of trying to win the race of determining the
    recovery master and then sending the message before the recovery
    master changes.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit d921b2756d5f1c4ad7a35fe120f6fda9f5bf5686
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 6 11:29:14 2013 +1000

    tests/simple: Fix the reloadips test to cope with changes to reloadips
    
    Specifying nodes to reload no longer uses -n.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit e81589b7084c661adf617e166cc2c25b4939f841
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 6 11:23:07 2013 +1000

    recoverd: Be careful about freeing the list of IP rebalance target nodes
    
    It can change during a takeover run.  If it does then don't free it.
    
    There are potentially fancier solutions (e.g. check what PNNs are new
    to the list) to this issue but this is the simplest.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit ceb30432a9a550778aed0b422a654fc5287b82a3
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 6 11:21:10 2013 +1000

    recoverd: reloadips should rebalance target nodes for new IPs
    
    Otherwise, if existing IPs are added to extra nodes (that have,
    perhaps, been disconnected) then those IPs will not be rebalanced
    across the extra nodes.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 85a5b544ec032173e98c9cc3b5402a76b961aa3b
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Sep 5 15:56:51 2013 +1000

    ctdbd: Make ctdb_reloadips_child send controls asynchronously
    
    Deleting IPs can take a while because IPs are released and connections
    are killed.  This can take a while so do them in parallel.  In fact,
    since the set of IPs being added and deleted will be disjoint, send
    all the adds/deletes at the same time and then wait.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit c51c1efe5fc7fa668597f2acd435dee16e410fc9
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 4 14:30:04 2013 +1000

    recoverd: Fix the implementation of CTDB_SRVID_REBALANCE_NODE
    
    The current implementation has a few flaws:
    
    * A takeover run is called unconditionally when the timer goes even if
      the recovery master role has moved.  This means a node other than
      the recovery master can incorrectly do a takeover run.
    
    * The rebalancing target nodes are cleared in the setup for a takeover
      run, regardless of whether the takeover run succeeds.
    
    * The timer to force a rebalance isn't cleared if another takeover run
      occurs before the deadline.  Any forced rebalancing will happen in
      the first takeover run and when the timer expires some time later
      then an unnecessary takeover run will occur.
    
    * If the recovery master role moves then the rebalancing data will
      stay on the original node and affect the next takeover run to occur
      if the recovery master role should come back to the original node.
    
    Instead, store an array of rebalance target nodes in the recovery
    master context.  This is passed as an extra argument to
    ctdb_takeover_run() each time it is called and is cleared when a
    takeover run succeeds.  The timer hangs off the array of rebalance
    target nodes, which is cleared if the node isn't the recovery master.
    
    This means that it is possible to lose rebalance data if the recovery
    master role moves.  However, that's a difficult problem to solve.  The
    best way of approaching it is probably to try to stop the recovery
    master role from jumping around unnecesarily when inactive nodes join
    the cluster.
    
    The long term solution is to avoid this nonsense completely.  The IP
    allocation algorithm needs to cache state between runs so that it
    knows which nodes have just become healthy.  This also needs recovery
    master stability.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 4cd727439a0824ebb8dbcf737d9888ffc3c41184
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Aug 28 15:46:27 2013 +1000

    recoverd: Remove unused CTDB_SRVID_RELOAD_ALL_IPS and handler
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit d66a072d9b120c78c47e726e9f29a3c1cfdd87ce
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Aug 28 15:38:48 2013 +1000

    tools/ctdb: Reimplement reloadips
    
    This implementation disables takeover runs on all nodes before trying
    to reload IPs.  It also takes "all" or the list of PNNs as an argument
    to the command instead of to -n.  -n can still be specified with a
    single node indicating that node should be considered the current node
    - that might be confusing so could be removed.
    
    This implementation does not use CTDB_SRVID_RELOAD_ALL_IPS, so it can
    be removed.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 428f800bcdf3dbfe19de8bb36099fbf01ebeaab4
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Aug 28 11:50:23 2013 +1000

    recoverd: Defer ipreallocated requests when takeover runs are disabled
    
    The takeover run will fail anyway but deferring seems like a cleaner
    option.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 0a51a85915486b2a8fded7ba6444b18c6c1ee8e8
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Aug 28 11:32:54 2013 +1000

    recoverd: Reimplement CTDB_SRVID_DISABLE_IP_CHECK
    
    Use disable_takeover_runs_handler() instead of maintaining duplicate
    logic.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 00db4de53a0d86013e79e6577e7e6cf3ef864e56
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Aug 27 15:04:40 2013 +1000

    recoverd: New SRVID message CTDB_SRVID_DISABLE_TAKEOVER_RUNS
    
    This implements a superset of CTDB_SRVID_DISABLE_IP_CHECK.  It stops
    the IP checks but also causes any attempted takeover runs to fail and
    be rescheduled.
    
    This is meant to completely stop IP movements.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 52050e1c75b21961dafe2bc410268b44240ab24e
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 16 18:47:51 2013 +1000

    tools/ctdb: Add a wait_for_all option to srvid_broadcast()
    
    This will be useful for other SRVIDs.
    
    The error checking in the handler depends on the SRVID responding with
    a uint32_t where <0 indicates an error and >=0 is a PNN that
    succeeded.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit a566fb5e70282c4e9f76654b1be4dc80829dced0
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 16 17:06:23 2013 +1000

    tools/ctdb: Factor out SRVID broadcast code from ipreallocate()
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit c58ee0eddf7ae3283e3ca8bd25575e6e677e1b17
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 16 16:25:28 2013 +1000

    tools/ctdb: Change ipreallocate() to use a local done flag
    
    Instead of the current global variable.  This is in anticipation of
    abstracting the code.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit e4eae6e3291baa299a1d0f733ab11b138ee699a3
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 16 20:02:34 2013 +1000

    recoverd: Factor out the SRVID handling code
    
    The code that handles IP reallocate requests can be reused.
    
    This also changes the result back to a SRVID caller to the PNN on
    success or a negative error code on failure.  None of the callers
    currently look at the result so this is harmless... but it will be
    useful later.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit d9c22b04d5aa7938a3965bd3144568664eb772ce
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 16 20:10:10 2013 +1000

    recoverd: Make the SRVID request structure generic
    
    No need for a separate one for each SRVID.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 48b603fbf16311daa47b01e7a33d477ed51da56d
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Sep 3 11:21:09 2013 +1000

    recoverd: Move disabling of IP checks into do_takeover_run()
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 8ed29c60c0a7dd29f2a6efdf694d38e94281e1c4
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Sep 3 11:20:01 2013 +1000

    recoverd: do_takeover_run() should mark when a takeover run is in progress
    
    Nested takeover runs should never happens so they should fail.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit e5f94c7857405bdeac233069003c3769b3dc3616
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Aug 27 12:19:18 2013 +1000

    recoverd: takeover_fail_callback() doesn't need to set rec->need_takeover_run
    
    It is set on every failure anyway.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 53722430ad35f80935aabd12fa07654126443b8b
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Sep 9 12:13:11 2013 +1000

    recoverd: Fail takeover run if "ipreallocated" fails
    
    Previously flagging a failure was probably avoided because of attempts
    to run "ipreallocated" events on stopped and banned nodes, which would
    fail because they are in recovery.  Given the change to a new control
    and that fallback only retries the old method on active nodes, this
    should never fail in reasonable circumstances.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9a3f0c0e61ca5c17e020c6e0463d73c7cf4f7c09
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Aug 27 12:14:34 2013 +1000

    recoverd: New function do_takeover_run()
    
    Factor the calling sequence for ctdb_takeover_run() into a new
    function and call it instead.  This changes rec->need_takeover_run to
    false for each successful takeover run and that seems to be the right
    thing to do.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit f0f48f22f45e4c82eba2582efae307e25385de81
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Sep 17 12:00:26 2013 +1000

    recoverd: Stabilise the recovery master role
    
    On rare occasions when a node that has been inactive it will trigger
    an election when it becomes active again.  If that node has been up
    for the longest then it will win the election and the recovery master
    role will spuriously move.
    
    While a node remains inactive we reset the priority time to discourage
    it from winning elections.  The priority time will now reflect roughly
    how long the node has been active rather than how long it has been up.
    That means the most stable node is more likely to win elections.
    
    Having a stable recovery master means that disabling takeover runs
    while reloading IPs is more likely to succeed.  It also improves the
    chances of being able to cache information in the recovery master -
    for example, between takeover runs.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 403938804caf1322f9773d63197e4303a7b2a788
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 4 13:54:23 2013 +1000

    recoverd: Banned nodes should not be told to run "ipreallocated" event
    
    They will reject it because they are in recovery.  This can result in
    extra banning credits being applied to banned nodes.
    
    This corresponds to commit 9132e6814ed927fa317f333f03dedb18f75d0e5b
    from the 1.2.40 branch.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

-----------------------------------------------------------------------

Summary of changes:
 include/ctdb_private.h            |   18 +-
 include/ctdb_protocol.h           |    6 +-
 server/ctdb_monitor.c             |    2 +-
 server/ctdb_recoverd.c            |  597 ++++++++++++++++++++++++-------------
 server/ctdb_takeover.c            |  262 +++++++++-------
 tests/simple/18_ctdb_reloadips.sh |    6 +-
 tests/src/ctdb_takeover_tests.c   |   11 +-
 tools/ctdb.c                      |  294 +++++++++++-------
 8 files changed, 736 insertions(+), 460 deletions(-)


Changeset truncated at 500 lines:

diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index 987502e..8eab45f 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -24,19 +24,12 @@
 #include <sys/socket.h>
 
 /*
-  recovery daemon memdump reply address
+ * Structure to support SRVID requests and replies
  */
-struct rd_memdump_reply {
-	uint32_t pnn;
-	uint64_t srvid;
-};
-
-/*
-  description for a TAKEOVER_RUN message reply address
- */
-struct takeover_run_reply {
+struct srvid_request {
 	uint32_t pnn;
 	uint64_t srvid;
+	uint32_t data;
 };
 
 /*
@@ -1234,8 +1227,9 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
 int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir);
 int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
-void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn);
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, client_async_callback fail_callback, void *callback_data);
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+		      uint32_t *force_rebalance_nodes,
+		      client_async_callback fail_callback, void *callback_data);
 
 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, 
 				TDB_DATA indata);
diff --git a/include/ctdb_protocol.h b/include/ctdb_protocol.h
index 7b8298b..73ce0e1 100644
--- a/include/ctdb_protocol.h
+++ b/include/ctdb_protocol.h
@@ -147,10 +147,8 @@ struct ctdb_call_info {
 */
 #define CTDB_SRVID_REBALANCE_NODE 0xFB01000000000000LL
 
-/*
-   a message handler ID meaning to ask recovery master to reload all ips
- */
-#define CTDB_SRVID_RELOAD_ALL_IPS 0xFB02000000000000LL
+/* A message handler ID to stop takeover runs from occurring */
+#define CTDB_SRVID_DISABLE_TAKEOVER_RUNS 0xFB03000000000000LL
 
 /* A message id to ask the recovery daemon to temporarily disable the
    public ip checks
diff --git a/server/ctdb_monitor.c b/server/ctdb_monitor.c
index c23477d..acd68c8 100644
--- a/server/ctdb_monitor.c
+++ b/server/ctdb_monitor.c
@@ -113,7 +113,7 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
 	uint32_t next_interval;
 	int ret;
 	TDB_DATA rddata;
-	struct takeover_run_reply rd;
+	struct srvid_request rd;
 	const char *state_str = NULL;
 
 	c.pnn = ctdb->pnn;
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index da88f16..2e8ba58 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -30,19 +30,93 @@
 #include "dlinklist.h"
 
 
-/* most recent reload all ips request we need to perform during the 
-   next monitoring loop
-*/
-struct reloadips_all_reply *reload_all_ips_request = NULL;
+/* List of SRVID requests that need to be processed */
+struct srvid_list {
+	struct srvid_list *next, *prev;
+	struct srvid_request *request;
+};
 
-/* list of "ctdb ipreallocate" processes to call back when we have
-   finished the takeover run.
-*/
-struct ip_reallocate_list {
-	struct ip_reallocate_list *next;
-	struct rd_memdump_reply *rd;
+struct srvid_requests {
+	struct srvid_list *requests;
 };
 
+static void srvid_request_reply(struct ctdb_context *ctdb,
+				struct srvid_request *request,
+				TDB_DATA result)
+{
+	/* Someone that sent srvid==0 does not want a reply */
+	if (request->srvid == 0) {
+		talloc_free(request);
+		return;
+	}
+
+	if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
+				     result) == 0) {
+		DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
+				  (unsigned)request->pnn,
+				  (unsigned long long)request->srvid));
+	} else {
+		DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
+				 (unsigned)request->pnn,
+				 (unsigned long long)request->srvid));
+	}
+
+	talloc_free(request);
+}
+
+static void srvid_requests_reply(struct ctdb_context *ctdb,
+				 struct srvid_requests **requests,
+				 TDB_DATA result)
+{
+	struct srvid_list *r;
+
+	for (r = (*requests)->requests; r != NULL; r = r->next) {
+		srvid_request_reply(ctdb, r->request, result);
+	}
+
+	/* Free the list structure... */
+	TALLOC_FREE(*requests);
+}
+
+static void srvid_request_add(struct ctdb_context *ctdb,
+			      struct srvid_requests **requests,
+			      struct srvid_request *request)
+{
+	struct srvid_list *t;
+	int32_t ret;
+	TDB_DATA result;
+
+	if (*requests == NULL) {
+		*requests = talloc_zero(ctdb, struct srvid_requests);
+		if (*requests == NULL) {
+			goto nomem;
+		}
+	}
+
+	t = talloc_zero(*requests, struct srvid_list);
+	if (t == NULL) {
+		/* If *requests was just allocated above then free it */
+		if ((*requests)->requests == NULL) {
+			TALLOC_FREE(*requests);
+		}
+		goto nomem;
+	}
+
+	t->request = (struct srvid_request *)talloc_steal(t, request);
+	DLIST_ADD((*requests)->requests, t);
+
+	return;
+
+nomem:
+	/* Failed to add the request to the list.  Send a fail. */
+	DEBUG(DEBUG_ERR, (__location__
+			  " Out of memory, failed to queue SRVID request\n"));
+	ret = -ENOMEM;
+	result.dsize = sizeof(ret);
+	result.dptr = (uint8_t *)&ret;
+	srvid_request_reply(ctdb, request, result);
+}
+
 struct ctdb_banning_state {
 	uint32_t count;
 	struct timeval last_reported_time;
@@ -65,11 +139,11 @@ struct ctdb_recoverd {
 	struct timed_event *send_election_te;
 	struct timed_event *election_timeout;
 	struct vacuum_info *vacuum_info;
-	TALLOC_CTX *ip_reallocate_ctx;
-	struct ip_reallocate_list *reallocate_callers;
-	TALLOC_CTX *ip_check_disable_ctx;
+	struct srvid_requests *reallocate_requests;
+	bool takeover_run_in_progress;
+	TALLOC_CTX *takeover_runs_disable_ctx;
 	struct ctdb_control_get_ifaces *ifaces;
-	TALLOC_CTX *deferred_rebalance_ctx;
+	uint32_t *force_rebalance_nodes;
 };
 
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -1468,7 +1542,7 @@ static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
 		}
 
 		if (ctdb->do_checkpublicip &&
-		    (rec->ip_check_disable_ctx == NULL) &&
+		    rec->takeover_runs_disable_ctx == NULL &&
 		    verify_remote_ip_allocation(ctdb,
 						 node->known_public_ips,
 						 node->pnn)) {
@@ -1546,7 +1620,6 @@ static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn,
 		DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
 
 		ctdb_set_culprit(rec, node_pnn);
-		rec->need_takeover_run = true;
 	}
 }
 
@@ -1580,6 +1653,102 @@ static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 	}
 }
 
+static bool do_takeover_run(struct ctdb_recoverd *rec,
+			    struct ctdb_node_map *nodemap,
+			    bool banning_credits_on_fail)
+{
+	uint32_t *nodes = NULL;
+	struct srvid_request dtr;
+	TDB_DATA data;
+	int i;
+	uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
+	int ret;
+	bool ok;
+
+	DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
+
+	if (rec->takeover_run_in_progress) {
+		DEBUG(DEBUG_ERR, (__location__
+				  " takeover run already in progress \n"));
+		ok = false;
+		goto done;
+	}
+
+	rec->takeover_run_in_progress = true;
+
+	/* If takeover runs are in disabled then fail... */
+	if (rec->takeover_runs_disable_ctx != NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Takeover runs are disabled so refusing to run one\n"));
+		ok = false;
+		goto done;
+	}
+
+	/* Disable IP checks (takeover runs, really) on other nodes
+	 * while doing this takeover run.  This will stop those other
+	 * nodes from triggering takeover runs when think they should
+	 * be hosting an IP but it isn't yet on an interface.  Don't
+	 * wait for replies since a failure here might cause some
+	 * noise in the logs but will not actually cause a problem.
+	 */
+	dtr.srvid = 0; /* No reply */
+	dtr.pnn = -1;
+
+	data.dptr  = (uint8_t*)&dtr;
+	data.dsize = sizeof(dtr);
+
+	nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
+
+	/* Disable for 5 minutes.  This can be a tunable later if
+	 * necessary.
+	 */
+	dtr.data = 300;
+	for (i = 0; i < talloc_array_length(nodes); i++) {
+		if (ctdb_client_send_message(rec->ctdb, nodes[i],
+					     CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+					     data) != 0) {
+			DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
+		}
+	}
+
+	ret = ctdb_takeover_run(rec->ctdb, nodemap,
+				rec->force_rebalance_nodes,
+				takeover_fail_callback,
+				banning_credits_on_fail ? rec : NULL);
+
+	/* Reenable takeover runs and IP checks on other nodes */
+	dtr.data = 0;
+	for (i = 0; i < talloc_array_length(nodes); i++) {
+		if (ctdb_client_send_message(rec->ctdb, nodes[i],
+					     CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+					     data) != 0) {
+			DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
+		}
+	}
+
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
+		ok = false;
+		goto done;
+	}
+
+	ok = true;
+	/* Takeover run was successful so clear force rebalance targets */
+	if (rebalance_nodes == rec->force_rebalance_nodes) {
+		TALLOC_FREE(rec->force_rebalance_nodes);
+	} else {
+		DEBUG(DEBUG_WARNING,
+		      ("Rebalance target nodes changed during takeover run - not clearing\n"));
+	}
+done:
+	rec->need_takeover_run = !ok;
+	talloc_free(nodes);
+	rec->takeover_run_in_progress = false;
+
+	DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
+	return ok;
+}
+
 
 /*
   we are the recmaster, and recovery is needed - start a recovery run
@@ -1856,12 +2025,8 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		rec->need_takeover_run = true;
 		return -1;
 	}
-	rec->need_takeover_run = false;
-	ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, NULL);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
-		rec->need_takeover_run = true;
-	}
+
+	do_takeover_run(rec, nodemap, false);
 
 	/* execute the "recovered" event script on all nodes */
 	ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
@@ -2109,14 +2274,14 @@ static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
 	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 	TDB_DATA *dump;
 	int ret;
-	struct rd_memdump_reply *rd;
+	struct srvid_request *rd;
 
-	if (data.dsize != sizeof(struct rd_memdump_reply)) {
+	if (data.dsize != sizeof(struct srvid_request)) {
 		DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
 		talloc_free(tmp_ctx);
 		return;
 	}
-	rd = (struct rd_memdump_reply *)data.dptr;
+	rd = (struct srvid_request *)data.dptr;
 
 	dump = talloc_zero(tmp_ctx, TDB_DATA);
 	if (dump == NULL) {
@@ -2198,42 +2363,37 @@ static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
-static void reenable_ip_check(struct event_context *ev, struct timed_event *te, 
-			      struct timeval yt, void *p)
+static void ctdb_rebalance_timeout(struct event_context *ev,
+				   struct timed_event *te,
+				   struct timeval t, void *p)
 {
 	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 
-	talloc_free(rec->ip_check_disable_ctx);
-	rec->ip_check_disable_ctx = NULL;
-}
-
-
-static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te, 
-				  struct timeval t, void *p)
-{
-	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
-	struct ctdb_context *ctdb = rec->ctdb;
-	int ret;
-
-	DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
-
-	ret = ctdb_takeover_run(ctdb, rec->nodemap, takeover_fail_callback, NULL);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
-		rec->need_takeover_run = true;
+	if (rec->force_rebalance_nodes == NULL) {
+		DEBUG(DEBUG_ERR,
+		      ("Rebalance timeout occurred - no nodes to rebalance\n"));
+		return;
 	}
 
-	talloc_free(rec->deferred_rebalance_ctx);
-	rec->deferred_rebalance_ctx = NULL;
+	DEBUG(DEBUG_NOTICE,
+	      ("Rebalance timeout occurred - do takeover run\n"));
+	do_takeover_run(rec, rec->nodemap, false);
 }
 
 	
-static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid, 
-			     TDB_DATA data, void *private_data)
+static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
+					uint64_t srvid,
+					TDB_DATA data, void *private_data)
 {
 	uint32_t pnn;
+	uint32_t *t;
+	int len;
 	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
 
+	if (rec->recmaster != ctdb_get_pnn(ctdb)) {
+		return;
+	}
+
 	if (data.dsize != sizeof(uint32_t)) {
 		DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
 		return;
@@ -2245,14 +2405,33 @@ static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvi
 
 	pnn = *(uint32_t *)&data.dptr[0];
 
-	lcp2_forcerebalance(ctdb, pnn);
-	DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
+	DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
 
-	if (rec->deferred_rebalance_ctx != NULL) {
-		talloc_free(rec->deferred_rebalance_ctx);
+	/* Copy any existing list of nodes.  There's probably some
+	 * sort of realloc variant that will do this but we need to
+	 * make sure that freeing the old array also cancels the timer
+	 * event for the timeout... not sure if realloc will do that.
+	 */
+	len = (rec->force_rebalance_nodes != NULL) ?
+		talloc_array_length(rec->force_rebalance_nodes) :
+		0;
+
+	/* This allows duplicates to be added but they don't cause
+	 * harm.  A call to add a duplicate PNN arguably means that
+	 * the timeout should be reset, so this is the simplest
+	 * solution.
+	 */
+	t = talloc_zero_array(rec, uint32_t, len+1);
+	CTDB_NO_MEMORY_VOID(ctdb, t);
+	if (len > 0) {
+		memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
 	}
-	rec->deferred_rebalance_ctx = talloc_new(rec);
-	event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx, 
+	t[len] = pnn;
+
+	talloc_free(rec->force_rebalance_nodes);
+
+	rec->force_rebalance_nodes = t;
+	event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
 			timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
 			ctdb_rebalance_timeout, rec);
 }
@@ -2281,153 +2460,163 @@ static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
-static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid, 
-			     TDB_DATA data, void *private_data)
+static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
 {
-	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
-	uint32_t timeout;
+	TALLOC_FREE(rec->takeover_runs_disable_ctx);
+}
 
-	if (rec->ip_check_disable_ctx != NULL) {
-		talloc_free(rec->ip_check_disable_ctx);
-		rec->ip_check_disable_ctx = NULL;
-	}
+static void reenable_takeover_runs(struct event_context *ev,
+				   struct timed_event *te,
+				   struct timeval yt, void *p)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 
-	if (data.dsize != sizeof(uint32_t)) {
+	DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
+	clear_takeover_runs_disable(rec);
+}
+
+static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
+					  uint64_t srvid, TDB_DATA data,
+					  void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(private_data,
+						    struct ctdb_recoverd);
+	struct srvid_request *r;
+	uint32_t timeout;
+	TDB_DATA result;
+	int32_t ret = 0;
+
+	/* Validate input data */
+	if (data.dsize != sizeof(struct srvid_request)) {
 		DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
-				 "expexting %lu\n", (long unsigned)data.dsize,
-				 (long unsigned)sizeof(uint32_t)));
-		return;
+				 "expecting %lu\n", (long unsigned)data.dsize,
+				 (long unsigned)sizeof(struct srvid_request)));
+		ret = -EINVAL;
+		goto done;
 	}
 	if (data.dptr == NULL) {
-		DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));


-- 
CTDB repository