[SCM] CTDB repository - branch master updated - bf1b76955db6ba00ec64686b53084268573ba6a0

Tue May 5 02:18:30 GMT 2009

The branch, master has been updated
       via  bf1b76955db6ba00ec64686b53084268573ba6a0 (commit)
       via  72f1c696ee77899f7973878f2568a60d199d4fea (commit)
       via  329df9e47e6ca8ab5143985a999e68f37c6d88a5 (commit)
       via  94343309992929a592348c936e09a7b4f8b512c1 (commit)
       via  bf8dae63d10498e6b6179bbacdd72f1ff0fc60be (commit)
       via  1b2029dbb055ff07367ebc1f307f5241320227b2 (commit)
       via  459e4ee135bd1cd24c15e5325906eb4ecfd550ec (commit)
       via  70f21428c9eec96bcc787be191e7478ad68956dc (commit)
       via  7af060ded5113a49832f6a08a942523a202586b3 (commit)
       via  1860a365e6ba8212e15c33016c80a2adcf8d10f4 (commit)
       via  69dc3bf60b86d8df6dc5c7c6ebf303e847fb2ba9 (commit)
      from  2f952af1a12e81a652ec9a4794db96f9593f2676 (commit)

http://gitweb.samba.org/?p=tridge/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit bf1b76955db6ba00ec64686b53084268573ba6a0
Author: root <root at rcn1.VSOFS1.COM>
Date:   Fri May 1 12:37:52 2009 +1000

    new version 1.0.80

commit 72f1c696ee77899f7973878f2568a60d199d4fea
Author: root <root at rcn1.VSOFS1.COM>
Date:   Fri May 1 12:30:26 2009 +1000

    when tracking the ctdb statistics,    only decrement num_clients and pending_calls IFF the counter is >0
    
    Otherwise there is the chance that we will reset the statistics after the counter has been incremented (client connects) to zero   and when the client disconnects we decrement it to a negative number.
    
    this is a pure cosmetic patch with no operational impact to ctdb

commit 329df9e47e6ca8ab5143985a999e68f37c6d88a5
Author: root <root at rcn1.VSOFS1.COM>
Date:   Fri May 1 01:18:27 2009 +1000

    Add a new variable VerifyRecoveryLock which can be used to disable the test that the recovery daemon holds the lock properly when performing a recovery

commit 94343309992929a592348c936e09a7b4f8b512c1
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu Apr 30 17:38:30 2009 +1000

    dont unconditionally kill/restart ctdb when given "service ctdb start"   only start ctdb if it is not already running,   and print an error message othervise

commit bf8dae63d10498e6b6179bbacdd72f1ff0fc60be
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Sun Apr 26 08:47:38 2009 +1000

    we only need to have transaction nesting disabled when we start the new transaction for the recovery

commit 1b2029dbb055ff07367ebc1f307f5241320227b2
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Sun Apr 26 08:42:54 2009 +1000

    set the TDB_NO_NESTING flag for the tdb before we start a transaction from within recovery

commit 459e4ee135bd1cd24c15e5325906eb4ecfd550ec
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Sun Apr 26 08:38:37 2009 +1000

    add TDB_NO_NESTING. When this flag is set tdb will not allow any nested transactions and tdb_transaction_start() will implicitely _cancel() any pending transactions before starting any new ones.

commit 70f21428c9eec96bcc787be191e7478ad68956dc
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 18:23:48 2009 +1000

    add a tuneable RecoveryDropAllIPs  so it is possible to control after how long a node that has been stuck in recovery will wait until it will yield all public addresses.
    
    this now defaults to 60 seconds
    
    This is useful if a split brain occurs due to network partitioning since it will make sure that the "other half" of the cluster that does not contain the recovery master will eventually release all ips and thus avoiding a duplicate ip situation for the public addresses

commit 7af060ded5113a49832f6a08a942523a202586b3
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 18:09:51 2009 +1000

    increase the loglevel for the message we print when we automatically release all ips when we have been in recovery for too long

commit 1860a365e6ba8212e15c33016c80a2adcf8d10f4
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 14:41:21 2009 +1000

    tweak some timeouts so that we do trigger a banning even if the control hangs/timesout

commit 69dc3bf60b86d8df6dc5c7c6ebf303e847fb2ba9
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 13:58:32 2009 +1000

    If we can not pull a database from a node during recovery, mark this node as a "culprit" so that it will eventually become banned.

-----------------------------------------------------------------------

Summary of changes:
 config/ctdb.init             |    8 ++++++-
 include/ctdb_private.h       |    2 +
 lib/tdb/common/transaction.c |   18 +++++++++++++---
 lib/tdb/include/tdb.h        |    1 +
 packaging/RPM/ctdb.spec      |   10 ++++++++-
 server/ctdb_daemon.c         |   44 +++++++++++++++++++++++++++++++----------
 server/ctdb_freeze.c         |    2 +
 server/ctdb_recover.c        |   19 ++++++++++++-----
 server/ctdb_recoverd.c       |   28 ++++++++++++++++++++++---
 server/ctdb_tunables.c       |    6 +++-
 10 files changed, 109 insertions(+), 29 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/ctdb.init b/config/ctdb.init
index 0903c91..a45babf 100755
--- a/config/ctdb.init
+++ b/config/ctdb.init
@@ -95,9 +95,15 @@ set_retval() {
 }
 
 start() {
-        killall -q ctdbd
 	echo -n $"Starting ctdbd service: "
 
+	ctdb ping >& /dev/null
+	if [ $? == "0" ] ; then
+		echo $"CTDB is already running"
+		RETVAL=1
+		return $RETVAL
+        fi
+
 	# check all persistent databases that they look ok
 	PERSISTENT_DB_DIR="/var/ctdb/persistent"
 	[ -z "$CTDB_DBDIR" ] || {
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index 88e686b..eac27f7 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -105,6 +105,8 @@ struct ctdb_tunable {
 	uint32_t recd_ping_timeout;
 	uint32_t recd_ping_failcount;
 	uint32_t log_latency_ms;
+	uint32_t recovery_drop_all_ips;
+	uint32_t verify_recovery_lock;
 };
 
 /*
diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index 4e2127b..6a34c45 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -85,6 +85,11 @@
     still available, but no transaction recovery area is used and no
     fsync/msync calls are made.
 
+  - if TDB_NO_NESTING is passed to flags in tdb open then transaction
+    nesting is disabled. tdb_transaction_start() will then implicitely
+    cancel any pending transactions and always start a new transaction
+    context instead of nesting.
+
 */
 
 
@@ -409,10 +414,15 @@ int tdb_transaction_start(struct tdb_context *tdb)
 
 	/* cope with nested tdb_transaction_start() calls */
 	if (tdb->transaction != NULL) {
-		tdb->transaction->nesting++;
-		TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n", 
-			 tdb->transaction->nesting));
-		return 0;
+		if (!tdb->flags & TDB_NO_NESTING) {
+			tdb->transaction->nesting++;
+			TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n", 
+				 tdb->transaction->nesting));
+			return 0;
+		} else {
+			tdb_transaction_cancel(tdb);
+			TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: cancelling previous transaction\n"));
+		}
 	}
 
 	if (tdb->num_locks != 0 || tdb->global_lock.count) {
diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h
index 0008085..6281181 100644
--- a/lib/tdb/include/tdb.h
+++ b/lib/tdb/include/tdb.h
@@ -47,6 +47,7 @@ extern "C" {
 #define TDB_NOSYNC   64 /* don't use synchronous transactions */
 #define TDB_SEQNUM   128 /* maintain a sequence number */
 #define TDB_VOLATILE   256 /* Activate the per-hashchain freelist, default 5 */
+#define TDB_NO_NESTING 512 /* Dont allow transaction nesting */
 
 #define TDB_ERRCODE(code, ret) ((tdb->ecode = (code)), ret)
 
diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec
index 77bb0ba..bbd467b 100644
--- a/packaging/RPM/ctdb.spec
+++ b/packaging/RPM/ctdb.spec
@@ -4,7 +4,7 @@ Summary: Clustered TDB
 Vendor: Samba Team
 Packager: Samba Team <samba at samba.org>
 Name: ctdb
-Version: 1.0.79
+Version: 1.0.80
 Release: 1
 Epoch: 0
 License: GNU GPL version 3
@@ -131,6 +131,14 @@ fi
 %{_libdir}/pkgconfig/ctdb.pc
 
 %changelog
+* Fri May 1 2009 : Version 1.0.80
+ - change init shutdown level to 01 for ctdb so it stops before any of the other services
+ - if we can not pull a database from a remote node during recovery, mark that node as a culprit so it becomes banned
+ - increase the loglevel when we volunteer to drop all ip addresses after beeing in recovery mode for too long. Make this timeout tuneable with "RecoveryDropAllIPs" and have it default to 60 seconds
+ - Add a new flag TDB_NO_NESTING to the tdb layer to prevent nested transactions which ctdb does not use and does not expect. Have ctdb set this flag to prevent nested transactions from occuring.
+ - dont unconditionally kill off ctdb and restrat it on "service ctdb start". Fail "service ctdb start" with an error if ctdb is already running.
+ - Add a new tunable "VerifyRecoveryLock" that can be set to 0 to prevent the main ctdb daemon to verify that the recovery master has locked the reclock file correctly before allowing it to set the recovery mode to active.
+ - fix a cosmetic bug with ctdb statistics where certain counters could become negative.
 * Wed Apr 8 2009 : Version 1.0.79
  - From Mathieu Parent: add a ctdb pkgconfig file
  - Fix bug 6250
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c
index e730d76..8a3f564 100644
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -178,7 +178,9 @@ static int ctdb_client_destructor(struct ctdb_client *client)
 {
 	ctdb_takeover_client_destructor_hook(client);
 	ctdb_reqid_remove(client->ctdb, client->client_id);
-	client->ctdb->statistics.num_clients--;
+	if (client->ctdb->statistics.num_clients) {
+		client->ctdb->statistics.num_clients--;
+	}
 
 	if (client->num_persistent_updates != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
@@ -243,7 +245,9 @@ static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 	res = ctdb_daemon_call_recv(state, dstate->call);
 	if (res != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
-		client->ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			client->ctdb->statistics.pending_calls--;
+		}
 		ctdb_latency(ctdb_db, "call_from_client_cb 1", &client->ctdb->statistics.max_call_latency, dstate->start_time);
 		return;
 	}
@@ -253,7 +257,9 @@ static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 			       length, struct ctdb_reply_call);
 	if (r == NULL) {
 		DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
-		client->ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			client->ctdb->statistics.pending_calls--;
+		}
 		ctdb_latency(ctdb_db, "call_from_client_cb 2", &client->ctdb->statistics.max_call_latency, dstate->start_time);
 		return;
 	}
@@ -267,7 +273,9 @@ static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 	}
 	ctdb_latency(ctdb_db, "call_from_client_cb 3", &client->ctdb->statistics.max_call_latency, dstate->start_time);
 	talloc_free(dstate);
-	client->ctdb->statistics.pending_calls--;
+	if (client->ctdb->statistics.pending_calls > 0) {
+		client->ctdb->statistics.pending_calls--;
+	}
 }
 
 struct ctdb_daemon_packet_wrap {
@@ -320,13 +328,17 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 	struct ctdb_daemon_packet_wrap *w;
 
 	ctdb->statistics.total_calls++;
-	ctdb->statistics.pending_calls++;
+	if (client->ctdb->statistics.pending_calls > 0) {
+		ctdb->statistics.pending_calls++;
+	}
 
 	ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
 	if (!ctdb_db) {
 		DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
 			  c->db_id));
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		return;
 	}
 
@@ -344,7 +356,9 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 					   daemon_incoming_packet_wrap, w, True);
 	if (ret == -2) {
 		/* will retry later */
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		return;
 	}
 
@@ -352,7 +366,9 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		return;
 	}
 
@@ -360,7 +376,9 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 	if (dstate == NULL) {
 		ctdb_ltdb_unlock(ctdb_db, key);
 		DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		return;
 	}
 	dstate->start_time = timeval_current();
@@ -372,7 +390,9 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 	if (call == NULL) {
 		ctdb_ltdb_unlock(ctdb_db, key);
 		DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		ctdb_latency(ctdb_db, "call_from_client 1", &ctdb->statistics.max_call_latency, dstate->start_time);
 		return;
 	}
@@ -393,7 +413,9 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 
 	if (state == NULL) {
 		DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
-		ctdb->statistics.pending_calls--;
+		if (client->ctdb->statistics.pending_calls > 0) {
+			ctdb->statistics.pending_calls--;
+		}
 		ctdb_latency(ctdb_db, "call_from_client 2", &ctdb->statistics.max_call_latency, dstate->start_time);
 		return;
 	}
diff --git a/server/ctdb_freeze.c b/server/ctdb_freeze.c
index e39332e..6f99f8b 100644
--- a/server/ctdb_freeze.c
+++ b/server/ctdb_freeze.c
@@ -345,7 +345,9 @@ int32_t ctdb_control_transaction_start(struct ctdb_context *ctdb, uint32_t id)
 			}
 		}
 
+		tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NO_NESTING);
 		ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
+		tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NO_NESTING);
 
 		tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
 
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 12b95de..374b324 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -509,7 +509,7 @@ static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_even
 	   caused by the cluster filesystem being very slow to
 	   arbitrate locks immediately after a node failure.	   
 	 */
-	DEBUG(DEBUG_NOTICE,(__location__ " set_recmode timeout - allowing recmode set\n"));
+	DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
 	state->ctdb->recovery_mode = state->recmode;
 	ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
 	talloc_free(state);
@@ -569,7 +569,7 @@ ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
 {
 	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
 
-	DEBUG(DEBUG_INFO,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
+	DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
 	talloc_free(ctdb->release_ips_ctx);
 	ctdb->release_ips_ctx = NULL;
 
@@ -600,7 +600,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		ctdb->release_ips_ctx = talloc_new(ctdb);
 		CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
 
-		event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(5,0), ctdb_drop_all_ips_event, ctdb);
+		event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
 	}
 
 
@@ -632,11 +632,17 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 	state = talloc(ctdb, struct ctdb_set_recmode_state);
 	CTDB_NO_MEMORY(ctdb, state);
 
+
+	if (ctdb->tunable.verify_recovery_lock == 0) {
+		/* dont need to verify the reclock file */
+		ctdb->recovery_mode = recmode;
+		return 0;
+	}
+
 	/* For the rest of what needs to be done, we need to do this in
 	   a child process since 
 	   1, the call to ctdb_recovery_lock() can block if the cluster
 	      filesystem is in the process of recovery.
-	   2, running of the script may take a while.
 	*/
 	ret = pipe(state->fd);
 	if (ret != 0) {
@@ -657,7 +663,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		char cc = 0;
 		close(state->fd[0]);
 
-		/* we should not be able to get the lock on the nodes list, 
+		/* we should not be able to get the lock on the reclock file, 
 		  as it should  be held by the recovery master 
 		*/
 		if (ctdb_recovery_lock(ctdb, false)) {
@@ -669,6 +675,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		/* make sure we die when our parent dies */
 		while (kill(parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
+			write(state->fd[1], &cc, 1);
 		}
 		_exit(0);
 	}
@@ -676,7 +683,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 
 	talloc_set_destructor(state, set_recmode_destructor);
 
-	state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(3, 0),
+	state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
 				    ctdb_set_recmode_timeout, state);
 
 	state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 86f7a46..c6d0a7a 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -244,6 +244,23 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 	rec->culprit_counter++;
 }
 
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	if (rec->last_culprit != culprit ||
+	    timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+		DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+		/* either a new node is the culprit, or we've decided to forgive them */
+		rec->last_culprit = culprit;
+		rec->first_recover_time = timeval_current();
+		rec->culprit_counter = 0;
+	}
+	rec->culprit_counter += count;
+}
 
 /* this callback is called for every node that failed to execute the
    start recovery event
@@ -612,7 +629,9 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 /*
   pull all the remote database contents into the recdb
  */
-static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, 
+static int pull_remote_database(struct ctdb_context *ctdb,
+				struct ctdb_recoverd *rec, 
+				struct ctdb_node_map *nodemap, 
 				struct tdb_wrap *recdb, uint32_t dbid)
 {
 	int j;
@@ -628,6 +647,7 @@ static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map
 		if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 			DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n", 
 				 nodemap->nodes[j].pnn));
+			ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 			return -1;
 		}
 	}
@@ -1244,7 +1264,7 @@ static int recover_database(struct ctdb_recoverd *rec,
 	}
 
 	/* pull all remote databases onto the recdb */
-	ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
+	ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
 		return -1;
@@ -1326,9 +1346,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
 	if (rec->culprit_counter > 2*nodemap->num) {
 		DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
-			 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
+			 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
 			 ctdb->tunable.recovery_ban_period));
-		ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
+		ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
 	}
 
 	if (!ctdb_recovery_lock(ctdb, true)) {
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index 31ec89f..bab9aef 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -31,7 +31,7 @@ static const struct {
 	{ "KeepaliveInterval",    5,  offsetof(struct ctdb_tunable, keepalive_interval) },
 	{ "KeepaliveLimit",       5,  offsetof(struct ctdb_tunable, keepalive_limit) },
 	{ "MaxLACount",           7,  offsetof(struct ctdb_tunable, max_lacount) },
-	{ "RecoverTimeout",      30,  offsetof(struct ctdb_tunable, recover_timeout) },
+	{ "RecoverTimeout",      20,  offsetof(struct ctdb_tunable, recover_timeout) },
 	{ "RecoverInterval",      1,  offsetof(struct ctdb_tunable, recover_interval) },
 	{ "ElectionTimeout",      3,  offsetof(struct ctdb_tunable, election_timeout) },
 	{ "TakeoverTimeout",      5,  offsetof(struct ctdb_tunable, takeover_timeout) },
@@ -39,7 +39,7 @@ static const struct {
 	{ "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, tickle_update_interval) },
 	{ "EventScriptTimeout",  20,  offsetof(struct ctdb_tunable, script_timeout) },
 	{ "EventScriptBanCount",  5,  offsetof(struct ctdb_tunable, script_ban_count) },
-	{ "RecoveryGracePeriod", 60,  offsetof(struct ctdb_tunable, recovery_grace_period) },
+	{ "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, recovery_grace_period) },
 	{ "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, recovery_ban_period) },
 	{ "DatabaseHashSize", 10000,  offsetof(struct ctdb_tunable, database_hash_size) },
 	{ "DatabaseMaxDead",      5,  offsetof(struct ctdb_tunable, database_max_dead) },
@@ -53,6 +53,8 @@ static const struct {
 	{ "RecdPingTimeout",	 20,  offsetof(struct ctdb_tunable, recd_ping_timeout) },
 	{ "RecdFailCount",	  3,  offsetof(struct ctdb_tunable, recd_ping_failcount) },
 	{ "LogLatencyMs",         0,  offsetof(struct ctdb_tunable, log_latency_ms) },
+	{ "RecoveryDropAllIPs",  60,  offsetof(struct ctdb_tunable, recovery_drop_all_ips) },
+	{ "VerifyRecoveryLock",   1,  offsetof(struct ctdb_tunable, verify_recovery_lock) },
 };
 
 /*


-- 
CTDB repository