[SCM] CTDB repository - branch master updated - ctdb-1.0.79-7-g1860a36

Fri Apr 24 04:47:49 GMT 2009

The branch, master has been updated
       via  1860a365e6ba8212e15c33016c80a2adcf8d10f4 (commit)
       via  69dc3bf60b86d8df6dc5c7c6ebf303e847fb2ba9 (commit)
       via  2f952af1a12e81a652ec9a4794db96f9593f2676 (commit)
       via  373a04d6a47a5c7cf8d822d60be1d9b9ab4df784 (commit)
       via  43c718ebd402abe2e1e6360ce21b08cc13030c88 (commit)
       via  0f2664a82128e38ac49c6e8e784129b779616a27 (commit)
       via  1a110993af1706e0c5df500fa7d14e16c0668372 (commit)
      from  6c900aa343096c5e1e297e055c36832ffa5028dd (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 1860a365e6ba8212e15c33016c80a2adcf8d10f4
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 14:41:21 2009 +1000

    tweak some timeouts so that we do trigger a banning even if the control hangs/timesout

commit 69dc3bf60b86d8df6dc5c7c6ebf303e847fb2ba9
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Apr 24 13:58:32 2009 +1000

    If we can not pull a database from a node during recovery, mark this node as a "culprit" so that it will eventually become banned.

commit 2f952af1a12e81a652ec9a4794db96f9593f2676
Author: Andrew Tridgell <tridge at samba.org>
Date:   Thu Apr 23 11:35:42 2009 +1000

    change shutdown level for ctdb to be 01
    
    We want ctdb to shutdown first, as it manages many other
    services. With the old level of 32 the NFS service would shutdown
    first, and that would trigger ctdb to do a recovery. Then ctdb itself
    would be shutdown a few seconds later, which causes a lot of error
    messages in the other nodes logs

commit 373a04d6a47a5c7cf8d822d60be1d9b9ab4df784
Merge: 43c718ebd402abe2e1e6360ce21b08cc13030c88 6c900aa343096c5e1e297e055c36832ffa5028dd
Author: Andrew Tridgell <tridge at samba.org>
Date:   Thu Apr 23 11:00:16 2009 +1000

    Merge commit 'ronnie/master'

commit 43c718ebd402abe2e1e6360ce21b08cc13030c88
Merge: 0f2664a82128e38ac49c6e8e784129b779616a27 13e2c9044950f21918e4610726e73ed3d8f76920
Author: Andrew Tridgell <tridge at samba.org>
Date:   Tue Apr 7 17:07:41 2009 +1000

    Merge commit 'ronnie/master'

commit 0f2664a82128e38ac49c6e8e784129b779616a27
Merge: 1a110993af1706e0c5df500fa7d14e16c0668372 ecf26af22245d0f55aded50e8768b0c21495f98c
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Mar 6 11:26:20 2009 +1100

    Merge commit 'ronnie/master'

commit 1a110993af1706e0c5df500fa7d14e16c0668372
Merge: 4777b74b1e2eebe54cf27f3303f60e49023e7f6a 59a04a50e64aae0a89b165d0428e23a8bcf8eb24
Author: Andrew Tridgell <tridge at samba.org>
Date:   Mon Feb 9 10:53:47 2009 +1100

    Merge commit 'ronnie/master'

-----------------------------------------------------------------------

Summary of changes:
 config/ctdb.init       |    2 +-
 server/ctdb_recoverd.c |   28 ++++++++++++++++++++++++----
 server/ctdb_tunables.c |    4 ++--
 3 files changed, 27 insertions(+), 7 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/ctdb.init b/config/ctdb.init
index 24a206d..0903c91 100755
--- a/config/ctdb.init
+++ b/config/ctdb.init
@@ -3,7 +3,7 @@
 ##############################
 # ctdb:                        Starts the clustered tdb daemon
 #
-# chkconfig:           - 90 36
+# chkconfig:           - 90 01
 #
 # description:                 Starts and stops the clustered tdb daemon
 # pidfile:             /var/run/ctdbd/ctdbd.pid
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 86f7a46..c6d0a7a 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -244,6 +244,23 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 	rec->culprit_counter++;
 }
 
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
+{
+	struct ctdb_context *ctdb = rec->ctdb;
+
+	if (rec->last_culprit != culprit ||
+	    timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+		DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+		/* either a new node is the culprit, or we've decided to forgive them */
+		rec->last_culprit = culprit;
+		rec->first_recover_time = timeval_current();
+		rec->culprit_counter = 0;
+	}
+	rec->culprit_counter += count;
+}
 
 /* this callback is called for every node that failed to execute the
    start recovery event
@@ -612,7 +629,9 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 /*
   pull all the remote database contents into the recdb
  */
-static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, 
+static int pull_remote_database(struct ctdb_context *ctdb,
+				struct ctdb_recoverd *rec, 
+				struct ctdb_node_map *nodemap, 
 				struct tdb_wrap *recdb, uint32_t dbid)
 {
 	int j;
@@ -628,6 +647,7 @@ static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map
 		if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 			DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n", 
 				 nodemap->nodes[j].pnn));
+			ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 			return -1;
 		}
 	}
@@ -1244,7 +1264,7 @@ static int recover_database(struct ctdb_recoverd *rec,
 	}
 
 	/* pull all remote databases onto the recdb */
-	ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
+	ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
 		return -1;
@@ -1326,9 +1346,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
 	if (rec->culprit_counter > 2*nodemap->num) {
 		DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
-			 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
+			 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
 			 ctdb->tunable.recovery_ban_period));
-		ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
+		ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
 	}
 
 	if (!ctdb_recovery_lock(ctdb, true)) {
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index 31ec89f..330ab19 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -31,7 +31,7 @@ static const struct {
 	{ "KeepaliveInterval",    5,  offsetof(struct ctdb_tunable, keepalive_interval) },
 	{ "KeepaliveLimit",       5,  offsetof(struct ctdb_tunable, keepalive_limit) },
 	{ "MaxLACount",           7,  offsetof(struct ctdb_tunable, max_lacount) },
-	{ "RecoverTimeout",      30,  offsetof(struct ctdb_tunable, recover_timeout) },
+	{ "RecoverTimeout",      20,  offsetof(struct ctdb_tunable, recover_timeout) },
 	{ "RecoverInterval",      1,  offsetof(struct ctdb_tunable, recover_interval) },
 	{ "ElectionTimeout",      3,  offsetof(struct ctdb_tunable, election_timeout) },
 	{ "TakeoverTimeout",      5,  offsetof(struct ctdb_tunable, takeover_timeout) },
@@ -39,7 +39,7 @@ static const struct {
 	{ "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, tickle_update_interval) },
 	{ "EventScriptTimeout",  20,  offsetof(struct ctdb_tunable, script_timeout) },
 	{ "EventScriptBanCount",  5,  offsetof(struct ctdb_tunable, script_ban_count) },
-	{ "RecoveryGracePeriod", 60,  offsetof(struct ctdb_tunable, recovery_grace_period) },
+	{ "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, recovery_grace_period) },
 	{ "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, recovery_ban_period) },
 	{ "DatabaseHashSize", 10000,  offsetof(struct ctdb_tunable, database_hash_size) },
 	{ "DatabaseMaxDead",      5,  offsetof(struct ctdb_tunable, database_max_dead) },


-- 
CTDB repository