[SCM] CTDB repository - branch 1.2.40 updated - ctdb-1.2.63-7-g4560186

Wed Jun 19 23:45:03 MDT 2013

The branch, 1.2.40 has been updated
       via  4560186b514221bbde89ebc0124380007a22ed08 (commit)
       via  88e639b55fced3cd223282b55214e7c2d506c50d (commit)
       via  cb700ff66859c585a70f7ca36da356c56bbdcc4f (commit)
       via  14d5f1c2475039ae331429b55f27f8273618a91d (commit)
       via  c7391f66b34406baa7f1677309492b792d0a3122 (commit)
       via  8137e4a0d8f70f6aa55e9c4ca1ac676aafd12f06 (commit)
       via  6400e3bae96771229c22d2e88d923d16e807d456 (commit)
      from  6e46fe96d46332cab6e8f9b6075c2f4ba6ceec04 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=1.2.40


- Log -----------------------------------------------------------------
commit 4560186b514221bbde89ebc0124380007a22ed08
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Thu Jun 20 14:40:00 2013 +1000

    New version 1.2.64
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit 88e639b55fced3cd223282b55214e7c2d506c50d
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Wed Jan 23 14:35:47 2013 +1100

    recoverd: Fix printing of node flags from local information
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>
    (cherry picked from commit 124e2a471aeda9c900fd898178a30522d7d74221)

commit cb700ff66859c585a70f7ca36da356c56bbdcc4f
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Tue Jun 18 14:27:34 2013 +1000

    tools/ctdb: Do not exit prematurely on control timeout if retrying in a loop
    
    This avoids premature exits from "ctdb stop" and "ctdb continue" due to
    intermittent control (e.g. getpnn, getnodemap) timeouts.
    
    This needs a proper fix to distinguish between timeout and failure
    conditions and take appropriate action.
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>
    (cherry picked from commit c48583fd238496a81ddc46a21892f0b49559036a)

commit 14d5f1c2475039ae331429b55f27f8273618a91d
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu May 23 16:06:47 2013 +1000

    tools/ctdb: Remove duplicate command definition for "sync"
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Pair-programmed-with: Amitay Isaacs <amitay at gmail.com>
    
    (cherry picked from commit 9e7b7cd04adc5e66e2ffa4edf463a682aaea379b)
    
    Conflicts:
    	tools/ctdb.c

commit c7391f66b34406baa7f1677309492b792d0a3122
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Thu May 23 13:04:06 2013 +1000

    tools/ctdb: Fix racy ipreallocate code
    
    This code tried to find the recovery master and send an ipreallocate
    request to that node.  When a node is stopped, this code asked the
    stopped node for recovery master.  Stopped node does not have up-to-date
    information on the current recovery master.  So ipreallocate requests
    were sent to the wrong node and ignored by that node which is not the
    recovery master.
    
    Send ipreallocate request to all active nodes.  That way we guarantee
    that the current recovery master will see it and respond to it.
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>
    Pair-Programmed-With: Martin Schwenke <martin at meltin.net>
    
    (cherry picked from commit 0577ce3c68e4febf49a1ef5093e918db9d5ec636)
    
    Conflicts:
    	tools/ctdb.c

commit 8137e4a0d8f70f6aa55e9c4ca1ac676aafd12f06
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Jun 13 11:56:25 2013 +1000

    eventscripts: New configuration varable $CTDB_NFS_DUMP_STUCK_THREADS
    
    If some nfsd threads are still alive after a shutdown during a restart
    then this indicates the maximum number of threads for which a stack
    trace should be dumped.  This can be useful for trying to determine
    why nfsd is stuck.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    (cherry picked from commit 2503245db10d567af708a04edd3a3b488c24f401)

commit 6400e3bae96771229c22d2e88d923d16e807d456
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Jun 13 10:17:20 2013 +1000

    eventscripts: Add new option $CTDB_MONITOR_NFS_THREAD_COUNT
    
    Consider the following example:
    
    1. There are 256 nfsd threads configured.
    2. 200 threads are "stuck" in system calls, perhaps waiting for the
       underlying filesystem when an attempt is made to restart NFS.
    3. 56 threads exit when NFS is stopped.
    4. 56 new threads are started when NFS is started.
    5. 200 "stuck" threads exit leaving only 56 threads running.
    
    Setting this option to "yes" makes the 60.nfs monitor event look for
    this situation and try to correct it.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    (cherry picked from commit 99b0d8b8ecc36dfc493775b9ebced54539c182d2)
    
    Conflicts:
    	config/events.d/60.nfs

-----------------------------------------------------------------------

Summary of changes:
 config/ctdb.sysconfig      |   19 ++++++++
 config/events.d/60.nfs     |   25 +++++++++++
 config/functions           |   24 +++++++++++
 packaging/RPM/ctdb.spec.in |    6 ++-
 server/ctdb_recoverd.c     |    2 +-
 tools/ctdb.c               |   98 ++++++++++++--------------------------------
 6 files changed, 101 insertions(+), 73 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/ctdb.sysconfig b/config/ctdb.sysconfig
index 08a550f..c0d6f0a 100644
--- a/config/ctdb.sysconfig
+++ b/config/ctdb.sysconfig
@@ -129,6 +129,25 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
 # CTDB_MONITOR_FREE_MEMORY_WARN=100
 # CTDB_MONITOR_FREE_MEMORY=10
 
+# Should the 60.nfs monitor event try to correct the number of nfsd
+# threads?  This works around a limitation in some NFS initscripts
+# where some threads can be stuck in host filesystem calls (perhaps
+# due to slow storage), a restart occurs, some threads don't exit, the
+# start only adds the missing number of threads, the stuck threads
+# exit, and the result is a lower than expected thread count.  Note
+# that if you must also set $RPCNFSDCOUNT (RedHat/Debian) or
+# $USE_KERNEL_NFSD_NUMBER (SUSE) in your NFS configuration so the
+# monitoring code knows how many threads there should be - if neither
+# of these are set then this option will be ignored.  The default is
+# to not do this check.
+# CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+
+
+# The number of nfsd threads to dump stack traces for if some are
+# still alive after stopping NFS during a restart.  The default is to
+# dump no stack traces.
+# CTDB_NFS_DUMP_STUCK_THREADS=5
+
 # When set to yes, the CTDB node will start in DISABLED mode and not host
 # any public ip addresses. The administrator needs to explicitely enable
 # the node with "ctdb enable"
diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 35055fb..3ae8f24 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -16,6 +16,29 @@ service_start="start_nfs"
 service_stop="startstop_nfs stop"
 service_reconfigure="startstop_nfs restart"
 
+nfs_check_thread_count ()
+{
+    [ "$CTDB_MONITOR_NFS_THREAD_COUNT" = "yes" ] || return 0
+
+    # If $RPCNFSDCOUNT/$USE_KERNEL_NFSD_NUMBER isn't set then we could
+    # guess the default from the initscript.  However, let's just
+    # assume that those using the default don't care about the number
+    # of threads and that they have switched on this feature in error.
+    _configured_threads="${RPCNFSDCOUNT:-${USE_KERNEL_NFSD_NUMBER}}"
+    [ -n "$_configured_threads" ] || return 0
+
+    # nfsd should be running the configured number of threads.  If
+    # there are a different number of threads then tell nfsd the
+    # correct number.  
+    _running_threads=$(get_proc "fs/nfsd/threads")
+    # Intentionally not arithmetic comparison - avoids extra errors
+    # when get_proc() fails...
+    if [ "$_running_threads" != "$_configured_threads" ] ; then
+	echo "Attempting to correct number of nfsd threads from ${_running_threads} to ${_configured_threads}"
+	set_proc "fs/nfsd/threads" "$_configured_threads"
+    fi
+}
+
 loadconfig
 
 [ "$NFS_SERVER_MODE" != "GANESHA" ] || exit 0
@@ -172,6 +195,8 @@ case "$1" in
 		$cmd &
 	}
 
+	nfs_check_thread_count
+
 	# once every 600 seconds, update the statd state database for which
 	# clients need notifications
 	LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger 2>/dev/null`
diff --git a/config/functions b/config/functions
index 4d82801..b35f60f 100755
--- a/config/functions
+++ b/config/functions
@@ -564,6 +564,7 @@ startstop_nfs() {
 			echo 0 >/proc/fs/nfsd/threads
 			service nfsserver stop > /dev/null 2>&1
 			pkill -9 nfsd
+			nfs_dump_some_threads
 			service nfsserver start
 			;;
 		esac
@@ -583,6 +584,7 @@ startstop_nfs() {
 			service nfs stop > /dev/null 2>&1
 			service nfslock stop > /dev/null 2>&1
 			pkill -9 nfsd
+			nfs_dump_some_threads
 			service nfslock start
 			service nfs start
 			;;
@@ -595,6 +597,28 @@ startstop_nfs() {
 	esac
 }
 
+# Dump up to the configured number of nfsd thread backtraces.
+nfs_dump_some_threads ()
+{
+    [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0
+
+    # Optimisation to avoid running an unnecessary pidof
+    [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
+
+    _count=0
+    for _pid in $(pidof nfsd) ; do
+	[ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
+
+	# Do this first to avoid racing with thread exit
+	_stack=$(get_proc "${_pid}/stack" 2>/dev/null)
+	if [ -n "$_stack" ] ; then
+	    echo "Stack trace for stuck nfsd thread [${_pid}]:"
+	    echo "$_stack"
+	    _count=$(($_count + 1))
+	fi
+    done
+}
+
 ########################################################
 # start/stop the nfs lockmanager service on different platforms
 ########################################################
diff --git a/packaging/RPM/ctdb.spec.in b/packaging/RPM/ctdb.spec.in
index 307760b..1a5ccfc 100644
--- a/packaging/RPM/ctdb.spec.in
+++ b/packaging/RPM/ctdb.spec.in
@@ -3,7 +3,7 @@ Name: ctdb
 Summary: Clustered TDB
 Vendor: Samba Team
 Packager: Samba Team <samba at samba.org>
-Version: 1.2.63
+Version: 1.2.64
 Release: 1GITHASH
 Epoch: 0
 License: GNU GPL version 3
@@ -155,6 +155,10 @@ development libraries for ctdb
 
 %changelog
 
+* Thu Jun 20 2013 : Version 1.2.64
+  - Add configuration variables to maintain configured number of NFS threads
+  - Fix racy code in CTDB commandline tool for ipreallocate/sync
+  - Fix printing of node flags
 * Mon Jun 17 2013 : Version 1.2.63
   - Sync tdb library version to 1.2.12
 * Mon Apr 22 2013 : Version 1.2.62
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index f419cb7..d7a79fe 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -3475,7 +3475,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 				  nodemap->nodes[j].pnn, 
 				  nodemap->nodes[i].pnn, 
 				  remote_nodemaps[j]->nodes[i].flags,
-				  nodemap->nodes[j].flags));
+				  nodemap->nodes[i].flags));
 				if (i == j) {
 					DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
 					update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
diff --git a/tools/ctdb.c b/tools/ctdb.c
index b7b5e12..5ed378c 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -1616,7 +1616,7 @@ control_get_all_public_ips(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx, struc
 }
 
 
-static uint32_t ipreallocate_finished;
+static bool ipreallocate_finished;
 
 /*
   handler for receiving the response to ipreallocate
@@ -1624,7 +1624,7 @@ static uint32_t ipreallocate_finished;
 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid, 
 			     TDB_DATA data, void *private_data)
 {
-	ipreallocate_finished = 1;
+	ipreallocate_finished = true;
 }
 
 static void ctdb_every_second(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
@@ -1644,9 +1644,8 @@ static int control_ipreallocate(struct ctdb_context *ctdb, int argc, const char
 	int i, ret;
 	TDB_DATA data;
 	struct takeover_run_reply rd;
-	uint32_t recmaster;
 	struct ctdb_node_map *nodemap=NULL;
-	int retries=0;
+	int count;
 	struct timeval tv = timeval_current();
 
 	/* we need some events to trigger so we can timeout and restart
@@ -1672,82 +1671,42 @@ static int control_ipreallocate(struct ctdb_context *ctdb, int argc, const char
 	data.dsize = sizeof(rd);
 
 again:
-	/* check that there are valid nodes available */
+	/* get the number of nodes and node flags */
 	if (ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, ctdb, &nodemap) != 0) {
 		DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
-		return -1;
-	}
-	for (i=0; i<nodemap->num;i++) {
-		if ((nodemap->nodes[i].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) == 0) {
-			break;
-		}
-	}
-	if (i==nodemap->num) {
-		DEBUG(DEBUG_ERR,("No recmaster available, no need to wait for cluster convergence\n"));
-		return 0;
-	}
-
-
-	ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.pnn, &recmaster);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR, ("Unable to get recmaster from node %u\n", options.pnn));
-		return ret;
-	}
-
-	/* verify the node exists */
-	if (ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), recmaster, ctdb, &nodemap) != 0) {
-		DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
-		return -1;
+		sleep(1);
+		goto again;
 	}
 
-
-	/* check tha there are nodes available that can act as a recmaster */
-	for (i=0; i<nodemap->num; i++) {
-		if (nodemap->nodes[i].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
+	ipreallocate_finished = false;
+	count = 0;
+	for (i=0; i<nodemap->num;i++) {
+		if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
 			continue;
+		} else {
+			/* Send to all active nodes. Only recmaster will reply. */
+			ret = ctdb_client_send_message(ctdb, i, CTDB_SRVID_TAKEOVER_RUN, data);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,("Failed to send ip takeover run request message to %u\n", options.pnn));
+				return -1;
+			}
+			count++;
 		}
-		break;
 	}
-	if (i == nodemap->num) {
-		DEBUG(DEBUG_ERR,("No possible nodes to host addresses.\n"));
+	if (count == 0) {
+		DEBUG(DEBUG_ERR,("No recmaster available, no need to wait for cluster convergence\n"));
 		return 0;
 	}
 
-	/* verify the recovery master is not STOPPED, nor BANNED */
-	if (nodemap->nodes[recmaster].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
-		DEBUG(DEBUG_ERR,("No suitable recmaster found. Try again\n"));
-		retries++;
-		sleep(1);
-		goto again;
-	} 
-	
-	/* verify the recovery master is not STOPPED, nor BANNED */
-	if (nodemap->nodes[recmaster].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
-		DEBUG(DEBUG_ERR,("No suitable recmaster found. Try again\n"));
-		retries++;
-		sleep(1);
-		goto again;
-	} 
-
-	ipreallocate_finished = 0;
-	ret = ctdb_client_send_message(ctdb, recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR,("Failed to send ip takeover run request message to %u\n", options.pnn));
-		return -1;
-	}
-
 	tv = timeval_current();
 	/* this loop will terminate when we have received the reply */
-	while (timeval_elapsed(&tv) < 5.0 && ipreallocate_finished == 0) {
+	while (timeval_elapsed(&tv) < 5.0 && !ipreallocate_finished) {
 		event_loop_once(ctdb->ev);
 	}
-	if (ipreallocate_finished == 1) {
-		return 0;
-	}
 
-	retries++;
-	sleep(1);
-	goto again;
+	if (!ipreallocate_finished) {
+		goto again;
+	}
 
 	return 0;
 }
@@ -2574,10 +2533,9 @@ static int control_stop(struct ctdb_context *ctdb, int argc, const char **argv)
 		/* read the nodemap and verify the change took effect */
 		if (ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap) != 0) {
 			DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
-			exit(10);
 		}
 
-	} while (!(nodemap->nodes[options.pnn].flags & NODE_FLAGS_STOPPED));
+	} while (nodemap == NULL || !(nodemap->nodes[options.pnn].flags & NODE_FLAGS_STOPPED));
 	ret = control_ipreallocate(ctdb, argc, argv);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u\n", options.pnn));
@@ -2608,10 +2566,9 @@ static int control_continue(struct ctdb_context *ctdb, int argc, const char **ar
 		/* read the nodemap and verify the change took effect */
 		if (ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap) != 0) {
 			DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
-			exit(10);
 		}
 
-	} while (nodemap->nodes[options.pnn].flags & NODE_FLAGS_STOPPED);
+	} while (nodemap == NULL || nodemap->nodes[options.pnn].flags & NODE_FLAGS_STOPPED);
 	ret = control_ipreallocate(ctdb, argc, argv);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u\n", options.pnn));
@@ -5181,7 +5138,7 @@ static const struct {
 	{ "showban",         control_showban,           true,	false,  "show ban information"},
 	{ "shutdown",        control_shutdown,          true,	false,  "shutdown ctdbd" },
 	{ "recover",         control_recover,           true,	false,  "force recovery" },
-	{ "sync", 	     control_ipreallocate,      true,	false,  "wait until ctdbd has synced all state changes" },
+	{ "sync", 	     control_ipreallocate,      false,	false,  "wait until ctdbd has synced all state changes" },
 	{ "ipreallocate",    control_ipreallocate,      true,	false,  "force the recovery daemon to perform a ip reallocation procedure" },
 	{ "thaw",            control_thaw,              true,	false,  "thaw databases", "[priority:1-3]" },
 	{ "isnotrecmaster",  control_isnotrecmaster,    false,	false,  "check if the local node is recmaster or not" },
@@ -5226,7 +5183,6 @@ static const struct {
 	{ "setdbreadonly",    control_setdbreadonly,	false,	false, "Set DB readonly capable", "<dbid>"},
 	{ "msglisten",        control_msglisten,	false,	false, "Listen on a srvid port for messages", "<msg srvid>"},
 	{ "msgsend",          control_msgsend,	false,	false, "Send a message to srvid", "<srvid> <message>"},
-	{ "sync", 	     control_ipreallocate,      false,	false,  "wait until ctdbd has synced all state changes" },
 	{ "pfetch", 	     control_pfetch,      	false,	false,  "fetch a record from a persistent database", "<db> <key> [<file>]" },
 	{ "pstore", 	     control_pstore,      	false,	false,  "write a record to a persistent database", "<db> <key> <file containing record>" },
 	{ "tfetch", 	     control_tfetch,      	false,	true,  "fetch a record from a [c]tdb-file", "<tdb-file> <key> [<file>]" },


-- 
CTDB repository