[SCM] CTDB repository - branch master updated - ctdb-1.0.91-5-gfe26da7

Wed Sep 30 05:38:38 MDT 2009

The branch, master has been updated
       via  fe26da7780545b1ecc0a7da5bc1cf8beaeea94cc (commit)
       via  cfe63636a163730ae9ad3554b78519b3c07d8896 (commit)
       via  803cfb4cd2f6d139f466053a6d7e104fcb772ef5 (commit)
       via  096cdc0c12d22d99f8405bee5cb9f05c616c8492 (commit)
       via  a083a1976d621c76121f1fa2c2f484cfa47267bd (commit)
      from  d1332f4d5d3d3e4b4e0cd362a6903d09e0d5fcbb (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit fe26da7780545b1ecc0a7da5bc1cf8beaeea94cc
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 30 21:21:56 2009 +1000

    Minor fixes to 01.reclock eventscript.
    
    test -z really needs its argument to be quoted.  Simplified a status
    test.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit cfe63636a163730ae9ad3554b78519b3c07d8896
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 30 21:05:16 2009 +1000

    40.vsftpd monitor event only fails after 2 failures to connect to port 21.
    
    Change the monitor event in 40.vsftpd so it only fails if there are 2
    successive failures connecting to port 21.  This reduces the
    likelihood of unhealthy nodes due to vsftpd being restarted for
    reconfiguration due to node failover or system reconfiguration.
    
    New eventscript functions ctdb_counter_init, ctdb_counter_incr,
    ctdb_counter_limit.  These are used to count arbitrary things in
    eventscripts, depending on the eventscript name and a tag that is
    passed, and determine if a specified limit has been hit.  They're good
    for counting failures!
    
    These functions are used in 40.vsftpd and also in 01.reclock - the
    latter used to do the counting without these functions.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 803cfb4cd2f6d139f466053a6d7e104fcb772ef5
Merge: 096cdc0c12d22d99f8405bee5cb9f05c616c8492 d1332f4d5d3d3e4b4e0cd362a6903d09e0d5fcbb
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Sep 30 19:22:59 2009 +1000

    Merge commit 'origin/master'

commit 096cdc0c12d22d99f8405bee5cb9f05c616c8492
Merge: a083a1976d621c76121f1fa2c2f484cfa47267bd 6e35feb06ec036b9036c5d1cdd94f7cef140d8a6
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Sep 29 12:59:10 2009 +1000

    Merge commit 'origin/master'

commit a083a1976d621c76121f1fa2c2f484cfa47267bd
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Sep 25 18:00:17 2009 +1000

    Test suite: Print debug info on node status timeouts.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/01.reclock             |   29 ++++++++++++---------------
 config/events.d/40.vsftpd              |   21 ++++++++++++++++++-
 config/functions                       |   34 ++++++++++++++++++++++++++++++++
 tests/scripts/ctdb_test_functions.bash |   10 ++++++++-
 4 files changed, 76 insertions(+), 18 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/01.reclock b/config/events.d/01.reclock
index ccc0075..74b9cea 100755
--- a/config/events.d/01.reclock
+++ b/config/events.d/01.reclock
@@ -9,46 +9,43 @@ shift
 
 PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
 
-# The size of this file represents the number of intervals that have
-# passed when we have tried to but failed to stat the reclock file.
-# after third failure the node becomes unhealthy
-# after the twenteth failure the node we shutdown ctdbd
-RECLOCKCOUNT="$CTDB_BASE/state/reclock-fail-count"
+# Count the number of intervals that have passed when we have tried to
+# but failed to stat the reclock file.  after third failure the node
+# becomes unhealthy after the twentieth failure the node we shutdown
+# ctdbd
+RECLOCKCOUNT="fail-count"
 
 case $cmd in 
      startup)
-	echo -n > $RECLOCKCOUNT
+	ctdb_counter_init "$RECLOCKCOUNT"
 	;;
 
       monitor)
-	echo -n 1 >> $RECLOCKCOUNT
-
-	COUNT=`ls -ln $RECLOCKCOUNT | cut -d" " -f5`
-	[ $COUNT -gt 19 ] && {
+	ctdb_counter_incr "$RECLOCKCOUNT"
+	ctdb_counter_limit "$RECLOCKCOUNT" 20 && {
 		echo "Reclock file can not be accessed. Shutting down."
 		sleep 1
 		ctdb shutdown
 	}
 
 	RECLOCKFILE=`ctdb -Y getreclock`
-	[ -z $RECLOCKFILE ] && {
+	[ -z "$RECLOCKFILE" ] && {
 		# we are not using a reclock file
-		echo -n > $RECLOCKCOUNT
+		ctdb_counter_init "$RECLOCKCOUNT"
 		exit 0
 	}
 
 	# try stat the reclock file as a background process
 	# so that we dont block in case the cluster filesystem is unavailable
 	(
-		stat $RECLOCKFILE
-		[ "$?" -eq 0 ] && {
+		stat $RECLOCKFILE && {
 			# we could stat the file, reset the counter
-			echo -n > $RECLOCKCOUNT
+			ctdb_counter_init "$RECLOCKCOUNT"
 		}
 	) >/dev/null 2>/dev/null &
 
 
-	[ $COUNT -gt 2 ] && {
+	ctdb_counter_limit "$RECLOCKCOUNT" 3 && {
 		echo "Reclock file can not be accessed. Mark node UNHEALTHY."
 		exit 1;
 	}
diff --git a/config/events.d/40.vsftpd b/config/events.d/40.vsftpd
index 539cd80..bec7862 100755
--- a/config/events.d/40.vsftpd
+++ b/config/events.d/40.vsftpd
@@ -10,6 +10,11 @@ loadconfig vsftpd
 cmd="$1"
 shift
 
+# Count the number of monitor failures.  The cluster only becomes
+# unhealthy after 2 failures.
+VSFTPD_FAILS="fail-count"
+VSFTPD_LIMIT=2
+
 case $cmd in 
      startup)
 	/bin/mkdir -p $CTDB_BASE/state/vsftpd
@@ -17,6 +22,8 @@ case $cmd in
 	# make sure the service is stopped first
 	service vsftpd stop > /dev/null 2>&1
 	service vsftpd start
+
+	ctdb_counter_init "$VSFTPD_FAILS"
 	;;
 
      shutdown)
@@ -43,7 +50,19 @@ case $cmd in
 	;;
 
      monitor)
-	ctdb_check_tcp_ports "ftp" 21
+	# Subshell catches the "exit 1"
+	if (ctdb_check_tcp_ports "ftp" 21) ; then
+	    ctdb_counter_init "$VSFTPD_FAILS"
+	else
+	    ctdb_counter_incr "$VSFTPD_FAILS"
+	    if ctdb_counter_limit "$VSFTPD_FAILS" $VSFTPD_LIMIT ; then
+		echo "ERROR: more than $VSFTPD_LIMIT consecutive failures, marking cluster unhealthy"
+		exit 1
+	    else
+		echo "WARNING: less than $VSFTPD_LIMIT consecutive failures, not unhealthy yet"
+	    fi
+		
+	fi
 	;;
 esac
 
diff --git a/config/functions b/config/functions
index 8d18828..bec4815 100644
--- a/config/functions
+++ b/config/functions
@@ -498,6 +498,40 @@ remove_ip() {
 }
 
 ########################################################
+# some simple logic for counting events - per eventscript
+# usage: ctdb_counter_init <tag>
+#        ctdb_counter_incr <tag>
+#        ctdb_counter_limit <tag> <limit>
+#        e.g. <tag> = "fail-count"
+# ctdb_counter_limit succeeds when count >= <limit>
+########################################################
+_ctdb_counter_common () {
+    _tag="$1"
+    _eventscript="${0##*/}" # basename
+
+    _counter_file="$CTDB_BASE/state/${_eventscript}-${_tag}"
+    mkdir -p "${_counter_file%/*}" # dirname
+}
+ctdb_counter_init () {
+    _ctdb_counter_common "$1"
+
+    echo -n > "$_counter_file"
+}
+ctdb_counter_incr () {
+    _ctdb_counter_common "$1"
+
+    # unary counting!
+    echo -n 1 >> "$_counter_file"
+}
+ctdb_counter_limit () {
+    _ctdb_counter_common "$1"
+    _limit="$2"
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    [ $_size -ge $_limit ]
+}
+########################################################
 # load a site local config file
 ########################################################
 
diff --git a/tests/scripts/ctdb_test_functions.bash b/tests/scripts/ctdb_test_functions.bash
index 57cd781..3da0f0e 100644
--- a/tests/scripts/ctdb_test_functions.bash
+++ b/tests/scripts/ctdb_test_functions.bash
@@ -417,7 +417,15 @@ wait_until_node_has_status ()
 
     echo "Waiting until node $pnn has status \"$status\"..."
 
-    onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status"
+    if ! onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status" ; then
+	for i in "onnode -q any ctdb status" "onnode -q any onnode all ctdb scriptstatus" ; do
+	    echo "$i"
+	    $i || true
+	done
+
+	return 1
+    fi
+
 }
 
 # Useful for superficially testing IP failover.


-- 
CTDB repository