[SCM] CTDB repository - branch master updated - ctdb-1.10-25-g1d77a3a

Tue Nov 16 18:06:58 MST 2010

The branch, master has been updated
       via  1d77a3adfff893b3c01b87f791e72c0d3148425c (commit)
       via  26077e6c8eb126584af587e7416154ea4858aea2 (commit)
       via  79c25fe241cf5d8f92e23d3736823ebaf4e1769d (commit)
       via  ad18bfa398e582474afe25340368e39d4e74e3c6 (commit)
       via  a9f5ae2a548e1096c086888adc886cb604d372fa (commit)
       via  9e88466a99b5245d5f0ebab553be8d2b9b9a58e2 (commit)
       via  7303058616fdb1d7f58cce2349c034e9f611275e (commit)
      from  cf778b5aaf6356401e3985acccc7df9e08ab6930 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 1d77a3adfff893b3c01b87f791e72c0d3148425c
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Aug 31 17:40:40 2010 +1000

    Eventscripts: make loadconfig() function hookable by the test suite.
    
    Rename loadconfig() to _loadconfig().  Add a new loadconfig() that
    simply calls _loadconfig().
    
    This makes it easy for the test suite to override loadconfig().
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 26077e6c8eb126584af587e7416154ea4858aea2
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Nov 16 19:42:31 2010 +1100

    Make a time comparison in 60.nfs eventscript more readable.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 79c25fe241cf5d8f92e23d3736823ebaf4e1769d
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Nov 16 19:31:18 2010 +1100

    60.nfs only fails or warns after 10 consecutive nfsd/statd failures.
    
    These failures are sometimes the result of slow restarts so we want to
    avoid dirtying the logs or marking a node unhealthy because of them,
    unless they are excessive.
    
    For these 2 cases we use the existing fail counting code but hack a
    temporary service_name in a subshell to allow separate fail counts.
    
    We also update ctdb_check_rpc() so that it captures the error output
    from rpcinfo and we add a message including the service name to the
    beginning.  The error is printed to stdout but is also stored in
    ctdb_check_rpc_out to allow it to be conditionally used by the caller.
    This function also now returns non-zero rather than exiting on
    failure.
    
    Other direct rpcinfo calls are relaced by called to ctdb_check_rpc()
    for consistency.
    
    Option handling code for service restarts is cleaned up so that fits
    in 80 columns.  A more informative restart messageis now used in all
    cases, printing the exact command being used to start a service.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit ad18bfa398e582474afe25340368e39d4e74e3c6
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Oct 12 11:10:38 2010 +1100

    Test suite: fix typo in ctdb ping test grep pattern.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit a9f5ae2a548e1096c086888adc886cb604d372fa
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Oct 6 16:32:22 2010 +1100

    Test suite: match changed output for ctdb ping to disconnected node.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9e88466a99b5245d5f0ebab553be8d2b9b9a58e2
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Oct 15 15:09:08 2010 +1100

    Test suite: make statistics test cope with changes to statistics output.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 7303058616fdb1d7f58cce2349c034e9f611275e
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Nov 15 16:30:44 2010 +1100

    initialize the statistics to the current time, not start of epoch
    this makes "ctdb statistics" show correct "start of starts collection"

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/60.nfs             |   77 ++++++++++++++++++++++--------------
 config/functions                   |   18 ++++++--
 server/ctdb_statistics.c           |    1 +
 tests/simple/09_ctdb_ping.sh       |    2 +-
 tests/simple/14_ctdb_statistics.sh |    2 +-
 5 files changed, 63 insertions(+), 37 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 57c81d3..038adbb 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -51,24 +51,43 @@ case "$1" in
 
 	# check that statd responds to rpc requests
 	# if statd is not running we try to restart it
-	rpcinfo -u localhost 100024 1 > /dev/null || {
-		RPCSTATDOPTS=""
-		[ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
-		[ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
-		[ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
-		rpc.statd $RPCSTATDOPTS 
-		echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
-	}
+	if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+		(service_name="nfs_statd"; ctdb_counter_init)
+	else
+		p="rpc.statd" ; cmd="$p"
+		cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+		cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+		cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+		(
+			service_name="nfs_statd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart STATD [$cmd]"
+		}
+		$cmd
+	fi
 
 
 	# check that NFS responds to rpc requests
 	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-	    (ctdb_check_rpc "NFS" 100003 3)
-	    [ $? = "0" ] || {
-		echo "Trying to restart NFS service"
-		startstop_nfs restart
-		exit 1
-	    }
+	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+		(service_name="nfs_knfsd"; ctdb_counter_init)
+	    else
+		(
+			service_name="nfs_knfsd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart NFS service"
+			startstop_nfs restart
+			exit 1
+		}
+		# we haven't hit the failure limit so restart quietly
+		startstop_nfs restart >/dev/null 2>&1 &
+	    fi
 	}
 
 	# and that its directories are available
@@ -79,8 +98,7 @@ case "$1" in
 	} || exit $?
 
 	# check that lockd responds to rpc requests
-	(ctdb_check_rpc "lockd" 100021 1)
-	[ $? = "0" ] || {
+	ctdb_check_rpc "LOCKD" 100021 1 || {
 		echo "Trying to restart lock manager service"
 		startstop_nfs restart
 		startstop_nfslock restart
@@ -89,31 +107,30 @@ case "$1" in
 
 	# mount needs special handling since it is sometimes not started
 	# correctly on RHEL5
-	rpcinfo -u localhost 100005 1 > /dev/null || {
-		echo "ERROR: MOUNTD is not running. Trying to restart it."
-		RPCMOUNTDOPTS=""
-		[ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
-		killall -q -9 rpc.mountd
-		rpc.mountd $RPCMOUNTDOPTS &
+	ctdb_check_rpc "MOUNTD" 100005 1 || {
+		p="rpc.mountd"
+		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+		echo "Trying to restart MOUNTD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 		exit 1
 	}
 	# rquotad needs special handling since it is sometimes not started
 	# correctly on RHEL5
 	# this is not a critical service so we dont flag the node as unhealthy
-	rpcinfo -u localhost 100011 1 > /dev/null || {
-		echo "ERROR: RQUOTAD is not running. Trying to restart it."
-		RPCRQUOTADOPTS=""
-		[ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT"
-		killall -q -9 rpc.rquotad
-		rpc.rquotad $RPCRQUOTADOPTS &
+	ctdb_check_rpc "RQUOTAD" 100011 1 || {
+		p="rpc.rquotad"
+		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+		echo "Trying to restart RQUOTAD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 	}
 
 	# once every 60 seconds, update the statd state database for which
 	# clients need notifications
 	LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger`
 	CURRENT_TIME=`date +"%s"`
-	expr "$CURRENT_TIME" ">" "(" "$LAST_UPDATE" "+" "60" ")" >/dev/null 2>/dev/null
-	[ $? = "0" ] && {
+	[ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && {
 	    mkdir -p $CTDB_VARDIR/state/statd
 	    touch $CTDB_VARDIR/state/statd/update-trigger
 	    $CTDB_BASE/statd-callout updatelocal &
diff --git a/config/functions b/config/functions
index 9659d48..610085b 100755
--- a/config/functions
+++ b/config/functions
@@ -4,7 +4,7 @@ PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
 
 #######################################
 # pull in a system config file, if any
-loadconfig() {
+_loadconfig() {
 
     if [ -z "$1" ] ; then
 	foo="${service_config:-${service_name}}"
@@ -25,6 +25,10 @@ loadconfig() {
     fi
 }
 
+loadconfig () {
+    _loadconfig "$@"
+}
+
 ##############################################################
 # determine on what type of system (init style) we are running
 detect_init_style() {
@@ -144,10 +148,14 @@ ctdb_check_rpc() {
     progname="$1"
     prognum="$2"
     version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-	    echo "ERROR: $progname not responding to rpc requests"
-	    exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+	echo "$ctdb_check_rpc_out"
+	return 1
+    fi
 }
 
 ######################################################
diff --git a/server/ctdb_statistics.c b/server/ctdb_statistics.c
index aec17ad..29e6d6a 100644
--- a/server/ctdb_statistics.c
+++ b/server/ctdb_statistics.c
@@ -42,6 +42,7 @@ static void ctdb_statistics_update(struct event_context *ev, struct timed_event
 int ctdb_statistics_init(struct ctdb_context *ctdb)
 {
 	bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+	ctdb->statistics.statistics_start_time = timeval_current();
 
 	bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
 	ctdb->statistics_current.statistics_start_time = timeval_current();
diff --git a/tests/simple/09_ctdb_ping.sh b/tests/simple/09_ctdb_ping.sh
index 6ca50d4..6610431 100755
--- a/tests/simple/09_ctdb_ping.sh
+++ b/tests/simple/09_ctdb_ping.sh
@@ -53,5 +53,5 @@ try_command_on_node -v 0 "! $CTDB ping -n 1"
 
 sanity_check_output \
     1 \
-    "(: ctdb_control error: 'ctdb_control to disconnected node'|Unable to get ping response from node 1|Node 1 is DISCONNECTED)" \
+    "(: ctdb_control error: ('ctdb_control to disconnected node'|'node is disconnected')|Unable to get ping response from node 1|Node 1 is DISCONNECTED|ctdb_control for getpnn failed|: Can not access node. Node is not operational\.)" \
     "$out"
diff --git a/tests/simple/14_ctdb_statistics.sh b/tests/simple/14_ctdb_statistics.sh
index 9a95a83..e9ecce5 100755
--- a/tests/simple/14_ctdb_statistics.sh
+++ b/tests/simple/14_ctdb_statistics.sh
@@ -33,7 +33,7 @@ set -e
 
 cluster_is_healthy
 
-pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec)$'
+pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec|[[:space:]]*(reclock_ctdbd|reclock_recd|call_latency|lockwait_latency|childwrite_latency)[[:space:]]+MIN/AVG/MAX[[:space:]]+[-.[:digit:]]+/[-.[:digit:]]+/[-.[:digit:]]+ sec out of [[:digit:]]+)$'
 
 try_command_on_node -v 1 "$CTDB statistics"
 


-- 
CTDB repository