[SCM] CTDB repository - branch 1.2-nodeflags updated - ctdb-1.9.1-232-gea77f1b

Tue Nov 16 18:06:58 MST 2010

The branch, 1.2-nodeflags has been updated
       via  ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9 (commit)
       via  6d118faada8ffbde7f35e1a0b4d2df21ac67dec2 (commit)
       via  7b91e4fa1d00bcba860dfe502e83161a6d06de6b (commit)
       via  9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44 (commit)
       via  d887c991d48f010b4512ada24b059161614bb168 (commit)
       via  10e005cd0eb5b350a22377629ecea87e876f6bfb (commit)
       via  138e2c912ae718ce024d7e342bc7808416aeec25 (commit)
      from  256b4f44d9de1b394d4f260f26fd11e2ff6adf8f (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=1.2-nodeflags


- Log -----------------------------------------------------------------
commit ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Aug 31 17:40:40 2010 +1000

    Eventscripts: make loadconfig() function hookable by the test suite.
    
    Rename loadconfig() to _loadconfig().  Add a new loadconfig() that
    simply calls _loadconfig().
    
    This makes it easy for the test suite to override loadconfig().
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 6d118faada8ffbde7f35e1a0b4d2df21ac67dec2
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Nov 16 19:42:31 2010 +1100

    Make a time comparison in 60.nfs eventscript more readable.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 7b91e4fa1d00bcba860dfe502e83161a6d06de6b
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Nov 16 19:31:18 2010 +1100

    60.nfs only fails or warns after 10 consecutive nfsd/statd failures.
    
    These failures are sometimes the result of slow restarts so we want to
    avoid dirtying the logs or marking a node unhealthy because of them,
    unless they are excessive.
    
    For these 2 cases we use the existing fail counting code but hack a
    temporary service_name in a subshell to allow separate fail counts.
    
    We also update ctdb_check_rpc() so that it captures the error output
    from rpcinfo and we add a message including the service name to the
    beginning.  The error is printed to stdout but is also stored in
    ctdb_check_rpc_out to allow it to be conditionally used by the caller.
    This function also now returns non-zero rather than exiting on
    failure.
    
    Other direct rpcinfo calls are relaced by called to ctdb_check_rpc()
    for consistency.
    
    Option handling code for service restarts is cleaned up so that fits
    in 80 columns.  A more informative restart messageis now used in all
    cases, printing the exact command being used to start a service.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Oct 12 11:10:38 2010 +1100

    Test suite: fix typo in ctdb ping test grep pattern.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit d887c991d48f010b4512ada24b059161614bb168
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Oct 6 16:32:22 2010 +1100

    Test suite: match changed output for ctdb ping to disconnected node.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 10e005cd0eb5b350a22377629ecea87e876f6bfb
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Oct 15 15:09:08 2010 +1100

    Test suite: make statistics test cope with changes to statistics output.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 138e2c912ae718ce024d7e342bc7808416aeec25
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Nov 15 16:30:44 2010 +1100

    initialize the statistics to the current time, not start of epoch
    this makes "ctdb statistics" show correct "start of starts collection"

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/60.nfs             |   77 ++++++++++++++++++++++--------------
 config/functions                   |   18 ++++++--
 server/ctdb_statistics.c           |    1 +
 tests/simple/09_ctdb_ping.sh       |    2 +-
 tests/simple/14_ctdb_statistics.sh |    2 +-
 5 files changed, 63 insertions(+), 37 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 57c81d3..038adbb 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -51,24 +51,43 @@ case "$1" in
 
 	# check that statd responds to rpc requests
 	# if statd is not running we try to restart it
-	rpcinfo -u localhost 100024 1 > /dev/null || {
-		RPCSTATDOPTS=""
-		[ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
-		[ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
-		[ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
-		rpc.statd $RPCSTATDOPTS 
-		echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
-	}
+	if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+		(service_name="nfs_statd"; ctdb_counter_init)
+	else
+		p="rpc.statd" ; cmd="$p"
+		cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+		cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+		cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+		(
+			service_name="nfs_statd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart STATD [$cmd]"
+		}
+		$cmd
+	fi
 
 
 	# check that NFS responds to rpc requests
 	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-	    (ctdb_check_rpc "NFS" 100003 3)
-	    [ $? = "0" ] || {
-		echo "Trying to restart NFS service"
-		startstop_nfs restart
-		exit 1
-	    }
+	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+		(service_name="nfs_knfsd"; ctdb_counter_init)
+	    else
+		(
+			service_name="nfs_knfsd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart NFS service"
+			startstop_nfs restart
+			exit 1
+		}
+		# we haven't hit the failure limit so restart quietly
+		startstop_nfs restart >/dev/null 2>&1 &
+	    fi
 	}
 
 	# and that its directories are available
@@ -79,8 +98,7 @@ case "$1" in
 	} || exit $?
 
 	# check that lockd responds to rpc requests
-	(ctdb_check_rpc "lockd" 100021 1)
-	[ $? = "0" ] || {
+	ctdb_check_rpc "LOCKD" 100021 1 || {
 		echo "Trying to restart lock manager service"
 		startstop_nfs restart
 		startstop_nfslock restart
@@ -89,31 +107,30 @@ case "$1" in
 
 	# mount needs special handling since it is sometimes not started
 	# correctly on RHEL5
-	rpcinfo -u localhost 100005 1 > /dev/null || {
-		echo "ERROR: MOUNTD is not running. Trying to restart it."
-		RPCMOUNTDOPTS=""
-		[ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
-		killall -q -9 rpc.mountd
-		rpc.mountd $RPCMOUNTDOPTS &
+	ctdb_check_rpc "MOUNTD" 100005 1 || {
+		p="rpc.mountd"
+		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+		echo "Trying to restart MOUNTD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 		exit 1
 	}
 	# rquotad needs special handling since it is sometimes not started
 	# correctly on RHEL5
 	# this is not a critical service so we dont flag the node as unhealthy
-	rpcinfo -u localhost 100011 1 > /dev/null || {
-		echo "ERROR: RQUOTAD is not running. Trying to restart it."
-		RPCRQUOTADOPTS=""
-		[ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT"
-		killall -q -9 rpc.rquotad
-		rpc.rquotad $RPCRQUOTADOPTS &
+	ctdb_check_rpc "RQUOTAD" 100011 1 || {
+		p="rpc.rquotad"
+		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+		echo "Trying to restart RQUOTAD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 	}
 
 	# once every 60 seconds, update the statd state database for which
 	# clients need notifications
 	LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger`
 	CURRENT_TIME=`date +"%s"`
-	expr "$CURRENT_TIME" ">" "(" "$LAST_UPDATE" "+" "60" ")" >/dev/null 2>/dev/null
-	[ $? = "0" ] && {
+	[ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && {
 	    mkdir -p $CTDB_VARDIR/state/statd
 	    touch $CTDB_VARDIR/state/statd/update-trigger
 	    $CTDB_BASE/statd-callout updatelocal &
diff --git a/config/functions b/config/functions
index 9659d48..610085b 100755
--- a/config/functions
+++ b/config/functions
@@ -4,7 +4,7 @@ PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
 
 #######################################
 # pull in a system config file, if any
-loadconfig() {
+_loadconfig() {
 
     if [ -z "$1" ] ; then
 	foo="${service_config:-${service_name}}"
@@ -25,6 +25,10 @@ loadconfig() {
     fi
 }
 
+loadconfig () {
+    _loadconfig "$@"
+}
+
 ##############################################################
 # determine on what type of system (init style) we are running
 detect_init_style() {
@@ -144,10 +148,14 @@ ctdb_check_rpc() {
     progname="$1"
     prognum="$2"
     version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-	    echo "ERROR: $progname not responding to rpc requests"
-	    exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+	echo "$ctdb_check_rpc_out"
+	return 1
+    fi
 }
 
 ######################################################
diff --git a/server/ctdb_statistics.c b/server/ctdb_statistics.c
index aec17ad..29e6d6a 100644
--- a/server/ctdb_statistics.c
+++ b/server/ctdb_statistics.c
@@ -42,6 +42,7 @@ static void ctdb_statistics_update(struct event_context *ev, struct timed_event
 int ctdb_statistics_init(struct ctdb_context *ctdb)
 {
 	bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+	ctdb->statistics.statistics_start_time = timeval_current();
 
 	bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
 	ctdb->statistics_current.statistics_start_time = timeval_current();
diff --git a/tests/simple/09_ctdb_ping.sh b/tests/simple/09_ctdb_ping.sh
index 6ca50d4..6610431 100755
--- a/tests/simple/09_ctdb_ping.sh
+++ b/tests/simple/09_ctdb_ping.sh
@@ -53,5 +53,5 @@ try_command_on_node -v 0 "! $CTDB ping -n 1"
 
 sanity_check_output \
     1 \
-    "(: ctdb_control error: 'ctdb_control to disconnected node'|Unable to get ping response from node 1|Node 1 is DISCONNECTED)" \
+    "(: ctdb_control error: ('ctdb_control to disconnected node'|'node is disconnected')|Unable to get ping response from node 1|Node 1 is DISCONNECTED|ctdb_control for getpnn failed|: Can not access node. Node is not operational\.)" \
     "$out"
diff --git a/tests/simple/14_ctdb_statistics.sh b/tests/simple/14_ctdb_statistics.sh
index 9a95a83..e9ecce5 100755
--- a/tests/simple/14_ctdb_statistics.sh
+++ b/tests/simple/14_ctdb_statistics.sh
@@ -33,7 +33,7 @@ set -e
 
 cluster_is_healthy
 
-pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec)$'
+pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec|[[:space:]]*(reclock_ctdbd|reclock_recd|call_latency|lockwait_latency|childwrite_latency)[[:space:]]+MIN/AVG/MAX[[:space:]]+[-.[:digit:]]+/[-.[:digit:]]+/[-.[:digit:]]+ sec out of [[:digit:]]+)$'
 
 try_command_on_node -v 1 "$CTDB statistics"
 


-- 
CTDB repository