[SCM] CTDB repository - branch master updated - ctdb-1.10-392-ga2c30d8

Ronnie Sahlberg sahlberg at samba.org
Sun Aug 14 23:54:40 MDT 2011


The branch, master has been updated
       via  a2c30d88348da47d1a733a16e4c7d83c3becb6df (commit)
       via  4e670d9bc1bdeb2abd7e846bc36e02f0aa0d7309 (commit)
       via  92f8e31f6995836b1668639a4dac2862efee269a (commit)
       via  0d17daab38d4086f922a8006d4c545133adca191 (commit)
       via  845fb0ba24cf9118470c58fae7103ab8322ce079 (commit)
       via  2e30a2bb4371a846c7a768affa15883211642d5c (commit)
       via  f9899b1b96056d23628356589c855cf2262e5152 (commit)
       via  06b322ad6eff8d4e691f8e014b7d85983b261147 (commit)
       via  0680437bf5f02aeaed6387370e58bbdba2c04f28 (commit)
       via  038916248a73d6a250108c9235c0c4f76dba8e0c (commit)
       via  fc62bf0975c6059ee467285565d0dc3b4daaf238 (commit)
       via  35942841229cc72ce363a7236aec708f1a33136b (commit)
       via  cd1442531ad079b11c60f46ee9d34f5104bef219 (commit)
       via  5f49537889a92c3cb68d9203912188bedf00ecd4 (commit)
       via  3e145ab1bb61ed2087ec5ce6183ee24802686ed3 (commit)
       via  380c9263eb37db5a250264316e250c2160908263 (commit)
       via  7b201c1087b1433cfbc95de76cb4205e484ccd6f (commit)
       via  fa93177442c65c2a4eb2d5d5dba0a0da1c486969 (commit)
       via  3402930319d462eab5525410f6a676952e120182 (commit)
       via  ab443c4d7d282f282792abc6a6ac224ab06abe30 (commit)
       via  f654739080b40b7ac1b7f998cacc689d3d4e3193 (commit)
       via  9b66057964756a6245bafb436eb6106fb6a2866e (commit)
       via  2036764bfd1a4571fcfcca22099c2b9a95a02c57 (commit)
       via  9600cc7a6b7b854fac1a5b080129e3df8fcbd84e (commit)
       via  bd39b91ad12fd05271a7fced0e6f9d8c4eba92e6 (commit)
      from  84ac667af408816e5508719b9fdb7c5e25408640 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit a2c30d88348da47d1a733a16e4c7d83c3becb6df
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:53:04 2011 +1000

    Change the errors for 10.interface to clearly state ERROR: for error messages
    
    Update the tests system to catch the new error strings generated by this change

commit 4e670d9bc1bdeb2abd7e846bc36e02f0aa0d7309
Merge: 0d17daab38d4086f922a8006d4c545133adca191 92f8e31f6995836b1668639a4dac2862efee269a
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:43:15 2011 +1000

    Merge remote branch 'martins/eventscript_tests'

commit 92f8e31f6995836b1668639a4dac2862efee269a
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Aug 15 15:40:35 2011 +1000

    Tests - exportfs stub needs to print out export options.
    
    This is needed due to bd39b91ad12fd05271a7fced0e6f9d8c4eba92e6.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 0d17daab38d4086f922a8006d4c545133adca191
Merge: 845fb0ba24cf9118470c58fae7103ab8322ce079 fc62bf0975c6059ee467285565d0dc3b4daaf238
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:27:50 2011 +1000

    Merge remote branch 'martins/eventscript.10.interface'

commit 845fb0ba24cf9118470c58fae7103ab8322ce079
Merge: 2e30a2bb4371a846c7a768affa15883211642d5c bd39b91ad12fd05271a7fced0e6f9d8c4eba92e6
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:22:20 2011 +1000

    Merge remote branch 'martins/60_nfs_regression'

commit 2e30a2bb4371a846c7a768affa15883211642d5c
Merge: f9899b1b96056d23628356589c855cf2262e5152 3e145ab1bb61ed2087ec5ce6183ee24802686ed3
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:20:18 2011 +1000

    Merge remote branch 'martins/eventscript.60.nfs.rpc'

commit f9899b1b96056d23628356589c855cf2262e5152
Merge: 06b322ad6eff8d4e691f8e014b7d85983b261147 0680437bf5f02aeaed6387370e58bbdba2c04f28
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:16:06 2011 +1000

    Merge remote branch 'martins/test_suite'

commit 06b322ad6eff8d4e691f8e014b7d85983b261147
Merge: 038916248a73d6a250108c9235c0c4f76dba8e0c 2036764bfd1a4571fcfcca22099c2b9a95a02c57
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 15:15:12 2011 +1000

    Merge remote branch 'martins/eventscript_tests'

commit 0680437bf5f02aeaed6387370e58bbdba2c04f28
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Aug 15 13:53:39 2011 +1000

    Tests - ctdb listvars test should allow alphanumericals in tunable names.
    
    This matches the new "LCP2PublicIPs" tunable.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 038916248a73d6a250108c9235c0c4f76dba8e0c
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 15 10:23:50 2011 +1000

    Change the default for ip failover to be LCP2 and not DeterministicIPs

commit fc62bf0975c6059ee467285565d0dc3b4daaf238
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jul 5 17:21:57 2011 +1000

    Eventscripts: 10.interfaces - make startup event actually mark interfaces up!
    
    The startup event intends to mark interfaces up.  However, it doesn't
    actually do that because $INTERFACES is empty.
    
    This uses the function get_all_interfaces() to list the
    interfaces... and then mark them up.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 35942841229cc72ce363a7236aec708f1a33136b
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jul 5 17:20:09 2011 +1000

    Eventscripts: 10.interfaces - startup comment says assume all interfaces good.
    
    Interfaces are currently marked down.  Mark them up instead, as per
    the comment... and discussion with Ronnie.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit cd1442531ad079b11c60f46ee9d34f5104bef219
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jul 5 17:18:30 2011 +1000

    Eventscripts: 10.interfaces - new function get_all_interfaces().
    
    Move existing interface listing code to new function in preparation
    for using it in startup event.
    
    While we're here change the "sort | uniq" into "sort -u" and save some
    complexity.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 5f49537889a92c3cb68d9203912188bedf00ecd4
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jun 28 17:07:39 2011 +1000

    Eventscripts: 10.interface clean-ups - minor tweaks and new comments.
    
    * sed can read files, it doesn't need a file piped to it
    * use $() subshells instead of `` - they seem to quote better in dash
    * tweak the uniquifying code so that it is easier to read
    * add comments
    * remove some extraneous semicolons at ends of lines
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 3e145ab1bb61ed2087ec5ce6183ee24802686ed3
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 12 16:30:54 2011 +1000

    Tests: re-enable the NFS eventscript tests - they work again.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 380c9263eb37db5a250264316e250c2160908263
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Aug 12 16:28:09 2011 +1000

    Eventscripts: In 60.nfs don't restart NFS when restarting rpc.lockd.
    
    This effectively reverts 953dbfbddad656a64e30a6aca115cb1479d11573 and
    is a policy decision.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 7b201c1087b1433cfbc95de76cb4205e484ccd6f
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jun 28 16:50:47 2011 +1000

    Eventscripts: 10.interface clean-ups - variable name fix-ups.
    
    Change most of the uppercase variable names to lowercase for
    consistency with other variables, readability and so they can be
    easily distinguished from environment/configuration variables.  Change
    the name of 2 of the variabless to add some clarity.  Changes are as
    follows:
    
      INTERFACES   -> all_interfaces
      IFACES       -> ctdb_interfaces
      IFACE        -> iface
      I            -> i
      REALIFACE    -> realiface
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit fa93177442c65c2a4eb2d5d5dba0a0da1c486969
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jun 28 16:27:01 2011 +1000

    Eventscripts: 10.interfaces clean-ups - push logic into monitor_interfaces().
    
    The logic in the monitor event itself is very complex.  Nearly all of
    it can go away by adding a single check of
    $CTDB_PARTIALLY_ONLINE_INTERFACES to the return logic of
    monitor_interfaces() and reversing the sense of the corresponding
    check.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 3402930319d462eab5525410f6a676952e120182
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jun 28 16:10:23 2011 +1000

    Eventscripts: 10.interfaces clean-up - use more descriptive variable names.
    
    The name of variable $ok gives no clue to its meaning/use so this
    changes that variable to be named $up_interfaces_found.
    
    The return logic relating to $ok and $fail is difficult to read, so
    these variables are given true/fale values, allowing the return logic
    to be simplified.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit ab443c4d7d282f282792abc6a6ac224ab06abe30
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue Jun 28 15:53:54 2011 +1000

    Eventscripts: 10.interfaces cleanup - new functions mark_up(), mark_down().
    
    The same few lines of logic are used every time an interface up or down.
    
    This encapsulates those few lines in 2 new functions.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit f654739080b40b7ac1b7f998cacc689d3d4e3193
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Jan 14 09:40:11 2011 +1100

    Eventscripts: change failure counts and behaviour for statd and nfsd.
    
    We reduce the number of failures before attempting a restart.
    However, after 6 failures we mark the cluster unhealthy and no longer
    try to restart.  If the previous 2 attempts didn't work then there
    isn't any use in bogging the system down with an attempted restart on
    every monitor event.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9b66057964756a6245bafb436eb6106fb6a2866e
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Dec 17 16:25:04 2010 +1100

    Eventscripts: clean up 60.nfs monitor event.
    
    This adds a helper function called nfs_check_rpc_service() and uses it
    to make the monitor event much more readable.  An example of usage is
    as follows:
    
      nfs_check_rpc_service "mountd" \
        -ge 10 "verbose restart:b unhealthy" \
        -eq 5 "restart:b"
    
    The first argument to nfs_check_rpc_service() is the name of the RPC
    service to be checked.  The RPC service corresponding to this command
    is checked for availability using the rpcinfo command.  If the service
    is available then the function succeeds and subsequent arguments are
    ignored.
    
    If the rpcinfo check fails then a failure counter for that particular
    RPC service is incremented and subsequent arguments are processed in
    groups of 3:
    
    1. An integer comparison operator supported by test.
    2. An integer failure limit.
    3. An action string.
    
    The value of the failure counter is checked using (1) and (2) above.
    The first check that succeeds has its action string processed - note
    that this explains the somewhat curious reverse ordering of checks.
    
    It the example above:
    
    * If the counter is >= 10 then a verbose message is printed
      describing the failure, the service is restarted in the background
      and the node is marked as unhealthy (via an "exit 1" from the
      function).
    
    * If the counter is == 5 then the service us restarted in the
      background.
    
    For more action options please see the code.
    
    This also changes the ctdb_check_rpc() function so that it no longer
    takes a program number to check.  It now just takes a real RPC program
    name that rpcinfo can resolve via /etc/rpc.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 2036764bfd1a4571fcfcca22099c2b9a95a02c57
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Aug 11 15:33:46 2011 +1000

    Tests: Re-enable the Samba eventscript tests.
    
    They work again.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

commit 9600cc7a6b7b854fac1a5b080129e3df8fcbd84e
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu Aug 11 15:32:28 2011 +1000

    Revert "Tests: tweak some samba tests to cope with debug from ctdb_check_tcp_ports()."
    
    This reverts commit 557ac30e60516742da10b83bfbbbb41430c977a2.

commit bd39b91ad12fd05271a7fced0e6f9d8c4eba92e6
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Apr 13 12:37:42 2011 +1000

    Eventscripts: fix regression in 60.nfs export checking.
    
    Commit 35a60a63a9b5c7d98dde514ae552239506b691c9 introduced a
    regression, reported by "Jonathan Buzzard" <J.Buzzard at dundee.ac.uk>,
    as follows:
    
      Basically the use of sed in the following code snippet does not work
      for long exports where exportfs wraps the host or network onto the
      next line.
    
             exportfs | grep -v '^#' | grep '^/' |
             sed -e 's/[[:space:]]*[^[:space:]]*$//' |
             ctdb_check_directories
    
      The result is that the you get lots of blank lines being sent to
      ctdb_check_directories which causes the host to be marked as
      unhealthy and then thrashing sets in of the managed IP's making the
      whole cluster unusable.
    
    This tightens up the sed expression so that it is less likely to
    produce a spurious empty line.  It also removes an unnecessary "grep -v".
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/10.interface                       |  146 ++++++++++----------
 config/events.d/60.nfs                             |  141 ++++---------------
 config/functions                                   |  134 +++++++++++++++++-
 server/ctdb_tunables.c                             |    4 +-
 .../simple/10.interface.monitor.013.sh             |    2 +-
 .../simple/10.interface.releaseip.001.sh           |    2 +-
 .../eventscripts/simple/10.interface.takeip.001.sh |    2 +-
 tests/eventscripts/simple/50.samba.monitor.103.sh  |    8 +-
 tests/eventscripts/simple/50.samba.monitor.104.sh  |    8 +-
 tests/eventscripts/stubs/exportfs                  |    2 +-
 tests/simple/02_ctdb_listvars.sh                   |    2 +-
 11 files changed, 238 insertions(+), 213 deletions(-)
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.001.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.050.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.051.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.101.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.102.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.103.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.104.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.105.sh
 mode change 100644 => 100755 tests/eventscripts/simple/50.samba.monitor.106.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.001.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.100.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.101.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.111.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.112.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.121.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.122.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.131.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.132.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.141.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.142.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.151.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.152.sh
 mode change 100644 => 100755 tests/eventscripts/simple/60.nfs.monitor.153.sh


Changeset truncated at 500 lines:

diff --git a/config/events.d/10.interface b/config/events.d/10.interface
index baf3171..f75e6af 100755
--- a/config/events.d/10.interface
+++ b/config/events.d/10.interface
@@ -16,104 +16,117 @@ loadconfig
 	exit 0
 }
 
-monitor_interfaces()
+mark_up ()
+{
+    up_interfaces_found=true
+    ctdb setifacelink $1 up >/dev/null 2>&1
+}
+
+mark_down ()
 {
-	INTERFACES=`cat $CTDB_PUBLIC_ADDRESSES |
-		sed -e "s/^[^\t ]*[\t ]*//" -e "s/,/ /g" -e "s/[\t ]*$//"`
+    fail=true
+    ctdb setifacelink $1 down >/dev/null 2>&1
+}
 
-	[ "$CTDB_PUBLIC_INTERFACE" ] && INTERFACES="$CTDB_PUBLIC_INTERFACE $INTERFACES"
-	[ "$CTDB_NATGW_PUBLIC_IFACE" ] && INTERFACES="$CTDB_NATGW_PUBLIC_IFACE $INTERFACES"
+# This sets $all_interfaces as a side-effect.
+get_all_interfaces ()
+{
+    # Get all the interfaces listed in the public_addresses file
+    all_interfaces=$(sed -e "s/^[^\t ]*[\t ]*//" -e "s/,/ /g" -e "s/[\t ]*$//" $CTDB_PUBLIC_ADDRESSES)
 
+    # Add some special interfaces if they're defined
+    [ "$CTDB_PUBLIC_INTERFACE" ] && all_interfaces="$CTDB_PUBLIC_INTERFACE $all_interfaces"
+    [ "$CTDB_NATGW_PUBLIC_IFACE" ] && all_interfaces="$CTDB_NATGW_PUBLIC_IFACE $all_interfaces"
 
-	# For all but the 1st line, get the 2nd last field with commas
-	# changes to spaces.
-	IFACES=`ctdb -Y ip -v | sed -e '1d' -e 's/:[^:]*:$//' -e 's/^.*://' -e 's/,/ /g'`
+    # For all but the 1st line, get the 2nd last field with commas
+    # changes to spaces.
+    ctdb_ifaces=$(ctdb -Y ip -v | sed -e '1d' -e 's/:[^:]*:$//' -e 's/^.*://' -e 's/,/ /g')
+
+    # Add $ctdb_interfaces and uniquify
+    all_interfaces=$(echo $all_interfaces $ctdb_ifaces | tr ' ' '\n' | sort -u)
+}
+
+monitor_interfaces()
+{
+	get_all_interfaces
 
-	INTERFACES=`for IFACE in $INTERFACES $IFACES ; do echo $IFACE ; done | sort | uniq`
+	fail=false
+	up_interfaces_found=false
 
-	fail=0
-	ok=0
-	for IFACE in $INTERFACES ; do
+	for iface in $all_interfaces ; do
 
-	    ip addr show $IFACE 2>/dev/null >/dev/null || {
-		echo Interface $IFACE does not exist but it is used by public addresses.
+	    ip addr show $iface 2>/dev/null >/dev/null || {
+		echo "WARNING: Interface $iface does not exist but it is used by public addresses."
 		continue
 	    }
 
 	    # These interfaces are sometimes bond devices
 	    # When we use VLANs for bond interfaces, there will only
 	    # be an entry in /proc for the underlying real interface
-	    REALIFACE=`echo $IFACE |sed -e 's/\..*$//'`
-	    bi=$(get_proc "net/bonding/$REALIFACE" 2>/dev/null) && {
+	    realiface=`echo $iface |sed -e 's/\..*$//'`
+	    bi=$(get_proc "net/bonding/$realiface" 2>/dev/null) && {
 		echo "$bi" | grep -q 'Currently Active Slave: None' && {
-			echo "ERROR: No active slaves for bond device $REALIFACE"
-			fail=1
-			ctdb setifacelink $IFACE down >/dev/null 2>/dev/null
-			continue;
+			echo "ERROR: No active slaves for bond device $realiface"
+			mark_down $iface
+			continue
 		}
 		echo "$bi" | grep -q '^MII Status: up' || {
-			echo "ERROR: public network interface $REALIFACE is down"
-			fail=1
-			ctdb setifacelink $IFACE down >/dev/null 2>/dev/null
-			continue;
+			echo "ERROR: public network interface $realiface is down"
+			mark_down $iface
+			continue
 		}
 		echo "$bi" | grep -q '^Bonding Mode: IEEE 802.3ad Dynamic link aggregation' && {
+			# This works around a bug in the driver where the
+			# overall bond status can be up but none of the actual
+			# physical interfaces have a link.
 			echo "$bi" | grep 'MII Status:' | tail -n +2 | grep -q '^MII Status: up' || {
-				echo No active slaves for 802.ad bond device $REALIFACE
-				ctdb setifacelink $IFACE down >/dev/null 2>/dev/null
-				fail=1
+				echo "ERROR: No active slaves for 802.ad bond device $realiface"
+				mark_down $iface
 				continue
 			}
 		}
-		ok=1 # we only set ok for interfaces known to ctdbd
-		ctdb setifacelink $IFACE up >/dev/null 2>/dev/null
-		continue;
+		mark_up $iface
+		continue
 	    }
 
-	    case $IFACE in
+	    case $iface in
 	    lo*)
 		# loopback is always working
-		ok=1 # we only set ok for interfaces known to ctdbd
-		ctdb setifacelink $IFACE up >/dev/null 2>/dev/null
+		mark_up $iface
 		;;
 	    ib*)
 		# we dont know how to test ib links
-		ok=1 # we only set ok for interfaces known to ctdbd
-		ctdb setifacelink $IFACE up >/dev/null 2>/dev/null
+		mark_up $iface
 		;;
 	    *)
-		[ -z "$IFACE" ] || {
-		    [ "$(basename $(readlink /sys/class/net/$IFACE/device/driver) 2>/dev/null)" = virtio_net ] ||
-		    ethtool $IFACE | grep -q 'Link detected: yes' || {
+		[ -z "$iface" ] || {
+		    [ "$(basename $(readlink /sys/class/net/$iface/device/driver) 2>/dev/null)" = virtio_net ] ||
+		    ethtool $iface | grep -q 'Link detected: yes' || {
 			# On some systems, this is not successful when a
 			# cable is plugged but the interface has not been
 			# brought up previously. Bring the interface up and
 			# try again...
-			ip link set $IFACE up
-			ethtool $IFACE | grep -q 'Link detected: yes' || {
-			    echo "ERROR: No link on the public network interface $IFACE"
-			    fail=1
-			    ctdb setifacelink $IFACE down >/dev/null 2>/dev/null
+			ip link set $iface up
+			ethtool $iface | grep -q 'Link detected: yes' || {
+			    echo "ERROR: No link on the public network interface $iface"
+			    mark_down $iface
 			    continue
 			}
 		    }
-		    ok=1 # we only set ok for interfaces known to ctdbd
-		    ctdb setifacelink $IFACE up >/dev/null 2>/dev/null
+		    mark_up $iface
 		}
 		;;
 	    esac
 
 	done
 
-	test x"$fail" = x"0" && {
-		return 0;
-	}
+	$fail || return 0
 
-	test x"$ok" = x"1" && {
-		return 2;
-	}
+	$up_interfaces_found && \
+	    [ "$CTDB_PARTIALLY_ONLINE_INTERFACES" = "yes" ] && \
+	    return 0
 
-	return 1;
+	return 1
 }
 
 case "$1" in 
@@ -132,10 +145,9 @@ case "$1" in
      # and we start the services to become healthy
      startup)
 	# Assume all links are good initially
-	INTERFACES=`for IFACE in $INTERFACES ; do echo $IFACE ; done | sort | uniq`
-
-	for IFACE in $INTERFACES ; do
-		ctdb setifacelink $IFACE down >/dev/null 2>/dev/null
+	get_all_interfaces
+	for iface in $all_interfaces ; do
+		ctdb setifacelink $iface up >/dev/null 2>/dev/null
 	done
 	
 	monitor_interfaces
@@ -147,7 +159,7 @@ case "$1" in
      # called when ctdbd wants to claim an IP address
      takeip)
 	if [ $# != 4 ]; then
-	   echo "must supply interface, IP and maskbits"
+	   echo "ERROR: must supply interface, IP and maskbits"
 	   exit 1
 	fi
 	iface=$2
@@ -170,7 +182,7 @@ case "$1" in
      # called when ctdbd wants to release an IP address
      releaseip)
 	if [ $# != 4 ]; then
-	   echo "must supply interface, IP and maskbits"
+	   echo "ERROR: must supply interface, IP and maskbits"
 	   exit 1
 	fi
 
@@ -209,7 +221,7 @@ case "$1" in
      # called when ctdbd wants to update an IP address
      updateip)
 	if [ $# != 5 ]; then
-	   echo "must supply old interface, new interface, IP and maskbits"
+	   echo "ERROR: must supply old interface, new interface, IP and maskbits"
 	   exit 1
 	fi
 
@@ -270,21 +282,7 @@ case "$1" in
 	;;
 
      monitor)
-	monitor_interfaces
-	ret=$?
-
-	test x"$ret" = x"2" && {
-		test x"$CTDB_PARTIALLY_ONLINE_INTERFACES" != x"yes" && {
-			exit 1;
-		}
-		# as long as we have one interface available don't become
-		# unhealthy
-		ret=0
-	}
-
-	test x"$ret" != x"0" && {
-		exit 1;
-	}
+	monitor_interfaces || exit 1
 	;;
     *)
 	ctdb_standard_event_handler "$@"
diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index e778048..19a9ea8 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -59,11 +59,11 @@ case "$1" in
 	;;
 
       monitor)
-	# and that its directories are available
+	# Check that directories for shares actually exist.
 	[ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
-	    exportfs | grep -v '^#' | grep '^/' |
-	    sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
-	    ctdb_check_directories
+	    exportfs -v | grep '^/' | 
+	    sed -r -e 's@[[:space:]]+[^[:space:]()]+\([^[:space:]()]+\)$@@' | 
+	    ctdb_check_directories 
 	} || exit $?
 
 	update_tickles 2049
@@ -73,118 +73,35 @@ case "$1" in
 	# we only do this IF we have a rpc.statd command.
 	# For platforms where rpc.statd does not exist, we skip
 	# the check completely
-	p="rpc.statd"
-	which $p >/dev/null 2>/dev/null && {
-		if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
-			(service_name="nfs_statd"; ctdb_counter_init)
-		else
-			cmd="$p"
-			cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
-			cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
-			cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
-			(
-				service_name="nfs_statd"
-				ctdb_counter_incr
-				ctdb_check_counter_limit 10 quiet >/dev/null
-			) || {
-				echo "$ctdb_check_rpc_out"
-				echo "Trying to restart STATD [$cmd]"
-				$cmd
-			}
-		fi
-	}
+        p="rpc.statd"
+        which $p >/dev/null 2>/dev/null && \
+	    nfs_check_rpc_service "statd" \
+	        -ge 6 "verbose unhealthy" \
+	        -eq 4 "verbose restart" \
+		-eq 2 "restart:bs"
 
 	# check that NFS responds to rpc requests
-	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
-		(service_name="nfs_knfsd"; ctdb_counter_init)
-	    else
-		(
-			service_name="nfs_knfsd"
-			ctdb_counter_incr
-
-			ctdb_check_counter_equal 2 || {
-				echo "Trying to restart NFS service"
-				startstop_nfs restart >/dev/null 2>&1 &
-				exit 0
-			}
-
-			ctdb_check_counter_limit 5 quiet >/dev/null
-		) || {
-			echo "$ctdb_check_rpc_out"
-			echo "Trying to restart NFS service"
-			startstop_nfs restart
-			exit 1
-		}
-	    fi
-	}
-
-	# check that lockd responds to rpc requests
-	if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
-		(service_name="lockd"; ctdb_counter_init)
-	else
-		(
-			service_name="lockd"
-			ctdb_counter_incr
-
-			ctdb_check_counter_equal 10 || {
-				echo "Trying to restart NFS lock service"
-				startstop_nfs restart >/dev/null 2>&1 &
-				startstop_nfslock restart  >/dev/null 2>&1 &
-				exit 0
-			}
-
-			ctdb_check_counter_limit 15 quiet >/dev/null
-	) || {
-			echo "$ctdb_check_rpc_out"
-			echo "Trying to restart NFS lock service"
-			startstop_nfs restart
-			startstop_nfslock restart
-			exit 1
-		}
-	fi
-
-	# mount needs special handling since it is sometimes not started
-	# correctly on RHEL5
-	if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
-		(service_name="nfs_mountd"; ctdb_counter_init)
-	else
-	(
-		service_name="nfs_mountd"
-		ctdb_counter_incr
-
-		ctdb_check_counter_equal 5 || {
-			p="rpc.mountd"
-			cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-			echo "Trying to restart MOUNTD [${cmd}]"
-			killall -q -9 $p
-			$cmd &
-			exit 0
-		}
-
-		ctdb_check_counter_limit 10 quiet >/dev/null
-	) || {
-		echo "$ctdb_check_rpc_out"
-		p="rpc.mountd"
-		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-		echo "Trying to restart MOUNTD [${cmd}]"
-		killall -q -9 $p
-		$cmd &
-		exit 1
-	}
+	if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
+	    nfs_check_rpc_service "knfsd" \
+		-ge 6 "verbose unhealthy" \
+		-eq 4 "verbose restart" \
+		-eq 2 "restart:bs"
 	fi
 
-
-	# rquotad needs special handling since it is sometimes not started
-	# correctly on RHEL5
-	# this is not a critical service so we dont flag the node as unhealthy
-	ctdb_check_rpc "RQUOTAD" 100011 1 || {
-		p="rpc.rquotad"
-		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
-		echo "Trying to restart RQUOTAD [${cmd}]"
-		killall -q -9 $p
-		$cmd &
-	}
+	# check that lockd responds to rpc requests
+	nfs_check_rpc_service "lockd" \
+	    -ge 15 "verbose restart unhealthy" \
+	    -eq 10 "restart:bs"
+
+	# mountd is sometimes not started correctly on RHEL5
+	nfs_check_rpc_service "mountd" \
+	    -ge 10 "verbose restart:b unhealthy" \
+	    -eq 5 "restart:b"
+
+	# rquotad is sometimes not started correctly on RHEL5
+	# not a critical service so we dont flag the node as unhealthy
+	nfs_check_rpc_service "rquotad" \
+	    -gt 0 "verbose restart:b"
 
 	# once every 600 seconds, update the statd state database for which
 	# clients need notifications
diff --git a/config/functions b/config/functions
index 2668531..b049652 100755
--- a/config/functions
+++ b/config/functions
@@ -106,17 +106,139 @@ get_proc ()
 }
 
 ######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+# 	nfs_check_rpc_service "lockd" \
+#	    -ge 15 "verbose restart unhealthy" \
+#	    -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    _version=1
+    _rpc_prog="$_prog_name"
+    _restart=""
+    _opts=""
+    case "$_prog_name" in
+	knfsd)
+	    _rpc_prog=nfs
+	    _version=3
+	    _restart="echo 'Trying to restart NFS service'"
+	    _restart="${_restart}; startstop_nfs restart"
+	    ;;
+	mountd)
+	    _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+	    ;;
+	rquotad)
+	    _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+	    ;;
+	lockd)
+	    _rpc_prog=nlockmgr
+	    _version=4
+	    _restart="echo 'Trying to restart lock manager service'"
+	    _restart="${_restart}; startstop_nfslock restart"
+	    ;;
+	statd)
+	    _rpc_prog=status
+	    _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+	    _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+	    _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+	    ;;
+	*)
+	    echo "Internal error: unknown RPC program \"$_prog_name\"."
+	    exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+	ctdb_counter_init "$_service_name"
+	return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    while [ -n "$3" ] ; do
+	ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+	    for _action in $3 ; do
+		case "$_action" in
+		    verbose)
+			echo "$ctdb_check_rpc_out"
+			;;
+		    restart|restart:*)
+			# No explicit command specified, construct rpc command.
+			if [ -z "$_restart" ] ; then
+			    _p="rpc.${_prog_name}"
+			    _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+			    _restart="${_restart}; killall -q -9 $_p"
+			    _restart="${_restart}; $_p $_opts"
+			fi
+
+			# Process restart flags...
+			_flags="${_action#restart:}"
+			# There may not have been a colon...
+			[ "$_flags" != "$_action" ] || _flags=""


-- 
CTDB repository


More information about the samba-cvs mailing list