[SCM] CTDB repository - branch master updated - ctdb-1.0.114-277-g230bec8

Mon Aug 30 02:20:31 MDT 2010

The branch, master has been updated
       via  230bec8d375b778b20ff3cb7f9864c26323997f3 (commit)
       via  fc0678d351187cfa4c71123f97c0f493aacd5d16 (commit)
       via  7f4c591388adae20e98984001385cba26598ec67 (commit)
      from  292d7435a360efd7f15a7a99f658a605e07c0a81 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 230bec8d375b778b20ff3cb7f9864c26323997f3
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 30 18:15:41 2010 +1000

    remove the mention of a tickle and statd directory in shared storage now that we are removing these and migrating to store the data inside ctdbd or persistent databases

commit fc0678d351187cfa4c71123f97c0f493aacd5d16
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 30 18:13:28 2010 +1000

    Remove the dependency on the underlying cluster filesystem for handling
    the clusterwide persistent data associated with the lock manager and
    statd notifications.
    
    Use persistent databases to store this data instead of a shared directory.

commit 7f4c591388adae20e98984001385cba26598ec67
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Mon Aug 30 18:08:38 2010 +1000

        Add a new event "ipreallocated"
        This is called everytime a reallocation is performed.
    
        While STARTRECOVERY/RECOVERED events are only called when
        we do ipreallocation as part of a full database/cluster recovery,
        this new event can be used to trigger on when we just do a light
        failover due to a node becomming unhealthy.
    
        I.e. situations where we do a failover but we do not perform a full
        cluster recovery.
    
        Use this to trigger for natgw so we select a new natgw master node
        when failover happens and not just when cluster rebuilds happen.

-----------------------------------------------------------------------

Summary of changes:
 common/ctdb_util.c       |    1 +
 config/events.d/11.natgw |    2 +-
 config/events.d/60.nfs   |   49 +++++++--------
 config/statd-callout     |  154 +++++++++++++++++++++++++++++-----------------
 include/ctdb_protocol.h  |    1 +
 server/ctdb_takeover.c   |   17 +++++-
 server/eventscript.c     |    1 +
 web/nfs.html             |   19 ------
 8 files changed, 140 insertions(+), 104 deletions(-)


Changeset truncated at 500 lines:

diff --git a/common/ctdb_util.c b/common/ctdb_util.c
index add2c3a..4244a04 100644
--- a/common/ctdb_util.c
+++ b/common/ctdb_util.c
@@ -685,5 +685,6 @@ const char *ctdb_eventscript_call_names[] = {
 	"status",
 	"shutdown",
 	"reload",
+	"ipreallocated",
 	"updateip"
 };
diff --git a/config/events.d/11.natgw b/config/events.d/11.natgw
index 46de7fa..61c09e9 100755
--- a/config/events.d/11.natgw
+++ b/config/events.d/11.natgw
@@ -46,7 +46,7 @@ case "$1" in
 	ctdb setnatgwstate on
 	;;
 
-    recovered|updatenatgw)
+    recovered|updatenatgw|ipreallocated)
 	MYPNN=`ctdb pnn | cut -d: -f2`
 	NATGWMASTER=`ctdb natgwlist | head -1 | sed -e "s/ .*//"`
 	NATGWIP=`ctdb natgwlist | head -1 | sed -e "s/^[^ ]* *//"`
diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 637efe8..15c2b89 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -4,36 +4,29 @@
 start_nfs() {
 	/bin/mkdir -p $CTDB_BASE/state/nfs
 	/bin/mkdir -p $CTDB_BASE/state/statd/ip
-	/bin/mkdir -p $STATD_SHARED_DIRECTORY
 	startstop_nfs stop
 	startstop_nfs start
 }
 
-reconfigure_nfs() {
-	# always restart the lockmanager so that we start with a clusterwide
-	# graceperiod when ip addresses has changed
-	[ -x $CTDB_BASE/statd-callout ] && {
-		$CTDB_BASE/statd-callout notify &
-	} >/dev/null 2>&1
-
-}
-
 . $CTDB_BASE/functions
 
 service_name="nfs"
 service_start="start_nfs"
 service_stop="startstop_nfs stop"
-service_reconfigure="reconfigure_nfs"
 
 loadconfig
 
-[ -z "$STATD_SHARED_DIRECTORY" ] && exit 0
-
 ctdb_start_stop_service
 
+echo XX 60.nfs   $@
+
 case "$1" in 
+     init)
+	# read statd from persistent database
+	;;
      startup)
 	ctdb_service_start
+	touch $CTDB_BASE/state/statd/update-trigger
 	;;
 
      shutdown)
@@ -42,20 +35,10 @@ case "$1" in
 
      takeip)
 	ctdb_service_set_reconfigure
-	touch $CTDB_BASE/state/statd/ip/$3
 	;;
 
      releaseip)
 	ctdb_service_set_reconfigure
-	/bin/rm -f $CTDB_BASE/state/statd/ip/$3
-	;;
-
-     recovered)
-	# if we have taken or released any ips we must 
-	# restart the lock manager so that we enter a clusterwide grace period
-	if ctdb_service_needs_reconfigure ; then
-	    ctdb_service_reconfigure
-	fi
 	;;
 
       monitor)
@@ -95,8 +78,6 @@ case "$1" in
 
 	# check that lockd responds to rpc requests
 	ctdb_check_rpc "lockd" 100021 1
-	echo "$STATD_SHARED_DIRECTORY" | ctdb_check_directories "statd" || \
-	    exit $?
 
 	# mount needs special handling since it is sometimes not started
 	# correctly on RHEL5
@@ -118,8 +99,26 @@ case "$1" in
 		killall -q -9 rpc.rquotad
 		rpc.rquotad $RPCRQUOTADOPTS &
 	}
+
+	# once every 60 seconds, update the statd state database for which
+	# clients need notifications
+	LAST_UPDATE=`stat --printf="%Y" $CTDB_BASE/state/statd/update-trigger`
+	CURRENT_TIME=`date +"%s"`
+	expr "$CURRENT_TIME" ">" "(" "$LAST_UPDATE" "+" "60" ")" >/dev/null 2>/dev/null
+	[ $? = "0" ] && {
+	    touch $CTDB_BASE/state/statd/update-trigger
+	    $CTDB_BASE/statd-callout updatelocal &
+	    $CTDB_BASE/statd-callout updateremote &
+	}
        	;;
 
+    ipreallocated)
+	# if the ips have been reallocated, we must restart the lockmanager
+	# across all nodes and ping all statd listeners
+	[ -x $CTDB_BASE/statd-callout ] && {
+		$CTDB_BASE/statd-callout notify &
+	} >/dev/null 2>&1
+	;;
     *)
 	ctdb_standard_event_handler "$@"
 	;;
diff --git a/config/statd-callout b/config/statd-callout
index 168975c..461cd81 100755
--- a/config/statd-callout
+++ b/config/statd-callout
@@ -13,13 +13,6 @@
 loadconfig ctdb
 loadconfig nfs
 
-[ -z "$STATD_SHARED_DIRECTORY" ] && {
-	echo STATD_SHARED_DIRECTORY not configured. statd-callout failed.
-	exit 0
-}
-
-[ -d $STATD_SHARED_DIRECTORY ] || exit 0
-
 [ -z $NFS_HOSTNAME ] && {
 	echo NFS_HOSTNAME is not configured. statd-callout failed.
 	exit 0
@@ -29,20 +22,85 @@ case "$1" in
   add-client)
 	# the callout does not tell us to which ip the client connected
 	# so we must add it to all the ips that we serve
-        for f in $CTDB_BASE/state/statd/ip/*; do
-	    ip=`basename $f`
-	    [ -d $STATD_SHARED_DIRECTORY/$ip ] || /bin/mkdir $STATD_SHARED_DIRECTORY/$ip
-	    touch $STATD_SHARED_DIRECTORY/$ip/$2
+	PNN=`ctdb xpnn | sed -e "s/.*://"`
+	ctdb ip -Y | while read LINE; do
+		NODE=`echo $LINE | cut -f3 -d:`
+		[ "$NODE" = "$PNN" ] || {
+			# not us
+			continue
+		} 
+		IP=`echo $LINE | cut -f2 -d:`
+		/bin/mkdir -p $CTDB_BASE/state/statd/ip/$IP
+		touch $CTDB_BASE/state/statd/ip/$IP/$2
 	done
 	;;
   del-client)
-	# the callout does not tell us to which ip the client connected
-	# so we must add it to all the ips that we serve
-        for f in $CTDB_BASE/state/statd/ip/*; do
-	    ip=`basename $f`
-	    /bin/rm -f $STATD_SHARED_DIRECTORY/$ip/$2
+	# the callout does not tell us to which ip the client disconnected
+	# so we must remove it from all the ips that we serve
+	PNN=`ctdb xpnn | sed -e "s/.*://"`
+	ctdb ip -Y | while read LINE; do
+		NODE=`echo $LINE | cut -f3 -d:`
+		[ "$NODE" = "$PNN" ] || {
+			# not us
+			continue
+		} 
+		IP=`echo $LINE | cut -f2 -d:`
+		/bin/rm -f $CTDB_BASE/state/statd/ip/$IP/$2
+	done
+	;;
+  updatelocal)
+	# For all IPs we serve, collect info and push to the config database
+	PNN=`ctdb xpnn | sed -e "s/.*://"`
+	ctdb ip -Y | tail -n +2 | while read LINE; do
+		NODE=`echo $LINE | cut -f3 -d:`
+		[ "$NODE" = "$PNN" ] || {
+			continue
+		} 
+		IP=`echo $LINE | cut -f2 -d:`
+
+		rm -f $CTDB_BASE/state/statd/ip/$IP.tar
+		tar cfP $CTDB_BASE/state/statd/ip/$IP.tar $CTDB_BASE/state/statd/ip/$IP
+
+		rm -f $CTDB_BASE/state/statd/ip/$IP.rec
+		ctdb pfetch ctdb.tdb statd-state:$IP $CTDB_BASE/state/statd/ip/$IP.rec 2>/dev/null
+		[ "$?" = "0" ] || {
+			# something went wrong,  try storing this data
+			echo No record. Store STATD state data for $IP
+			ctdb pstore ctdb.tdb statd-state:$IP $CTDB_BASE/state/statd/ip/$IP.tar 2>/dev/null
+			continue
+		}
+
+		cmp $CTDB_BASE/state/statd/ip/$IP.tar $CTDB_BASE/state/statd/ip/$IP.rec >/dev/null 2>/dev/null
+		[ "$?" = "0" ] || {
+			# something went wrong,  try storing this data
+			echo Updated record. Store STATD state data for $IP
+			ctdb pstore ctdb.tdb statd-state:$IP $CTDB_BASE/state/statd/ip/$IP.tar 2>/dev/null
+			continue
+		}
 	done
 	;;
+
+  updateremote)
+	# For all IPs we dont serve, pull the state from the database
+	PNN=`ctdb xpnn | sed -e "s/.*://"`
+	ctdb ip -Y | tail -n +2 | while read LINE; do
+		NODE=`echo $LINE | cut -f3 -d:`
+		[ "$NODE" = "$PNN" ] && {
+			continue
+		} 
+		IP=`echo $LINE | cut -f2 -d:`
+
+		rm -f $CTDB_BASE/state/statd/ip/$IP.rec
+		ctdb pfetch ctdb.tdb statd-state:$IP $CTDB_BASE/state/statd/ip/$IP.rec 2>/dev/null
+		[ "$?" = "0" ] || {
+			continue
+		}
+
+		rm -f $CTDB_BASE/state/statd/ip/$IP/*
+		tar xfP $CTDB_BASE/state/statd/ip/$IP.rec
+	done
+	;;
+
   notify)
 	# we must restart the lockmanager (on all nodes) so that we get
 	# a clusterwide grace period (so other clients dont take out
@@ -55,26 +113,19 @@ case "$1" in
 	#echo 0 > /proc/sys/net/ipv4/tcp_max_tw_buckets
 	#echo 0 > /proc/sys/net/ipv4/tcp_max_orphans
 
-	# rebuild the state directory for the local statd to use the correct
-	# state value and to initally send notifications to all clients
+	# Delete the notification list for statd, we dont want it to 
+	# ping any clients
 	rm -f /var/lib/nfs/statd/sm/*
 	rm -f /var/lib/nfs/statd/sm.bak/*
-	cat $STATD_SHARED_DIRECTORY/state >/var/lib/nfs/statd/state
-
 
 	# we must keep a monotonically increasing state variable for the entire
 	# cluster  so state always increases when ip addresses fail from one
 	# node to another
-	[ ! -f $STATD_SHARED_DIRECTORY/state ] && {
-		echo 1 | awk '{printf("%c%c%c%c", $0, $0/256, $0/256/256, $0/256/256/256);}' >$STATD_SHARED_DIRECTORY/state
-	}
-	# read current state
-	STATE=`od -t d4 $STATD_SHARED_DIRECTORY/state | head -1 | sed -e "s/^[0-9]*[^0-9]*//"`
-	# write current state+2 back to the state file
-	# the /2 *2 are to ensure that state is odd. state must be odd.
-	STATE=`expr $STATE "/" 2 "*" 2 "+" 3`
-	echo $STATE | awk '{printf("%c%c%c%c", $0, $0/256, $0/256/256, $0/256/256/256);}' >$STATD_SHARED_DIRECTORY/state
-	
+	# We use epoch and hope the nodes are close enough in clock.
+	# Even numbers mean service is shut down, odd numbers mean
+	# service is started.
+	STATE=`date +"%s"`
+	STATE=`expr "$STATE" "/" "2"`
 
 
 	# we must also let some time pass between stopping and restarting the
@@ -85,17 +136,6 @@ case "$1" in
 	startstop_nfslock stop > /dev/null 2>&1
 	sleep 2
 
-	# copy all monitored clients on this node to the local lockmanager
-	for f in `/bin/ls $CTDB_BASE/state/statd/ip/* 2>/dev/null`; do
-	    ip=`basename $f`
-	    [ -d $STATD_SHARED_DIRECTORY/$ip ] && [ -x /usr/bin/smnotify ] && {
-		for g in `/bin/ls $STATD_SHARED_DIRECTORY/$ip/* 2>/dev/null`; do
-			client=`basename $g`
-			touch /var/lib/nfs/statd/sm/$client
-		done
-	    }
-	done
-
 	# now start lockmanager again with the new state directory.
 	startstop_nfslock start > /dev/null 2>&1
 
@@ -127,23 +167,23 @@ case "$1" in
 	# Both 2a and 2b are commonly used in lockmanagers since they maximize
 	# probability that the client will accept the statd notify packet and
 	# not just ignore it.
-        for f in `/bin/ls $CTDB_BASE/state/statd/ip/* 2>/dev/null`; do
-	    ip=`basename $f`
-	    [ -d $STATD_SHARED_DIRECTORY/$ip ] && [ -x /usr/bin/smnotify ] && {
-		for g in `/bin/ls $STATD_SHARED_DIRECTORY/$ip/* 2>/dev/null`; do
-			client=`basename $g`
-#			/bin/rm -f $g
-			# send out notifications from the "correct" address
-			# (the same addresse as where the lock was taken out
-			# on)   some clients require that the source address
-			# matches where the lock was taken out.
-			# also send it both as a name that the client
-			# hopefully can resolve into the server ip and
-			# and also by specifying the raw ip address as name.
-			/usr/bin/smnotify --client=$client --ip=$ip --server=$ip --stateval=$STATE
-			/usr/bin/smnotify --client=$client --ip=$ip --server=$NFS_HOSTNAME --stateval=$STATE
+	# For all IPs we serve, collect info and push to the config database
+	PNN=`ctdb xpnn | sed -e "s/.*://"`
+	ctdb ip -Y | tail -n +2 | while read LINE; do
+		NODE=`echo $LINE | cut -f3 -d:`
+		[ "$NODE" = "$PNN" ] || {
+			continue
+		} 
+		IP=`echo $LINE | cut -f2 -d:`
+
+		ls $CTDB_BASE/state/statd/ip/$IP | while read CLIENT; do
+			rm $CTDB_BASE/state/statd/ip/$IP/$CLIENT
+			/usr/bin/smnotify --client=$CLIENT --ip=$IP --server=$ip --stateval=$STATE
+			/usr/bin/smnotify --client=$CLIENT --ip=$IP --server=$NFS_HOSTNAME --stateval=$STATE
+			STATE=`expr "$STATE" "+" "1"`
+			/usr/bin/smnotify --client=$CLIENT --ip=$IP --server=$ip --stateval=$STATE
+			/usr/bin/smnotify --client=$CLIENT --ip=$IP --server=$NFS_HOSTNAME --stateval=$STATE
 		done
-	    }
 	done
 	;;
 esac
diff --git a/include/ctdb_protocol.h b/include/ctdb_protocol.h
index 99765d4..df6e90c 100644
--- a/include/ctdb_protocol.h
+++ b/include/ctdb_protocol.h
@@ -183,6 +183,7 @@ enum ctdb_eventscript_call {
 	CTDB_EVENT_SHUTDOWN,		/* CTDB shutting down: no args. */
 	CTDB_EVENT_RELOAD,		/* magic */
 	CTDB_EVENT_UPDATE_IP,		/* IP updating: old interface, new interface, IP address, netmask bits. */
+	CTDB_EVENT_IPREALLOCATED,	/* when a takeover_run() completes */
 	CTDB_EVENT_MAX
 };
 
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c
index 44a68ee..06494c8 100644
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -1215,10 +1215,10 @@ create_merged_ip_list(struct ctdb_context *ctdb)
  */
 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 {
-	int i, num_healthy, retries;
+	int i, num_healthy, retries, num_ips;
 	struct ctdb_public_ip ip;
 	struct ctdb_public_ipv4 ipv4;
-	uint32_t mask;
+	uint32_t mask, *nodes;
 	struct ctdb_public_ip_list *all_ips, *tmp_ip;
 	int maxnode, maxnum=0, minnode, minnum=0, num;
 	TDB_DATA data;
@@ -1529,6 +1529,19 @@ finished:
 		return -1;
 	}
 
+	/* tell all nodes to update natwg */
+	/* send the flags update natgw on all connected nodes */
+	data.dptr  = discard_const("ipreallocated");
+	data.dsize = strlen((char *)data.dptr) + 1; 
+	nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
+				      nodes, 0, TAKEOVER_TIMEOUT(),
+				      false, data,
+				      NULL, NULL,
+				      NULL) != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
+	}
+
 	talloc_free(tmp_ctx);
 	return 0;
 }
diff --git a/server/eventscript.c b/server/eventscript.c
index 139f59f..ce2fd89 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -633,6 +633,7 @@ static bool check_options(enum ctdb_eventscript_call call, const char *options)
 	case CTDB_EVENT_STATUS:
 	case CTDB_EVENT_SHUTDOWN:
 	case CTDB_EVENT_RELOAD:
+	case CTDB_EVENT_IPREALLOCATED:
 		return count_words(options) == 0;
 
 	case CTDB_EVENT_TAKE_IP: /* interface, IP address, netmask bits. */
diff --git a/web/nfs.html b/web/nfs.html
index f5b626c..a4a6fb5 100644
--- a/web/nfs.html
+++ b/web/nfs.html
@@ -39,8 +39,6 @@ which causes problems on some clients.<br>
 
 This file should look something like :
 <pre>
-  NFS_TICKLE_SHARED_DIRECTORY=/gpfs0/nfs-tickles
-  STATD_SHARED_DIRECTORY=/gpfs0/nfs-state
   NFS_HOSTNAME=ctdb
   STATD_PORT=595
   STATD_OUTGOING_PORT=596
@@ -56,23 +54,6 @@ This file should look something like :
 You need to make sure that the lock manager runs on the same port on all nodes in the cluster since some clients will have "issues" and take very long to recover if the port suddenly changes.<br>
 599 above is only an example. You can run the lock manager on any available port as long as you use the same port on all nodes.<br><br>
 
-STATD_SHARED_DIRECTORY is the shared directory where statd and the statd-callout script expects that the state variables and lists of clients to notify are found.<br>
-
-This directory must be stored on the shared cluster filesystem so that all nodes can access the same data.<br><br>
-
-Don't forget to create this directory:
-<pre>
-  mkdir /gpfs0/nfs-state
-</pre>
-
-NFS_TICKLE_SHARED_DIRECTORY is where ctdb will store information about which
-clients have established tcp connections to the cluster. This information
-is used during failover of ip addresses.
-This allows the node that takes over an ip address to very quickly 'tickle' and reset any tcp connections for the ip address it took over.<br>
-The reason to do this is to improve the speed at which a client will detect
-that the tcp connection for NFS needs to be reestablished and to speed up
-recovery in the client.<br>
-
 NFS_HOSTNAME is the dns name for the ctdb cluster and which is used when clients map nfs shares. This name must be in DNS and resolve back into the public ip addresses of the cluster.<br>
 Always use the same name here as you use for the samba hostname.
 


-- 
CTDB repository