[SCM] CTDB repository - branch master updated - 2c292039a0139dcf5bb2bd964eb6f8902d094c50

Thu May 15 05:18:51 GMT 2008

The branch, master has been updated
       via  2c292039a0139dcf5bb2bd964eb6f8902d094c50 (commit)
       via  f0169ac8166a19d65ce254496e21d095aed87c2f (commit)
       via  3038d0b74895b51af4f85f2f304508ed16d245f4 (commit)
       via  d5fb4489f83f1f956b2c083cfad1861c5ddde283 (commit)
       via  788d38812d73729f11d12e9812b16092c0ae4123 (commit)
       via  e3cdb8f2be6a44ec877efcd75c7297edb008a80b (commit)
       via  b616961c16667328a81efa00a1c880efa4e791f1 (commit)
       via  3e6160e5d90a0661eb833b163c11be2267117d0b (commit)
       via  7b624add53c270f803177237c08e867f70bc85cc (commit)
      from  406a2a1e364cf71eb15e5aeec3b87c62f825da92 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 2c292039a0139dcf5bb2bd964eb6f8902d094c50
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 15 15:01:01 2008 +1000

    dont check whether the "recovered" event was successful or not
    since  this event wont run unless the recovery mode is normal   but we
    can not know what the recovery mode will be in the future on a remote node
    so since we issue these commands   that will execute in the future at some other node
    it is pointless to try to check if it worked or not
    
    in particular if "failure to successfully run the eventscript" would then trigger a full new recovery which is disruptive and expensive.

commit f0169ac8166a19d65ce254496e21d095aed87c2f
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 15 13:28:19 2008 +1000

    remove some unnessecary tests if ->vnn is null or not

commit 3038d0b74895b51af4f85f2f304508ed16d245f4
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 15 12:28:52 2008 +1000

    Update some debug statements. Dont say that recovery failed if the failed function was invoked from outside of recovery

commit d5fb4489f83f1f956b2c083cfad1861c5ddde283
Merge: 406a2a1e364cf71eb15e5aeec3b87c62f825da92 788d38812d73729f11d12e9812b16092c0ae4123
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 15 08:02:51 2008 +1000

    Merge git://git.samba.org/tridge/ctdb

commit 788d38812d73729f11d12e9812b16092c0ae4123
Author: Andrew Tridgell <tridge at samba.org>
Date:   Wed May 14 22:05:09 2008 +1000

    put the return in the right place
    
    We were refusing the 'startrecovery' event

commit e3cdb8f2be6a44ec877efcd75c7297edb008a80b
Author: Andrew Tridgell <tridge at samba.org>
Date:   Wed May 14 20:57:04 2008 +1000

    Fix the chicken and egg problem with ctdb/samba and a registry smb.conf
    
    This attempts to fix the problem of ctdb event scripts blocking due to
    attempted access to the ctdb databases during recovery. The changes are:
    
      - now only the 'shutdown' and 'startrecovery' events can be called
        with the databases locked in recovery. The event scripts must ensure
        that for these two events no database access is attempted
    
      - the recovered, takeip and releaseip events could previously be called
        inside a recovery. The code now ensures that this doesn't happen, delaying
        the events till after recovery has finished
    
      - the 50.samba event script now avoids using testparm unless it is really
        needed
    
    This needs extensive testing.

commit b616961c16667328a81efa00a1c880efa4e791f1
Merge: 3e6160e5d90a0661eb833b163c11be2267117d0b 13d3eb9a8bc7fad14fcd3e7e023c1336657424d6
Author: Andrew Tridgell <tridge at samba.org>
Date:   Wed May 14 14:37:11 2008 +1000

    Merge commit 'ronnie-ctdb/master' into tridge

commit 3e6160e5d90a0661eb833b163c11be2267117d0b
Author: Andrew Tridgell <tridge at samba.org>
Date:   Sat May 10 09:35:13 2008 +1000

    need to specicy tree to git archive

commit 7b624add53c270f803177237c08e867f70bc85cc
Author: Andrew Tridgell <tridge at samba.org>
Date:   Sat May 10 09:24:51 2008 +1000

    use git archive to create tarball

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/50.samba  |   12 +++--
 config/events.d/README    |    5 ++-
 packaging/RPM/makerpms.sh |   34 +------------
 server/ctdb_recoverd.c    |  119 +++++++++++++++++++++++---------------------
 server/eventscript.c      |   24 +++++++--
 5 files changed, 93 insertions(+), 101 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/50.samba b/config/events.d/50.samba
index c67dbda..784c059 100755
--- a/config/events.d/50.samba
+++ b/config/events.d/50.samba
@@ -16,9 +16,9 @@ shift
     SAMBA_CLEANUP_PERIOD=10
 }
 
-
-# autodetect use of winbind if not set in config file
-[ -z "$CTDB_MANAGES_WINBIND" ] && {
+# function to see if ctdb manages winbind
+check_ctdb_manages_winbind() {
+  [ -z "$CTDB_MANAGES_WINBIND" ] && {
     secmode=`testparm -s --parameter-name=security 2> /dev/null`
     case $secmode in
 	ADS|DOMAIN)
@@ -28,6 +28,7 @@ shift
 	    CTDB_MANAGES_WINBIND="no";
 	    ;;
     esac
+  }
 }
 
 ###########################
@@ -53,11 +54,12 @@ case $cmd in
 	}
 
 	# restart the winbind service
+	check_ctdb_manages_winbind
 	[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
 		service winbind stop > /dev/null 2>&1
 		killall -0 -q winbindd && {
 		    sleep 1
-		    # make absolutely sure winbindd is dead
+          	    # make absolutely sure winbindd is dead
 		    killall -q -9 winbindd
 		}
 		service winbind start
@@ -87,6 +89,7 @@ case $cmd in
 	service smb stop
 
 	# stop the winbind service
+	check_ctdb_manages_winbind
 	[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
 		service winbind stop
 	}
@@ -116,6 +119,7 @@ case $cmd in
 	ctdb_check_tcp_ports "Samba" $smb_ports
 
 	# check winbind is OK
+	check_ctdb_manages_winbind
 	[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
 		ctdb_check_command "winbind" "wbinfo -p"
 	}
diff --git a/config/events.d/README b/config/events.d/README
index bfa4372..a75da38 100644
--- a/config/events.d/README
+++ b/config/events.d/README
@@ -18,6 +18,9 @@ The eventscripts are called with varying number of arguments.
 The first argument is the "event" and the rest of the arguments depend
 on which event was triggered.
 
+All of the events except the 'shutdown' and 'startrecovery' events will be
+called with the ctdb daemon in NORMAL mode (ie. not in recovery)
+
 The events currently implemented are
 startup
 	This event does not take any additional arguments.
@@ -74,7 +77,7 @@ takeip
 
 	Before this event there will always be a 'startrecovery' event.
 
-	This event will always be followed by a 'recovered' event onse
+	This event will always be followed by a 'recovered' event once
 	all ipaddresses have been reassigned to new nodes and the ctdb database
 	has been recovered.
 	If multiple ip addresses are reassigned during recovery it is
diff --git a/packaging/RPM/makerpms.sh b/packaging/RPM/makerpms.sh
index 71c8db5..7b5012a 100755
--- a/packaging/RPM/makerpms.sh
+++ b/packaging/RPM/makerpms.sh
@@ -26,50 +26,19 @@ SRCDIR=`rpm --eval %_sourcedir`
 
 # At this point the SPECDIR and SRCDIR vaiables must have a value!
 
-USERID=`id -u`
-GRPID=`id -g`
 VERSION='1.0'
 REVISION=''
 SPECFILE="ctdb.spec"
-RPMVER=`rpm --version | awk '{print $3}'`
 RPMBUILD="rpmbuild"
 
-##
-## Check the RPM version (paranoid)
-##
-case $RPMVER in
-    4*)
-       echo "Supported RPM version [$RPMVER]"
-       ;;
-    *)
-       echo "Unknown RPM version: `rpm --version`"
-       exit 1
-       ;;
-esac
-
-if [ -f Makefile ]; then 
-	make distclean
-fi
-
-pushd .
-BASEDIR=`basename $PWD`
-cd ..
-chown -R ${USERID}.${GRPID} $BASEDIR
-rm -f ctdb-${VERSION}
-ln -s $BASEDIR ctdb-${VERSION} || exit 1
-REMOVE_LN=$PWD/ctdb-$VERSION
-
 echo -n "Creating ctdb-${VERSION}.tar.gz ... "
-tar --exclude=.bzr --exclude=.git --exclude .bzrignore --exclude="*~" --exclude=configure --exclude="test.db*" --exclude="#*" --exclude="push*.sh" --exclude="publish*.sh" -cf - ctdb-${VERSION}/. | gzip -9 --rsyncable > ${SRCDIR}/ctdb-${VERSION}.tar.gz
+git archive --prefix=ctdb-${VERSION}/ HEAD | gzip -9 --rsyncable > ${SRCDIR}/ctdb-${VERSION}.tar.gz
 echo "Done."
 if [ $? -ne 0 ]; then
         echo "Build failed!"
-	[ ${REMOVE_LN} ] && rm $REMOVE_LN
         exit 1
 fi
 
-popd
-
 
 ##
 ## copy additional source files
@@ -84,6 +53,5 @@ cd ${SPECDIR}
 ${RPMBUILD} -ba --clean --rmsource $EXTRA_OPTIONS $SPECFILE || exit 1
 
 echo "$(basename $0): Done."
-[ ${REMOVE_LN} ] && /bin/rm -f $REMOVE_LN
 
 exit 0
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 7aca7cb..9a33819 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -203,7 +203,7 @@ enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEED
 /*
   run the "recovered" eventscript on all nodes
  */
-static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
 {
 	TALLOC_CTX *tmp_ctx;
 
@@ -213,7 +213,8 @@ static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node
 	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 			list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
 			CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
+		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
 		talloc_free(tmp_ctx);
 		return -1;
 	}
@@ -247,7 +248,8 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata)
 {
 	if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
-		DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
+		DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", 
+				  (unsigned)outdata.dsize, outdata.dptr));
 		return;
 	}
 	ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
@@ -1451,6 +1453,15 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
 
+	/* disable recovery mode */
+	ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
+	if (ret!=0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
+		return -1;
+	}
+
+	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
+
 	/*
 	  tell nodes to takeover their public IPs
 	 */
@@ -1463,23 +1474,14 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
 
 	/* execute the "recovered" event script on all nodes */
-	ret = run_recovered_eventscript(ctdb, nodemap);
+	ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
 	if (ret!=0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+		DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
 		return -1;
 	}
 
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
 
-	/* disable recovery mode */
-	ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
-	if (ret!=0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
-		return -1;
-	}
-
-	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
-
 	/* send a message to all clients telling them that the cluster 
 	   has been reconfigured */
 	ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
@@ -1873,8 +1875,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 	
 	if (ret == 0 &&
 	    ctdb->recovery_master == ctdb->pnn &&
-	    ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
-	    ctdb->vnn) {
+	    ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
 		/* Only do the takeover run if the perm disabled or unhealthy
 		   flags changed since these will cause an ip failover but not
 		   a recovery.
@@ -2450,46 +2451,44 @@ again:
 	}
 
 	/* verify that the public ip address allocation is consistent */
-	if (ctdb->vnn != NULL) {
-		ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
-		if (ret != 0) {
-			DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", i));
-			goto again;
-		}
-		for (j=0; j<ips->num; j++) {
-			/* verify that we have the ip addresses we should have
-			   and we dont have ones we shouldnt have.
-			   if we find an inconsistency we set recmode to
-			   active on the local node and wait for the recmaster
-			   to do a full blown recovery
-			*/
-			if (ips->ips[j].pnn == pnn) {
-				if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
-					DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
-					ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
-					if (ret != 0) {
-						DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
-						goto again;
-					}
-					ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
-					if (ret != 0) {
-						DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
-						goto again;
-					}
+	ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", i));
+		goto again;
+	}
+	for (j=0; j<ips->num; j++) {
+		/* verify that we have the ip addresses we should have
+		   and we dont have ones we shouldnt have.
+		   if we find an inconsistency we set recmode to
+		   active on the local node and wait for the recmaster
+		   to do a full blown recovery
+		*/
+		if (ips->ips[j].pnn == pnn) {
+			if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
+				DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
+				ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+				if (ret != 0) {
+					DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+					goto again;
 				}
-			} else {
-				if (ctdb_sys_have_ip(ips->ips[j].sin)) {
-					DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
-					ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
-					if (ret != 0) {
-						DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
-						goto again;
-					}
-					ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
-					if (ret != 0) {
-						DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
-						goto again;
-					}
+				ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+				if (ret != 0) {
+					DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+					goto again;
+				}
+			}
+		} else {
+			if (ctdb_sys_have_ip(ips->ips[j].sin)) {
+				DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
+				ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+				if (ret != 0) {
+					DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+					goto again;
+				}
+				ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+				if (ret != 0) {
+					DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+					goto again;
 				}
 			}
 		}
@@ -2748,12 +2747,18 @@ again:
 		}
 
 		/* execute the "recovered" event script on all nodes */
-		ret = run_recovered_eventscript(ctdb, nodemap);
+		ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
+#if 0
+// we cant check whether the event completed successfully
+// since this script WILL fail if the node is in recovery mode
+// and if that race happens, the code here would just cause a second
+// cascading recovery.
 		if (ret!=0) {
-			DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+			DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
 			do_recovery(rec, mem_ctx, pnn, nodemap, 
 				    vnnmap, ctdb->pnn);
 		}
+#endif
 	}
 
 	goto again;
diff --git a/server/eventscript.c b/server/eventscript.c
index f6afd47..0a60901 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -52,7 +52,6 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
 {
 	char *options, *cmdstr;
 	int ret;
-	va_list ap2;
 	struct stat st;
 	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 	trbt_tree_t *tree;
@@ -60,6 +59,24 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
 	struct dirent *de;
 	char *script;
 
+	options  = talloc_vasprintf(tmp_ctx, fmt, ap);
+	CTDB_NO_MEMORY(ctdb, options);
+
+	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+		/* we guarantee that only some specifically allowed event scripts are run
+		   while in recovery */
+		const char *allowed_scripts[] = {"startrecovery", "shutdown" };
+		int i;
+		for (i=0;i<ARRAY_SIZE(allowed_scripts);i++) {
+			if (strcmp(options, allowed_scripts[i]) == 0) break;
+		}
+		if (i == ARRAY_SIZE(allowed_scripts)) {
+			DEBUG(0,("Refusing to run event scripts with option '%s' while in recovery\n",
+				 options));
+			return -1;
+		}
+	}
+
 	if (setpgid(0,0) != 0) {
 		DEBUG(DEBUG_ERR,("Failed to create process group for event scripts - %s\n",
 			 strerror(errno)));
@@ -146,11 +163,6 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
 	   them
 	 */
 	while ((script=trbt_findfirstarray32(tree, 1)) != NULL) {
-		va_copy(ap2, ap);
-		options  = talloc_vasprintf(tmp_ctx, fmt, ap2);
-		va_end(ap2);
-		CTDB_NO_MEMORY(ctdb, options);
-
 		cmdstr = talloc_asprintf(tmp_ctx, "%s/%s %s", 
 				ctdb->event_script_dir,
 				script, options);


-- 
CTDB repository