[SCM] CTDB repository - branch master updated -
2c292039a0139dcf5bb2bd964eb6f8902d094c50
Ronnie Sahlberg
sahlberg at samba.org
Thu May 15 05:18:51 GMT 2008
The branch, master has been updated
via 2c292039a0139dcf5bb2bd964eb6f8902d094c50 (commit)
via f0169ac8166a19d65ce254496e21d095aed87c2f (commit)
via 3038d0b74895b51af4f85f2f304508ed16d245f4 (commit)
via d5fb4489f83f1f956b2c083cfad1861c5ddde283 (commit)
via 788d38812d73729f11d12e9812b16092c0ae4123 (commit)
via e3cdb8f2be6a44ec877efcd75c7297edb008a80b (commit)
via b616961c16667328a81efa00a1c880efa4e791f1 (commit)
via 3e6160e5d90a0661eb833b163c11be2267117d0b (commit)
via 7b624add53c270f803177237c08e867f70bc85cc (commit)
from 406a2a1e364cf71eb15e5aeec3b87c62f825da92 (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master
- Log -----------------------------------------------------------------
commit 2c292039a0139dcf5bb2bd964eb6f8902d094c50
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date: Thu May 15 15:01:01 2008 +1000
dont check whether the "recovered" event was successful or not
since this event wont run unless the recovery mode is normal but we
can not know what the recovery mode will be in the future on a remote node
so since we issue these commands that will execute in the future at some other node
it is pointless to try to check if it worked or not
in particular if "failure to successfully run the eventscript" would then trigger a full new recovery which is disruptive and expensive.
commit f0169ac8166a19d65ce254496e21d095aed87c2f
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date: Thu May 15 13:28:19 2008 +1000
remove some unnessecary tests if ->vnn is null or not
commit 3038d0b74895b51af4f85f2f304508ed16d245f4
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date: Thu May 15 12:28:52 2008 +1000
Update some debug statements. Dont say that recovery failed if the failed function was invoked from outside of recovery
commit d5fb4489f83f1f956b2c083cfad1861c5ddde283
Merge: 406a2a1e364cf71eb15e5aeec3b87c62f825da92 788d38812d73729f11d12e9812b16092c0ae4123
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date: Thu May 15 08:02:51 2008 +1000
Merge git://git.samba.org/tridge/ctdb
commit 788d38812d73729f11d12e9812b16092c0ae4123
Author: Andrew Tridgell <tridge at samba.org>
Date: Wed May 14 22:05:09 2008 +1000
put the return in the right place
We were refusing the 'startrecovery' event
commit e3cdb8f2be6a44ec877efcd75c7297edb008a80b
Author: Andrew Tridgell <tridge at samba.org>
Date: Wed May 14 20:57:04 2008 +1000
Fix the chicken and egg problem with ctdb/samba and a registry smb.conf
This attempts to fix the problem of ctdb event scripts blocking due to
attempted access to the ctdb databases during recovery. The changes are:
- now only the 'shutdown' and 'startrecovery' events can be called
with the databases locked in recovery. The event scripts must ensure
that for these two events no database access is attempted
- the recovered, takeip and releaseip events could previously be called
inside a recovery. The code now ensures that this doesn't happen, delaying
the events till after recovery has finished
- the 50.samba event script now avoids using testparm unless it is really
needed
This needs extensive testing.
commit b616961c16667328a81efa00a1c880efa4e791f1
Merge: 3e6160e5d90a0661eb833b163c11be2267117d0b 13d3eb9a8bc7fad14fcd3e7e023c1336657424d6
Author: Andrew Tridgell <tridge at samba.org>
Date: Wed May 14 14:37:11 2008 +1000
Merge commit 'ronnie-ctdb/master' into tridge
commit 3e6160e5d90a0661eb833b163c11be2267117d0b
Author: Andrew Tridgell <tridge at samba.org>
Date: Sat May 10 09:35:13 2008 +1000
need to specicy tree to git archive
commit 7b624add53c270f803177237c08e867f70bc85cc
Author: Andrew Tridgell <tridge at samba.org>
Date: Sat May 10 09:24:51 2008 +1000
use git archive to create tarball
-----------------------------------------------------------------------
Summary of changes:
config/events.d/50.samba | 12 +++--
config/events.d/README | 5 ++-
packaging/RPM/makerpms.sh | 34 +------------
server/ctdb_recoverd.c | 119 +++++++++++++++++++++++---------------------
server/eventscript.c | 24 +++++++--
5 files changed, 93 insertions(+), 101 deletions(-)
Changeset truncated at 500 lines:
diff --git a/config/events.d/50.samba b/config/events.d/50.samba
index c67dbda..784c059 100755
--- a/config/events.d/50.samba
+++ b/config/events.d/50.samba
@@ -16,9 +16,9 @@ shift
SAMBA_CLEANUP_PERIOD=10
}
-
-# autodetect use of winbind if not set in config file
-[ -z "$CTDB_MANAGES_WINBIND" ] && {
+# function to see if ctdb manages winbind
+check_ctdb_manages_winbind() {
+ [ -z "$CTDB_MANAGES_WINBIND" ] && {
secmode=`testparm -s --parameter-name=security 2> /dev/null`
case $secmode in
ADS|DOMAIN)
@@ -28,6 +28,7 @@ shift
CTDB_MANAGES_WINBIND="no";
;;
esac
+ }
}
###########################
@@ -53,11 +54,12 @@ case $cmd in
}
# restart the winbind service
+ check_ctdb_manages_winbind
[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
service winbind stop > /dev/null 2>&1
killall -0 -q winbindd && {
sleep 1
- # make absolutely sure winbindd is dead
+ # make absolutely sure winbindd is dead
killall -q -9 winbindd
}
service winbind start
@@ -87,6 +89,7 @@ case $cmd in
service smb stop
# stop the winbind service
+ check_ctdb_manages_winbind
[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
service winbind stop
}
@@ -116,6 +119,7 @@ case $cmd in
ctdb_check_tcp_ports "Samba" $smb_ports
# check winbind is OK
+ check_ctdb_manages_winbind
[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
ctdb_check_command "winbind" "wbinfo -p"
}
diff --git a/config/events.d/README b/config/events.d/README
index bfa4372..a75da38 100644
--- a/config/events.d/README
+++ b/config/events.d/README
@@ -18,6 +18,9 @@ The eventscripts are called with varying number of arguments.
The first argument is the "event" and the rest of the arguments depend
on which event was triggered.
+All of the events except the 'shutdown' and 'startrecovery' events will be
+called with the ctdb daemon in NORMAL mode (ie. not in recovery)
+
The events currently implemented are
startup
This event does not take any additional arguments.
@@ -74,7 +77,7 @@ takeip
Before this event there will always be a 'startrecovery' event.
- This event will always be followed by a 'recovered' event onse
+ This event will always be followed by a 'recovered' event once
all ipaddresses have been reassigned to new nodes and the ctdb database
has been recovered.
If multiple ip addresses are reassigned during recovery it is
diff --git a/packaging/RPM/makerpms.sh b/packaging/RPM/makerpms.sh
index 71c8db5..7b5012a 100755
--- a/packaging/RPM/makerpms.sh
+++ b/packaging/RPM/makerpms.sh
@@ -26,50 +26,19 @@ SRCDIR=`rpm --eval %_sourcedir`
# At this point the SPECDIR and SRCDIR vaiables must have a value!
-USERID=`id -u`
-GRPID=`id -g`
VERSION='1.0'
REVISION=''
SPECFILE="ctdb.spec"
-RPMVER=`rpm --version | awk '{print $3}'`
RPMBUILD="rpmbuild"
-##
-## Check the RPM version (paranoid)
-##
-case $RPMVER in
- 4*)
- echo "Supported RPM version [$RPMVER]"
- ;;
- *)
- echo "Unknown RPM version: `rpm --version`"
- exit 1
- ;;
-esac
-
-if [ -f Makefile ]; then
- make distclean
-fi
-
-pushd .
-BASEDIR=`basename $PWD`
-cd ..
-chown -R ${USERID}.${GRPID} $BASEDIR
-rm -f ctdb-${VERSION}
-ln -s $BASEDIR ctdb-${VERSION} || exit 1
-REMOVE_LN=$PWD/ctdb-$VERSION
-
echo -n "Creating ctdb-${VERSION}.tar.gz ... "
-tar --exclude=.bzr --exclude=.git --exclude .bzrignore --exclude="*~" --exclude=configure --exclude="test.db*" --exclude="#*" --exclude="push*.sh" --exclude="publish*.sh" -cf - ctdb-${VERSION}/. | gzip -9 --rsyncable > ${SRCDIR}/ctdb-${VERSION}.tar.gz
+git archive --prefix=ctdb-${VERSION}/ HEAD | gzip -9 --rsyncable > ${SRCDIR}/ctdb-${VERSION}.tar.gz
echo "Done."
if [ $? -ne 0 ]; then
echo "Build failed!"
- [ ${REMOVE_LN} ] && rm $REMOVE_LN
exit 1
fi
-popd
-
##
## copy additional source files
@@ -84,6 +53,5 @@ cd ${SPECDIR}
${RPMBUILD} -ba --clean --rmsource $EXTRA_OPTIONS $SPECFILE || exit 1
echo "$(basename $0): Done."
-[ ${REMOVE_LN} ] && /bin/rm -f $REMOVE_LN
exit 0
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 7aca7cb..9a33819 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -203,7 +203,7 @@ enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEED
/*
run the "recovered" eventscript on all nodes
*/
-static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
{
TALLOC_CTX *tmp_ctx;
@@ -213,7 +213,8 @@ static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
talloc_free(tmp_ctx);
return -1;
}
@@ -247,7 +248,8 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata)
{
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
- DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
+ DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n",
+ (unsigned)outdata.dsize, outdata.dptr));
return;
}
ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
@@ -1451,6 +1453,15 @@ static int do_recovery(struct ctdb_recoverd *rec,
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+ /* disable recovery mode */
+ ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
+ if (ret!=0) {
+ DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
+ return -1;
+ }
+
+ DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
+
/*
tell nodes to takeover their public IPs
*/
@@ -1463,23 +1474,14 @@ static int do_recovery(struct ctdb_recoverd *rec,
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
/* execute the "recovered" event script on all nodes */
- ret = run_recovered_eventscript(ctdb, nodemap);
+ ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
return -1;
}
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
- /* disable recovery mode */
- ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
- if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
- return -1;
- }
-
- DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
-
/* send a message to all clients telling them that the cluster
has been reconfigured */
ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
@@ -1873,8 +1875,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
if (ret == 0 &&
ctdb->recovery_master == ctdb->pnn &&
- ctdb->recovery_mode == CTDB_RECOVERY_NORMAL &&
- ctdb->vnn) {
+ ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
/* Only do the takeover run if the perm disabled or unhealthy
flags changed since these will cause an ip failover but not
a recovery.
@@ -2450,46 +2451,44 @@ again:
}
/* verify that the public ip address allocation is consistent */
- if (ctdb->vnn != NULL) {
- ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
- if (ret != 0) {
- DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", i));
- goto again;
- }
- for (j=0; j<ips->num; j++) {
- /* verify that we have the ip addresses we should have
- and we dont have ones we shouldnt have.
- if we find an inconsistency we set recmode to
- active on the local node and wait for the recmaster
- to do a full blown recovery
- */
- if (ips->ips[j].pnn == pnn) {
- if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
- DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
- ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
- goto again;
- }
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- goto again;
- }
+ ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", i));
+ goto again;
+ }
+ for (j=0; j<ips->num; j++) {
+ /* verify that we have the ip addresses we should have
+ and we dont have ones we shouldnt have.
+ if we find an inconsistency we set recmode to
+ active on the local node and wait for the recmaster
+ to do a full blown recovery
+ */
+ if (ips->ips[j].pnn == pnn) {
+ if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
+ DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
+ ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+ goto again;
}
- } else {
- if (ctdb_sys_have_ip(ips->ips[j].sin)) {
- DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
- ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
- goto again;
- }
- ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
- goto again;
- }
+ ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+ goto again;
+ }
+ }
+ } else {
+ if (ctdb_sys_have_ip(ips->ips[j].sin)) {
+ DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
+ ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+ goto again;
+ }
+ ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+ goto again;
}
}
}
@@ -2748,12 +2747,18 @@ again:
}
/* execute the "recovered" event script on all nodes */
- ret = run_recovered_eventscript(ctdb, nodemap);
+ ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
+#if 0
+// we cant check whether the event completed successfully
+// since this script WILL fail if the node is in recovery mode
+// and if that race happens, the code here would just cause a second
+// cascading recovery.
if (ret!=0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
do_recovery(rec, mem_ctx, pnn, nodemap,
vnnmap, ctdb->pnn);
}
+#endif
}
goto again;
diff --git a/server/eventscript.c b/server/eventscript.c
index f6afd47..0a60901 100644
--- a/server/eventscript.c
+++ b/server/eventscript.c
@@ -52,7 +52,6 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
{
char *options, *cmdstr;
int ret;
- va_list ap2;
struct stat st;
TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
trbt_tree_t *tree;
@@ -60,6 +59,24 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
struct dirent *de;
char *script;
+ options = talloc_vasprintf(tmp_ctx, fmt, ap);
+ CTDB_NO_MEMORY(ctdb, options);
+
+ if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+ /* we guarantee that only some specifically allowed event scripts are run
+ while in recovery */
+ const char *allowed_scripts[] = {"startrecovery", "shutdown" };
+ int i;
+ for (i=0;i<ARRAY_SIZE(allowed_scripts);i++) {
+ if (strcmp(options, allowed_scripts[i]) == 0) break;
+ }
+ if (i == ARRAY_SIZE(allowed_scripts)) {
+ DEBUG(0,("Refusing to run event scripts with option '%s' while in recovery\n",
+ options));
+ return -1;
+ }
+ }
+
if (setpgid(0,0) != 0) {
DEBUG(DEBUG_ERR,("Failed to create process group for event scripts - %s\n",
strerror(errno)));
@@ -146,11 +163,6 @@ static int ctdb_event_script_v(struct ctdb_context *ctdb, const char *fmt, va_li
them
*/
while ((script=trbt_findfirstarray32(tree, 1)) != NULL) {
- va_copy(ap2, ap);
- options = talloc_vasprintf(tmp_ctx, fmt, ap2);
- va_end(ap2);
- CTDB_NO_MEMORY(ctdb, options);
-
cmdstr = talloc_asprintf(tmp_ctx, "%s/%s %s",
ctdb->event_script_dir,
script, options);
--
CTDB repository
More information about the samba-cvs
mailing list