[SCM] Samba Shared Repository - branch master updated

Tue Feb 18 20:08:04 MST 2014

The branch, master has been updated
       via  c743fc4 ctdb-scripts: Update a misleading comment
       via  d4298e9 ctdb-tests: Improvements to tests INSTALL script
       via  24b734f ctdb-recoverd: LCP2 cleanups
       via  9e5ef44 ctdb-recoverd: Optimise check for rebalance candidates in LCP2
       via  2532149 ctdb-scripts: Enhancements to hung script debugging
      from  79e2725 s3-auth: Pass mem_ctx to do_map_to_guest_server_info().

http://gitweb.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit c743fc434545236a7f733b8339f6f8f88fd864c8
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Feb 14 16:59:08 2014 +1100

    ctdb-scripts: Update a misleading comment
    
    This comment was true when 50.samba was spaghetti because it tried to
    automatically manage both smbd (and nmbd) and winbind.  It isn't true
    anymore.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>
    
    Autobuild-User(master): Amitay Isaacs <amitay at samba.org>
    Autobuild-Date(master): Wed Feb 19 04:07:12 CET 2014 on sn-devel-104

commit d4298e9e9dd4cc2a9a79f895e00477e802b2e052
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Feb 17 11:32:15 2014 +1100

    ctdb-tests: Improvements to tests INSTALL script
    
    * Should stop on 1st error
    
    * Fix up value of CTDB_TESTS_ARE_INSTALLED
    
    * Improve fixing of broken symlinks in INSTALL
    
      This is all of the links in tests/eventscript/etc-ctdb/ so no need
      to list them.  Just find and fix them.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit 24b734f084de36160d065dc639100eab3b186f6c
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Feb 7 17:32:12 2014 +1100

    ctdb-recoverd: LCP2 cleanups
    
    * Remove unnecessary candimbl parameter.
    
      This parameter can be cheaply calculated in
      lcp2_failback_candidate().  The compiler will probably do an
      excellent job optimising it.  :-)
    
    * Clarify a debug statement
    
      This is much clearer than doing a complex recalculation of a known
      value.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit 9e5ef44f32fad6606bd95e619f0720a72344e441
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Feb 7 14:28:54 2014 +1100

    ctdb-recoverd: Optimise check for rebalance candidates in LCP2
    
    Currently this can be checked many times.  However, there's no point
    calling the rebalance/failback code at all if there are no rebalance
    candidates.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit 2532149f8f9bbe6d3c8f5ac6e5e4bc2ad1681e27
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Feb 7 17:37:00 2014 +1100

    ctdb-scripts: Enhancements to hung script debugging
    
    * Add stack dumps for "interesting" processes that sometimes get
      stuck, so try to print stack traces for them if they appear in the
      pstree output.
    
    * Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and
      CTDB_DEBUG_HUNG_SCRIPT_STACKPAT.  These are primarily for testing
      but the latter may be useful for live debugging.
    
    * Load CTDB configuration so that above configuration variables can be
      set/changed without restarting ctdbd.
    
    Add a test that tries to ensure that all of this is working.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

-----------------------------------------------------------------------

Summary of changes:
 ctdb/config/debug-hung-script.sh           |   34 ++++++++++-
 ctdb/config/functions                      |    9 +---
 ctdb/doc/ctdbd.conf.5.xml                  |   30 +++++++++
 ctdb/server/ctdb_takeover.c                |   47 +++++++-------
 ctdb/tests/INSTALL                         |    9 ++-
 ctdb/tests/complex/90_debug_hung_script.sh |   91 ++++++++++++++++++++++++++++
 6 files changed, 185 insertions(+), 35 deletions(-)
 create mode 100755 ctdb/tests/complex/90_debug_hung_script.sh


Changeset truncated at 500 lines:

diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh
index 1984242..63d695f 100755
--- a/ctdb/config/debug-hung-script.sh
+++ b/ctdb/config/debug-hung-script.sh
@@ -1,18 +1,48 @@
 #!/bin/sh
 
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
+
+. "$CTDB_BASE/functions"
+
+loadconfig ctdb
+
+# Testing hook
+if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
+    exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1
+fi
+
 (
     flock --wait 2 9 || exit 1
 
     echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
 
     echo "pstree -p -a ${1}:"
-    pstree -p -a $1
+    out=$(pstree -p -a $1)
+    echo "$out"
+
+    # Check for processes matching a regular expression and print
+    # stack staces.  This could help confirm that certain processes
+    # are stuck in certain places such as the cluster filesystem.  The
+    # regexp should separate items with "\|" and should not contain
+    # parentheses.  The default pattern can be replaced for testing.
+    default_pat='exportfs\|rpcinfo'
+    pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
+    echo "$out" |
+    sed -n "s at .*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1 at p" |
+    while read pid name ; do
+	trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
+	if [ $? -eq 0 ] ; then
+	    echo "---- Stack trace of interesting process ${pid}[${name}] ----"
+	    echo "$trace"
+	fi
+    done
 
     if [ "$2" = "init" ] ; then
 	exit 0
     fi
 
-    echo "ctdb scriptstatus ${2}:"
+    echo "---- ctdb scriptstatus ${2}: ----"
     # No use running several of these in parallel if, say, "releaseip"
     # event hangs for multiple IPs.  In that case the output would be
     # interleaved in the log and would just be confusing.
diff --git a/ctdb/config/functions b/ctdb/config/functions
index 4363d3d..954e681 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -1289,14 +1289,7 @@ ctdb_service_stop ()
 
 # Default service_start() and service_stop() functions.
  
-# These may be overridden in an eventscript.  When overriding, the
-# following convention must be followed.  If these functions are
-# called with no arguments then they may use internal logic to
-# determine whether the service is managed and, therefore, whether
-# they should take any action.  However, if the service name is
-# specified as an argument then an attempt must be made to start or
-# stop the service.  This is because the auto-start/stop code calls
-# them with the service name as an argument.
+# These may be overridden in an eventscript.
 service_start ()
 {
     service "$service_name" start
diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml
index a1f6db5..37b1cf9 100644
--- a/ctdb/doc/ctdbd.conf.5.xml
+++ b/ctdb/doc/ctdbd.conf.5.xml
@@ -1375,6 +1375,36 @@ CTDB_SET_MonitorInterval=20
       </varlistentry>
 
       <varlistentry>
+	<term>CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=<parameter>FILENAME</parameter></term>
+	<listitem>
+	  <para>
+	    FILENAME specifies where log messages should go when
+	    debugging hung eventscripts.  This is a testing option.
+	    See also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
+	  </para>
+	  <para>
+	    No default.  Messages go to stdout/stderr and are logged
+	    to the same place as other CTDB log messages.
+	  </para>
+	</listitem>
+      </varlistentry>
+
+      <varlistentry>
+	<term>CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=<parameter>REGEXP</parameter></term>
+	<listitem>
+	  <para>
+	    REGEXP specifies interesting processes for which stack
+	    traces should be logged when debugging hung eventscripts
+	    and those processes are matched in pstree output.  See
+	    also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
+	  </para>
+	  <para>
+	    Default is "exportfs\|rpcinfo".
+	  </para>
+	</listitem>
+      </varlistentry>
+
+      <varlistentry>
 	<term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term>
 	<listitem>
 	  <para>
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index d3a6e25..d2b2a9e 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -1880,7 +1880,6 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
 				    struct ctdb_ipflags *ipflags,
 				    struct ctdb_public_ip_list *all_ips,
 				    int srcnode,
-				    uint32_t candimbl,
 				    uint32_t *lcp2_imbalances,
 				    bool *rebalance_candidates)
 {
@@ -1900,7 +1899,8 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
 	numnodes = talloc_array_length(ipflags);
 
 	DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-	DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
+	DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+			   srcnode, lcp2_imbalances[srcnode]));
 
 	for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
 		/* Only consider addresses on srcnode. */
@@ -1910,7 +1910,7 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
 
 		/* What is this IP address costing the source node? */
 		srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
-		srcimbl = candimbl - srcdsum;
+		srcimbl = lcp2_imbalances[srcnode] - srcdsum;
 
 		/* Consider this IP address would cost each potential
 		 * destination node.  Destination nodes are limited to
@@ -1933,11 +1933,12 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
 			dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
 			dstimbl = lcp2_imbalances[dstnode] + dstdsum;
 			DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
-					   srcnode, srcimbl - lcp2_imbalances[srcnode],
+					   srcnode, -srcdsum,
 					   ctdb_addr_to_str(&(tmp_ip->addr)),
-					   dstnode, dstimbl - lcp2_imbalances[dstnode]));
+					   dstnode, dstdsum));
 
-			if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
+			if ((dstimbl < lcp2_imbalances[srcnode]) &&
+			    (dstdsum < srcdsum) &&			\
 			    ((mindstnode == -1) ||				\
 			     ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
 
@@ -1998,28 +1999,13 @@ static void lcp2_failback(struct ctdb_context *ctdb,
 			  uint32_t *lcp2_imbalances,
 			  bool *rebalance_candidates)
 {
-	int i, num_rebalance_candidates, numnodes;
+	int i, numnodes;
 	struct lcp2_imbalance_pnn * lips;
 	bool again;
 
 	numnodes = talloc_array_length(ipflags);
 
 try_again:
-
-	/* It is only worth continuing if we have suitable target
-	 * nodes to transfer IPs to.  This check is much cheaper than
-	 * continuing on...
-	 */
-	num_rebalance_candidates = 0;
-	for (i=0; i<numnodes; i++) {
-		if (rebalance_candidates[i]) {
-			num_rebalance_candidates++;
-		}
-	}
-	if (num_rebalance_candidates == 0) {
-		return;
-	}
-
 	/* Put the imbalances and nodes into an array, sort them and
 	 * iterate through candidates.  Usually the 1st one will be
 	 * used, so this doesn't cost much...
@@ -2048,7 +2034,6 @@ try_again:
 					    ipflags,
 					    all_ips,
 					    lips[i].pnn,
-					    lips[i].imbalance,
 					    lcp2_imbalances,
 					    rebalance_candidates)) {
 			again = true;
@@ -2153,6 +2138,7 @@ static void ip_alloc_lcp2(struct ctdb_context *ctdb,
 {
 	uint32_t *lcp2_imbalances;
 	bool *rebalance_candidates;
+	int numnodes, num_rebalance_candidates, i;
 
 	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 
@@ -2168,6 +2154,21 @@ static void ip_alloc_lcp2(struct ctdb_context *ctdb,
 		goto finished;
 	}
 
+	/* It is only worth continuing if we have suitable target
+	 * nodes to transfer IPs to.  This check is much cheaper than
+	 * continuing on...
+	 */
+	numnodes = talloc_array_length(ipflags);
+	num_rebalance_candidates = 0;
+	for (i=0; i<numnodes; i++) {
+		if (rebalance_candidates[i]) {
+			num_rebalance_candidates++;
+		}
+	}
+	if (num_rebalance_candidates == 0) {
+		goto finished;
+	}
+
 	/* Now, try to make sure the ip adresses are evenly distributed
 	   across the nodes.
 	*/
diff --git a/ctdb/tests/INSTALL b/ctdb/tests/INSTALL
index 5581989..23dcdaf 100755
--- a/ctdb/tests/INSTALL
+++ b/ctdb/tests/INSTALL
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# Stop on 1st error
+set -e
+
 # Script to install the CTDB testsuite on a machine.
 
 usage ()
@@ -71,7 +74,9 @@ for d in $data_subdirs ; do
 done
 # Some of the unit tests have relative symlinks back to in-tree bits
 # and pieces.  These links will be broken!
-for i in "events.d" "functions" "nfs-rpc-checks.d" ; do
+link_dir="${ctdb_datadir}/eventscripts/etc-ctdb/"
+broken_links=$(find "$link_dir" -type l | sed -e "s@^${link_dir}@@")
+for i in $broken_links ; do
     ln -sf "${etcdir}/ctdb/${i}" "${ctdb_datadir}/eventscripts/etc-ctdb/${i}"
 done
 # test_wrap needs to set TEST_BIN_DIR
@@ -86,6 +91,6 @@ ctdb_bindir="${destdir}${bindir}"
 echo "Installing wrapper scripts into ${ctdb_bindir}..."
 mkdir -p "${ctdb_bindir}"
 out="${ctdb_bindir}/ctdb_run_tests"
-sed -e "s@^test_dir=.*@test_dir=${datarootdir}/ctdb-tests\nexport TEST_BIN_DIR=\"${libdir}/ctdb-tests\"@" "tests/run_tests.sh" >"$out"
+sed -e "s@^test_dir=.*@test_dir=${datarootdir}/ctdb-tests\nexport TEST_BIN_DIR=\"${libdir}/ctdb-tests\"@" -e 's@^\(export CTDB_TESTS_ARE_INSTALLED\)=false@\1=true@' "tests/run_tests.sh" >"$out"
 chmod 755 "$out"
 ln -s "ctdb_run_tests" "${ctdb_bindir}/ctdb_run_cluster_tests"
diff --git a/ctdb/tests/complex/90_debug_hung_script.sh b/ctdb/tests/complex/90_debug_hung_script.sh
new file mode 100755
index 0000000..ef6216c
--- /dev/null
+++ b/ctdb/tests/complex/90_debug_hung_script.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify CTDB's debugging of timed out eventscripts
+
+Prerequisites:
+
+* An active CTDB cluster with monitoring enabled
+
+Expected results:
+
+* When an eventscript times out the correct debugging is executed.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# No need for restart when done
+
+# This is overkill but it at least provides a valid test node
+select_test_node_and_ips
+
+####################
+
+# Set this if CTDB is installed in a non-standard location on cluster
+# nodes
+[ -n "$CTDB_BASE" ] || CTDB_BASE="/etc/ctdb"
+
+####################
+
+echo "Enable eventscript for testing timeouts..."
+ctdb_test_exit_hook_add "onnode -q $test_node $CTDB disablescript 99.timeout"
+try_command_on_node $test_node $CTDB enablescript "99.timeout"
+
+####################
+
+echo "Setting monitor events to time out..."
+rc_local_d="${CTDB_BASE}/rc.local.d"
+try_command_on_node $test_node mkdir -p "$rc_local_d"
+
+rc_local_f="${rc_local_d}/timeout_config.$$"
+ctdb_test_exit_hook_add "onnode $test_node rm -f $rc_local_f"
+
+try_command_on_node $test_node mktemp
+debug_output="$out"
+ctdb_test_exit_hook_add "onnode $test_node rm -f $debug_output"
+
+try_command_on_node -i $test_node tee "$rc_local_f" <<<"\
+CTDB_RUN_TIMEOUT_MONITOR=yes
+CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=\"$debug_output\"
+CTDB_DEBUG_HUNG_SCRIPT_STACKPAT='exportfs\|rpcinfo\|sleep'"
+
+try_command_on_node $test_node chmod +x "$rc_local_f"
+
+####################
+
+wait_for_monitor_event $test_node
+
+echo "Checking output of hung script debugging..."
+try_command_on_node -v $test_node cat "$debug_output"
+
+while IFS="" read pattern ; do
+    if grep -- "^${pattern}\$" <<<"$out" >/dev/null ; then
+	echo "GOOD: output contains \"$pattern\""
+    else
+	echo "BAD: output does not contain \"$pattern\""
+	exit 1
+    fi
+done <<'EOF'
+===== Start of hung script debug for PID=".*", event="monitor" =====
+===== End of hung script debug for PID=".*", event="monitor" =====
+pstree -p -a .*:
+ *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor
+ *\`-sleep,.*
+---- Stack trace of interesting process [0-9]*\\[sleep\\] ----
+[<[0-9a-f]*>] .*sleep+.*
+---- ctdb scriptstatus monitor: ----
+[0-9]* scripts were executed last monitor cycle
+99\\.timeout *Status:TIMEDOUT.*
+ *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\.
+EOF


-- 
Samba Shared Repository