[SCM] Samba Shared Repository - branch master updated

Amitay Isaacs amitay at samba.org
Tue May 31 05:57:01 UTC 2022


The branch, master has been updated
       via  b20ee18031c ctdb-tests: Fix a cut and paste error in a comment
       via  90a96f06a93 ctdb-recoverd: Do not ban on unknown error when taking cluster lock
       via  a400f4e7cc0 ctdb-doc: Fix typos in the policy routing documentation
       via  da9decfc5e3 ctdb-daemon: Remove unused #includes of rb_tree.h
       via  80de84d36e9 ctdb-daemon: Log per-database summary of resent calls
      from  3567f4130d8 debug: update comments about setup_logging()

https://git.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit b20ee18031c5280c78ad1ec9d2c5cc6d28be719f
Author: Martin Schwenke <martin at meltin.net>
Date:   Tue May 24 11:30:20 2022 +1000

    ctdb-tests: Fix a cut and paste error in a comment
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>
    
    Autobuild-User(master): Amitay Isaacs <amitay at samba.org>
    Autobuild-Date(master): Tue May 31 05:56:43 UTC 2022 on sn-devel-184

commit 90a96f06a938d5fdae33c959c71103add3064b96
Author: Martin Schwenke <martin at meltin.net>
Date:   Thu May 19 15:09:41 2022 +1000

    ctdb-recoverd: Do not ban on unknown error when taking cluster lock
    
    If the cluster filesystem is unavailable then I/O errors may occur.
    This is no worse than contention, so don't ban.  This avoids having
    services unavailable for longer than necessary.
    
    Update the associated test to simply confirm that this results in a
    leaderless cluster, and leadership is restored when the lock can once
    again be taken.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit a400f4e7cc0f48efd7b88354d1a550615fec6d64
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Mar 4 08:52:32 2022 +1100

    ctdb-doc: Fix typos in the policy routing documentation
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit da9decfc5e35d7d59987904d4dc03cf2ce1db3e7
Author: Martin Schwenke <martin at meltin.net>
Date:   Wed Feb 23 09:57:33 2022 +1100

    ctdb-daemon: Remove unused #includes of rb_tree.h
    
    ctdb_takeover.c and eventscript.c no longer use this.
    ipalloc_common.c has never used it.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

commit 80de84d36e9c29d9506976f991560fb5dde99471
Author: Martin Schwenke <martin at meltin.net>
Date:   Fri Oct 15 11:10:46 2021 +1100

    ctdb-daemon: Log per-database summary of resent calls
    
    After a recovery that takes a significant amount of time the logs are
    flooded with messages about every resent call.
    
    Log a summary instead and demote per-call messages to INFO level.
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>

-----------------------------------------------------------------------

Summary of changes:
 ctdb/doc/ctdb.7.xml                                |  4 +-
 ctdb/server/ctdb_call.c                            | 12 ++-
 ctdb/server/ctdb_recoverd.c                        |  7 --
 ctdb/server/ctdb_takeover.c                        |  1 -
 ctdb/server/eventscript.c                          |  1 -
 ctdb/server/ipalloc_common.c                       |  1 -
 .../cluster.008.capability_leader_yield_no_lock.sh |  3 +-
 .../simple/cluster.016.reclock_move_lock_dir.sh    | 90 +++++++++++-----------
 8 files changed, 60 insertions(+), 59 deletions(-)


Changeset truncated at 500 lines:

diff --git a/ctdb/doc/ctdb.7.xml b/ctdb/doc/ctdb.7.xml
index 6daa092f837..351f3e8b6f7 100644
--- a/ctdb/doc/ctdb.7.xml
+++ b/ctdb/doc/ctdb.7.xml
@@ -895,7 +895,7 @@ CTDB_NATGW_DEFAULT_GATEWAY=10.0.0.1
       </para>
 
       <screen>
-  192.168.1.99	192.168.1.1/24
+  192.168.1.99	192.168.1.0/24
       </screen>
 
       <para>
@@ -918,7 +918,7 @@ CTDB_NATGW_DEFAULT_GATEWAY=10.0.0.1
       </screen>
 
       <para>  
-	This causes traffic from 192.168.1.1 to 192.168.1.0/24 go via
+	This causes traffic from 192.168.1.99 to 192.168.1.0/24 go via
 	eth2.
       </para>
 
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c
index 14baa797bd6..1d5dea32962 100644
--- a/ctdb/server/ctdb_call.c
+++ b/ctdb/server/ctdb_call.c
@@ -1398,8 +1398,10 @@ static void ctdb_call_resend(struct ctdb_call_state *state)
 	state->c->hdr.destnode = ctdb->pnn;
 
 	ctdb_queue_packet(ctdb, &state->c->hdr);
-	DEBUG(DEBUG_NOTICE,("resent ctdb_call for db %s reqid %u generation %u\n",
-			    state->ctdb_db->db_name, state->reqid, state->generation));
+	D_INFO("resent ctdb_call for db %s reqid %u generation %u\n",
+	       state->ctdb_db->db_name,
+	       state->reqid,
+	       state->generation);
 }
 
 /*
@@ -1408,11 +1410,17 @@ static void ctdb_call_resend(struct ctdb_call_state *state)
 void ctdb_call_resend_db(struct ctdb_db_context *ctdb_db)
 {
 	struct ctdb_call_state *state, *next;
+	unsigned int count = 0;
 
 	for (state = ctdb_db->pending_calls; state; state = next) {
 		next = state->next;
 		ctdb_call_resend(state);
+		count++;
 	}
+	D_NOTICE("Resent calls for database=%s, generation=%u, count=%u\n",
+		 ctdb_db->db_name,
+		 ctdb_db->generation,
+		 count);
 }
 
 void ctdb_call_resend_all(struct ctdb_context *ctdb)
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 03698ef2928..c293aa7f037 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -839,13 +839,6 @@ static void take_cluster_lock_handler(char status,
 
 	default:
 		D_ERR("Unable to take cluster lock - unknown error\n");
-
-		{
-			struct ctdb_recoverd *rec = s->rec;
-
-			D_ERR("Banning this node\n");
-			ctdb_ban_node(rec, rec->pnn);
-		}
 	}
 
 	s->done = true;
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index 4e9de8fd6af..c1e4f683784 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -38,7 +38,6 @@
 #include "ctdb_private.h"
 #include "ctdb_client.h"
 
-#include "common/rb_tree.h"
 #include "common/reqid.h"
 #include "common/system.h"
 #include "common/system_socket.h"
diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c
index 50aee915aa0..3ea7d74e955 100644
--- a/ctdb/server/eventscript.c
+++ b/ctdb/server/eventscript.c
@@ -36,7 +36,6 @@
 
 #include "ctdb_private.h"
 
-#include "common/rb_tree.h"
 #include "common/common.h"
 #include "common/logging.h"
 #include "common/reqid.h"
diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c
index 53d96d11e47..437c5114ebb 100644
--- a/ctdb/server/ipalloc_common.c
+++ b/ctdb/server/ipalloc_common.c
@@ -30,7 +30,6 @@
 #include "common/logging.h"
 
 #include "common/common.h"
-#include "common/rb_tree.h"
 
 #include "protocol/protocol_util.h"
 
diff --git a/ctdb/tests/INTEGRATION/simple/cluster.008.capability_leader_yield_no_lock.sh b/ctdb/tests/INTEGRATION/simple/cluster.008.capability_leader_yield_no_lock.sh
index b84ae4f42d3..4489bc580c6 100755
--- a/ctdb/tests/INTEGRATION/simple/cluster.008.capability_leader_yield_no_lock.sh
+++ b/ctdb/tests/INTEGRATION/simple/cluster.008.capability_leader_yield_no_lock.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-# Verify that 'ctdb ban' causes a node to yield the leader role
+# Verify that removing the the leader capability causes a node to
+# yield the leader role
 
 . "${TEST_SCRIPTS_DIR}/integration.bash"
 
diff --git a/ctdb/tests/INTEGRATION/simple/cluster.016.reclock_move_lock_dir.sh b/ctdb/tests/INTEGRATION/simple/cluster.016.reclock_move_lock_dir.sh
index a2ba112c68c..ca2e7157dfc 100755
--- a/ctdb/tests/INTEGRATION/simple/cluster.016.reclock_move_lock_dir.sh
+++ b/ctdb/tests/INTEGRATION/simple/cluster.016.reclock_move_lock_dir.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
 
-# Verify that if the directory containing the recovery lock is moved
-# then all nodes are banned (because they can't take the lock).
-# Confirm that if the directory is moved back and the bans time out
-# then the cluster returns to good health.
+# Verify that if the directory containing the cluster lock is moved
+# then the current cluster leader no longer claims to be leader, and
+# no other node claims to be leader.  Confirm that if the directory is
+# moved back then a node will become leader.
 
-# This simulates the cluster filesystem containing the recovery lock
+# This simulates the cluster filesystem containing the cluster lock
 # being unmounted and remounted.
 
 . "${TEST_SCRIPTS_DIR}/integration.bash"
@@ -19,21 +19,9 @@ ctdb_test_init -n
 echo "Starting CTDB with cluster lock recheck time set to 5s..."
 ctdb_nodes_start_custom -r 5
 
-all_nodes_are_banned ()
-{
-	node="$1"
-
-	ctdb_onnode "$node" nodestatus
-	[ $? -eq 1 ]
-
-	# shellcheck disable=SC2154
-	# $out set by ctdb_onnode() above
-	[ "$out" = "Warning: All nodes are banned." ]
-}
-
 select_test_node
 
-echo "Get recovery lock setting"
+echo "Get cluster lock setting"
 # shellcheck disable=SC2154
 # $test_node set by select_test_node() above
 ctdb_onnode "$test_node" getreclock
@@ -42,49 +30,63 @@ ctdb_onnode "$test_node" getreclock
 reclock_setting="$out"
 
 if [ -z "$reclock_setting" ] ; then
-	ctdb_test_skip "Recovery lock is not set"
+	ctdb_test_skip "Cluster lock is not set"
 fi
 
 t="${reclock_setting% 5}"
 reclock="${t##* }"
 
 if [ ! -f "$reclock" ] ; then
-	ctdb_test_error "Recovery lock file \"${reclock}\" is missing"
+	ctdb_test_error "Cluster lock file \"${reclock}\" is missing"
 fi
 
-echo "Recovery lock setting is \"${reclock_setting}\""
-echo "Recovery lock file is \"${reclock}\""
-echo
-
-echo "Set ban period to 30s"
-ctdb_onnode all setvar RecoveryBanPeriod 30
+echo "Cluster lock setting is \"${reclock_setting}\""
+echo "Cluster lock file is \"${reclock}\""
 echo
 
-# Avoid a race where the election handler can be called before the
-# tunables are updated in the recovery daemon.  Ideally, since
-# everything is idle, this should take one RecoverInterval
-# (i.e. iteration of the monitor loop in the recovery daemon).
-# However, this is the interval between loops and each loop can take
-# an arbitrary amount of time.  The only way to be sure that the
-# tunables have definitely been updated is to do 2 recoveries - this
-# guarantees the tunables were read at the top of the loop between the
-# 2 recoveries.
-echo "2 recoveries to ensure that tunables have been re-read"
-ctdb_onnode "$test_node" "recover"
-ctdb_onnode "$test_node" "recover"
+leader_get "$test_node"
 
 dir=$(dirname "$reclock")
 
-echo "Rename recovery lock directory"
+echo "Rename cluster lock directory"
 mv "$dir" "${dir}.$$"
+
+wait_until_leader_has_changed "$test_node"
 echo
 
-echo "Wait until all nodes are banned"
-wait_until 60 all_nodes_are_banned "$test_node"
+# shellcheck disable=SC2154
+# $leader set by leader_get() & wait_until_leader_has_changed(), above
+if [ "$leader" != "UNKNOWN" ]; then
+	test_fail "BAD: leader is ${leader}"
+fi
+
+echo "OK: leader is UNKNOWN"
 echo
 
-echo "Restore recovery lock directory"
-mv "${dir}.$$" "$dir"
+echo 'Get "leader timeout":'
+conf_tool="${CTDB_SCRIPTS_HELPER_BINDIR}/ctdb-config"
+# shellcheck disable=SC2154
+# $test_node set by select_test_node() above
+try_command_on_node "$test_node" "${conf_tool} get cluster 'leader timeout'"
+# shellcheck disable=SC2154
+# $out set by ctdb_onnode() above
+leader_timeout="$out"
+echo "Leader timeout is ${leader_timeout}s"
 echo
 
-wait_until_ready 60
+sleep_time=$((2 * leader_timeout))
+echo "Waiting for ${sleep_time}s to confirm leader stays UNKNOWN"
+sleep_for $sleep_time
+
+leader_get "$test_node"
+if [ "$leader" = "UNKNOWN" ]; then
+	echo "OK: leader is UNKNOWN"
+	echo
+else
+	test_fail "BAD: leader is ${leader}"
+fi
+
+echo "Restore cluster lock directory"
+mv "${dir}.$$" "$dir"
+
+wait_until_leader_has_changed "$test_node"


-- 
Samba Shared Repository



More information about the samba-cvs mailing list