[SCM] Samba Shared Repository - branch master updated

Amitay Isaacs amitay at samba.org
Mon Nov 5 05:53:02 UTC 2018


The branch, master has been updated
       via  27df4f0 ctdb-recovery: Ban a node that causes recovery failure
      from  3338a3e traffic: Machine accounts were generated as critical objects

https://git.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 27df4f002a594dbb2f2a38afaccf3e22f19818e1
Author: Martin Schwenke <martin at meltin.net>
Date:   Mon Oct 29 14:33:08 2018 +1100

    ctdb-recovery: Ban a node that causes recovery failure
    
    ... instead of applying banning credits.
    
    There have been a couple of cases where recovery repeatedly takes just
    over 2 minutes to fail.  Therefore, banning credits expire between
    failures and a continuously problematic node is never banned,
    resulting in endless recoveries.  This is because it takes 2
    applications of banning credits before a node is banned, which
    generally involves 2 recovery failures.
    
    The recovery helper makes up to 3 attempts to recover each database
    during a single run.  If a node causes 3 failures then this is really
    equivalent to 3 recovery failures in the model that existed before the
    recovery helper added retries.  In that case the node would have been
    banned after 2 failures.
    
    So, instead of applying banning credits to the "most failing" node,
    simply ban it directly from the recovery helper.
    
    If multiple nodes are causing recovery failures then this can cause a
    node to be banned more quickly than it might otherwise have been, even
    pre-recovery-helper.  However, 90 seconds (i.e. 3 failures) is a long
    time to be in recovery, so banning earlier seems like the best
    approach.
    
    BUG: https://bugzilla.samba.org/show_bug.cgi?id=13670
    
    Signed-off-by: Martin Schwenke <martin at meltin.net>
    Reviewed-by: Amitay Isaacs <amitay at gmail.com>
    
    Autobuild-User(master): Amitay Isaacs <amitay at samba.org>
    Autobuild-Date(master): Mon Nov  5 06:52:33 CET 2018 on sn-devel-144

-----------------------------------------------------------------------

Summary of changes:
 ctdb/server/ctdb_recovery_helper.c | 46 +++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 15 deletions(-)


Changeset truncated at 500 lines:

diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c
index 7495eb3..7fdcc2e 100644
--- a/ctdb/server/ctdb_recovery_helper.c
+++ b/ctdb/server/ctdb_recovery_helper.c
@@ -2571,22 +2571,28 @@ static void recovery_db_recovery_done(struct tevent_req *subreq)
 
 		/* If pulling database fails multiple times */
 		if (max_credits >= NUM_RETRIES) {
-			struct ctdb_req_message message;
-
-			D_ERR("Assigning banning credits to node %u\n",
-			      max_pnn);
-
-			message.srvid = CTDB_SRVID_BANNING;
-			message.data.pnn = max_pnn;
-
-			subreq = ctdb_client_message_send(
-					state, state->ev, state->client,
-					ctdb_client_pnn(state->client),
-					&message);
+			struct ctdb_ban_state ban_state = {
+				.pnn = max_pnn,
+				.time = state->tun_list->recovery_ban_period,
+			};
+
+			D_ERR("Banning node %u for %u seconds\n",
+			      ban_state.pnn,
+			      ban_state.time);
+
+			ctdb_req_control_set_ban_state(&request,
+						       &ban_state);
+			subreq = ctdb_client_control_send(state,
+							  state->ev,
+							  state->client,
+							  ban_state.pnn,
+							  TIMEOUT(),
+							  &request);
 			if (tevent_req_nomem(subreq, req)) {
 				return;
 			}
-			tevent_req_set_callback(subreq, recovery_failed_done,
+			tevent_req_set_callback(subreq,
+						recovery_failed_done,
 						req);
 		} else {
 			tevent_req_error(req, EIO);
@@ -2609,15 +2615,25 @@ static void recovery_failed_done(struct tevent_req *subreq)
 {
 	struct tevent_req *req = tevent_req_callback_data(
 		subreq, struct tevent_req);
+	struct recovery_state *state = tevent_req_data(
+		req, struct recovery_state);
+	struct ctdb_reply_control *reply;
 	int ret;
 	bool status;
 
-	status = ctdb_client_message_recv(subreq, &ret);
+	status = ctdb_client_control_recv(subreq, &ret, state, &reply);
 	TALLOC_FREE(subreq);
 	if (! status) {
-		D_ERR("failed to assign banning credits, ret=%d\n", ret);
+		D_ERR("failed to ban node, ret=%d\n", ret);
+		goto done;
+	}
+
+	ret = ctdb_reply_control_set_ban_state(reply);
+	if (ret != 0) {
+		D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
 	}
 
+done:
 	tevent_req_error(req, EIO);
 }
 


-- 
Samba Shared Repository



More information about the samba-cvs mailing list