[SCM] CTDB repository - branch master updated - 3059ab5f3f21e331b80728773c36a74620e46677

Ronnie Sahlberg sahlberg at samba.org
Fri Aug 8 03:34:44 GMT 2008


The branch, master has been updated
       via  3059ab5f3f21e331b80728773c36a74620e46677 (commit)
       via  b3b9707dd8244758ff1080401a9e03e74766e1ab (commit)
       via  7f29c50ccbc7789bfbc20bcb4b65758af9ebe6c5 (commit)
       via  7c6b621f7307dc39ffcd7d965ac613642af201b8 (commit)
       via  e75cc3a030a8ccb43961cf80ff10d41ec81a24b0 (commit)
       via  2426b9010ef45f5e96ffc12b8a69a3b0566b4f98 (commit)
       via  a4814aa8b0b165b9d6c4c55fc5aee33cd1a570bd (commit)
       via  7ed5fbe7fa3bc3cb729d9b516d2a73d52e28d22d (commit)
       via  6915661a460cd589b441ac7cd8695f35c4e83113 (commit)
      from  58e6dc722ad1e2415b71baf1d471885169dde14d (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 3059ab5f3f21e331b80728773c36a74620e46677
Merge: e75cc3a030a8ccb43961cf80ff10d41ec81a24b0 b3b9707dd8244758ff1080401a9e03e74766e1ab
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Aug 8 13:11:07 2008 +1000

    Merge git://git.samba.org/tridge/ctdb

commit b3b9707dd8244758ff1080401a9e03e74766e1ab
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 13:11:41 2008 +1000

    added retry handling in client

commit 7f29c50ccbc7789bfbc20bcb4b65758af9ebe6c5
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 13:11:28 2008 +1000

    added a new control CTDB_CONTROL_TRANS2_COMMIT_RETRY so we can tell
    the difference between a initial commit attempt and a retry, which
    allows us to get the persistent updates counter right for retries

commit 7c6b621f7307dc39ffcd7d965ac613642af201b8
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 11:04:21 2008 +1000

    imported failure handling from dbwrap_ctdb.c

commit e75cc3a030a8ccb43961cf80ff10d41ec81a24b0
Merge: a4814aa8b0b165b9d6c4c55fc5aee33cd1a570bd 7c6b621f7307dc39ffcd7d965ac613642af201b8
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Aug 8 10:59:40 2008 +1000

    Merge git://git.samba.org/tridge/ctdb

commit 2426b9010ef45f5e96ffc12b8a69a3b0566b4f98
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 10:15:23 2008 +1000

    save writing the same data twice

commit a4814aa8b0b165b9d6c4c55fc5aee33cd1a570bd
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Aug 8 10:01:20 2008 +1000

    new version 1.0.54

commit 7ed5fbe7fa3bc3cb729d9b516d2a73d52e28d22d
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 10:00:33 2008 +1000

    up release number

commit 6915661a460cd589b441ac7cd8695f35c4e83113
Author: Andrew Tridgell <tridge at samba.org>
Date:   Fri Aug 8 09:58:49 2008 +1000

    return a more detailed error code from a trans2 commit error

-----------------------------------------------------------------------

Summary of changes:
 client/ctdb_client.c     |   78 +++++++++++++++++++++++++++++++++++----------
 include/ctdb_private.h   |   11 ++++++
 packaging/RPM/ctdb.spec  |    6 +++-
 server/ctdb_control.c    |    1 +
 server/ctdb_persistent.c |   35 +++++++++++++++++---
 5 files changed, 107 insertions(+), 24 deletions(-)


Changeset truncated at 500 lines:

diff --git a/client/ctdb_client.c b/client/ctdb_client.c
index 2b31d81..0d85374 100644
--- a/client/ctdb_client.c
+++ b/client/ctdb_client.c
@@ -3068,12 +3068,13 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
 {
 	TALLOC_CTX *tmp_ctx = talloc_new(h);
 	struct ctdb_ltdb_header header;
+	TDB_DATA olddata;
 	int ret;
 
 	ZERO_STRUCT(header);
 
 	/* we need the header so we can update the RSN */
-	ret = ctdb_ltdb_fetch(h->ctdb_db, key, &header, tmp_ctx, NULL);
+	ret = ctdb_ltdb_fetch(h->ctdb_db, key, &header, tmp_ctx, &olddata);
 	if (ret == -1 && header.dmaster == (uint32_t)-1) {
 		/* the record doesn't exist - create one with us as dmaster.
 		   This is only safe because we are in a transaction and this
@@ -3086,6 +3087,13 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
 		return ret;
 	}
 
+	if (data.dsize == olddata.dsize &&
+	    memcmp(data.dptr, olddata.dptr, data.dsize) == 0) {
+		/* save writing the same data */
+		talloc_free(tmp_ctx);
+		return 0;
+	}
+
 	header.rsn++;
 
 	if (!h->in_replay) {
@@ -3095,13 +3103,13 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
 			talloc_free(tmp_ctx);
 			return -1;
 		}
-		
-		h->m_write = ctdb_marshall_add(h, h->m_write, h->ctdb_db->db_id, 0, key, &header, data);
-		if (h->m_write == NULL) {
-			DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
-			talloc_free(tmp_ctx);
-			return -1;
-		}
+	}		
+
+	h->m_write = ctdb_marshall_add(h, h->m_write, h->ctdb_db->db_id, 0, key, &header, data);
+	if (h->m_write == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+		talloc_free(tmp_ctx);
+		return -1;
 	}
 	
 	ret = ctdb_ltdb_store(h->ctdb_db, key, &header, data);
@@ -3120,6 +3128,8 @@ static int ctdb_replay_transaction(struct ctdb_transaction_handle *h)
 	struct ctdb_rec_data *rec = NULL;
 
 	h->in_replay = true;
+	talloc_free(h->m_write);
+	h->m_write = NULL;
 
 	ret = ctdb_transaction_fetch_start(h);
 	if (ret != 0) {
@@ -3171,16 +3181,11 @@ failed:
  */
 int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
 {
-	int ret;
+	int ret, retries=0;
 	int32_t status;
 	struct ctdb_context *ctdb = h->ctdb_db->ctdb;
 	struct timeval timeout;
-
-	if (h->m_write == NULL) {
-		/* no changes were made */
-		talloc_free(h);
-		return 0;
-	}
+	enum ctdb_controls failure_control = CTDB_CONTROL_TRANS2_ERROR;
 
 	talloc_set_destructor(h, NULL);
 
@@ -3200,24 +3205,61 @@ int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
 	*/
 
 again:
+	if (h->m_write == NULL) {
+		/* no changes were made */
+		tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
+		talloc_free(h);
+		return 0;
+	}
+
 	/* tell ctdbd to commit to the other nodes */
 	timeout = timeval_current_ofs(1, 0);
 	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
-			   CTDB_CONTROL_TRANS2_COMMIT, 0, 
+			   retries==0?CTDB_CONTROL_TRANS2_COMMIT:CTDB_CONTROL_TRANS2_COMMIT_RETRY, 0, 
 			   ctdb_marshall_finish(h->m_write), NULL, NULL, &status, 
 			   &timeout, NULL);
 	if (ret != 0 || status != 0) {
 		tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
 		sleep(1);
+
+		if (ret != 0) {
+			failure_control = CTDB_CONTROL_TRANS2_ERROR;
+		} else {
+			/* work out what error code we will give if we 
+			   have to fail the operation */
+			switch ((enum ctdb_trans2_commit_error)status) {
+			case CTDB_TRANS2_COMMIT_SUCCESS:
+			case CTDB_TRANS2_COMMIT_SOMEFAIL:
+			case CTDB_TRANS2_COMMIT_TIMEOUT:
+				failure_control = CTDB_CONTROL_TRANS2_ERROR;
+				break;
+			case CTDB_TRANS2_COMMIT_ALLFAIL:
+				failure_control = CTDB_CONTROL_TRANS2_FINISHED;
+				break;
+			}
+		}
+
+		if (++retries == 10) {
+			DEBUG(DEBUG_ERR,(__location__ " Giving up transaction on db 0x%08x after %d retries failure_control=%u\n", 
+					 h->ctdb_db->db_id, retries, (unsigned)failure_control));
+			ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
+				     failure_control, CTDB_CTRL_FLAG_NOREPLY, 
+				     tdb_null, NULL, NULL, NULL, NULL, NULL);		
+			talloc_free(h);
+			return -1;
+		}		
+
 		if (ctdb_replay_transaction(h) != 0) {
 			DEBUG(DEBUG_ERR,(__location__ " Failed to replay transaction\n"));
 			ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
-				     CTDB_CONTROL_TRANS2_ERROR, CTDB_CTRL_FLAG_NOREPLY, 
+				     failure_control, CTDB_CTRL_FLAG_NOREPLY, 
 				     tdb_null, NULL, NULL, NULL, NULL, NULL);		
 			talloc_free(h);
 			return -1;
 		}
 		goto again;
+	} else {
+		failure_control = CTDB_CONTROL_TRANS2_ERROR;
 	}
 
 	/* do the real commit locally */
@@ -3225,7 +3267,7 @@ again:
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " Failed to commit transaction\n"));
 		ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
-			     CTDB_CONTROL_TRANS2_ERROR, CTDB_CTRL_FLAG_NOREPLY, 
+			     failure_control, CTDB_CTRL_FLAG_NOREPLY, 
 			     tdb_null, NULL, NULL, NULL, NULL, NULL);		
 		talloc_free(h);
 		return ret;
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index ff4d271..f44a940 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -549,6 +549,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_TRANS2_COMMIT           = 83,
 		    CTDB_CONTROL_TRANS2_FINISHED         = 84,
 		    CTDB_CONTROL_TRANS2_ERROR            = 85,
+		    CTDB_CONTROL_TRANS2_COMMIT_RETRY     = 86,
 };	
 
 /*
@@ -789,6 +790,16 @@ struct ctdb_req_keepalive {
 	struct ctdb_req_header hdr;
 };
 
+
+/* types of failures possible from TRANS2_COMMIT */
+enum ctdb_trans2_commit_error {
+	CTDB_TRANS2_COMMIT_SUCCESS=0, /* all nodes committed successfully */
+	CTDB_TRANS2_COMMIT_TIMEOUT=1, /* at least one node timed out */
+	CTDB_TRANS2_COMMIT_ALLFAIL=2, /* all nodes failed the commit */
+	CTDB_TRANS2_COMMIT_SOMEFAIL=3 /* some nodes failed the commit, some allowed it */
+};
+
+
 /* internal prototypes */
 void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
 void ctdb_fatal(struct ctdb_context *ctdb, const char *msg);
diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec
index fcaf2ee..ea9ffbd 100644
--- a/packaging/RPM/ctdb.spec
+++ b/packaging/RPM/ctdb.spec
@@ -5,7 +5,7 @@ Vendor: Samba Team
 Packager: Samba Team <samba at samba.org>
 Name: ctdb
 Version: 1.0
-Release: 53
+Release: 54
 Epoch: 0
 License: GNU GPL version 3
 Group: System Environment/Daemons
@@ -118,6 +118,10 @@ fi
 %{_includedir}/ctdb_private.h
 
 %changelog
+* Fri Aug 8 2008 : Version 1.0.54
+ - fix a looping error in the transaction code
+ - provide a more detailed error code for persistent store errors
+   so clients can make more intelligent choices on how to try to recover
 * Thu Aug 7 2008 : Version 1.0.53
  - Remove the reclock.pnn file   it can cause gpfs to fail to umount
  - New transaction code
diff --git a/server/ctdb_control.c b/server/ctdb_control.c
index 59b0657..edfe344 100644
--- a/server/ctdb_control.c
+++ b/server/ctdb_control.c
@@ -397,6 +397,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		return ctdb_control_cancel_persistent_update(ctdb, c, indata);
 
 	case CTDB_CONTROL_TRANS2_COMMIT:
+	case CTDB_CONTROL_TRANS2_COMMIT_RETRY:
 		return ctdb_control_trans2_commit(ctdb, c, indata, async_reply);
 
 	case CTDB_CONTROL_TRANS2_ERROR:
diff --git a/server/ctdb_persistent.c b/server/ctdb_persistent.c
index 5b88b4b..42b148c 100644
--- a/server/ctdb_persistent.c
+++ b/server/ctdb_persistent.c
@@ -32,9 +32,17 @@ struct ctdb_persistent_state {
 	const char *errormsg;
 	uint32_t num_pending;
 	int32_t status;
+	uint32_t num_failed, num_sent;
 };
 
 /*
+  1) all nodes fail, and all nodes reply
+  2) some nodes fail, all nodes reply
+  3) some nodes timeout
+  4) all nodes succeed
+ */
+
+/*
   called when a node has acknowledged a ctdb_control_update_record call
  */
 static void ctdb_persistent_callback(struct ctdb_context *ctdb,
@@ -50,10 +58,19 @@ static void ctdb_persistent_callback(struct ctdb_context *ctdb,
 			 status, errormsg));
 		state->status = status;
 		state->errormsg = errormsg;
+		state->num_failed++;
 	}
 	state->num_pending--;
 	if (state->num_pending == 0) {
-		ctdb_request_control_reply(state->ctdb, state->c, NULL, state->status, state->errormsg);
+		enum ctdb_trans2_commit_error etype;
+		if (state->num_failed == state->num_sent) {
+			etype = CTDB_TRANS2_COMMIT_ALLFAIL;
+		} else if (state->num_failed != 0) {
+			etype = CTDB_TRANS2_COMMIT_SOMEFAIL;
+		} else {
+			etype = CTDB_TRANS2_COMMIT_SUCCESS;
+		}
+		ctdb_request_control_reply(state->ctdb, state->c, NULL, etype, state->errormsg);
 		talloc_free(state);
 	}
 }
@@ -66,7 +83,8 @@ static void ctdb_persistent_store_timeout(struct event_context *ev, struct timed
 {
 	struct ctdb_persistent_state *state = talloc_get_type(private_data, struct ctdb_persistent_state);
 	
-	ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_state");
+	ctdb_request_control_reply(state->ctdb, state->c, NULL, CTDB_TRANS2_COMMIT_TIMEOUT, 
+				   "timeout in ctdb_persistent_state");
 
 	talloc_free(state);
 }
@@ -103,12 +121,18 @@ int32_t ctdb_control_trans2_commit(struct ctdb_context *ctdb,
 	        then have it decremented in ctdb_control_trans2_error
 	        or ctdb_control_trans2_finished
 	*/
-	if (c->opcode == CTDB_CONTROL_PERSISTENT_STORE) {
+	switch (c->opcode) {
+	case CTDB_CONTROL_PERSISTENT_STORE:
 		if (client->num_persistent_updates > 0) {
 			client->num_persistent_updates--;
-		}		
-	} else {
+		}
+		break;
+	case CTDB_CONTROL_TRANS2_COMMIT:
 		client->num_persistent_updates++;
+		break;
+	case CTDB_CONTROL_TRANS2_COMMIT_RETRY:
+		/* already updated from the first commit */
+		break;
 	}
 
 	state = talloc_zero(ctdb, struct ctdb_persistent_state);
@@ -141,6 +165,7 @@ int32_t ctdb_control_trans2_commit(struct ctdb_context *ctdb,
 		}
 
 		state->num_pending++;
+		state->num_sent++;
 	}
 
 	if (state->num_pending == 0) {


-- 
CTDB repository


More information about the samba-cvs mailing list