[SCM] CTDB repository - branch 1.2.40 updated - ctdb-1.9.1-549-g545c343

Thu Mar 1 20:46:09 MST 2012

The branch, 1.2.40 has been updated
       via  545c343b19258fce01562b15f274eaf1a1deafc8 (commit)
       via  9bde066f6eb46124168e5686fc41a323e67401e8 (commit)
      from  c51154b79be94198324c321aaaa037045bb85cd9 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=1.2.40


- Log -----------------------------------------------------------------
commit 545c343b19258fce01562b15f274eaf1a1deafc8
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Wed Feb 8 13:42:30 2012 +1100

    STATISTICS: add total counts for number of delegations and number of revokes
    
    Everytime we give a delegation to another node we count this as one delegation.
    If the same record is delegated to several nodes we count one for each node.
    
    Everytime a record has all its delegations revoked we count this as one revoke.

commit 9bde066f6eb46124168e5686fc41a323e67401e8
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Fri Mar 2 14:12:37 2012 +1100

    READONLY: readonly fetch collapse. Make sure we only keep one single readonly fetch for a record in flight at a time.

-----------------------------------------------------------------------

Summary of changes:
 include/ctdb_private.h    |    4 +
 include/ctdb_protocol.h   |    2 +
 server/ctdb_call.c        |    2 +
 server/ctdb_daemon.c      |  209 +++++++++++++++++++++++++++++++++++++++++++++
 server/ctdb_ltdb_server.c |   10 ++
 tools/ctdb.c              |    2 +
 6 files changed, 229 insertions(+), 0 deletions(-)


Changeset truncated at 500 lines:

diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index 8180722..86b664b 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -533,6 +533,10 @@ struct ctdb_db_context {
 				  struct ctdb_ltdb_header *header,
 				  TDB_DATA data);
 
+	/* used to track which records we are currently fetching with readonly
+	   requests so we can avoid sending duplicates
+	*/
+	struct trbt_tree *deferred_ro_fetch;
 };
 
 
diff --git a/include/ctdb_protocol.h b/include/ctdb_protocol.h
index efcc2cf..c874148 100644
--- a/include/ctdb_protocol.h
+++ b/include/ctdb_protocol.h
@@ -634,6 +634,8 @@ struct ctdb_statistics {
 	uint32_t num_recoveries;
 	struct timeval statistics_start_time;
 	struct timeval statistics_current_time;
+	uint32_t total_ro_delegations;
+	uint32_t total_ro_revokes;
 };
 
 /*
diff --git a/server/ctdb_call.c b/server/ctdb_call.c
index 2657f8b..1ece85a 100644
--- a/server/ctdb_call.c
+++ b/server/ctdb_call.c
@@ -513,6 +513,7 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 
 	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
 		header.flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
+		CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
 		if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
 			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
 		}
@@ -619,6 +620,7 @@ void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
 		}
 
 		ctdb_queue_packet(ctdb, &r->hdr);
+		CTDB_INCREMENT_STAT(ctdb, total_ro_delegations);
 
 		talloc_free(r);
 		return;
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c
index c4f46b1..b09aaf3 100644
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -27,6 +27,7 @@
 #include "system/wait.h"
 #include "../include/ctdb_client.h"
 #include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
 #include <sys/socket.h>
 
 struct ctdb_client_pid_list {
@@ -384,6 +385,192 @@ static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
 }
 
 
+struct ctdb_deferred_fetch_call {
+	struct ctdb_deferred_fetch_call *next, *prev;
+	struct ctdb_req_call *c;
+	struct ctdb_daemon_packet_wrap *w;
+};
+
+struct ctdb_deferred_fetch_queue {
+	struct ctdb_deferred_fetch_call *deferred_calls;
+};
+
+struct ctdb_deferred_requeue {
+	struct ctdb_deferred_fetch_call *dfc;
+	struct ctdb_client *client;
+};
+
+
+/* called from a timer event and starts reprocessing the deferred call.*/
+static void reprocess_deferred_call(struct event_context *ev, struct timed_event *te, 
+				       struct timeval t, void *private_data)
+{
+	struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
+	struct ctdb_client *client = dfr->client;
+
+	talloc_steal(client, dfr->dfc->c);
+	daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
+	talloc_free(dfr);
+}
+
+/* the referral context is destroyed either after a timeout or when the initial
+   fetch-lock has finished.
+   at this stage, immediately start reprocessing the queued up deferred
+   calls so they get reprocessed immediately (and since we are dmaster at
+   this stage, trigger the waiting smbd processes to pick up and aquire the
+   record right away.
+*/
+static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
+{
+
+	/* need to reprocess the packets from the queue explicitely instead of
+	   just using a normal destructor since we want, need, to
+	   call the clients in the same oder as the requests queued up
+	*/
+	while (dfq->deferred_calls != NULL) {
+		struct ctdb_client *client;
+		struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
+		struct ctdb_deferred_requeue *dfr;
+
+		DLIST_REMOVE(dfq->deferred_calls, dfc);
+
+		client = ctdb_reqid_find(dfc->w->ctdb, dfc->w->client_id, struct ctdb_client);
+		if (client == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+				 dfc->w->client_id));
+			continue;
+		}
+
+		/* process it by pushing it back onto the eventloop */
+		dfr = talloc(client, struct ctdb_deferred_requeue);
+		if (dfr == NULL) {
+			DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
+			continue;
+		}
+
+		dfr->dfc    = talloc_steal(dfr, dfc);
+		dfr->client = client;
+
+		event_add_timed(dfc->w->ctdb->ev, client, timeval_zero(), reprocess_deferred_call, dfr);
+	}
+
+	return 0;
+}
+
+/* insert the new deferral context into the rb tree.
+   there should never be a pre-existing context here, but check for it
+   warn and destroy the previous context if there is already a deferral context
+   for this key.
+*/
+static void *insert_dfq_callback(void *parm, void *data)
+{
+        if (data) {
+		DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+/* if the original fetch-lock did not complete within a reasonable time,
+   free the context and context for all deferred requests to cause them to be
+   re-inserted into the event system.
+*/
+static void dfq_timeout(struct event_context *ev, struct timed_event *te, 
+				  struct timeval t, void *private_data)
+{
+	talloc_free(private_data);
+}
+
+/* This function is used in the local daemon to register a KEY in a database
+   for being "fetched"
+   While the remote fetch is in-flight, any futher attempts to re-fetch the
+   same record will be deferred until the fetch completes.
+*/
+static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct trbt_tree *tree, struct ctdb_call *call)
+{
+	uint32_t *k;
+	struct ctdb_deferred_fetch_queue *dfq;
+
+	k = talloc_zero_size(call, ((call->key.dsize + 3) & 0xfffffffc) + 4);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+		return -1;
+	}
+
+	k[0] = (call->key.dsize + 3) / 4 + 1;
+	memcpy(&k[1], call->key.dptr, call->key.dsize);
+
+	dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
+	if (dfq == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
+		talloc_free(k);
+		return -1;
+	}
+	dfq->deferred_calls = NULL;
+
+	trbt_insertarray32_callback(tree, k[0], &k[0], insert_dfq_callback, dfq);
+
+	talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
+
+	/* if the fetch havent completed in 30 seconds, just tear it all down
+	   and let it try again as the events are reissued */
+	event_add_timed(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0), dfq_timeout, dfq);
+
+	talloc_free(k);
+	return 0;
+}
+
+/* check if this is a duplicate request to a fetch already in-flight
+   if it is, make this call deferred to be reprocessed later when
+   the in-flight fetch completes.
+*/
+static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct trbt_tree *tree, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call *c)
+{
+	uint32_t *k;
+	struct ctdb_deferred_fetch_queue *dfq;
+	struct ctdb_deferred_fetch_call *dfc;
+
+	k = talloc_zero_size(c, ((key.dsize + 3) & 0xfffffffc) + 4);
+	if (k == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+		return -1;
+	}
+
+	k[0] = (key.dsize + 3) / 4 + 1;
+	memcpy(&k[1], key.dptr, key.dsize);
+
+	dfq = trbt_lookuparray32(tree, k[0], &k[0]);
+	if (dfq == NULL) {
+		talloc_free(k);
+		return -1;
+	}
+
+
+	talloc_free(k);
+
+	dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
+	if (dfc == NULL) {
+		DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
+		return -1;
+	}
+
+	dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
+	if (dfc->w == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
+		talloc_free(dfc);
+		return -1;
+	}
+
+	dfc->c = talloc_steal(dfc, c);
+	dfc->w->ctdb = ctdb_db->ctdb;
+	dfc->w->client_id = client->client_id;
+
+	DLIST_ADD_END(dfq->deferred_calls, dfc, NULL);
+
+	return 0;
+}
+
+
 /*
   this is called when the ctdb daemon received a ctdb request call
   from a local client over the unix domain socket
@@ -453,8 +640,23 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 		c->flags &= ~CTDB_WANT_READONLY;
 	}
 
+	if (c->flags & CTDB_WANT_READONLY) {
+		/* check if this fetch-lock request is a duplicate for a
+		   request we already have in flight. If so defer it until
+		   the first request completes.
+		 */
+		if (requeue_duplicate_fetch(ctdb_db, ctdb_db->deferred_ro_fetch, client, key, c) == 0) {
+			ret = ctdb_ltdb_unlock(ctdb_db, key);
+			if (ret != 0) {
+				DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+			}
+			return;
+		}
+	}
+
 	if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
 		header.flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
+		CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
 		if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 			ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
 		}
@@ -549,6 +751,13 @@ static void daemon_request_call_from_client(struct ctdb_client *client,
 		state = ctdb_call_local_send(ctdb_db, call, &header, &data);
 	} else {
 		state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
+		if (call->flags & CTDB_WANT_READONLY) {
+			/* This request triggered a remote ro fetch.
+			   set up a deferral for this key so any additional
+			   ro fetches are deferred until this one finishes.
+			 */
+			setup_deferred_fetch_locks(ctdb_db, ctdb_db->deferred_ro_fetch, call);
+		}
 	}
 
 	ret = ctdb_ltdb_unlock(ctdb_db, key);
diff --git a/server/ctdb_ltdb_server.c b/server/ctdb_ltdb_server.c
index 27d773b..d600278 100644
--- a/server/ctdb_ltdb_server.c
+++ b/server/ctdb_ltdb_server.c
@@ -964,6 +964,16 @@ again:
 		}
 	}
 
+	/* set up a rb tree we can use to track which records we have a 
+	   fetch-lock in-flight for so we can defer any additional calls
+	   for the same record.
+	 */
+	ctdb_db->deferred_ro_fetch = trbt_create(ctdb_db, 0);
+	if (ctdb_db->deferred_ro_fetch == NULL) {
+		DEBUG(DEBUG_ERR,("Failed to create deferred ro fetch rb tree for ctdb database\n"));
+		talloc_free(ctdb_db);
+		return -1;
+	}
 
 	DLIST_ADD(ctdb->db_list, ctdb_db);
 
diff --git a/tools/ctdb.c b/tools/ctdb.c
index 92ef63d..2c7fdc3 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -201,6 +201,8 @@ static void show_statistics(struct ctdb_statistics *s, int show_header)
 		STATISTICS_FIELD(pending_childwrite_calls),
 		STATISTICS_FIELD(memory_used),
 		STATISTICS_FIELD(max_hop_count),
+		STATISTICS_FIELD(total_ro_delegations),
+		STATISTICS_FIELD(total_ro_revokes),
 	};
 	tmp = s->statistics_current_time.tv_sec - s->statistics_start_time.tv_sec;
 	seconds = tmp%60;


-- 
CTDB repository