[SCM] CTDB repository - branch master updated - ctdb-1.0.90-28-gd1332f4

Mon Sep 28 21:34:36 MDT 2009

The branch, master has been updated
       via  d1332f4d5d3d3e4b4e0cd362a6903d09e0d5fcbb (commit)
       via  95a3ee551241aa164967991fe5efe078e1714bde (commit)
      from  6e35feb06ec036b9036c5d1cdd94f7cef140d8a6 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit d1332f4d5d3d3e4b4e0cd362a6903d09e0d5fcbb
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Tue Sep 29 13:31:41 2009 +1000

    New version 1.0.91

commit 95a3ee551241aa164967991fe5efe078e1714bde
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Tue Sep 29 13:20:18 2009 +1000

    From Wolfgang Mueller-Friedt
    
    Remove the explicit vacuum/repack commands from the 00.ctdb eventscript
    and implement this in the ctdb daemon.
    
    Combine vacuuming and repacking into one
    cheap read traverse to enumerate all candidate records
    and one write traverse that both repacks the database and also deletes the record locally where we are lmaster and where the records have already been deleted remotely.
    
    this code also adds initial autotuning heuristics for the vacuum intervals and how many records to delete in each iteration.
    
    minor stylish changes made by ronnie s

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/00.ctdb |   23 --
 include/ctdb_private.h  |    3 +
 packaging/RPM/ctdb.spec |    9 +-
 server/ctdb_tunables.c  |    5 +-
 server/ctdb_vacuum.c    |  630 +++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 600 insertions(+), 70 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/00.ctdb b/config/events.d/00.ctdb
index 8821995..90fd92f 100755
--- a/config/events.d/00.ctdb
+++ b/config/events.d/00.ctdb
@@ -18,18 +18,6 @@ PATH=/bin:/usr/bin:$PATH
 cmd="$1"
 shift
 
-# set default samba cleanup period - in minutes
-[ -z "$CTDB_VACUUM_PERIOD" ] && {
-    CTDB_VACUUM_PERIOD=5
-}
-
-###########################
-# periodic vacuum function
-periodic_vacuum() {
-    # this cleans up dead records and repacks the databases
-    ( time ctdb vacuum 200000 -T 30 ) > $CTDB_BASE/state/vacuum.log 2>&1 &
-}
-
 case $cmd in 
      startup)
         # make sure we have a blank state directory for the scripts to work with
@@ -47,17 +35,6 @@ case $cmd in
 	;;
 
     monitor)
-	# Create a dummy file to track when we need to do periodic cleanup
-	# of samba databases
-	[ -f $CTDB_BASE/state/periodic_vacuum ] || {
-		touch $CTDB_BASE/state/periodic_vacuum
-	}
-	[ `/usr/bin/find $CTDB_BASE/state/periodic_vacuum -mmin +$CTDB_VACUUM_PERIOD | wc -l` -eq 1 ] && {
-		# vacuum the databases
-		touch $CTDB_BASE/state/periodic_vacuum
-	    	periodic_vacuum
-	}
-
 	# monitor that we are not running out of memory
 	[ -z "$CTDB_MONITOR_FREE_MEMORY" ] || {
 		FREE_MEM=`free -m | grep "buffers/cache" | while read A B C D ;do /bin/echo -n $D ; done`
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index 3528a5c..a9b4e22 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -111,6 +111,9 @@ struct ctdb_tunable {
 	uint32_t vacuum_default_interval;
 	uint32_t vacuum_max_run_time;
 	uint32_t repack_limit;
+	uint32_t vacuum_limit;
+	uint32_t vacuum_min_interval;
+	uint32_t vacuum_max_interval;
 };
 
 /*
diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec
index 0954f74..5ddf836 100644
--- a/packaging/RPM/ctdb.spec
+++ b/packaging/RPM/ctdb.spec
@@ -4,7 +4,7 @@ Summary: Clustered TDB
 Vendor: Samba Team
 Packager: Samba Team <samba at samba.org>
 Name: ctdb
-Version: 1.0.90
+Version: 1.0.91
 Release: 1
 Epoch: 0
 License: GNU GPL version 3
@@ -132,6 +132,13 @@ fi
 %{_libdir}/pkgconfig/ctdb.pc
 
 %changelog
+* Tue Sep 29 2009 : Version 1.0.91
+ - New vacuum and repack design from Wolgang Mueller.
+ - Add a new eventscript 01.reclock that will first mark a node unhealthy and later ban the node if the reclock file can not be accessed.
+ - Add machinereadable output to the ctdb getreclock command
+ - merge transaction updates from Michael Adam
+ - In the new banning code, reset the culprit count to 0 for all nodes that could successfully compelte a full recovery.
+ - dont mark the recovery master as a ban culprit because a node in the cluster needs a recovery. this happens naturally when using ctdb recover command so dont make this cause a node to be banned.
 * Sat Sep 12 2009 : Version 1.0.90
  - Be more forgiving for eventscripts that hang during startup
  - Fix for a banning bug in the new banning logic
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index 85cbc11..a321789 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -56,9 +56,12 @@ static const struct {
 	{ "RecLockLatencyMs",  1000,  offsetof(struct ctdb_tunable, reclock_latency_ms) },
 	{ "RecoveryDropAllIPs",  60,  offsetof(struct ctdb_tunable, recovery_drop_all_ips) },
 	{ "VerifyRecoveryLock",   1,  offsetof(struct ctdb_tunable, verify_recovery_lock) },
-	{ "VacuumDefaultInterval", 120,  offsetof(struct ctdb_tunable, vacuum_default_interval) },
+	{ "VacuumDefaultInterval", 300,  offsetof(struct ctdb_tunable, vacuum_default_interval) },
 	{ "VacuumMaxRunTime",     30,  offsetof(struct ctdb_tunable, vacuum_max_run_time) },
 	{ "RepackLimit",      10000,  offsetof(struct ctdb_tunable, repack_limit) },
+	{ "VacuumLimit",       5000,  offsetof(struct ctdb_tunable, vacuum_limit) },
+	{ "VacuumMinInterval",   60,  offsetof(struct ctdb_tunable, vacuum_min_interval) },
+	{ "VacuumMaxInterval",  600,  offsetof(struct ctdb_tunable, vacuum_max_interval) }
 };
 
 /*
diff --git a/server/ctdb_vacuum.c b/server/ctdb_vacuum.c
index 4188c85..69991b5 100644
--- a/server/ctdb_vacuum.c
+++ b/server/ctdb_vacuum.c
@@ -28,7 +28,10 @@
 #include "lib/util/dlinklist.h"
 #include "lib/events/events.h"
 #include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
 
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+#define TUNINGDBNAME "vactune.tdb"
 
 enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT};
 
@@ -46,39 +49,376 @@ struct ctdb_vacuum_handle {
 };
 
 
-static void ctdb_vacuum_event(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
-
-struct traverse_state {
-	bool error;
+/*  a list of records to possibly delete */
+struct vacuum_data {
+	uint32_t vacuum_limit;
+	uint32_t repack_limit;
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
 	struct tdb_context *dest_db;
+	trbt_tree_t *delete_tree;
+	uint32_t delete_count;
+	struct ctdb_marshall_buffer **list;
+	struct timeval start;
+	bool traverse_error;
+	bool vacuum;
+	uint32_t total;
+	uint32_t vacuumed;
+	uint32_t copied;
+};
+
+/* tuning information stored for every db */
+struct vacuum_tuning_data {
+	uint32_t last_num_repack;
+	uint32_t last_num_empty;
+	uint32_t last_interval;
+	uint32_t new_interval;
+	struct timeval last_start;
+	double   last_duration;
+};
+
+/* this structure contains the information for one record to be deleted */
+struct delete_record_data {
+	struct ctdb_context *ctdb;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_ltdb_header hdr;
+	TDB_DATA key;
 };
 
+struct delete_records_list {
+	struct ctdb_marshall_buffer *records;
+};
+
+static void ctdb_vacuum_event(struct event_context *ev, struct timed_event *te, 
+							  struct timeval t, void *private_data);
+
+
 /*
-  traverse function for repacking
+ * traverse function for gathering the records that can be deleted
  */
-static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
 {
-	struct traverse_state *state = (struct traverse_state *)private;
-	if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
-		state->error = true;
+	struct vacuum_data *vdata = talloc_get_type(private, struct vacuum_data);
+	struct ctdb_context *ctdb = vdata->ctdb;
+	struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+	uint32_t lmaster;
+	struct ctdb_ltdb_header *hdr;
+	struct ctdb_rec_data *rec;
+	size_t old_size;
+	       
+	lmaster = ctdb_lmaster(ctdb, &key);
+	if (lmaster >= ctdb->vnn_map->size) {
+		return 0;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		/* its not a deleted record */
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr->dmaster != ctdb->pnn) {
+		return 0;
+	}
+
+	/* is this a records we could possibly delete? I.e.
+	   if the record is empty and also we are both lmaster
+	   and dmaster for the record we should be able to delete it
+	*/
+	if (lmaster == ctdb->pnn) {
+		uint32_t hash;
+
+		hash = ctdb_hash(&key);
+		if (trbt_lookup32(vdata->delete_tree, hash)) {
+			DEBUG(DEBUG_INFO, (__location__ " Hash collission when vacuuming, skipping this record.\n"));
+		} 
+		else {
+			struct delete_record_data *dd;
+
+			/* store key and header indexed by the key hash */
+			dd = talloc_zero(vdata->delete_tree, struct delete_record_data);
+			if (dd == NULL) {
+				DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+				return -1;
+			}
+			dd->ctdb      = ctdb;
+			dd->ctdb_db   = ctdb_db;
+			dd->key.dsize = key.dsize;
+			dd->key.dptr  = talloc_memdup(dd, key.dptr, key.dsize);
+			if (dd->key.dptr == NULL) {
+				DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+				return -1;
+			}
+
+			dd->hdr = *hdr;
+	
+			trbt_insert32(vdata->delete_tree, hash, dd);
+
+			vdata->delete_count++;
+		}
+	}
+
+	/* add the record to the blob ready to send to the nodes */
+	rec = ctdb_marshall_record(vdata->list[lmaster], ctdb->pnn, key, NULL, tdb_null);
+	if (rec == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+	old_size = talloc_get_size(vdata->list[lmaster]);
+	vdata->list[lmaster] = talloc_realloc_size(NULL, vdata->list[lmaster], 
+						   old_size + rec->length);
+	if (vdata->list[lmaster] == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+		vdata->traverse_error = true;
 		return -1;
 	}
+	vdata->list[lmaster]->count++;
+	memcpy(old_size+(uint8_t *)vdata->list[lmaster], rec, rec->length);
+	talloc_free(rec);
+
+	vdata->total++;
+
 	return 0;
 }
 
 /*
-  repack a tdb
+ * traverse the tree of records to delete and marshall them into
+ * a blob
  */
-static int ctdb_repack_tdb(struct tdb_context *tdb, TALLOC_CTX *mem_ctx)
+static void delete_traverse(void *param, void *data)
 {
-	struct tdb_context *tmp_db;
-	struct traverse_state *state;
+	struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+	struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+	struct ctdb_rec_data *rec;
+	size_t old_size;
+
+	rec = ctdb_marshall_record(dd, recs->records->db_id, dd->key, &dd->hdr, tdb_null);
+	if (rec == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n"));
+		return;
+	}
+
+	old_size = talloc_get_size(recs->records);
+	recs->records = talloc_realloc_size(NULL, recs->records, old_size + rec->length);
+	if (recs->records == NULL) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+		return;
+	}
+	recs->records->count++;
+	memcpy(old_size+(uint8_t *)(recs->records), rec, rec->length);
+}
+
+/* 
+ * read-only traverse the database in order to find
+ * records that can be deleted and try to delete these
+ * records on the other nodes
+ * this executes in the child context
+ */
+static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db, struct vacuum_data *vdata)
+{
+	struct ctdb_context *ctdb = ctdb_db->ctdb;
+	const char *name = ctdb_db->db_name;
+	int ret, i, pnn;
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
+		return ret;
+	}
+
+	pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+	if (pnn == -1) {
+		DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
+		return -1;
+	}
 
-	state = talloc(mem_ctx, struct traverse_state);
-	if (!state) {
+	ctdb->pnn = pnn;
+	/* the list needs to be of length num_nodes */
+	vdata->list = talloc_array(vdata, struct ctdb_marshall_buffer *, ctdb->vnn_map->size);
+	if (vdata->list == NULL) {
 		DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
 		return -1;
 	}
+	for (i = 0; i < ctdb->vnn_map->size; i++) {
+		vdata->list[i] = (struct ctdb_marshall_buffer *)
+			talloc_zero_size(vdata->list, 
+							 offsetof(struct ctdb_marshall_buffer, data));
+		if (vdata->list[i] == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+			return -1;
+		}
+		vdata->list[i]->db_id = ctdb_db->db_id;
+	}
+
+	/* read-only traverse, looking for records that might be able to be vacuumed */
+	if (tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata) == -1 ||
+	    vdata->traverse_error) {
+		DEBUG(DEBUG_ERR,(__location__ " Traverse error in vacuuming '%s'\n", name));
+		return -1;		
+	}
+
+	for ( i = 0; i < ctdb->vnn_map->size; i++) {
+		if (vdata->list[i]->count == 0) {
+			continue;
+		}
+
+		/* for records where we are not the lmaster, tell the lmaster to fetch the record */
+		if (ctdb->vnn_map->map[i] != ctdb->pnn) {
+			TDB_DATA data;
+			DEBUG(DEBUG_NOTICE,("Found %u records for lmaster %u in '%s'\n", 
+								vdata->list[i]->count, i, name));
+
+			data.dsize = talloc_get_size(vdata->list[i]);
+			data.dptr  = (void *)vdata->list[i];
+			if (ctdb_send_message(ctdb, ctdb->vnn_map->map[i], CTDB_SRVID_VACUUM_FETCH, data) != 0) {
+				DEBUG(DEBUG_ERR,(__location__ " Failed to send vacuum fetch message to %u\n",
+					 ctdb->vnn_map->map[i]));
+				return -1;		
+			}
+			continue;
+		}
+	}	
+
+	/* Process all records we can delete (if any) */
+	if (vdata->delete_count > 0) {
+		struct delete_records_list *recs;
+		TDB_DATA indata, outdata;
+		int32_t res;
+
+		recs = talloc_zero(vdata, struct delete_records_list);
+		if (recs == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+			return -1;
+		}
+		recs->records = (struct ctdb_marshall_buffer *)
+			talloc_zero_size(vdata, 
+				    offsetof(struct ctdb_marshall_buffer, data));
+		if (recs->records == NULL) {
+			DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+			return -1;
+		}
+		recs->records->db_id = ctdb_db->db_id;
+
+		/* 
+		 * traverse the tree of all records we want to delete and
+		 * create a blob we can send to the other nodes.
+		 */
+		trbt_traversearray32(vdata->delete_tree, 1, delete_traverse, recs);
+
+		indata.dsize = talloc_get_size(recs->records);
+		indata.dptr  = (void *)recs->records;
+
+		/* 
+		 * now tell all the other nodes to delete all these records
+		 * (if possible)
+		 */
+		for (i = 0; i < ctdb->vnn_map->size; i++) {
+			struct ctdb_marshall_buffer *records;
+			struct ctdb_rec_data *rec;
+
+			if (ctdb->vnn_map->map[i] == ctdb->pnn) {
+				/* we dont delete the records on the local node just yet */
+				continue;
+			}
+
+			ret = ctdb_control(ctdb, ctdb->vnn_map->map[i], 0,
+					CTDB_CONTROL_TRY_DELETE_RECORDS, 0,
+					indata, recs, &outdata, &res,
+					NULL, NULL);
+			if (ret != 0 || res != 0) {
+				DEBUG(DEBUG_ERR,("Failed to delete records on node %u\n", ctdb->vnn_map->map[i]));
+				return -1;
+			}
+
+			/* 
+			 * outdata countains the list of records coming back
+			 * from the node which the node could not delete
+			 */
+			records = (struct ctdb_marshall_buffer *)outdata.dptr;
+			rec = (struct ctdb_rec_data *)&records->data[0];
+			while (records->count-- > 1) {
+				TDB_DATA reckey, recdata;
+				struct ctdb_ltdb_header *rechdr;
+
+				reckey.dptr = &rec->data[0];
+				reckey.dsize = rec->keylen;
+				recdata.dptr = &rec->data[reckey.dsize];
+				recdata.dsize = rec->datalen;
+
+				if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+					DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+					return -1;
+				}
+				rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+				recdata.dptr += sizeof(*rechdr);
+				recdata.dsize -= sizeof(*rechdr);
+
+				/* 
+				 * that other node couldnt delete the record
+				 * so we should delete it and thereby remove it from the tree
+				 */
+				talloc_free(trbt_lookup32(vdata->delete_tree, ctdb_hash(&reckey)));
+
+				rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+			}	    
+		}
+
+		/* 
+		 * The only records remaining in the tree would be those
+		 * records where all other nodes could successfully
+		 * delete them, so we can safely delete them on the
+		 * lmaster as well. Deletion implictely happens while
+		 * we repack the database. The repack algorithm revisits 
+		 * the tree in order to find the records that don't need
+		 * to be copied / repacked.
+		 */
+	}
+
+	/* this ensures we run our event queue */
+	ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+
+	return 0;
+}
+
+
+/*
+ * traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+	struct vacuum_data *vdata = (struct vacuum_data *)private;
+
+	if (vdata->vacuum) {
+		uint32_t hash = ctdb_hash(&key);
+		struct delete_record_data *kd;
+		/*
+		 * check if we can ignore this record because it's in the delete_tree
+		 */
+		kd = (struct delete_record_data *)trbt_lookup32(vdata->delete_tree, hash);
+		/*
+		 * there might be hash collisions so we have to compare the keys here to be sure
+		 */
+		if (kd && kd->key.dsize == key.dsize && memcmp(kd->key.dptr, key.dptr, key.dsize) == 0) {
+			vdata->vacuumed++;
+			return 0;
+		}
+	}
+	if (tdb_store(vdata->dest_db, key, data, TDB_INSERT) != 0) {
+		vdata->traverse_error = true;
+		return -1;
+	}
+	vdata->copied++;
+	return 0;
+}
+
+/*
+ * repack a tdb


-- 
CTDB repository