[SCM] CTDB repository - branch master updated - ctdb-2.4-48-g713c9ec

Amitay Isaacs amitay at samba.org
Fri Sep 27 02:45:31 CEST 2013


The branch, master has been updated
       via  713c9ecc791e3319a2d109838471833de5a158c8 (commit)
       via  37e22fc3ac3eb64732f2e67058f5b7b06c093fbf (commit)
       via  482ac708cb79cb6378d814a79c2cf13f88435bc4 (commit)
       via  25e9cf86328252f96215b54b94551dd7bbdd2db4 (commit)
       via  abd51a9f41ebb178c4ea4491bdedf9a9433e7232 (commit)
       via  e4aba8598b00a810e721de64ac44dccc9af04ab6 (commit)
      from  9e18f3c173863919587e25d704f66372624ed8ed (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 713c9ecc791e3319a2d109838471833de5a158c8
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Wed Sep 25 19:10:13 2013 +1000

    tests: Add a simple test to test cluster wide database traverse
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit 37e22fc3ac3eb64732f2e67058f5b7b06c093fbf
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Mon Sep 9 12:46:26 2013 +1000

    traverse: Send traverse end record from traverse child process
    
    Traverse records are sent directly from traverse child process, but
    the last empty record signalling end of traverse is sent from ctdbd.
    This creates a race condition between ctdbd and traverse child.
    There are two fds from traverse child to ctdbd - a pipe to track status
    of the child process and unix socket connection for sending records.
    It's possible that last few records are sitting in unix socket buffer
    when ctdbd reads the status written from traverse child.  This will
    be interpreted as end of traverse and ctdbd will send the last empty
    record to originating node before it has processed the pending packets
    in unix socket connection.
    
    The race is avoided by sending the last empty record marking end of
    traverse from the child process.
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit 482ac708cb79cb6378d814a79c2cf13f88435bc4
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Tue Sep 10 17:52:26 2013 +1000

    traverse: Wait till all data has been flushed from output queue
    
    To improve the traverse performance, records are directly sent from
    traverse child process to the originating node.  Make sure that all the
    data is sent via socket, before informing ctdbd that traverse is complete.
    
    Without waiting for all the packets to be flushed from the queue,
    child process can incorrectly signal ctdbd that traverse has ended.
    This will cause the pending records in the queue never to make it to
    the originating node and traverse information will not be complete.
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit 25e9cf86328252f96215b54b94551dd7bbdd2db4
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Fri Sep 13 13:28:31 2013 +1000

    traverse: Use ctdb local variable for convenience
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit abd51a9f41ebb178c4ea4491bdedf9a9433e7232
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Fri Sep 6 18:11:40 2013 +1000

    traverse: Check if local traverse failed or succeeded
    
    By passing the result of tdb_traverse_read() allows ctdbd to determine
    if the local traverse succeeded or not.  In case of a problem with local
    traverse, ctdbd can log an error.
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

commit e4aba8598b00a810e721de64ac44dccc9af04ab6
Author: Amitay Isaacs <amitay at gmail.com>
Date:   Fri Sep 6 14:51:54 2013 +1000

    traverse: Log information when traverse starts and ends
    
    Signed-off-by: Amitay Isaacs <amitay at gmail.com>

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_traverse.c           |  116 +++++++++++++++++++++++++-------------
 tests/simple/80_ctdb_traverse.sh |   73 ++++++++++++++++++++++++
 2 files changed, 150 insertions(+), 39 deletions(-)
 create mode 100755 tests/simple/80_ctdb_traverse.sh


Changeset truncated at 500 lines:

diff --git a/server/ctdb_traverse.c b/server/ctdb_traverse.c
index 4edae50..99e7e8f 100644
--- a/server/ctdb_traverse.c
+++ b/server/ctdb_traverse.c
@@ -44,6 +44,8 @@ struct ctdb_traverse_local_handle {
 	ctdb_traverse_fn_t callback;
 	bool withemptyrecords;
 	struct tevent_fd *fde;
+	int records_failed;
+	int records_sent;
 };
 
 /*
@@ -56,12 +58,25 @@ static void ctdb_traverse_child_handler(struct tevent_context *ev, struct tevent
 							struct ctdb_traverse_local_handle);
 	ctdb_traverse_fn_t callback = h->callback;
 	void *p = h->private_data;
-	char res;
+	int res;
+	ssize_t n;
+
+	/* Read the number of records sent by traverse child */
+	n = read(h->fd[0], &res, sizeof(res));
+	if (n < 0 || n != sizeof(res)) {
+		/* Traverse child failed */
+		DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d\n",
+				  h->ctdb_db->db_name, h->reqid));
+	} else if (res < 0) {
+		/* Traverse failed */
+		res = -res;
+		DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d records:%d\n",
+				  h->ctdb_db->db_name, h->reqid, res));
+	} else {
+		DEBUG(DEBUG_INFO, ("Local traverse end db:%s reqid:%d records:%d\n",
+				   h->ctdb_db->db_name, h->reqid, res));
+	}
 
-	/* FIXME: There is no way to distinguish between failed traverse and
-	 * successful traverse.  The only way to signal the end is by sending
-	 * tdb_null for key and data. */
-	read(h->fd[0], &res, 1);
 	callback(p, tdb_null, tdb_null);
 }
 
@@ -106,6 +121,7 @@ static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DAT
 	d = ctdb_marshall_record(h, h->reqid, key, NULL, data);
 	if (d == NULL) {
 		/* error handling is tricky in this child code .... */
+		h->records_failed++;
 		return -1;
 	}
 
@@ -115,9 +131,11 @@ static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DAT
 	res = ctdb_control(h->ctdb_db->ctdb, h->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
 			   CTDB_CTRL_FLAG_NOREPLY, outdata, NULL, NULL, &status, NULL, NULL);
 	if (res != 0 || status != 0) {
+		h->records_failed++;
 		return -1;
 	}
 
+	h->records_sent++;
 	return 0;
 }
 
@@ -177,25 +195,57 @@ static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_con
 
 	if (h->child == 0) {
 		/* start the traverse in the child */
-		char res = 0;
+		int res, status;
 		pid_t parent = getpid();
+		struct ctdb_context *ctdb = ctdb_db->ctdb;
+		struct ctdb_rec_data *d;
+		TDB_DATA outdata;
 
 		close(h->fd[0]);
 
 		ctdb_set_process_name("ctdb_traverse");
-		if (switch_from_server_to_client(ctdb_db->ctdb,
-						 "traverse_local-%s:",
+		if (switch_from_server_to_client(ctdb, "traverse_local-%s:",
 						 ctdb_db->db_name) != 0) {
 			DEBUG(DEBUG_CRIT, ("Failed to switch traverse child into client mode\n"));
-			res = -1;
+			_exit(0);
+		}
+
+		d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null);
+		if (d == NULL) {
+			res = 0;
+			write(h->fd[1], &res, sizeof(int));
+			_exit(0);
+		}
+
+		res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h);
+		if (res == -1 || h->records_failed > 0) {
+			/* traverse failed */
+			res = -(h->records_sent);
+		} else {
+			res = h->records_sent;
+		}
+
+		/* Wait till all the data is flushed from output queue */
+		while (ctdb_queue_length(ctdb->daemon.queue) > 0) {
+			tevent_loop_once(ctdb->ev);
 		}
 
-		if (tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h) != 0) {
-			res = -1;
+		/* End traverse by sending empty record */
+		outdata.dptr = (uint8_t *)d;
+		outdata.dsize = d->length;
+		ret = ctdb_control(ctdb, h->srcnode, 0,
+				   CTDB_CONTROL_TRAVERSE_DATA,
+				   CTDB_CTRL_FLAG_NOREPLY, outdata,
+				   NULL, NULL, &status, NULL, NULL);
+		if (ret == -1 || status == -1) {
+			if (res > 0) {
+				res = -res;
+			}
 		}
-		write(h->fd[1], &res, 1);
 
-		while (ctdb_kill(ctdb_db->ctdb, parent, 0) == 0 || errno != ESRCH) {
+		write(h->fd[1], &res, sizeof(res));
+
+		while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
 			sleep(5);
 		}
 		_exit(0);
@@ -279,6 +329,7 @@ struct traverse_start_state {
 	uint32_t db_id;
 	uint64_t srvid;
 	bool withemptyrecords;
+	int num_records;
 };
 
 
@@ -381,6 +432,9 @@ static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_
 		return NULL;
 	}
 
+	DEBUG(DEBUG_NOTICE,("Starting traverse on DB %s (id %d)\n",
+			    ctdb_db->db_name, state->reqid));
+
 	/* timeout the traverse */
 	event_add_timed(ctdb->ev, state, 
 			timeval_current_ofs(ctdb->tunable.traverse_timeout, 0), 
@@ -390,35 +444,14 @@ static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_
 }
 
 /*
-  called for each record during a traverse all 
+  called when local traverse ends
  */
 static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data)
 {
 	struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state);
-	int ret;
-	struct ctdb_rec_data *d;
-	TDB_DATA cdata;
-
-	d = ctdb_marshall_record(state, state->reqid, key, NULL, data);
-	if (d == NULL) {
-		/* darn .... */
-		DEBUG(DEBUG_ERR,("Out of memory in traverse_all_callback\n"));
-		return;
-	}
-
-	cdata.dptr = (uint8_t *)d;
-	cdata.dsize = d->length;
-
-	ret = ctdb_daemon_send_control(state->ctdb, state->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
-				       0, CTDB_CTRL_FLAG_NOREPLY, cdata, NULL, NULL);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR,("Failed to send traverse data\n"));
-	}
 
-	if (key.dsize == 0 && data.dsize == 0) {
-		/* we're done */
-		talloc_free(state);
-	}
+	/* we're done */
+	talloc_free(state);
 }
 
 /*
@@ -643,6 +676,10 @@ static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data)
 
 	ctdb_dispatch_message(state->ctdb, state->srvid, cdata);
 	if (key.dsize == 0 && data.dsize == 0) {
+		DEBUG(DEBUG_NOTICE, ("Ending traverse on DB %s (id %d), records %d\n",
+				     state->h->ctdb_db->db_name, state->h->reqid,
+				     state->num_records));
+
 	    	if (state->h->timedout) {
 		    	/* timed out, send TRAVERSE_KILL control */
 			talloc_free(state);
@@ -651,6 +688,8 @@ static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data)
 			talloc_set_destructor(state, NULL);
 			talloc_free(state);
 		}
+	} else {
+		state->num_records++;
 	}
 }
 
@@ -706,8 +745,7 @@ int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb,
 	state->db_id = d->db_id;
 	state->ctdb = ctdb;
 	state->withemptyrecords = d->withemptyrecords;
-
-	DEBUG(DEBUG_NOTICE,("Stating traverse on DB %s\n", ctdb_db->db_name));
+	state->num_records = 0;
 
 	state->h = ctdb_daemon_traverse_all(ctdb_db, traverse_start_callback, state);
 	if (state->h == NULL) {
diff --git a/tests/simple/80_ctdb_traverse.sh b/tests/simple/80_ctdb_traverse.sh
new file mode 100755
index 0000000..65a991a
--- /dev/null
+++ b/tests/simple/80_ctdb_traverse.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Test CTDB cluster wide traverse code.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Create a test database
+2. Add records on different nodes
+3. Run traverse
+
+Expected results:
+
+* All records are retrieved.
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+num_records=1000
+
+TESTDB="traverse_test.tdb"
+
+echo "create test database $TESTDB"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach $TESTDB
+
+echo "wipe test database $TESTDB"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb $TESTDB
+
+echo "Add $num_records records to database"
+i=0
+while [ $i -lt $num_records ]; do
+	key=$(printf "key-%04x" $i)
+	value="value-$i"
+
+	n=$[ $i % $num_nodes ]
+	try_command_on_node -q $n $CTDB_TEST_WRAPPER ctdb writekey $TESTDB $key $value
+
+	i=$[ $i + 1 ]
+done
+
+echo "Start a traverse and collect records"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb catdb $TESTDB
+
+num_read=$(echo "$out" | tail -n 1 | cut -d\  -f2)
+if [ $num_read -eq $num_records ]; then
+	echo "GOOD: All $num_records records retrieved"
+	status=0
+else
+	echo "BAD: Only $num_read/$num_records records retrieved"
+	status=1
+fi
+
+exit $status


-- 
CTDB repository


More information about the samba-cvs mailing list