[SCM] CTDB repository - branch master updated - ctdb-1.0.65-20-g32d460b

Fri Dec 5 05:16:39 GMT 2008

The branch, master has been updated
       via  32d460b8469eb53145f04161a5d01166f9b5f09e (commit)
      from  5d3018c37179966f75183d9a98790eaaaf1d2cfc (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 32d460b8469eb53145f04161a5d01166f9b5f09e
Author: root <root at test1n1.VSOFS1.COM>
Date:   Fri Dec 5 16:32:30 2008 +1100

    redo and update how we synchronize flags across the cluster.
    this simplifies the code and should close a race condition between the local recovery daemon and a remote node when flags are changing.

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_recoverd.c |  168 ++++++++++++++++++++++++++++++------------------
 tcp/tcp_connect.c      |    1 -
 tools/ctdb.c           |   45 +++++++++++++
 3 files changed, 151 insertions(+), 63 deletions(-)


Changeset truncated at 500 lines:

diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 4faa2f8..468977c 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -639,27 +639,12 @@ static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map
 /*
   update flags on all active nodes
  */
-static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
-{
-	int i;
-	for (i=0;i<nodemap->num;i++) {
-		int ret;
-
-		ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, nodemap->nodes[i].flags, ~nodemap->nodes[i].flags);
-		if (ret != 0) {
-			DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
-			return -1;
-		}
-	}
-	return 0;
-}
-
-static int update_our_flags_on_all_nodes(struct ctdb_context *ctdb, uint32_t pnn, struct ctdb_node_map *nodemap)
+static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 {
 	int ret;
 
-	ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[pnn].pnn, nodemap->nodes[pnn].flags, ~nodemap->nodes[pnn].flags);
-	if (ret != 0) {
+	ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
+		if (ret != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 		return -1;
 	}
@@ -1513,12 +1498,18 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	/*
 	  update all nodes to have the same flags that we have
 	 */
-	ret = update_flags_on_all_nodes(ctdb, nodemap);
-	if (ret != 0) {
-		DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
-		return -1;
+	for (i=0;i<nodemap->num;i++) {
+		if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+			continue;
+		}
+
+		ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+		if (ret != 0) {
+			DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+			return -1;
+		}
 	}
-	
+
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
 
 	/* disable recovery mode */
@@ -2271,6 +2262,51 @@ static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
 	return 0;
 }
 
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+	struct ctdb_node_map **remote_nodemaps = callback_data;
+
+	if (node_pnn >= ctdb->num_nodes) {
+		DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
+		return;
+	}
+
+	remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
+
+}
+
+static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+	struct ctdb_node_map *nodemap,
+	struct ctdb_node_map ***remote_nodemaps)
+{
+	uint32_t *nodes;
+	int i;
+
+	*remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
+	if (*remote_nodemaps == NULL) {
+		DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
+		return -1;
+	}
+	for(i=0; i<nodemap->num; i++) {
+		(*remote_nodemaps)[i] = NULL;
+	}
+
+	nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
+					nodes,
+					CONTROL_TIMEOUT(), false, tdb_null,
+					async_getnodemap_callback,
+					NULL,
+					*remote_nodemaps) != 0) {
+		DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
+
+		return -1;
+	}
+
+	return 0;
+}
+
 /*
   the main monitoring loop
  */
@@ -2279,7 +2315,8 @@ static void monitor_cluster(struct ctdb_context *ctdb)
 	uint32_t pnn;
 	TALLOC_CTX *mem_ctx=NULL;
 	struct ctdb_node_map *nodemap=NULL;
-	struct ctdb_node_map *remote_nodemap=NULL;
+	struct ctdb_node_map *recmaster_nodemap=NULL;
+	struct ctdb_node_map **remote_nodemaps=NULL;
 	struct ctdb_vnn_map *vnnmap=NULL;
 	struct ctdb_vnn_map *remote_vnnmap=NULL;
 	int32_t debug_level;
@@ -2484,7 +2521,7 @@ again:
 
 	/* grap the nodemap from the recovery master to check if it is banned */
 	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
-				   mem_ctx, &remote_nodemap);
+				   mem_ctx, &recmaster_nodemap);
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n", 
 			  nodemap->nodes[j].pnn));
@@ -2492,21 +2529,13 @@ again:
 	}
 
 
-	if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+	if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 		DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
 		force_election(rec, pnn, nodemap);
 		goto again;
 	}
 
 
-	/* verify that we and the recmaster agrees on our flags */
-	if (nodemap->nodes[pnn].flags != remote_nodemap->nodes[pnn].flags) {
-		DEBUG(DEBUG_ERR, (__location__ " Recmaster disagrees on our flags flags:0x%x recmaster_flags:0x%x  Broadcasting out flags.\n", nodemap->nodes[pnn].flags, remote_nodemap->nodes[pnn].flags));
-
-		update_our_flags_on_all_nodes(ctdb, pnn, nodemap);
-	}
-
-
 	/* verify that we have all ip addresses we should have and we dont
 	 * have addresses we shouldnt have.
 	 */ 
@@ -2619,31 +2648,27 @@ again:
 		goto again;
 	}
 
-	/* get the nodemap for all active remote nodes and verify
-	   they are the same as for this node
+
+	/* get the nodemap for all active remote nodes
 	 */
+	if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, &remote_nodemaps) != 0) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
+		goto again;
+	} 
+
+	/* verify that all other nodes have the same nodemap as we have
+	*/
 	for (j=0; j<nodemap->num; j++) {
-		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-			continue;
-		}
-		if (nodemap->nodes[j].pnn == pnn) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 			continue;
 		}
 
-		ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
-					   mem_ctx, &remote_nodemap);
-		if (ret != 0) {
-			DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n", 
-				  nodemap->nodes[j].pnn));
-			goto again;
-		}
-
-		/* if the nodes disagree on how many nodes there are
+ 		/* if the nodes disagree on how many nodes there are
 		   then this is a good reason to try recovery
 		 */
-		if (remote_nodemap->num != nodemap->num) {
+		if (remote_nodemaps[j]->num != nodemap->num) {
 			DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
-				  nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
+				  nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
 			do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
 			goto again;
 		}
@@ -2652,25 +2677,44 @@ again:
 		   active, then that is also a good reason to do recovery
 		 */
 		for (i=0;i<nodemap->num;i++) {
-			if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
+			if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
 				DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n", 
 					  nodemap->nodes[j].pnn, i, 
-					  remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
-				do_recovery(rec, mem_ctx, pnn, nodemap, 
-					    vnnmap, nodemap->nodes[j].pnn);
-				goto again;
-			}
-			if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) != 
-			    (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
-				DEBUG(DEBUG_WARNING, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n", 
-					  nodemap->nodes[j].pnn, i,
-					  remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
+					  remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
 				do_recovery(rec, mem_ctx, pnn, nodemap, 
 					    vnnmap, nodemap->nodes[j].pnn);
 				goto again;
 			}
 		}
 
+		/* verify the flags are consistent
+		*/
+		for (i=0; i<nodemap->num; i++) {
+			if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+				continue;
+			}
+			
+			if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
+				DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n", 
+				  nodemap->nodes[j].pnn, 
+				  nodemap->nodes[i].pnn, 
+				  remote_nodemaps[j]->nodes[i].flags,
+				  nodemap->nodes[j].flags));
+				if (i == j) {
+					DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
+					update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
+					do_recovery(rec, mem_ctx, pnn, nodemap, 
+						    vnnmap, nodemap->nodes[j].pnn);
+					goto again;
+				} else {
+					DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
+					update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
+					do_recovery(rec, mem_ctx, pnn, nodemap, 
+						    vnnmap, nodemap->nodes[j].pnn);
+					goto again;
+				}
+			}
+		}
 	}
 
 
diff --git a/tcp/tcp_connect.c b/tcp/tcp_connect.c
index cd0693c..6aa377b 100644
--- a/tcp/tcp_connect.c
+++ b/tcp/tcp_connect.c
@@ -153,7 +153,6 @@ void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te,
 		return;
 	}
 
-	DEBUG(DEBUG_ERR,("create socket...\n"));
 	tnode->fd = socket(sock_out.sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
 	set_nonblocking(tnode->fd);
 	set_close_on_exec(tnode->fd);
diff --git a/tools/ctdb.c b/tools/ctdb.c
index d6240ea..034a02a 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -2280,6 +2280,50 @@ static int control_restoredb(struct ctdb_context *ctdb, int argc, const char **a
 }
 
 /*
+ * set flags of a node in the nodemap
+ */
+static int control_setflags(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	int ret;
+	int32_t status;
+	int node;
+	int flags;
+	TDB_DATA data;
+	struct ctdb_node_flag_change c;
+
+	if (argc != 2) {
+		usage();
+		return -1;
+	}
+
+	if (sscanf(argv[0], "%d", &node) != 1) {
+		DEBUG(DEBUG_ERR, ("Badly formed node\n"));
+		usage();
+		return -1;
+	}
+	if (sscanf(argv[1], "0x%x", &flags) != 1) {
+		DEBUG(DEBUG_ERR, ("Badly formed flags\n"));
+		usage();
+		return -1;
+	}
+
+	c.pnn       = node;
+	c.old_flags = 0;
+	c.new_flags = flags;
+
+	data.dsize = sizeof(c);
+	data.dptr = (unsigned char *)&c;
+
+	ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_MODIFY_FLAGS, 0, 
+			   data, NULL, NULL, &status, NULL, NULL);
+	if (ret != 0 || status != 0) {
+		DEBUG(DEBUG_ERR,("Failed to modify flags\n"));
+		return -1;
+	}
+	return 0;
+}
+
+/*
   dump memory usage
  */
 static int control_dumpmemory(struct ctdb_context *ctdb, int argc, const char **argv)
@@ -2483,6 +2527,7 @@ static const struct {
 	{ "backupdb",        control_backupdb,          false, "backup the database into a file.", "<database> <file>"},
 	{ "restoredb",        control_restoredb,          false, "restore the database from a file.", "<file>"},
 	{ "recmaster",        control_recmaster,          false, "show the pnn for the recovery master."},
+	{ "setflags",        control_setflags,            false, "set flags for a node in the nodemap.", "<node> <flags>"},
 };
 
 /*


-- 
CTDB repository