Rev 645: - catch ESTALE in the recovery lock by trying a read() in
http://samba.org/~tridge/ctdb
tridge at samba.org
tridge at samba.org
Fri Oct 5 03:28:23 GMT 2007
------------------------------------------------------------
revno: 645
revision-id: tridge at samba.org-20071005032821-gnm4te9ogcb2tmpw
parent: tridge at samba.org-20071005020140-in6xybrd2o71xrlk
committer: Andrew Tridgell <tridge at samba.org>
branch nick: tridge
timestamp: Fri 2007-10-05 13:28:21 +1000
message:
- catch ESTALE in the recovery lock by trying a read()
- priortise nodes that are unbanned and healthy in the election
modified:
server/ctdb_recoverd.c recoverd.c-20070503213540-bvxuyd9jm1f7ig90-1
=== modified file 'server/ctdb_recoverd.c'
--- a/server/ctdb_recoverd.c 2007-10-05 02:01:40 +0000
+++ b/server/ctdb_recoverd.c 2007-10-05 03:28:21 +0000
@@ -45,6 +45,7 @@
struct timeval priority_time;
bool need_takeover_run;
bool need_recovery;
+ uint32_t node_flags;
};
#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -974,6 +975,7 @@
uint32_t num_connected;
struct timeval priority_time;
uint32_t pnn;
+ uint32_t node_flags;
};
/*
@@ -989,6 +991,7 @@
em->pnn = rec->ctdb->pnn;
em->priority_time = rec->priority_time;
+ em->node_flags = rec->node_flags;
ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
if (ret != 0) {
@@ -1009,12 +1012,36 @@
static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
{
struct election_message myem;
- int cmp;
+ int cmp = 0;
ctdb_election_data(rec, &myem);
+ /* try to use a unbanned node */
+ if ((em->node_flags & NODE_FLAGS_BANNED) &&
+ !(myem.node_flags & NODE_FLAGS_BANNED)) {
+ cmp = 1;
+ }
+ if (!(em->node_flags & NODE_FLAGS_BANNED) &&
+ (myem.node_flags & NODE_FLAGS_BANNED)) {
+ cmp = -1;
+ }
+
+ /* try to use a healthy node */
+ if (cmp == 0) {
+ if ((em->node_flags & NODE_FLAGS_UNHEALTHY) &&
+ !(myem.node_flags & NODE_FLAGS_UNHEALTHY)) {
+ cmp = 1;
+ }
+ if (!(em->node_flags & NODE_FLAGS_UNHEALTHY) &&
+ (myem.node_flags & NODE_FLAGS_UNHEALTHY)) {
+ cmp = -1;
+ }
+ }
+
/* try to use the most connected node */
- cmp = (int)myem.num_connected - (int)em->num_connected;
+ if (cmp == 0) {
+ cmp = (int)myem.num_connected - (int)em->num_connected;
+ }
/* then the longest running node */
if (cmp == 0) {
@@ -1444,6 +1471,7 @@
int i, j, ret;
struct ctdb_recoverd *rec;
struct ctdb_all_public_ips *ips;
+ char c;
rec = talloc_zero(ctdb, struct ctdb_recoverd);
CTDB_NO_MEMORY_FATAL(ctdb, rec);
@@ -1508,6 +1536,8 @@
goto again;
}
+ /* remember our own node flags */
+ rec->node_flags = nodemap->nodes[pnn].flags;
/* count how many active nodes there are */
num_active = 0;
@@ -1608,7 +1638,6 @@
goto again;
}
-
/* update the list of public ips that a node can handle for
all connected nodes
*/
@@ -1670,6 +1699,20 @@
}
+ /* we should have the reclock - check its not stale */
+ if (ctdb->recovery_lock_fd == -1) {
+ DEBUG(0,("recovery master doesn't have the recovery lock\n"));
+ do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, pnn);
+ goto again;
+ }
+
+ if (read(ctdb->recovery_lock_fd, &c, 1) == -1) {
+ DEBUG(0,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
+ close(ctdb->recovery_lock_fd);
+ ctdb->recovery_lock_fd = -1;
+ do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, pnn);
+ goto again;
+ }
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
More information about the samba-cvs
mailing list