[SCM] CTDB repository - branch master updated - ctdb-1.0.82-3-gafdbf3c

Ronnie Sahlberg sahlberg at samba.org
Thu May 14 09:03:47 GMT 2009


The branch, master has been updated
       via  afdbf3c0df02decd823615134294abf2c8a8a5f3 (commit)
       via  722171fc94a36ffe9e0a5c64502b916fde0a13a4 (commit)
       via  042377ed803bb8f7ca9d6ea1a387427b7b8ba45a (commit)
      from  82ee458329968001bb03b2aec42e65f532f007b3 (commit)

http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit afdbf3c0df02decd823615134294abf2c8a8a5f3
Author: Christian Ambach <christian.ambach at de.ibm.com>
Date:   Wed May 6 19:01:58 2009 +0200

    Remove error messages about a non-existing /var/log/log.ctdb when running ctdb with logging to syslog

commit 722171fc94a36ffe9e0a5c64502b916fde0a13a4
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 14 18:25:00 2009 +1000

    add additional log info to track if/why we cant switch to client mode.

commit 042377ed803bb8f7ca9d6ea1a387427b7b8ba45a
Author: Ronnie Sahlberg <ronniesahlberg at gmail.com>
Date:   Thu May 14 10:33:25 2009 +1000

    Track how long it takes to take out the recovery lock from both the main dameon and also from the recovery daemon.
    Log this in "ctdb statistics".
    
    Also add a varaible "RecLockLatencyMs" that will log an error everytime it takes longer than this to access the reclock file.

-----------------------------------------------------------------------

Summary of changes:
 client/ctdb_client.c   |   23 +++++++++++++++++++++++
 common/ctdb_util.c     |   16 ++++++++++++++++
 include/ctdb_private.h |    8 ++++++++
 server/ctdb_control.c  |    4 ++++
 server/ctdb_recover.c  |    6 ++++++
 server/ctdb_recoverd.c |    5 +++++
 server/ctdb_tunables.c |    1 +
 tools/ctdb.c           |    3 +++
 tools/ctdb_diagnostics |    4 ++--
 9 files changed, 68 insertions(+), 2 deletions(-)


Changeset truncated at 500 lines:

diff --git a/client/ctdb_client.c b/client/ctdb_client.c
index 3bdb4b2..df9fa06 100644
--- a/client/ctdb_client.c
+++ b/client/ctdb_client.c
@@ -263,6 +263,7 @@ int ctdb_socket_connect(struct ctdb_context *ctdb)
 
 	ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
 	if (ctdb->daemon.sd == -1) {
+		DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
 		return -1;
 	}
 
@@ -272,6 +273,7 @@ int ctdb_socket_connect(struct ctdb_context *ctdb)
 	if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
 		close(ctdb->daemon.sd);
 		ctdb->daemon.sd = -1;
+		DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno));
 		return -1;
 	}
 
@@ -3615,3 +3617,24 @@ int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb,
 	return 0;
 }
 
+/*
+  tell the main daemon how long it took to lock the reclock file
+ */
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+{
+	int ret;
+	int32_t res;
+	TDB_DATA data;
+
+	data.dptr = (uint8_t *)&latency;
+	data.dsize = sizeof(latency);
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data, 
+			   ctdb, NULL, &res, NULL, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/common/ctdb_util.c b/common/ctdb_util.c
index a3f1731..3604cc8 100644
--- a/common/ctdb_util.c
+++ b/common/ctdb_util.c
@@ -138,6 +138,22 @@ void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *lat
 	}
 }
 
+/*
+  update a reclock latency number
+ */
+void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l)
+{
+	if (l > *latency) {
+		*latency = l;
+	}
+
+	if (ctdb->tunable.reclock_latency_ms !=0) {
+		if (l*1000 > ctdb->tunable.reclock_latency_ms) {
+			DEBUG(DEBUG_ERR, ("High RECLOCK latency %fs for operation %s\n", l, name));
+		}
+	}
+}
+
 uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state)
 {
 	uint32_t id;
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index f37560a..5ecf8a6 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -105,6 +105,7 @@ struct ctdb_tunable {
 	uint32_t recd_ping_timeout;
 	uint32_t recd_ping_failcount;
 	uint32_t log_latency_ms;
+	uint32_t reclock_latency_ms;
 	uint32_t recovery_drop_all_ips;
 	uint32_t verify_recovery_lock;
 };
@@ -299,6 +300,10 @@ struct ctdb_statistics {
 		uint32_t control;
 		uint32_t traverse;
 	} timeouts;
+	struct {
+		double ctdbd;
+		double recd;
+	} reclock;
 	uint32_t total_calls;
 	uint32_t pending_calls;
 	uint32_t lockwait_calls;
@@ -562,6 +567,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_EVENT_SCRIPT_FINISHED   = 95,
 		    CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS = 96,
 		    CTDB_CONTROL_TRAVERSE_KILL		 = 97,
+		    CTDB_CONTROL_RECD_RECLOCK_LATENCY    = 98,
 };	
 
 /*
@@ -954,6 +960,7 @@ void ctdb_recv_raw_pkt(void *p, uint8_t *data, uint32_t length);
 int ctdb_socket_connect(struct ctdb_context *ctdb);
 
 void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *latency, struct timeval t);
+void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l);
 
 uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state);
 void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location);
@@ -1431,5 +1438,6 @@ int32_t ctdb_control_event_script_finished(struct ctdb_context *ctdb);
 int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb, TDB_DATA *outdata);
 
 int ctdb_log_event_script_output(struct ctdb_context *ctdb, char *str, uint16_t len);
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency);
 
 #endif
diff --git a/server/ctdb_control.c b/server/ctdb_control.c
index d5d4766..72e60c5 100644
--- a/server/ctdb_control.c
+++ b/server/ctdb_control.c
@@ -441,6 +441,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		CHECK_CONTROL_DATA_SIZE(0);
 		return ctdb_control_get_event_script_status(ctdb, outdata);
 
+	case CTDB_CONTROL_RECD_RECLOCK_LATENCY:
+		CHECK_CONTROL_DATA_SIZE(sizeof(double));
+		ctdb_reclock_latency(ctdb, "recd reclock", &ctdb->statistics.reclock.recd, *((double *)indata.dptr));
+		return 0;
 	default:
 		DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index c979398..b4428fa 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -492,6 +492,7 @@ struct ctdb_set_recmode_state {
 	struct timed_event *te;
 	struct fd_event *fde;
 	pid_t child;
+	struct timeval start_time;
 };
 
 /*
@@ -520,6 +521,10 @@ static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_even
 */
 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
 {
+	double l = timeval_elapsed(&state->start_time);
+
+	ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
+		
 	kill(state->child, SIGKILL);
 	return 0;
 }
@@ -632,6 +637,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 	state = talloc(ctdb, struct ctdb_set_recmode_state);
 	CTDB_NO_MEMORY(ctdb, state);
 
+	state->start_time = timeval_current();
 
 	if (ctdb->tunable.verify_recovery_lock == 0) {
 		/* dont need to verify the reclock file */
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index c6d0a7a..995284f 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -1328,6 +1328,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 	struct ctdb_dbid_map *dbmap;
 	TDB_DATA data;
 	uint32_t *nodes;
+	struct timeval start_time;
 
 	DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
 
@@ -1351,11 +1352,15 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
 	}
 
+	DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
+	start_time = timeval_current();
 	if (!ctdb_recovery_lock(ctdb, true)) {
 		ctdb_set_culprit(rec, pnn);
 		DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
 		return -1;
 	}
+	ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
+	DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
 
 	DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
 
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index bab9aef..6a8876f 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -53,6 +53,7 @@ static const struct {
 	{ "RecdPingTimeout",	 20,  offsetof(struct ctdb_tunable, recd_ping_timeout) },
 	{ "RecdFailCount",	  3,  offsetof(struct ctdb_tunable, recd_ping_failcount) },
 	{ "LogLatencyMs",         0,  offsetof(struct ctdb_tunable, log_latency_ms) },
+	{ "RecLockLatencyMs",  1000,  offsetof(struct ctdb_tunable, reclock_latency_ms) },
 	{ "RecoveryDropAllIPs",  60,  offsetof(struct ctdb_tunable, recovery_drop_all_ips) },
 	{ "VerifyRecoveryLock",   1,  offsetof(struct ctdb_tunable, verify_recovery_lock) },
 };
diff --git a/tools/ctdb.c b/tools/ctdb.c
index 35a4190..b85354c 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -209,6 +209,9 @@ static void show_statistics(struct ctdb_statistics *s)
 		       preflen?0:4, "",
 		       *(uint32_t *)(fields[i].offset+(uint8_t *)s));
 	}
+	printf(" %-30s     %.6f sec\n", "max_reclock_ctdbd", s->reclock.ctdbd);
+	printf(" %-30s     %.6f sec\n", "max_reclock_recd", s->reclock.recd);
+
 	printf(" %-30s     %.6f sec\n", "max_call_latency", s->max_call_latency);
 	printf(" %-30s     %.6f sec\n", "max_lockwait_latency", s->max_lockwait_latency);
 	printf(" %-30s     %.6f sec\n", "max_childwrite_latency", s->max_childwrite_latency);
diff --git a/tools/ctdb_diagnostics b/tools/ctdb_diagnostics
index a9531d6..7b48299 100755
--- a/tools/ctdb_diagnostics
+++ b/tools/ctdb_diagnostics
@@ -133,10 +133,10 @@ show_all "ctdb statistics"
 show_all "ctdb uptime"
 
 echo "Showing log.ctdb"
-show_all "tail -100 /var/log/log.ctdb"
+show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb"
 
 echo "Showing log.ctdb"
-show_all "tail -100 /var/log/log.ctdb"
+show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb"
 
 show_all "tail -200 /var/log/messages"
 show_all "tail -200 /etc/ctdb/state/vacuum.log"


-- 
CTDB repository


More information about the samba-cvs mailing list