[SCM] SAMBA-CTDB repository - branch v3-4-ctdb updated - 3.4.2-ctdb-22-8-gc20e3e4

Tue Feb 16 08:56:40 MST 2010

The branch, v3-4-ctdb has been updated
       via  c20e3e402bd3b256b4ea5dd7f9f84b5336930bb7 (commit)
       via  5c54ae35b4356549caa691abd8d2b3eab36de5ec (commit)
       via  31a3d1c5fef0688548cde5224f2196c84d866b57 (commit)
       via  8136b31a24716a405c5e29f00d291e3979bc939a (commit)
       via  f373961ff38f0d9f6b7ef866b6cb30f653fc76b7 (commit)
       via  e577980623ea64a852bac569bf50fb9c92d98b7b (commit)
       via  0357851c0c8e115e92c89956b28c0468182e93d1 (commit)
       via  dd9ab62a6d8b247ee95f304b3d24469cff6553b8 (commit)
      from  968e731d7c2b905d816ba283a096195b1a964539 (commit)

http://gitweb.samba.org/?p=obnox/samba-ctdb.git;a=shortlog;h=v3-4-ctdb


- Log -----------------------------------------------------------------
commit c20e3e402bd3b256b4ea5dd7f9f84b5336930bb7
Author: Volker Lendecke <vl at samba.org>
Date:   Tue Feb 16 16:44:58 2010 +0100

    v3-4-ctdb: Bump ctdb vendor patch level to 23

commit 5c54ae35b4356549caa691abd8d2b3eab36de5ec
Author: Volker Lendecke <vl at samba.org>
Date:   Tue Feb 16 15:21:25 2010 +0100

    s3: Fix timeout calculation if g_lock_lock is given a timeout < 60s
    
    Detected while showing this code to obnox :-)

commit 31a3d1c5fef0688548cde5224f2196c84d866b57
Author: Volker Lendecke <vl at samba.org>
Date:   Tue Feb 16 12:31:58 2010 +0100

    s3: Slightly increase parallelism in g_lock
    
    There's no need to still hold the g_lock tdb-level lock while telling the
    waiters to retry

commit 8136b31a24716a405c5e29f00d291e3979bc939a
Author: Volker Lendecke <vl at samba.org>
Date:   Tue Feb 16 12:28:53 2010 +0100

    s3: Avoid starving locks when many processes die at the same time
    
    In g_lock_unlock we have a little race between the process_exists and
    messaging_send call: We only send to 5 waiters now, they all might have died
    between us checking their existence and sending the message. This change makes
    g_lock_lock retry at least once every minute.

commit f373961ff38f0d9f6b7ef866b6cb30f653fc76b7
Author: Volker Lendecke <vl at samba.org>
Date:   Tue Feb 16 12:22:08 2010 +0100

    s3: Avoid a thundering herd in g_lock_unlock
    
    Only notify the first 5 pending lock waiters. This avoids a thundering herd
    problem that is really nasty in a cluster. It also makes acquiring a lock a bit
    more FIFO, lock waiters are added to the end of the array.

commit e577980623ea64a852bac569bf50fb9c92d98b7b
Author: Volker Lendecke <vl at samba.org>
Date:   Mon Feb 15 16:57:16 2010 +0100

    s3: Optimize g_lock_lock for a heavily contended case
    
    Only check the existence of the lock owner in g_lock_parse, check the rest of
    the records only when we got the lock successfully. This reduces the load on
    process_exists which can involve a network roundtrip in the clustered case.

commit 0357851c0c8e115e92c89956b28c0468182e93d1
Author: Volker Lendecke <vl at samba.org>
Date:   Mon Feb 15 16:49:46 2010 +0100

    s3: Fix handling of processes that died in g_lock
    
    g_lock_parse might have thrown away entries from the locks array because the
    processes were not around anymore. Don't store the orphaned entries.

commit dd9ab62a6d8b247ee95f304b3d24469cff6553b8
Author: Volker Lendecke <vl at samba.org>
Date:   Mon Feb 15 16:35:06 2010 +0100

    s3: Fix a typo

-----------------------------------------------------------------------

Summary of changes:
 source3/VERSION      |    2 +-
 source3/lib/g_lock.c |   85 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 14 deletions(-)


Changeset truncated at 500 lines:

diff --git a/source3/VERSION b/source3/VERSION
index ab42fc4..2c495b8 100644
--- a/source3/VERSION
+++ b/source3/VERSION
@@ -85,7 +85,7 @@ SAMBA_VERSION_IS_GIT_SNAPSHOT=no
 #                                                      #
 ########################################################
 SAMBA_VERSION_VENDOR_SUFFIX="ctdb"
-SAMBA_VERSION_VENDOR_PATCH=22
+SAMBA_VERSION_VENDOR_PATCH=23
 
 ########################################################
 # This can be set by vendors if they want..            #
diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 9971d71..4e31761 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -108,6 +108,34 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
 			      (locks[i].lock_type & G_LOCK_PENDING) ?
 			      "(pending)" : "(owner)"));
 
+		if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
+		    && !process_exists(locks[i].pid)) {
+
+			DEBUGADD(10, ("lock owner %s died -- discarding\n",
+				      procid_str(talloc_tos(),
+						 &locks[i].pid)));
+
+			if (i < (num_locks-1)) {
+				locks[i] = locks[num_locks-1];
+			}
+			num_locks -= 1;
+		}
+	}
+
+	*plocks = locks;
+	*pnum_locks = num_locks;
+	return true;
+}
+
+static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
+{
+	int i, num_locks;
+
+	num_locks = *pnum_locks;
+
+	DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
+
+	for (i=0; i<num_locks; i++) {
 		if (process_exists(locks[i].pid)) {
 			continue;
 		}
@@ -119,19 +147,18 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
 		}
 		num_locks -= 1;
 	}
-
-	*plocks = locks;
 	*pnum_locks = num_locks;
-	return true;
+	return;
 }
 
 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
 					struct g_lock_rec *locks,
-					int num_locks,
+					int *pnum_locks,
 					const struct server_id pid,
 					enum g_lock_type lock_type)
 {
 	struct g_lock_rec *result;
+	int num_locks = *pnum_locks;
 
 	result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
 				num_locks+1);
@@ -141,6 +168,7 @@ static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
 
 	result[num_locks].pid = pid;
 	result[num_locks].lock_type = lock_type;
+	*pnum_locks += 1;
 	return result;
 }
 
@@ -225,7 +253,7 @@ again:
 	if (our_index == -1) {
 		/* First round, add ourself */
 
-		locks = g_lock_addrec(talloc_tos(), locks, num_locks,
+		locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
 				      self, lock_type);
 		if (locks == NULL) {
 			DEBUG(10, ("g_lock_addrec failed\n"));
@@ -241,7 +269,14 @@ again:
 		locks[our_index].lock_type = lock_type;
 	}
 
-	data = make_tdb_data((uint8_t *)locks, talloc_get_size(locks));
+	if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
+		/*
+		 * Walk through the list of locks, search for dead entries
+		 */
+		g_lock_cleanup(&num_locks, locks);
+	}
+
+	data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
 	store_status = rec->store(rec, data, 0);
 	if (!NT_STATUS_IS_OK(store_status)) {
 		DEBUG(1, ("rec->store failed: %s\n",
@@ -267,7 +302,6 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 	NTSTATUS status;
 	bool retry = false;
 	struct timeval timeout_end;
-	struct timeval timeout_remaining;
 	struct timeval time_now;
 
 	DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
@@ -306,6 +340,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		fd_set *r_fds = NULL;
 		int max_fd = 0;
 		int ret;
+		struct timeval timeout_remaining, select_timeout;
 
 		status = g_lock_trylock(ctx, name, lock_type);
 		if (NT_STATUS_IS_OK(status)) {
@@ -340,7 +375,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		 * for writing and some other process already holds a lock
 		 * on the registry.tdb.
 		 *
-		 * So as a quick fix, we act a little corasely here: we do
+		 * So as a quick fix, we act a little coarsely here: we do
 		 * a select on the ctdb connection fd and when it is readable
 		 * or we get EINTR, then we retry without actually parsing
 		 * any ctdb packages or dispatching messages. This means that
@@ -364,10 +399,13 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 
 		time_now = timeval_current();
 		timeout_remaining = timeval_until(&time_now, &timeout_end);
+		select_timeout = timeval_set(60, 0);
 
-		ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
-				 &timeout_remaining);
+		select_timeout = timeval_min(&select_timeout,
+					     &timeout_remaining);
 
+		ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
+				 &select_timeout);
 		if (ret == -1) {
 			if (errno != EINTR) {
 				DEBUG(1, ("error calling select: %s\n",
@@ -388,7 +426,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 				break;
 			} else {
 				DEBUG(10, ("select returned 0 but timeout not "
-					   "not expired: strange - retrying\n"));
+					   "not expired, retrying\n"));
 			}
 		} else if (ret != 1) {
 			DEBUG(1, ("invalid return code of select: %d\n", ret));
@@ -504,14 +542,26 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 		goto done;
 	}
 
+	TALLOC_FREE(rec);
+
 	if ((lock_type & G_LOCK_PENDING) == 0) {
+		int num_wakeups = 0;
+
 		/*
-		 * We've been the lock holder. Tell all others to retry.
+		 * We've been the lock holder. Others to retry. Don't
+		 * tell all others to avoid a thundering herd. In case
+		 * this leads to a complete stall because we miss some
+		 * processes, the loop in g_lock_lock tries at least
+		 * once a minute.
 		 */
+
 		for (i=0; i<num_locks; i++) {
 			if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
 				continue;
 			}
+			if (!process_exists(locks[i].pid)) {
+				continue;
+			}
 
 			/*
 			 * Ping all waiters to retry
@@ -524,13 +574,22 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 					  procid_str(debug_ctx(),
 						     &locks[i].pid),
 					  nt_errstr(status)));
+			} else {
+				num_wakeups += 1;
+			}
+			if (num_wakeups > 5) {
+				break;
 			}
 		}
 	}
 done:
+	/*
+	 * For the error path, TALLOC_FREE(rec) as well. In the good
+	 * path we have already freed it.
+	 */
+	TALLOC_FREE(rec);
 
 	TALLOC_FREE(locks);
-	TALLOC_FREE(rec);
 	return status;
 }
 


-- 
SAMBA-CTDB repository