TDB mutex support breaks CTDB

Stefan (metze) Metzmacher metze at samba.org
Fri Dec 12 05:06:07 MST 2014


Am 12.12.2014 um 06:34 schrieb Amitay Isaacs:
> On Thu, Dec 11, 2014 at 7:55 PM, Stefan (metze) Metzmacher <metze at samba.org>
> wrote:
> 
>> Am 11.12.2014 um 05:50 schrieb Amitay Isaacs:
>>> Hi Volker/Metze,
>>>
>>> The TDB robust mutex changes make certain assumptions which completely
>>> breaks CTDB.  Here are some of the findings which are causing concern.
>>>
>>> 1. CTDB does not always specify all the matching flags (i.e. the flags
>> used
>>> for creation) to tdb_open_ex() when opening tdb database subsequently (or
>>> in not-first case).  This does not work for databases created with
>>> TDB_MUTEX_LOCKING.
>>>
>>> 2. If I specify TDB_MUTEX_LOCKING in tdb_flags for subsequent (non-first)
>>> tdb_open_ex(), it does not work unless I also include TDB_CLEAR_IF_FIRST.
>>> If the process knows that it's not the first opener, why should it need
>> to
>>> specify TDB_CLEAR_IF_FIRST?
>>
>> As indication that it's a non persistent database.
>>
>>> 3. CTDB database recovery uses transactions to update the databases after
>>> recovery.  With TDB_MUTEX_LOCKING enabled, CTDB cannot do database
>> recovery.
>>
>> Can the all record lock be used instead of a transaction?
>> I wonder why we didn't notice this before...
>>
> 
> Changing from transaction to all record lock is going to be a major
> change.  What's the reason for not allowing transactions on mutex enabled
> databases?

I discussed this again with Volker and Michael, we all think it's ok to
remove
this restriction.

Please test the attached patches.

A possible backport will go via
https://bugzilla.samba.org/show_bug.cgi?id=11004.

metze
-------------- next part --------------
From cabb1713f171c6c6f6d1cd9c45ecbf65b943bb29 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze at samba.org>
Date: Fri, 12 Dec 2014 11:22:47 +0100
Subject: [PATCH 1/4] tdb: allow transactions on on tdb's with
 TDB_MUTEX_LOCKING

There's no real reason to disallow transactions as the
allrecord lock is also available with mutexes enabled.

E.g. ctdbd requires transactions also on non-persistent databases
opened with TDB_CLEAR_IF_FIRST and TDB_MUTEX_LOCKING.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=11004

Signed-off-by: Stefan Metzmacher <metze at samba.org>
---
 lib/tdb/common/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index caef0be..0dd057b 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -421,7 +421,7 @@ static int _tdb_transaction_start(struct tdb_context *tdb,
 				  enum tdb_lock_flags lockflags)
 {
 	/* some sanity checks */
-	if (tdb->read_only || (tdb->flags & (TDB_INTERNAL|TDB_MUTEX_LOCKING))
+	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)
 	    || tdb->traverse_read) {
 		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
 		tdb->ecode = TDB_ERR_EINVAL;
-- 
1.9.1


From c475663364571fea9dcef51438e49e52b44bd259 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze at samba.org>
Date: Fri, 12 Dec 2014 12:24:50 +0100
Subject: [PATCH 2/4] tdb/test: add tdb1-run-mutex-transaction1 test

Bug: https://bugzilla.samba.org/show_bug.cgi?id=11004

Signed-off-by: Stefan Metzmacher <metze at samba.org>
---
 lib/tdb/test/run-mutex-transaction1.c | 236 ++++++++++++++++++++++++++++++++++
 lib/tdb/wscript                       |   1 +
 2 files changed, 237 insertions(+)
 create mode 100644 lib/tdb/test/run-mutex-transaction1.c

diff --git a/lib/tdb/test/run-mutex-transaction1.c b/lib/tdb/test/run-mutex-transaction1.c
new file mode 100644
index 0000000..7b9f7b1
--- /dev/null
+++ b/lib/tdb/test/run-mutex-transaction1.c
@@ -0,0 +1,236 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "../common/mutex.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdarg.h>
+
+static TDB_DATA key, data;
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level,
+		   const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+static int do_child(int tdb_flags, int to, int from)
+{
+	struct tdb_context *tdb;
+	unsigned int log_count;
+	struct tdb_logging_context log_ctx = { log_fn, &log_count };
+	int ret;
+	char c = 0;
+
+	tdb = tdb_open_ex("mutex-transaction1.tdb", 3, tdb_flags,
+			  O_RDWR|O_CREAT, 0755, &log_ctx, NULL);
+	ok(tdb, "tdb_open_ex should succeed");
+
+	ret = tdb_transaction_start(tdb);
+	ok(ret == 0, "tdb_transaction_start should succeed");
+
+	ret = tdb_store(tdb, key, data, TDB_INSERT);
+	ok(ret == 0, "tdb_store(tdb, key, data, TDB_INSERT) should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	ret = tdb_transaction_cancel(tdb);
+	ok(ret == 0, "tdb_transaction_cancel should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	ret = tdb_transaction_start(tdb);
+	ok(ret == 0, "tdb_transaction_start should succeed");
+
+	ret = tdb_store(tdb, key, data, TDB_INSERT);
+	ok(ret == 0, "tdb_store(tdb, key, data, TDB_INSERT) should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	ret = tdb_transaction_commit(tdb);
+	ok(ret == 0, "tdb_transaction_commit should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	ret = tdb_transaction_start(tdb);
+	ok(ret == 0, "tdb_transaction_start should succeed");
+
+	ret = tdb_store(tdb, key, key, TDB_REPLACE);
+	ok(ret == 0, "tdb_store(tdb, key, data, TDB_REPLACE) should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	ret = tdb_transaction_commit(tdb);
+	ok(ret == 0, "tdb_transaction_commit should succeed");
+
+	write(to, &c, sizeof(c));
+	read(from, &c, sizeof(c));
+
+	return 0;
+}
+
+/* The code should barf on TDBs created with rwlocks. */
+int main(int argc, char *argv[])
+{
+	struct tdb_context *tdb;
+	unsigned int log_count;
+	struct tdb_logging_context log_ctx = { log_fn, &log_count };
+	int ret, status;
+	pid_t child, wait_ret;
+	int fromchild[2];
+	int tochild[2];
+	TDB_DATA val;
+	char c;
+	int tdb_flags;
+	bool runtime_support;
+
+	runtime_support = tdb_runtime_check_for_robust_mutexes();
+
+	if (!runtime_support) {
+		skip(1, "No robust mutex support");
+		return exit_status();
+	}
+
+	key.dsize = strlen("hi");
+	key.dptr = discard_const_p(uint8_t, "hi");
+	data.dsize = strlen("world");
+	data.dptr = discard_const_p(uint8_t, "world");
+
+	pipe(fromchild);
+	pipe(tochild);
+
+	tdb_flags = TDB_INCOMPATIBLE_HASH|
+		TDB_MUTEX_LOCKING|
+		TDB_CLEAR_IF_FIRST;
+
+	child = fork();
+	if (child == 0) {
+		close(fromchild[0]);
+		close(tochild[1]);
+		return do_child(tdb_flags, fromchild[1], tochild[0]);
+	}
+	close(fromchild[1]);
+	close(tochild[0]);
+
+	read(fromchild[0], &c, sizeof(c));
+
+	tdb = tdb_open_ex("mutex-transaction1.tdb", 0,
+			  tdb_flags, O_RDWR|O_CREAT, 0755,
+			  &log_ctx, NULL);
+	ok(tdb, "tdb_open_ex should succeed");
+
+	/*
+	 * The child has the transaction running
+	 */
+	ret = tdb_transaction_start_nonblock(tdb);
+	ok(ret == -1, "tdb_transaction_start_nonblock not succeed");
+
+	ret = tdb_chainlock_nonblock(tdb, key);
+	ok(ret == -1, "tdb_chainlock_nonblock should not succeed");
+
+	/*
+	 * We can still read
+	 */
+	ret = tdb_exists(tdb, key);
+	ok(ret == 0, "tdb_exists(tdb, key) should return 0");
+
+	val = tdb_fetch(tdb, key);
+	ok(val.dsize == 0, "tdb_fetch(tdb, key) should return an empty value");
+
+	write(tochild[1], &c, sizeof(c));
+
+	/*
+	 * When the child canceled we can start...
+	 */
+	ret = tdb_transaction_start(tdb);
+	ok(ret == 0, "tdb_transaction_start should succeed");
+
+	read(fromchild[0], &c, sizeof(c));
+	write(tochild[1], &c, sizeof(c));
+
+	ret = tdb_transaction_cancel(tdb);
+	ok(ret == 0, "tdb_transaction_cancel should succeed");
+
+	/*
+	 * When we canceled the child can start and store...
+	 */
+	read(fromchild[0], &c, sizeof(c));
+
+	/*
+	 * We still see the old values before the child commits...
+	 */
+	ret = tdb_exists(tdb, key);
+	ok(ret == 0, "tdb_exists(tdb, key) should return 0");
+
+	val = tdb_fetch(tdb, key);
+	ok(val.dsize == 0, "tdb_fetch(tdb, key) should return an empty value");
+
+	write(tochild[1], &c, sizeof(c));
+	read(fromchild[0], &c, sizeof(c));
+
+	/*
+	 * We see the new values after the commit...
+	 */
+	ret = tdb_exists(tdb, key);
+	ok(ret == 1, "tdb_exists(tdb, key) should return 1");
+
+	val = tdb_fetch(tdb, key);
+	ok(val.dsize != 0, "tdb_fetch(tdb, key) should return a value");
+	ok(val.dsize == data.dsize, "tdb_fetch(tdb, key) should return a value");
+	ok(memcmp(val.dptr, data.dptr, data.dsize) == 0, "tdb_fetch(tdb, key) should return a value");
+
+	write(tochild[1], &c, sizeof(c));
+	read(fromchild[0], &c, sizeof(c));
+
+	/*
+	 * The child started a new transaction and replaces the value,
+	 * but we still see the old values before the child commits...
+	 */
+	ret = tdb_exists(tdb, key);
+	ok(ret == 1, "tdb_exists(tdb, key) should return 1");
+
+	val = tdb_fetch(tdb, key);
+	ok(val.dsize != 0, "tdb_fetch(tdb, key) should return a value");
+	ok(val.dsize == data.dsize, "tdb_fetch(tdb, key) should return a value");
+	ok(memcmp(val.dptr, data.dptr, data.dsize) == 0, "tdb_fetch(tdb, key) should return a value");
+
+	write(tochild[1], &c, sizeof(c));
+	read(fromchild[0], &c, sizeof(c));
+
+	/*
+	 * We see the new values after the commit...
+	 */
+	ret = tdb_exists(tdb, key);
+	ok(ret == 1, "tdb_exists(tdb, key) should return 1");
+
+	val = tdb_fetch(tdb, key);
+	ok(val.dsize != 0, "tdb_fetch(tdb, key) should return a value");
+	ok(val.dsize == key.dsize, "tdb_fetch(tdb, key) should return a value");
+	ok(memcmp(val.dptr, key.dptr, key.dsize) == 0, "tdb_fetch(tdb, key) should return a value");
+
+	write(tochild[1], &c, sizeof(c));
+
+	wait_ret = wait(&status);
+	ok(wait_ret == child, "child should have exited correctly");
+
+	diag("done");
+	return exit_status();
+}
diff --git a/lib/tdb/wscript b/lib/tdb/wscript
index d129b24..bae5f37 100644
--- a/lib/tdb/wscript
+++ b/lib/tdb/wscript
@@ -46,6 +46,7 @@ tdb1_unit_tests = [
     'run-mutex-allrecord-bench',
     'run-mutex-allrecord-trylock',
     'run-mutex-allrecord-block',
+    'run-mutex-transaction1',
     'run-mutex-die',
     'run-mutex1',
 ]
-- 
1.9.1


From 1d8ffa424aac0e3e47adeb0074d4ae092fdbec3d Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze at samba.org>
Date: Fri, 12 Dec 2014 12:53:37 +0100
Subject: [PATCH 3/4] tdb/toos: allow transactions with TDB_MUTEX_LOCKING

Bug: https://bugzilla.samba.org/show_bug.cgi?id=11004

Signed-off-by: Stefan Metzmacher <metze at samba.org>
---
 lib/tdb/tools/tdbtorture.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/tdb/tools/tdbtorture.c b/lib/tdb/tools/tdbtorture.c
index 9278688..e4b8f69 100644
--- a/lib/tdb/tools/tdbtorture.c
+++ b/lib/tdb/tools/tdbtorture.c
@@ -120,7 +120,6 @@ static void addrec_db(void)
 
 #if TRANSACTION_PROB
 	if (in_transaction == 0 &&
-	    ((tdb_get_flags(db) & TDB_MUTEX_LOCKING) == 0) &&
 	    (always_transaction || random() % TRANSACTION_PROB == 0)) {
 		if (tdb_transaction_start(db) != 0) {
 			fatal("tdb_transaction_start failed");
-- 
1.9.1


From 2f0234c152b9a421222aa8afe5e8928d3648539f Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze at samba.org>
Date: Fri, 12 Dec 2014 12:28:47 +0100
Subject: [PATCH 4/4] tdb: version 1.3.4

Transactions are supported with TDB_MUTEX_LOCKING.

This fixes https://bugzilla.samba.org/show_bug.cgi?id=11004

Signed-off-by: Stefan Metzmacher <metze at samba.org>
---
 lib/tdb/ABI/tdb-1.3.4.sigs | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/tdb/wscript            |  2 +-
 2 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 lib/tdb/ABI/tdb-1.3.4.sigs

diff --git a/lib/tdb/ABI/tdb-1.3.4.sigs b/lib/tdb/ABI/tdb-1.3.4.sigs
new file mode 100644
index 0000000..7d3e469
--- /dev/null
+++ b/lib/tdb/ABI/tdb-1.3.4.sigs
@@ -0,0 +1,68 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lock_nonblock: int (struct tdb_context *, int, int)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_rescue: int (struct tdb_context *, void (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_runtime_check_for_robust_mutexes: bool (void)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_summary: char *(struct tdb_context *)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_transaction_write_lock_mark: int (struct tdb_context *)
+tdb_transaction_write_lock_unmark: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlock: int (struct tdb_context *, int, int)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/lib/tdb/wscript b/lib/tdb/wscript
index bae5f37..b283795 100644
--- a/lib/tdb/wscript
+++ b/lib/tdb/wscript
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 APPNAME = 'tdb'
-VERSION = '1.3.3'
+VERSION = '1.3.4'
 
 blddir = 'bin'
 
-- 
1.9.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: OpenPGP digital signature
URL: <http://lists.samba.org/pipermail/samba-technical/attachments/20141212/575e2161/attachment.pgp>


More information about the samba-technical mailing list