tdb transaction_prepare

Howard Chu hyc at highlandsun.com
Tue Mar 31 00:25:02 GMT 2009


tridge at samba.org wrote:
> Hi Howard,
>
>   >  Tridge, here's a first pass as we discussed on irc.
>
> Thanks! This looks really good, with only a couple of very minor
> quibbles.
>
> - I'd prefer a bool for 'prepared', and 'ret' over 'i' for the check
>    of the return value of tdb_transaction_prepare().

OK.
>
> - I think the function should be called
>    tdb_transaction_prepare_commit(), just to make it clear from the
>    name that it isn't a function called to prepare the start of a
>    transaction.

OK.
>
> - I think we probably want some paranoia checks on calling prepare
>    twice, and on doing any modify ops once a prepare is done. So if
>    someone tries to store a record after having done a prepare then
>    that should be a transaction error I think (either that, or we need
>    to remove the prepared flag)

OK. I added the check in transaction_read() as well although as you point out, 
reads are harmless and we could choose to allow them.

> - we will need this to be added to the test suite, the headers and the
>    docs. We also need to make it clear in the docs that a failed
>    prepare does an implicit cancel. I think that is the right behaviour
>    (and it is what you coded), but it isn't necessarily obvious to a
>    user of this function.

I haven't looked at the test suite yet, but added a note to the README.

>   >  I think that zeroing out the recovery data from the magic_offset
>   >  needs to be moved into transaction_cancel but I haven't tried that
>   >  yet.
>
> I think you're right. I also think that is an existing bug, although
> its probably harmless in most cases. If the tdb_expand_file() fails
> with the current code, we don't remove the recovery marker, so we
> would do a recovery on the nest open. Zeroing the magic_offset marker
> in the transaction_cancel would fix that.
>
> Andrew just pointed out to me that we should be using
> tdb_transaction_prepare_commit() in the ldb code too, as ldb in Samba4
> uses a separate tdb for each ldap partition. By using
> tdb_transaction_prepare_commit() we can commit safely across all those
> databases.
>
> Thanks for suggesting this!
>
> Cheers, Tridge
>


-- 
   -- Howard Chu
   CTO, Symas Corp.           http://www.symas.com
   Director, Highland Sun     http://highlandsun.com/hyc/
   Chief Architect, OpenLDAP  http://www.openldap.org/project/
-------------- next part --------------
diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index 7acda64..1cb7063 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -116,6 +116,10 @@ struct tdb_transaction {
 	   but don't create a new transaction */
 	int nesting;
 
+	/* set when a prepare has already occurred */
+	bool prepared;
+	tdb_off_t magic_offset;
+
 	/* old file size before transaction */
 	tdb_len_t old_map_size;
 };
@@ -130,6 +134,14 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 {
 	uint32_t blk;
 
+	/* Only a commit is allowed on a prepared transaction */
+	if (tdb->transaction->prepared) {
+		tdb->ecode = TDB_ERR_EINVAL;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: transaction already prepared, read not allowed\n"));
+		tdb->transaction->transaction_error = 1;
+		return -1;
+	}
+
 	/* break it down into block sized ops */
 	while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
 		tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
@@ -187,6 +199,14 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 {
 	uint32_t blk;
 
+	/* Only a commit is allowed on a prepared transaction */
+	if (tdb->transaction->prepared) {
+		tdb->ecode = TDB_ERR_EINVAL;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
+		tdb->transaction->transaction_error = 1;
+		return -1;
+	}
+
 	/* if the write is to a hash head, then update the transaction
 	   hash heads */
 	if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
@@ -498,11 +518,37 @@ fail:
 
 
 /*
+  sync to disk
+*/
+static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
+{	
+	if (fsync(tdb->fd) != 0) {
+		tdb->ecode = TDB_ERR_IO;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
+		return -1;
+	}
+#ifdef HAVE_MMAP
+	if (tdb->map_ptr) {
+		tdb_off_t moffset = offset & ~(tdb->page_size-1);
+		if (msync(moffset + (char *)tdb->map_ptr, 
+			  length + (offset - moffset), MS_SYNC) != 0) {
+			tdb->ecode = TDB_ERR_IO;
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
+				 strerror(errno)));
+			return -1;
+		}
+	}
+#endif
+	return 0;
+}
+
+
+/*
   cancel the current transaction
 */
 int tdb_transaction_cancel(struct tdb_context *tdb)
 {	
-	int i;
+	int i, ret = 0;
 
 	if (tdb->transaction == NULL) {
 		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
@@ -525,6 +571,18 @@ int tdb_transaction_cancel(struct tdb_context *tdb)
 	}
 	SAFE_FREE(tdb->transaction->blocks);
 
+	if (tdb->transaction->magic_offset) {
+		const struct tdb_methods *methods = tdb->transaction->io_methods;
+		uint32_t zero = 0;
+
+		/* remove the recovery marker */
+		if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &zero, 4) == -1 ||
+		transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
+			ret = -1;
+		}
+	}
+
 	/* remove any global lock created during the transaction */
 	if (tdb->global_lock.count != 0) {
 		tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
@@ -550,32 +608,7 @@ int tdb_transaction_cancel(struct tdb_context *tdb)
 	SAFE_FREE(tdb->transaction->hash_heads);
 	SAFE_FREE(tdb->transaction);
 	
-	return 0;
-}
-
-/*
-  sync to disk
-*/
-static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
-{	
-	if (fsync(tdb->fd) != 0) {
-		tdb->ecode = TDB_ERR_IO;
-		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
-		return -1;
-	}
-#ifdef HAVE_MMAP
-	if (tdb->map_ptr) {
-		tdb_off_t moffset = offset & ~(tdb->page_size-1);
-		if (msync(moffset + (char *)tdb->map_ptr, 
-			  length + (offset - moffset), MS_SYNC) != 0) {
-			tdb->ecode = TDB_ERR_IO;
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
-				 strerror(errno)));
-			return -1;
-		}
-	}
-#endif
-	return 0;
+	return ret;
 }
 
 
@@ -826,36 +859,39 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 }
 
 /*
-  commit the current transaction
+  prepare to commit the current transaction
 */
-int tdb_transaction_commit(struct tdb_context *tdb)
+int tdb_transaction_prepare_commit(struct tdb_context *tdb)
 {	
 	const struct tdb_methods *methods;
-	tdb_off_t magic_offset = 0;
-	uint32_t zero = 0;
 	int i;
 
 	if (tdb->transaction == NULL) {
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
+		return -1;
+	}
+
+	if (tdb->transaction->prepared) {
+		tdb->ecode = TDB_ERR_EINVAL;
+		tdb_transaction_cancel(tdb);
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
 		return -1;
 	}
 
 	if (tdb->transaction->transaction_error) {
 		tdb->ecode = TDB_ERR_IO;
 		tdb_transaction_cancel(tdb);
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
 		return -1;
 	}
 
 
 	if (tdb->transaction->nesting != 0) {
-		tdb->transaction->nesting--;
 		return 0;
 	}		
 
 	/* check for a null transaction */
 	if (tdb->transaction->blocks == NULL) {
-		tdb_transaction_cancel(tdb);
 		return 0;
 	}
 
@@ -865,14 +901,14 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 	   nested their locks properly, so fail the transaction */
 	if (tdb->num_locks || tdb->global_lock.count) {
 		tdb->ecode = TDB_ERR_LOCK;
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: locks pending on commit\n"));
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
 		tdb_transaction_cancel(tdb);
 		return -1;
 	}
 
 	/* upgrade the main transaction lock region to a write lock */
 	if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to upgrade hash locks\n"));
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
 		tdb->ecode = TDB_ERR_LOCK;
 		tdb_transaction_cancel(tdb);
 		return -1;
@@ -881,7 +917,7 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 	/* get the global lock - this prevents new users attaching to the database
 	   during the commit */
 	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: failed to get global lock\n"));
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get global lock\n"));
 		tdb->ecode = TDB_ERR_LOCK;
 		tdb_transaction_cancel(tdb);
 		return -1;
@@ -889,21 +925,23 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 
 	if (!(tdb->flags & TDB_NOSYNC)) {
 		/* write the recovery data to the end of the file */
-		if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to setup recovery data\n"));
+		if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
 			tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 			tdb_transaction_cancel(tdb);
 			return -1;
 		}
 	}
 
+	tdb->transaction->prepared = true;
+
 	/* expand the file to the new size if needed */
 	if (tdb->map_size != tdb->transaction->old_map_size) {
 		if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, 
 					     tdb->map_size - 
 					     tdb->transaction->old_map_size) == -1) {
 			tdb->ecode = TDB_ERR_IO;
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: expansion failed\n"));
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
 			tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 			tdb_transaction_cancel(tdb);
 			return -1;
@@ -912,6 +950,51 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 		methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 	}
 
+	/* Keep the global lock until the actual commit */
+
+	return 0;
+}
+
+/*
+  commit the current transaction
+*/
+int tdb_transaction_commit(struct tdb_context *tdb)
+{	
+	const struct tdb_methods *methods;
+	int i;
+
+	if (tdb->transaction == NULL) {
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
+		return -1;
+	}
+
+	if (tdb->transaction->transaction_error) {
+		tdb->ecode = TDB_ERR_IO;
+		tdb_transaction_cancel(tdb);
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
+		return -1;
+	}
+
+
+	if (tdb->transaction->nesting != 0) {
+		tdb->transaction->nesting--;
+		return 0;
+	}
+
+	/* check for a null transaction */
+	if (tdb->transaction->blocks == NULL) {
+		tdb_transaction_cancel(tdb);
+		return 0;
+	}
+
+	if (!tdb->transaction->prepared) {
+		int ret = tdb_transaction_prepare_commit(tdb);
+		if (ret)
+			return ret;
+	}
+
+	methods = tdb->transaction->io_methods;
+
 	/* perform all the writes */
 	for (i=0;i<tdb->transaction->num_blocks;i++) {
 		tdb_off_t offset;
@@ -953,17 +1036,6 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 		if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
 			return -1;
 		}
-
-		/* remove the recovery marker */
-		if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to remove recovery magic\n"));
-			return -1;
-		}
-
-		/* ensure the recovery marker has been removed on disk */
-		if (transaction_sync(tdb, magic_offset, 4) == -1) {
-			return -1;
-		}
 	}
 
 	tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
diff --git a/lib/tdb/docs/README b/lib/tdb/docs/README
index 63fcf5e..454e4ba 100644
--- a/lib/tdb/docs/README
+++ b/lib/tdb/docs/README
@@ -236,3 +236,11 @@ int tdb_transaction_commit(TDB_CONTEXT *tdb)
    commit a current transaction, updating the database and releasing
    the transaction locks.
 
+----------------------------------------------------------------------
+int tdb_transaction_prepare_commit(TDB_CONTEXT *tdb)
+
+   prepare to commit a current transaction, for two-phase commits.
+   Once prepared for commit, the only allowed calls are
+   tdb_transaction_commit() or tdb_transaction_cancel(). Preparing
+   allocates disk space for the pending updates, so a subsequent
+   commit should succeed (barring any hardware failures).
diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h
index 94b5e36..22496f5 100644
--- a/lib/tdb/include/tdb.h
+++ b/lib/tdb/include/tdb.h
@@ -129,6 +129,7 @@ int tdb_fd(struct tdb_context *tdb);
 tdb_log_func tdb_log_fn(struct tdb_context *tdb);
 void *tdb_get_logging_private(struct tdb_context *tdb);
 int tdb_transaction_start(struct tdb_context *tdb);
+int tdb_transaction_prepare_commit(struct tdb_context *tdb);
 int tdb_transaction_commit(struct tdb_context *tdb);
 int tdb_transaction_cancel(struct tdb_context *tdb);
 int tdb_transaction_recover(struct tdb_context *tdb);


More information about the samba-technical mailing list