[PATCH] minor tdb fixes

Mon Jul 10 10:00:05 UTC 2017

On Mon, Jul 03, 2017 at 10:25:19AM +0200, Ralph Böhme wrote:
> On Mon, Jul 03, 2017 at 10:06:28AM +0200, Stefan Metzmacher wrote:
> > > Attached is a small patchset that fixes the BUCKET() macro in tdb and the offset
> > > calculations.
> > > 
> > > I stumpled upon this when looking at the `tdbtool DB list` output: the freelist
> > > was showing the entries from another chain.
> > > 
> > > Turns out that the BUCKET(-1) macro returns wrong results because of sign
> > > conversion, eg -1 is sign converted to an unsigned int 4294967295 and 4294967295
> > > % 10 = 5. The expected result is -1, not 5.
> > > 
> > > This doesn't cause more severe issues because we consistenly lock the wrong
> > > chain when trying to lock the freelist. It doesn't deadlock because in tdb_store
> > > we lock the freelist before the hashchain. And finally in freelist.c we use the
> > > FREELIST_TOP define directly, not the TDB_HASH_TOP macro that uses BUCKET.
> > > 
> > > Thoughts?
> > 
> > Sadly it's now a feature as existing tdb versions use this logic.
> > So we can't just fix this as it's an incompatible change.
> 
> oh, that sucks, but you're right.
> 
> I'll prepare patches that fix `tdbtool DB list` differently and that document
> the broken BUCKET macro.

attached.

Please review & push if ok. Thanks!

Cheerio!
-slow
-------------- next part --------------
From 82c550b3116474250d63e9c58130ffe7693d7a36 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Sun, 2 Jul 2017 07:46:17 +0200
Subject: [PATCH 1/4] tdb: rename struct tdb_traverse_lock hash member to list

The variable stores the hashtable bucket, not the hash. No change in
behaviour.

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 lib/tdb/common/tdb_private.h |  2 +-
 lib/tdb/common/traverse.c    | 40 ++++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index 7ff29aa..e549af2 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -178,7 +178,7 @@ struct tdb_lock_type {
 struct tdb_traverse_lock {
 	struct tdb_traverse_lock *next;
 	uint32_t off;
-	uint32_t hash;
+	uint32_t list;
 	int lock_rw;
 };
 
diff --git a/lib/tdb/common/traverse.c b/lib/tdb/common/traverse.c
index f62306e..9b83379 100644
--- a/lib/tdb/common/traverse.c
+++ b/lib/tdb/common/traverse.c
@@ -37,8 +37,8 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
 	int want_next = (tlock->off != 0);
 
 	/* Lock each chain from the start one. */
-	for (; tlock->hash < tdb->hash_size; tlock->hash++) {
-		if (!tlock->off && tlock->hash != 0) {
+	for (; tlock->list < tdb->hash_size; tlock->list++) {
+		if (!tlock->off && tlock->list != 0) {
 			/* this is an optimisation for the common case where
 			   the hash chain is empty, which is particularly
 			   common for the use of tdb with ldb, where large
@@ -67,18 +67,18 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
 			   factor of around 80 in speed on a linux 2.6.x
 			   system (testing using ldbtest).
 			*/
-			tdb->methods->next_hash_chain(tdb, &tlock->hash);
-			if (tlock->hash == tdb->hash_size) {
+			tdb->methods->next_hash_chain(tdb, &tlock->list);
+			if (tlock->list == tdb->hash_size) {
 				continue;
 			}
 		}
 
-		if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
+		if (tdb_lock(tdb, tlock->list, tlock->lock_rw) == -1)
 			return TDB_NEXT_LOCK_ERR;
 
 		/* No previous record?  Start at top of chain. */
 		if (!tlock->off) {
-			if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
+			if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->list),
 				     &tlock->off) == -1)
 				goto fail;
 		} else {
@@ -121,7 +121,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
 			    tdb_do_delete(tdb, current, rec) != 0)
 				goto fail;
 		}
-		tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
+		tdb_unlock(tdb, tlock->list, tlock->lock_rw);
 		want_next = 0;
 	}
 	/* We finished iteration without finding anything */
@@ -130,7 +130,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
 
  fail:
 	tlock->off = 0;
-	if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
+	if (tdb_unlock(tdb, tlock->list, tlock->lock_rw) != 0)
 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
 	return TDB_NEXT_LOCK_ERR;
 }
@@ -181,7 +181,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
 
 			if (key.dptr == NULL) {
 				ret = -1;
-				if (tdb_unlock(tdb, tl->hash, tl->lock_rw)
+				if (tdb_unlock(tdb, tl->list, tl->lock_rw)
 				    != 0) {
 					goto out;
 				}
@@ -205,7 +205,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
 					       key.dptr, full_len, 0);
 		if (nread == -1) {
 			ret = -1;
-			if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
+			if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0)
 				goto out;
 			if (tdb_unlock_record(tdb, tl->off) != 0)
 				TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
@@ -218,7 +218,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
 		tdb_trace_1rec_retrec(tdb, "traverse", key, dbuf);
 
 		/* Drop chain lock, call out */
-		if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
+		if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0) {
 			ret = -1;
 			goto out;
 		}
@@ -314,7 +314,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
 	/* release any old lock */
 	if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
 		return tdb_null;
-	tdb->travlocks.off = tdb->travlocks.hash = 0;
+	tdb->travlocks.off = tdb->travlocks.list = 0;
 	tdb->travlocks.lock_rw = F_RDLCK;
 
 	/* Grab first record: locks chain and returned record. */
@@ -330,7 +330,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
 	tdb_trace_retrec(tdb, "tdb_firstkey", key);
 
 	/* Unlock the hash chain of the record we just read. */
-	if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
+	if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
 	return key;
 }
@@ -338,7 +338,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
 /* find the next entry in the database, returning its key */
 _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
 {
-	uint32_t oldhash;
+	uint32_t oldlist;
 	TDB_DATA key = tdb_null;
 	struct tdb_record rec;
 	unsigned char *k = NULL;
@@ -346,7 +346,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
 
 	/* Is locked key the old key?  If so, traverse will be reliable. */
 	if (tdb->travlocks.off) {
-		if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
+		if (tdb_lock(tdb,tdb->travlocks.list,tdb->travlocks.lock_rw))
 			return tdb_null;
 		if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
 		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
@@ -359,7 +359,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
 				SAFE_FREE(k);
 				return tdb_null;
 			}
-			if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
+			if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0) {
 				SAFE_FREE(k);
 				return tdb_null;
 			}
@@ -376,13 +376,13 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
 			tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, tdb_null);
 			return tdb_null;
 		}
-		tdb->travlocks.hash = BUCKET(rec.full_hash);
+		tdb->travlocks.list = BUCKET(rec.full_hash);
 		if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
 			return tdb_null;
 		}
 	}
-	oldhash = tdb->travlocks.hash;
+	oldlist = tdb->travlocks.list;
 
 	/* Grab next record: locks chain and returned record,
 	   unlocks old record */
@@ -392,11 +392,11 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
 		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
 					  key.dsize);
 		/* Unlock the chain of this new record */
-		if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
+		if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 	}
 	/* Unlock the chain of old record */
-	if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
+	if (tdb_unlock(tdb, oldlist, tdb->travlocks.lock_rw) != 0)
 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 	tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, key);
 	return key;
-- 
2.9.4


From 67430196ecb4d1b7a029dc63a9d700c906d1e876 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Sat, 1 Jul 2017 22:21:26 +0200
Subject: [PATCH 2/4] tbd: BUCKET(-1) returns wrong offset because
 tdb->hash_size is an unsigned int

The following C program demonstrates the issue:

  #include <stdio.h>
  #include <stdlib.h>
  #include <stdarg.h>
  #include <stdbool.h>
  #include <string.h>
  #include <unistd.h>
  #include <errno.h>
  #include <dirent.h>
  #include <sys/types.h>

  int main(int argc, char **argv)
  {
      int hash = -1;
      int tsize_signed = 10;
      unsigned int tsize_unsigned = 10;
      int bucket;

  #define BUCKET(hash, tsize) ((hash) % (tsize))

      bucket = BUCKET(hash, tsize_unsigned);
      printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_unsigned, bucket);

      bucket = BUCKET(hash, tsize_signed);
      printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_signed, bucket);

      return 0;
  }

Output:

$ ./tmp
hash [-1] tsize [10] bucket [5]
hash [-1] tsize [10] bucket [-1]

The first version is what the current BUCKET() macro does. As a result
we lock the hashtable chain in bucket 5, NOT the freelist.

-1 is sign converted to an unsigned int 4294967295 and

  4294967295 % 10 = 5.

As all callers will lock the same wrong list consistently locking is
still consistent.

Stumpled across this when looking at the output of `tdbtool DB list`
which always printed some other hashchain and not the freelist.

The freelist bucket offset computation doesn't use the BUCKET macro in
freelist.c (directly or indirectly) when working on the freelist, it
just directly uses the FREELIST_TOP define, so this problem only affects
tdbtool list.

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 lib/tdb/common/tdb_private.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index e549af2..a60d556 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -126,6 +126,22 @@ void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
 #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
 #endif
 
+/*
+ * Note: the BUCKET macro is broken as it returns an unexpected result when
+ * called as BUCKET(-1) for the freelist:
+ *
+ * -1 is sign converted to an unsigned int 4294967295 and then the modulo
+ * tdb->hashtable_size is computed. So rg with a hashtable_size of 10 the result
+ * is
+ *
+ *   4294967295 % hashtable_size = 5.
+ *
+ * where it should be -1 (C uses symmetric modulo).
+ *
+ * As all callers will lock the same wrong list consistently locking is still
+ * consistent. We can not change this without an incompatible on-disk format
+ * change, otherwise different tdb versions would use incompatible locking.
+ */
 #define BUCKET(hash) ((hash) % tdb->hash_size)
 
 #define DOCONV() (tdb->flags & TDB_CONVERT)
-- 
2.9.4


From fcbad25dfe8cd463b047491f3121db3dc3633f23 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Sun, 2 Jul 2017 11:41:43 +0200
Subject: [PATCH 3/4] tdb: document hashtable bucket offset calculation madness

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 lib/tdb/common/lock.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
index e330201..60956e7 100644
--- a/lib/tdb/common/lock.c
+++ b/lib/tdb/common/lock.c
@@ -137,7 +137,33 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 	return fcntl(tdb->fd, F_SETLKW, &fl);
 }
 
-/* list -1 is the alloc list, otherwise a hash chain. */
+/*
+ * Calculate the lock offset for a list
+ *
+ * list -1 is the freelist, otherwise a hash chain.
+ *
+ * Note that we consistently (but without real reason) lock hash chains at an
+ * offset that is 4 bytes below the real offset of the corresponding list head
+ * in the db.
+ *
+ * This it the memory layout of the hashchain array:
+ *
+ * FREELIST_TOP + 0 = freelist
+ * FREELIST_TOP + 4 = hashtbale list 0
+ * FREELIST_TOP + 8 = hashtbale list 1
+ * ...
+ *
+ * Otoh lock_offset computes:
+ *
+ * freelist = FREELIST_TOP - 4
+ * list 0   = FREELIST_TOP + 0
+ * list 1   = FREELIST_TOP + 4
+ * ...
+ *
+ * Unfortunately we can't change this calculation in order to align the locking
+ * offset with the memory layout, as that would make the locking incompatible
+ * between different tdb versions.
+ */
 static tdb_off_t lock_offset(int list)
 {
 	return FREELIST_TOP + 4*list;
-- 
2.9.4


From b012da31a76cb2f4ce267daf2595505b7135dcd1 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Sun, 9 Jul 2017 18:39:27 +0200
Subject: [PATCH 4/4] tdb: fix tbdtool list freelist output

Due to the non-fixable bug in the BUCKET macro tdbtool list printed some
other hash chainlist, not the freelist.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12888

Signed-off-by: Ralph Boehme <slow at samba.org>
---
 lib/tdb/common/dump.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/tdb/common/dump.c b/lib/tdb/common/dump.c
index 5f6a78b..73286b8 100644
--- a/lib/tdb/common/dump.c
+++ b/lib/tdb/common/dump.c
@@ -62,7 +62,11 @@ static int tdb_dump_chain(struct tdb_context *tdb, int i)
 {
 	tdb_off_t rec_ptr, top;
 
-	top = TDB_HASH_TOP(i);
+	if (i == -1) {
+		top = FREELIST_TOP;
+	} else {
+		top = TDB_HASH_TOP(i);
+	}
 
 	if (tdb_lock(tdb, i, F_WRLCK) != 0)
 		return -1;
-- 
2.9.4