[PATCH] minor tdb fixes

Tue Aug 22 20:28:58 UTC 2017

On Mon, Jul 10, 2017 at 12:00:05PM +0200, Ralph Böhme via samba-technical wrote:
> On Mon, Jul 03, 2017 at 10:25:19AM +0200, Ralph Böhme wrote:
> > On Mon, Jul 03, 2017 at 10:06:28AM +0200, Stefan Metzmacher wrote:
> > > > Attached is a small patchset that fixes the BUCKET() macro in tdb and the offset
> > > > calculations.
> > > > 
> > > > I stumpled upon this when looking at the `tdbtool DB list` output: the freelist
> > > > was showing the entries from another chain.
> > > > 
> > > > Turns out that the BUCKET(-1) macro returns wrong results because of sign
> > > > conversion, eg -1 is sign converted to an unsigned int 4294967295 and 4294967295
> > > > % 10 = 5. The expected result is -1, not 5.
> > > > 
> > > > This doesn't cause more severe issues because we consistenly lock the wrong
> > > > chain when trying to lock the freelist. It doesn't deadlock because in tdb_store
> > > > we lock the freelist before the hashchain. And finally in freelist.c we use the
> > > > FREELIST_TOP define directly, not the TDB_HASH_TOP macro that uses BUCKET.
> > > > 
> > > > Thoughts?
> > > 
> > > Sadly it's now a feature as existing tdb versions use this logic.
> > > So we can't just fix this as it's an incompatible change.
> > 
> > oh, that sucks, but you're right.
> > 
> > I'll prepare patches that fix `tdbtool DB list` differently and that document
> > the broken BUCKET macro.
> 
> attached.
> 
> Please review & push if ok. Thanks!

Finally got time to go through this. LGTM. Pushed !

> From 82c550b3116474250d63e9c58130ffe7693d7a36 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 2 Jul 2017 07:46:17 +0200
> Subject: [PATCH 1/4] tdb: rename struct tdb_traverse_lock hash member to list
> 
> The variable stores the hashtable bucket, not the hash. No change in
> behaviour.
> 
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
>  lib/tdb/common/tdb_private.h |  2 +-
>  lib/tdb/common/traverse.c    | 40 ++++++++++++++++++++--------------------
>  2 files changed, 21 insertions(+), 21 deletions(-)
> 
> diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
> index 7ff29aa..e549af2 100644
> --- a/lib/tdb/common/tdb_private.h
> +++ b/lib/tdb/common/tdb_private.h
> @@ -178,7 +178,7 @@ struct tdb_lock_type {
>  struct tdb_traverse_lock {
>  	struct tdb_traverse_lock *next;
>  	uint32_t off;
> -	uint32_t hash;
> +	uint32_t list;
>  	int lock_rw;
>  };
>  
> diff --git a/lib/tdb/common/traverse.c b/lib/tdb/common/traverse.c
> index f62306e..9b83379 100644
> --- a/lib/tdb/common/traverse.c
> +++ b/lib/tdb/common/traverse.c
> @@ -37,8 +37,8 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
>  	int want_next = (tlock->off != 0);
>  
>  	/* Lock each chain from the start one. */
> -	for (; tlock->hash < tdb->hash_size; tlock->hash++) {
> -		if (!tlock->off && tlock->hash != 0) {
> +	for (; tlock->list < tdb->hash_size; tlock->list++) {
> +		if (!tlock->off && tlock->list != 0) {
>  			/* this is an optimisation for the common case where
>  			   the hash chain is empty, which is particularly
>  			   common for the use of tdb with ldb, where large
> @@ -67,18 +67,18 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
>  			   factor of around 80 in speed on a linux 2.6.x
>  			   system (testing using ldbtest).
>  			*/
> -			tdb->methods->next_hash_chain(tdb, &tlock->hash);
> -			if (tlock->hash == tdb->hash_size) {
> +			tdb->methods->next_hash_chain(tdb, &tlock->list);
> +			if (tlock->list == tdb->hash_size) {
>  				continue;
>  			}
>  		}
>  
> -		if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
> +		if (tdb_lock(tdb, tlock->list, tlock->lock_rw) == -1)
>  			return TDB_NEXT_LOCK_ERR;
>  
>  		/* No previous record?  Start at top of chain. */
>  		if (!tlock->off) {
> -			if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
> +			if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->list),
>  				     &tlock->off) == -1)
>  				goto fail;
>  		} else {
> @@ -121,7 +121,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
>  			    tdb_do_delete(tdb, current, rec) != 0)
>  				goto fail;
>  		}
> -		tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
> +		tdb_unlock(tdb, tlock->list, tlock->lock_rw);
>  		want_next = 0;
>  	}
>  	/* We finished iteration without finding anything */
> @@ -130,7 +130,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
>  
>   fail:
>  	tlock->off = 0;
> -	if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
> +	if (tdb_unlock(tdb, tlock->list, tlock->lock_rw) != 0)
>  		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
>  	return TDB_NEXT_LOCK_ERR;
>  }
> @@ -181,7 +181,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
>  
>  			if (key.dptr == NULL) {
>  				ret = -1;
> -				if (tdb_unlock(tdb, tl->hash, tl->lock_rw)
> +				if (tdb_unlock(tdb, tl->list, tl->lock_rw)
>  				    != 0) {
>  					goto out;
>  				}
> @@ -205,7 +205,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
>  					       key.dptr, full_len, 0);
>  		if (nread == -1) {
>  			ret = -1;
> -			if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
> +			if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0)
>  				goto out;
>  			if (tdb_unlock_record(tdb, tl->off) != 0)
>  				TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
> @@ -218,7 +218,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
>  		tdb_trace_1rec_retrec(tdb, "traverse", key, dbuf);
>  
>  		/* Drop chain lock, call out */
> -		if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
> +		if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0) {
>  			ret = -1;
>  			goto out;
>  		}
> @@ -314,7 +314,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
>  	/* release any old lock */
>  	if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
>  		return tdb_null;
> -	tdb->travlocks.off = tdb->travlocks.hash = 0;
> +	tdb->travlocks.off = tdb->travlocks.list = 0;
>  	tdb->travlocks.lock_rw = F_RDLCK;
>  
>  	/* Grab first record: locks chain and returned record. */
> @@ -330,7 +330,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
>  	tdb_trace_retrec(tdb, "tdb_firstkey", key);
>  
>  	/* Unlock the hash chain of the record we just read. */
> -	if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
> +	if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
>  		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
>  	return key;
>  }
> @@ -338,7 +338,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
>  /* find the next entry in the database, returning its key */
>  _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>  {
> -	uint32_t oldhash;
> +	uint32_t oldlist;
>  	TDB_DATA key = tdb_null;
>  	struct tdb_record rec;
>  	unsigned char *k = NULL;
> @@ -346,7 +346,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>  
>  	/* Is locked key the old key?  If so, traverse will be reliable. */
>  	if (tdb->travlocks.off) {
> -		if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
> +		if (tdb_lock(tdb,tdb->travlocks.list,tdb->travlocks.lock_rw))
>  			return tdb_null;
>  		if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
>  		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
> @@ -359,7 +359,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>  				SAFE_FREE(k);
>  				return tdb_null;
>  			}
> -			if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
> +			if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0) {
>  				SAFE_FREE(k);
>  				return tdb_null;
>  			}
> @@ -376,13 +376,13 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>  			tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, tdb_null);
>  			return tdb_null;
>  		}
> -		tdb->travlocks.hash = BUCKET(rec.full_hash);
> +		tdb->travlocks.list = BUCKET(rec.full_hash);
>  		if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
>  			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
>  			return tdb_null;
>  		}
>  	}
> -	oldhash = tdb->travlocks.hash;
> +	oldlist = tdb->travlocks.list;
>  
>  	/* Grab next record: locks chain and returned record,
>  	   unlocks old record */
> @@ -392,11 +392,11 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>  		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
>  					  key.dsize);
>  		/* Unlock the chain of this new record */
> -		if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
> +		if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
>  			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
>  	}
>  	/* Unlock the chain of old record */
> -	if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
> +	if (tdb_unlock(tdb, oldlist, tdb->travlocks.lock_rw) != 0)
>  		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
>  	tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, key);
>  	return key;
> -- 
> 2.9.4
> 
> 
> From 67430196ecb4d1b7a029dc63a9d700c906d1e876 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sat, 1 Jul 2017 22:21:26 +0200
> Subject: [PATCH 2/4] tbd: BUCKET(-1) returns wrong offset because
>  tdb->hash_size is an unsigned int
> 
> The following C program demonstrates the issue:
> 
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <stdarg.h>
>   #include <stdbool.h>
>   #include <string.h>
>   #include <unistd.h>
>   #include <errno.h>
>   #include <dirent.h>
>   #include <sys/types.h>
> 
>   int main(int argc, char **argv)
>   {
>       int hash = -1;
>       int tsize_signed = 10;
>       unsigned int tsize_unsigned = 10;
>       int bucket;
> 
>   #define BUCKET(hash, tsize) ((hash) % (tsize))
> 
>       bucket = BUCKET(hash, tsize_unsigned);
>       printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_unsigned, bucket);
> 
>       bucket = BUCKET(hash, tsize_signed);
>       printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_signed, bucket);
> 
>       return 0;
>   }
> 
> Output:
> 
> $ ./tmp
> hash [-1] tsize [10] bucket [5]
> hash [-1] tsize [10] bucket [-1]
> 
> The first version is what the current BUCKET() macro does. As a result
> we lock the hashtable chain in bucket 5, NOT the freelist.
> 
> -1 is sign converted to an unsigned int 4294967295 and
> 
>   4294967295 % 10 = 5.
> 
> As all callers will lock the same wrong list consistently locking is
> still consistent.
> 
> Stumpled across this when looking at the output of `tdbtool DB list`
> which always printed some other hashchain and not the freelist.
> 
> The freelist bucket offset computation doesn't use the BUCKET macro in
> freelist.c (directly or indirectly) when working on the freelist, it
> just directly uses the FREELIST_TOP define, so this problem only affects
> tdbtool list.
> 
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
>  lib/tdb/common/tdb_private.h | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
> index e549af2..a60d556 100644
> --- a/lib/tdb/common/tdb_private.h
> +++ b/lib/tdb/common/tdb_private.h
> @@ -126,6 +126,22 @@ void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
>  #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
>  #endif
>  
> +/*
> + * Note: the BUCKET macro is broken as it returns an unexpected result when
> + * called as BUCKET(-1) for the freelist:
> + *
> + * -1 is sign converted to an unsigned int 4294967295 and then the modulo
> + * tdb->hashtable_size is computed. So rg with a hashtable_size of 10 the result
> + * is
> + *
> + *   4294967295 % hashtable_size = 5.
> + *
> + * where it should be -1 (C uses symmetric modulo).
> + *
> + * As all callers will lock the same wrong list consistently locking is still
> + * consistent. We can not change this without an incompatible on-disk format
> + * change, otherwise different tdb versions would use incompatible locking.
> + */
>  #define BUCKET(hash) ((hash) % tdb->hash_size)
>  
>  #define DOCONV() (tdb->flags & TDB_CONVERT)
> -- 
> 2.9.4
> 
> 
> From fcbad25dfe8cd463b047491f3121db3dc3633f23 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 2 Jul 2017 11:41:43 +0200
> Subject: [PATCH 3/4] tdb: document hashtable bucket offset calculation madness
> 
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
>  lib/tdb/common/lock.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
> index e330201..60956e7 100644
> --- a/lib/tdb/common/lock.c
> +++ b/lib/tdb/common/lock.c
> @@ -137,7 +137,33 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
>  	return fcntl(tdb->fd, F_SETLKW, &fl);
>  }
>  
> -/* list -1 is the alloc list, otherwise a hash chain. */
> +/*
> + * Calculate the lock offset for a list
> + *
> + * list -1 is the freelist, otherwise a hash chain.
> + *
> + * Note that we consistently (but without real reason) lock hash chains at an
> + * offset that is 4 bytes below the real offset of the corresponding list head
> + * in the db.
> + *
> + * This it the memory layout of the hashchain array:
> + *
> + * FREELIST_TOP + 0 = freelist
> + * FREELIST_TOP + 4 = hashtbale list 0
> + * FREELIST_TOP + 8 = hashtbale list 1
> + * ...
> + *
> + * Otoh lock_offset computes:
> + *
> + * freelist = FREELIST_TOP - 4
> + * list 0   = FREELIST_TOP + 0
> + * list 1   = FREELIST_TOP + 4
> + * ...
> + *
> + * Unfortunately we can't change this calculation in order to align the locking
> + * offset with the memory layout, as that would make the locking incompatible
> + * between different tdb versions.
> + */
>  static tdb_off_t lock_offset(int list)
>  {
>  	return FREELIST_TOP + 4*list;
> -- 
> 2.9.4
> 
> 
> From b012da31a76cb2f4ce267daf2595505b7135dcd1 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 9 Jul 2017 18:39:27 +0200
> Subject: [PATCH 4/4] tdb: fix tbdtool list freelist output
> 
> Due to the non-fixable bug in the BUCKET macro tdbtool list printed some
> other hash chainlist, not the freelist.
> 
> Bug: https://bugzilla.samba.org/show_bug.cgi?id=12888
> 
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
>  lib/tdb/common/dump.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/tdb/common/dump.c b/lib/tdb/common/dump.c
> index 5f6a78b..73286b8 100644
> --- a/lib/tdb/common/dump.c
> +++ b/lib/tdb/common/dump.c
> @@ -62,7 +62,11 @@ static int tdb_dump_chain(struct tdb_context *tdb, int i)
>  {
>  	tdb_off_t rec_ptr, top;
>  
> -	top = TDB_HASH_TOP(i);
> +	if (i == -1) {
> +		top = FREELIST_TOP;
> +	} else {
> +		top = TDB_HASH_TOP(i);
> +	}
>  
>  	if (tdb_lock(tdb, i, F_WRLCK) != 0)
>  		return -1;
> -- 
> 2.9.4
>