[PATCH] minor tdb fixes
Jeremy Allison
jra at samba.org
Tue Aug 22 20:28:58 UTC 2017
On Mon, Jul 10, 2017 at 12:00:05PM +0200, Ralph Böhme via samba-technical wrote:
> On Mon, Jul 03, 2017 at 10:25:19AM +0200, Ralph Böhme wrote:
> > On Mon, Jul 03, 2017 at 10:06:28AM +0200, Stefan Metzmacher wrote:
> > > > Attached is a small patchset that fixes the BUCKET() macro in tdb and the offset
> > > > calculations.
> > > >
> > > > I stumpled upon this when looking at the `tdbtool DB list` output: the freelist
> > > > was showing the entries from another chain.
> > > >
> > > > Turns out that the BUCKET(-1) macro returns wrong results because of sign
> > > > conversion, eg -1 is sign converted to an unsigned int 4294967295 and 4294967295
> > > > % 10 = 5. The expected result is -1, not 5.
> > > >
> > > > This doesn't cause more severe issues because we consistenly lock the wrong
> > > > chain when trying to lock the freelist. It doesn't deadlock because in tdb_store
> > > > we lock the freelist before the hashchain. And finally in freelist.c we use the
> > > > FREELIST_TOP define directly, not the TDB_HASH_TOP macro that uses BUCKET.
> > > >
> > > > Thoughts?
> > >
> > > Sadly it's now a feature as existing tdb versions use this logic.
> > > So we can't just fix this as it's an incompatible change.
> >
> > oh, that sucks, but you're right.
> >
> > I'll prepare patches that fix `tdbtool DB list` differently and that document
> > the broken BUCKET macro.
>
> attached.
>
> Please review & push if ok. Thanks!
Finally got time to go through this. LGTM. Pushed !
> From 82c550b3116474250d63e9c58130ffe7693d7a36 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 2 Jul 2017 07:46:17 +0200
> Subject: [PATCH 1/4] tdb: rename struct tdb_traverse_lock hash member to list
>
> The variable stores the hashtable bucket, not the hash. No change in
> behaviour.
>
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
> lib/tdb/common/tdb_private.h | 2 +-
> lib/tdb/common/traverse.c | 40 ++++++++++++++++++++--------------------
> 2 files changed, 21 insertions(+), 21 deletions(-)
>
> diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
> index 7ff29aa..e549af2 100644
> --- a/lib/tdb/common/tdb_private.h
> +++ b/lib/tdb/common/tdb_private.h
> @@ -178,7 +178,7 @@ struct tdb_lock_type {
> struct tdb_traverse_lock {
> struct tdb_traverse_lock *next;
> uint32_t off;
> - uint32_t hash;
> + uint32_t list;
> int lock_rw;
> };
>
> diff --git a/lib/tdb/common/traverse.c b/lib/tdb/common/traverse.c
> index f62306e..9b83379 100644
> --- a/lib/tdb/common/traverse.c
> +++ b/lib/tdb/common/traverse.c
> @@ -37,8 +37,8 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
> int want_next = (tlock->off != 0);
>
> /* Lock each chain from the start one. */
> - for (; tlock->hash < tdb->hash_size; tlock->hash++) {
> - if (!tlock->off && tlock->hash != 0) {
> + for (; tlock->list < tdb->hash_size; tlock->list++) {
> + if (!tlock->off && tlock->list != 0) {
> /* this is an optimisation for the common case where
> the hash chain is empty, which is particularly
> common for the use of tdb with ldb, where large
> @@ -67,18 +67,18 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
> factor of around 80 in speed on a linux 2.6.x
> system (testing using ldbtest).
> */
> - tdb->methods->next_hash_chain(tdb, &tlock->hash);
> - if (tlock->hash == tdb->hash_size) {
> + tdb->methods->next_hash_chain(tdb, &tlock->list);
> + if (tlock->list == tdb->hash_size) {
> continue;
> }
> }
>
> - if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
> + if (tdb_lock(tdb, tlock->list, tlock->lock_rw) == -1)
> return TDB_NEXT_LOCK_ERR;
>
> /* No previous record? Start at top of chain. */
> if (!tlock->off) {
> - if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
> + if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->list),
> &tlock->off) == -1)
> goto fail;
> } else {
> @@ -121,7 +121,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
> tdb_do_delete(tdb, current, rec) != 0)
> goto fail;
> }
> - tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
> + tdb_unlock(tdb, tlock->list, tlock->lock_rw);
> want_next = 0;
> }
> /* We finished iteration without finding anything */
> @@ -130,7 +130,7 @@ static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock
>
> fail:
> tlock->off = 0;
> - if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
> + if (tdb_unlock(tdb, tlock->list, tlock->lock_rw) != 0)
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
> return TDB_NEXT_LOCK_ERR;
> }
> @@ -181,7 +181,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
>
> if (key.dptr == NULL) {
> ret = -1;
> - if (tdb_unlock(tdb, tl->hash, tl->lock_rw)
> + if (tdb_unlock(tdb, tl->list, tl->lock_rw)
> != 0) {
> goto out;
> }
> @@ -205,7 +205,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
> key.dptr, full_len, 0);
> if (nread == -1) {
> ret = -1;
> - if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
> + if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0)
> goto out;
> if (tdb_unlock_record(tdb, tl->off) != 0)
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
> @@ -218,7 +218,7 @@ static int tdb_traverse_internal(struct tdb_context *tdb,
> tdb_trace_1rec_retrec(tdb, "traverse", key, dbuf);
>
> /* Drop chain lock, call out */
> - if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
> + if (tdb_unlock(tdb, tl->list, tl->lock_rw) != 0) {
> ret = -1;
> goto out;
> }
> @@ -314,7 +314,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
> /* release any old lock */
> if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
> return tdb_null;
> - tdb->travlocks.off = tdb->travlocks.hash = 0;
> + tdb->travlocks.off = tdb->travlocks.list = 0;
> tdb->travlocks.lock_rw = F_RDLCK;
>
> /* Grab first record: locks chain and returned record. */
> @@ -330,7 +330,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
> tdb_trace_retrec(tdb, "tdb_firstkey", key);
>
> /* Unlock the hash chain of the record we just read. */
> - if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
> + if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
> return key;
> }
> @@ -338,7 +338,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
> /* find the next entry in the database, returning its key */
> _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
> {
> - uint32_t oldhash;
> + uint32_t oldlist;
> TDB_DATA key = tdb_null;
> struct tdb_record rec;
> unsigned char *k = NULL;
> @@ -346,7 +346,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
>
> /* Is locked key the old key? If so, traverse will be reliable. */
> if (tdb->travlocks.off) {
> - if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
> + if (tdb_lock(tdb,tdb->travlocks.list,tdb->travlocks.lock_rw))
> return tdb_null;
> if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
> || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
> @@ -359,7 +359,7 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
> SAFE_FREE(k);
> return tdb_null;
> }
> - if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
> + if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0) {
> SAFE_FREE(k);
> return tdb_null;
> }
> @@ -376,13 +376,13 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
> tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, tdb_null);
> return tdb_null;
> }
> - tdb->travlocks.hash = BUCKET(rec.full_hash);
> + tdb->travlocks.list = BUCKET(rec.full_hash);
> if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
> return tdb_null;
> }
> }
> - oldhash = tdb->travlocks.hash;
> + oldlist = tdb->travlocks.list;
>
> /* Grab next record: locks chain and returned record,
> unlocks old record */
> @@ -392,11 +392,11 @@ _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
> key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
> key.dsize);
> /* Unlock the chain of this new record */
> - if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
> + if (tdb_unlock(tdb, tdb->travlocks.list, tdb->travlocks.lock_rw) != 0)
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
> }
> /* Unlock the chain of old record */
> - if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
> + if (tdb_unlock(tdb, oldlist, tdb->travlocks.lock_rw) != 0)
> TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
> tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, key);
> return key;
> --
> 2.9.4
>
>
> From 67430196ecb4d1b7a029dc63a9d700c906d1e876 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sat, 1 Jul 2017 22:21:26 +0200
> Subject: [PATCH 2/4] tbd: BUCKET(-1) returns wrong offset because
> tdb->hash_size is an unsigned int
>
> The following C program demonstrates the issue:
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <stdarg.h>
> #include <stdbool.h>
> #include <string.h>
> #include <unistd.h>
> #include <errno.h>
> #include <dirent.h>
> #include <sys/types.h>
>
> int main(int argc, char **argv)
> {
> int hash = -1;
> int tsize_signed = 10;
> unsigned int tsize_unsigned = 10;
> int bucket;
>
> #define BUCKET(hash, tsize) ((hash) % (tsize))
>
> bucket = BUCKET(hash, tsize_unsigned);
> printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_unsigned, bucket);
>
> bucket = BUCKET(hash, tsize_signed);
> printf("hash [%d] tsize [%d] bucket [%d]\n", hash, tsize_signed, bucket);
>
> return 0;
> }
>
> Output:
>
> $ ./tmp
> hash [-1] tsize [10] bucket [5]
> hash [-1] tsize [10] bucket [-1]
>
> The first version is what the current BUCKET() macro does. As a result
> we lock the hashtable chain in bucket 5, NOT the freelist.
>
> -1 is sign converted to an unsigned int 4294967295 and
>
> 4294967295 % 10 = 5.
>
> As all callers will lock the same wrong list consistently locking is
> still consistent.
>
> Stumpled across this when looking at the output of `tdbtool DB list`
> which always printed some other hashchain and not the freelist.
>
> The freelist bucket offset computation doesn't use the BUCKET macro in
> freelist.c (directly or indirectly) when working on the freelist, it
> just directly uses the FREELIST_TOP define, so this problem only affects
> tdbtool list.
>
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
> lib/tdb/common/tdb_private.h | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
> diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
> index e549af2..a60d556 100644
> --- a/lib/tdb/common/tdb_private.h
> +++ b/lib/tdb/common/tdb_private.h
> @@ -126,6 +126,22 @@ void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
> #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
> #endif
>
> +/*
> + * Note: the BUCKET macro is broken as it returns an unexpected result when
> + * called as BUCKET(-1) for the freelist:
> + *
> + * -1 is sign converted to an unsigned int 4294967295 and then the modulo
> + * tdb->hashtable_size is computed. So rg with a hashtable_size of 10 the result
> + * is
> + *
> + * 4294967295 % hashtable_size = 5.
> + *
> + * where it should be -1 (C uses symmetric modulo).
> + *
> + * As all callers will lock the same wrong list consistently locking is still
> + * consistent. We can not change this without an incompatible on-disk format
> + * change, otherwise different tdb versions would use incompatible locking.
> + */
> #define BUCKET(hash) ((hash) % tdb->hash_size)
>
> #define DOCONV() (tdb->flags & TDB_CONVERT)
> --
> 2.9.4
>
>
> From fcbad25dfe8cd463b047491f3121db3dc3633f23 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 2 Jul 2017 11:41:43 +0200
> Subject: [PATCH 3/4] tdb: document hashtable bucket offset calculation madness
>
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
> lib/tdb/common/lock.c | 28 +++++++++++++++++++++++++++-
> 1 file changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
> index e330201..60956e7 100644
> --- a/lib/tdb/common/lock.c
> +++ b/lib/tdb/common/lock.c
> @@ -137,7 +137,33 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
> return fcntl(tdb->fd, F_SETLKW, &fl);
> }
>
> -/* list -1 is the alloc list, otherwise a hash chain. */
> +/*
> + * Calculate the lock offset for a list
> + *
> + * list -1 is the freelist, otherwise a hash chain.
> + *
> + * Note that we consistently (but without real reason) lock hash chains at an
> + * offset that is 4 bytes below the real offset of the corresponding list head
> + * in the db.
> + *
> + * This it the memory layout of the hashchain array:
> + *
> + * FREELIST_TOP + 0 = freelist
> + * FREELIST_TOP + 4 = hashtbale list 0
> + * FREELIST_TOP + 8 = hashtbale list 1
> + * ...
> + *
> + * Otoh lock_offset computes:
> + *
> + * freelist = FREELIST_TOP - 4
> + * list 0 = FREELIST_TOP + 0
> + * list 1 = FREELIST_TOP + 4
> + * ...
> + *
> + * Unfortunately we can't change this calculation in order to align the locking
> + * offset with the memory layout, as that would make the locking incompatible
> + * between different tdb versions.
> + */
> static tdb_off_t lock_offset(int list)
> {
> return FREELIST_TOP + 4*list;
> --
> 2.9.4
>
>
> From b012da31a76cb2f4ce267daf2595505b7135dcd1 Mon Sep 17 00:00:00 2001
> From: Ralph Boehme <slow at samba.org>
> Date: Sun, 9 Jul 2017 18:39:27 +0200
> Subject: [PATCH 4/4] tdb: fix tbdtool list freelist output
>
> Due to the non-fixable bug in the BUCKET macro tdbtool list printed some
> other hash chainlist, not the freelist.
>
> Bug: https://bugzilla.samba.org/show_bug.cgi?id=12888
>
> Signed-off-by: Ralph Boehme <slow at samba.org>
> ---
> lib/tdb/common/dump.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/lib/tdb/common/dump.c b/lib/tdb/common/dump.c
> index 5f6a78b..73286b8 100644
> --- a/lib/tdb/common/dump.c
> +++ b/lib/tdb/common/dump.c
> @@ -62,7 +62,11 @@ static int tdb_dump_chain(struct tdb_context *tdb, int i)
> {
> tdb_off_t rec_ptr, top;
>
> - top = TDB_HASH_TOP(i);
> + if (i == -1) {
> + top = FREELIST_TOP;
> + } else {
> + top = TDB_HASH_TOP(i);
> + }
>
> if (tdb_lock(tdb, i, F_WRLCK) != 0)
> return -1;
> --
> 2.9.4
>
More information about the samba-technical
mailing list