process shared robust mutexes for tdb

Mon Mar 25 17:07:01 MDT 2013

Volker Lendecke <Volker.Lendecke at SerNet.DE> writes:
> On Sat, Jan 12, 2013 at 10:45:57AM +0100, Volker Lendecke wrote:
>> Hi!
>> 
>> On Sun, Jan 06, 2013 at 07:18:00PM -0500, Ira Cooper wrote:
>> > Can you run the same test with 1 bucket?  Not that we'd actually want to
>> > deploy that way, but, the performance of fcntl TDB shouldn't drop that
>> > much... and it'd give us an estimate of how much we need the 2048 locks vs.
>> > just using 1.
>> 
>> Under the usual branch
>> 
>> https://git.samba.org/?p=vl/samba.git/.git;a=shortlog;h=refs/heads/tdb
>> 
>> Now find a patchset that survived autobuild for me. It
>> contains some additional stuff, like a tdb feature flags
>> field. This could for example take a flag for a dual-linked
>> freelist and other future improvements.
>> 
>> This is something that I would like to see broader testing
>> and review of. Getting rid of the fcntl locks in tdb was
>> something I have wanted to do for a very long time, and
>> while this is not complete it is a start in that direction.
>
> The above branch now contains code that has survived testing
> in a cluster with thousands of clients. The cluster survived
> a recovery just fine, something which was entirely
> impossible due to the fcntl thundering herd before. The fact
> that unmodified ctdb is happy with it shows that the API is
> covered pretty completely.
>
> I squashed all the patches into one. In the end, I had
> almost 100 unsorted patches, no way to sort them into a sane
> order.
>
> Comments?

What can I say, other than "Cool!" :)  This is a really impressive piece
of work!

Detailed review follows, but it's all minor.

> diff --git a/lib/tdb/common/check.c b/lib/tdb/common/check.c
> index dc38102..511e11b 100644
> --- a/lib/tdb/common/check.c
> +++ b/lib/tdb/common/check.c
> @@ -39,7 +39,9 @@ static bool tdb_check_header(struct tdb_context
> *tdb, tdb_off> _t *recovery)
>  	if (hdr.version != TDB_VERSION)
>  		goto corrupt;
>  
> -	if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
> +	if (hdr.rwlocks != 0 &&
> +	    hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
> +	    hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
>  		goto corrupt;

Unfortunate, but probably necessary: this new tdb will look corrupt to
old tdb versions.

> +#ifdef USE_MUTEX_LOCKING
> +
> +/*
> + * All offsets are relative to "behind the mutex area". Here we adjust the
> + * offsets of all syscalls that touch the file. mutex.c has an explanation for
> + * this.
> + */

I found this comment a bit awkward, how about :

/* We prepend the mutex area, so fixup offsets: see mutex.c for details */

> +
> +static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
> +{
> +	size_t mutex_size = tdb_mutex_size(tdb);
> +
> +	if (*off + mutex_size < *off) {
> +		errno = EIO;
> +		return false;
> +	}
> +	*off += mutex_size;
> +	return true;
> +}

...
> +static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
> +{
> +	size_t mutex_len;
> +	int ret;
> +
> +	ret = fstat(tdb->fd, buf);
> +	if (ret == -1) {
> +		return -1;
> +	}
> +
> +	mutex_len = tdb_mutex_size(tdb);
> +	if (buf->st_size < mutex_len) {
> +		errno = EIO;
> +		return -1;
> +	}
> +	buf->st_size -= mutex_len;
> +
> +	return ret;
> +}
> +
> +static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
> +			  size_t count, off_t offset)
> +{
> +	if (!tdb_adjust_offset(tdb, &offset)) {
> +		return -1;
> +	}
> +	return pwrite(tdb->fd, buf, count, offset);
> +}
> +
> +static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
> +			 size_t count, off_t offset)
> +{
> +	if (!tdb_adjust_offset(tdb, &offset)) {
> +		return -1;
> +	}
> +	return pread(tdb->fd, buf, count, offset);
> +}
> +
> +static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
> +{
> +	if (!tdb_adjust_offset(tdb, &length)) {
> +		return -1;
> +	}
> +	return ftruncate(tdb->fd, length);
> +}
> +
> +#else /* USE_MUTEX_LOCKING */
> +
> +static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
> +{
> +	return fstat(tdb->fd, buf);
> +}
> +
> +static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
> +			  size_t count, off_t offset)
> +{
> +	return pwrite(tdb->fd, buf, count, offset);
> +}
> +
> +static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
> +			 size_t count, off_t offset)
> +{
> +	return pread(tdb->fd, buf, count, offset);
> +}
> +
> +static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
> +{
> +	return ftruncate(tdb->fd, length);
> +}
> +
> +#endif

Actually, you can just #ifdef around the tdb_adjust_offset definition:
gcc is pretty good and will produce the obvious result for the
!USE_MUTEX_LOCKING case.

> diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
> index b59dfbc..49954f2 100644
> --- a/lib/tdb/common/lock.c
> +++ b/lib/tdb/common/lock.c
> @@ -38,6 +38,15 @@ static int fcntl_lock(struct tdb_context *tdb,
>  	struct flock fl;
>  	int cmd;
>  
> +#ifdef USE_MUTEX_LOCKING
> +	{
> +		int ret;
> +		if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
> +			return ret;
> +		}
> +	}
> +#endif

This is the only place you call this, so it's a bit neater to open-code
it rather than pass the ret ptr:

        if (tdb_have_mutexes(tdb)) {
                return tdb_mutex_lock(tdb, rw, off, len, waitflag);
        }

And make tdb_mutex_lock() abort for !USE_MUTEX_LOCKING.

> +#ifdef USE_MUTEX_LOCKING
> +	{
> +		int ret;
> +		if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
> +			return ret;
> +		}
> +	}
> +#endif

And here.

> +/*
> + * If we run with mutexes, we store the "struct tdb_mutexes" at the
> + * beginning of the file. We store an additional tdb_header right
> + * beyond the mutex area, page aligned. All the offsets within the tdb
> + * are relative to the area behind the mutex area. tdb->map_ptr points
> + * behind the mmap area as well, so the read and write path in the
> + * mutex case can remain unchanged.

I think this comment has dated.  The tdb->map_ptr is now a complete mmap
area, so that sentence can be deleted.

>      So we had to mmap the first bytes of the file with a second
> + * mmap call. With that scheme, very weird errors happened that could be
> + * easily fixed by doing the mutex mmap in a second file. It seemed that
> + * mapping the same memory area twice does not end up in accessing the same
> + * physical page, looking at the mutexes in gdb it seemed that old data showed
> + * up after some re-mapping. 

Weird!  I would have expected any modern architecture to have physically
tagged caches.  Maybe something else is screwed up: it's not a common
case I guess.

>     To avoid a separate mutex file, the code now puts
> + * the real content of the tdb file behind the mutex area. This way we do not
> + * have overlapping mmap areas, the mutex area is mmapped once and not
> + * changed, the tdb data area's mmap is constantly changed but does not
> + * overlap.

s/behind/after/?  At least I think of it that way.

Another argument is that you can still fall back to read/write for huge
tdbs and limited memory.

> +again:
> +	ret = chain_mutex_lock(chain, waitflag);
> +	if (ret == EBUSY) {
> +		ret = EAGAIN;
> +	}
> +	if (ret != 0) {
> +		errno = ret;
> +		goto fail;
> +	}
> +
> +	if (tdb->num_chain_mutexes_locked != 0) {
> +		/*
> +		 * We can only check the allrecord lock once. If we do it with
> +		 * one chain mutex locked, we will deadlock with the allrecord
> +		 * locker process in the following way: We lock the first hash
> +		 * chain, we check for the allrecord lock. We keep the hash
> +		 * chain locked. Then the allrecord locker comes and takes the
> +		 * allrecord lock. It walks the list of chain mutexes, locking
> +		 * them all in sequence. Meanwhile, we have the chain mutex
> +		 * locked, so the allrecord locker blocks trying to lock our
> +		 * chain mutex. Then we come in and try to lock the second
> +		 * chain lock, which in most cases will be the freelist. We
> +		 * see that the allrecord lock is locked and put ourselves on
> +		 * the allrecord_waiters condition variable. This will never
> +		 * be signalled though because the allrecord locker waits for
> +		 * us to give up the chain lock.
> +		 */

Yeah, if we're already doing locking, we need to finish it: the
allrecord lock just has to wait.

> +		tdb->num_chain_mutexes_locked += 1;
> +		*pret = 0;
> +		return true;
> +	}

This += 1 and -= 1 appears a few places here: it's a bit weird...

> +
> +	allrecord_ok = false;
> +
> +	if (m->allrecord_lock == 0) {
> +		/*
> +		 * allrecord lock not taken
> +		 */
> +		allrecord_ok = true;
> +	}
> +
> +	if ((m->allrecord_lock == 1) && (rw == F_RDLCK)) {
> +		/*
> +		 * allrecord shared lock taken, but we only want to read
> +		 */
> +		allrecord_ok = true;
> +	}

OK, I found this hard to read.  An enum for allrecord_lock values?  Or
actually, you could use a u32 and the values F_RDLCK, F_WRLCK and
F_UNLCK which might be even better?

A comment just above allrecord_ok = false might help, say:

        /* Check if someone is has the allrecord lock: queue if so. */

> @@ -384,12 +437,65 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const
> char *name> , int hash_size, int td
>  		goto fail;
>  
>  	if (header.rwlocks != 0 &&
> +	    header.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
>  	    header.rwlocks != TDB_HASH_RWLOCK_MAGIC) {
>  		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks
> no longer > supported\n"));
>  		goto fail;
>  	}
>  	tdb->hash_size = header.hash_size;
>  
> +	if (header.rwlocks == TDB_FEATURE_FLAG_MAGIC) {
> +		tdb->feature_flags = header.feature_flags;
> +	}
> +
> +	{
> +		uint32_t mutex_size = tdb_mutex_size(tdb);
> +		if (mutex_size != header.mutex_size) {
> +			TDB_LOG((tdb, TDB_DEBUG_ERROR,
> +				 "Mutex size changed\n"));
> +			errno = EINVAL;
> +			goto fail;
> +		}
> +	}

Don't need the extra { } here:

        if (tdb_mutex_size(tdb) != header.mutex_size) {
....

> +
> +	if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
> +
> +		if (tdb->flags & TDB_NOMMAP) {
> +			/*
> +			 * We need to mmap the mutex area
> +			 */
> +			TDB_LOG((tdb, TDB_DEBUG_ERROR, "Can not open a tdb "
> +				 "with mutexes without mmap\n"));
> +			errno = EINVAL;
> +			goto fail;
> +		}

I think we should still mmap the mutexed in this case, just not the
rest.  It's great for debugging...

> +
> +		/*
> +		 * Normal operation of mutexed databases is only allowed for
> +		 * CLEAR_IF_FIRST because the pthread_mutex_t implementation
> +		 * is not portable at all. We can not detect wether the mutex
> +		 * implementation in the tdb file is compatible with what the
> +		 * current opener is using, so we only allow CLEAR_IF_FIRST.
> +		 *
> +		 * To allow inspecting tdb files offline on a different system
> +		 * we allow opening the mutexed tdb file if the opener
> +		 * requests TDB_NOLOCK. In that case we don't even look at the
> +		 * mutexes.
> +		 */
> +
> +		if (!(tdb_flags & (TDB_CLEAR_IF_FIRST|TDB_NOLOCK))) {
> +			/*
> +			 * tdb files with mutexes are not portable, so only
> +			 * allow mutexes for CLEAR_IF_FIRST databases or
> +			 * without locking.
> +			 */
> +			TDB_LOG((tdb, TDB_DEBUG_ERROR, "Can use mutexes only "
> +				 "with CLEAR_IF_FIRST or NOLOCK\n"));
> +			errno = EINVAL;
> +			goto fail;
> +		}
> +	}
> +
>  	if ((header.magic1_hash == 0) && (header.magic2_hash == 0)) {
>  		/* older TDB without magic hash references */
>  		tdb->hash_fn = tdb_old_hash;
> @@ -420,19 +526,48 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const
> char *name> , int hash_size, int td
>  	}
>  
>  	/* Beware truncation! */
> -	tdb->map_size = st.st_size;
> -	if (tdb->map_size != st.st_size) {
> -		/* Ensure ecode is set for log fn. */
> -		tdb->ecode = TDB_ERR_IO;
> -		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
> -			 "len %llu too large!\n", (long long)st.st_size));
> -		errno = EIO;
> -		goto fail;
> +
> +	{
> +		uint32_t map_size = st.st_size;
> +		if (map_size != st.st_size) {
> +			/* Ensure ecode is set for log fn. */
> +			tdb->ecode = TDB_ERR_IO;
> +			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
> +				 "len %llu too large!\n",
> +				 (long long)st.st_size));
> +			errno = EIO;
> +			goto fail;
> +		}
>  	}

Temporary block here is ugly... perhaps:

        if ((tdb_off_t)st.st_size != st.st_size) ...

>  
>  	tdb->device = st.st_dev;
>  	tdb->inode = st.st_ino;
> -	tdb_mmap(tdb);
> +
> +	tdb->map_size = 0;
> +	ret = tdb->methods->tdb_oob(tdb, 0, 1, 0);
> +	if (ret == -1) {
> +		goto fail;
> +	}
> +
> +#ifdef USE_MUTEX_LOCKING
> +	if (((tdb->flags & TDB_NOLOCK) == 0) &&
> +	    ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0)) {
> +		ret = tdb_mutex_mmap(tdb);
> +		if (ret != 0) {
> +			goto fail;
> +		}
> +	}
> +#else
> +	if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {

First tests are of weird form, how about:

        if (!(tdb->flags & TDB_NOLOCK) &&
	    (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX)) {

> +		/*
> +		 * Database was created with transactions, but our
> +		 * caller here did not request them.

This comment is wrong... we're here because we they asked for mutexes
and we don't support them.

Perhaps implement tdb_mutex_map(tdb) as return -1 if !USE_MUTEX_LOCKING?

> +		 */
> +		errno = EINVAL;
> +		goto fail;
> +	}
> +#endif
> +
>  	if (locked) {
>  		if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
>  			TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
> @@ -542,6 +677,11 @@ _PUBLIC_ int tdb_close(struct tdb_context *tdb)
>  		else
>  			tdb_munmap(tdb);
>  	}
> +
> +#ifdef USE_MUTEX_LOCKING
> +	tdb_mutex_munmap(tdb);
> +#endif

Similarly, perhaps make tdb_mutex_munmap() a noop if !USE_MUTEX_LOCKING.

Will finish review this afternoon, but wanted to send this off now.

Cheers!
Rusty.