process shared robust mutexes for tdb
Rusty Russell
rusty at samba.org
Mon Mar 25 17:07:01 MDT 2013
Volker Lendecke <Volker.Lendecke at SerNet.DE> writes:
> On Sat, Jan 12, 2013 at 10:45:57AM +0100, Volker Lendecke wrote:
>> Hi!
>>
>> On Sun, Jan 06, 2013 at 07:18:00PM -0500, Ira Cooper wrote:
>> > Can you run the same test with 1 bucket? Not that we'd actually want to
>> > deploy that way, but, the performance of fcntl TDB shouldn't drop that
>> > much... and it'd give us an estimate of how much we need the 2048 locks vs.
>> > just using 1.
>>
>> Under the usual branch
>>
>> https://git.samba.org/?p=vl/samba.git/.git;a=shortlog;h=refs/heads/tdb
>>
>> Now find a patchset that survived autobuild for me. It
>> contains some additional stuff, like a tdb feature flags
>> field. This could for example take a flag for a dual-linked
>> freelist and other future improvements.
>>
>> This is something that I would like to see broader testing
>> and review of. Getting rid of the fcntl locks in tdb was
>> something I have wanted to do for a very long time, and
>> while this is not complete it is a start in that direction.
>
> The above branch now contains code that has survived testing
> in a cluster with thousands of clients. The cluster survived
> a recovery just fine, something which was entirely
> impossible due to the fcntl thundering herd before. The fact
> that unmodified ctdb is happy with it shows that the API is
> covered pretty completely.
>
> I squashed all the patches into one. In the end, I had
> almost 100 unsorted patches, no way to sort them into a sane
> order.
>
> Comments?
What can I say, other than "Cool!" :) This is a really impressive piece
of work!
Detailed review follows, but it's all minor.
> diff --git a/lib/tdb/common/check.c b/lib/tdb/common/check.c
> index dc38102..511e11b 100644
> --- a/lib/tdb/common/check.c
> +++ b/lib/tdb/common/check.c
> @@ -39,7 +39,9 @@ static bool tdb_check_header(struct tdb_context
> *tdb, tdb_off> _t *recovery)
> if (hdr.version != TDB_VERSION)
> goto corrupt;
>
> - if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
> + if (hdr.rwlocks != 0 &&
> + hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
> + hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
> goto corrupt;
Unfortunate, but probably necessary: this new tdb will look corrupt to
old tdb versions.
> +#ifdef USE_MUTEX_LOCKING
> +
> +/*
> + * All offsets are relative to "behind the mutex area". Here we adjust the
> + * offsets of all syscalls that touch the file. mutex.c has an explanation for
> + * this.
> + */
I found this comment a bit awkward, how about :
/* We prepend the mutex area, so fixup offsets: see mutex.c for details */
> +
> +static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
> +{
> + size_t mutex_size = tdb_mutex_size(tdb);
> +
> + if (*off + mutex_size < *off) {
> + errno = EIO;
> + return false;
> + }
> + *off += mutex_size;
> + return true;
> +}
...
> +static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
> +{
> + size_t mutex_len;
> + int ret;
> +
> + ret = fstat(tdb->fd, buf);
> + if (ret == -1) {
> + return -1;
> + }
> +
> + mutex_len = tdb_mutex_size(tdb);
> + if (buf->st_size < mutex_len) {
> + errno = EIO;
> + return -1;
> + }
> + buf->st_size -= mutex_len;
> +
> + return ret;
> +}
> +
> +static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
> + size_t count, off_t offset)
> +{
> + if (!tdb_adjust_offset(tdb, &offset)) {
> + return -1;
> + }
> + return pwrite(tdb->fd, buf, count, offset);
> +}
> +
> +static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
> + size_t count, off_t offset)
> +{
> + if (!tdb_adjust_offset(tdb, &offset)) {
> + return -1;
> + }
> + return pread(tdb->fd, buf, count, offset);
> +}
> +
> +static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
> +{
> + if (!tdb_adjust_offset(tdb, &length)) {
> + return -1;
> + }
> + return ftruncate(tdb->fd, length);
> +}
> +
> +#else /* USE_MUTEX_LOCKING */
> +
> +static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
> +{
> + return fstat(tdb->fd, buf);
> +}
> +
> +static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
> + size_t count, off_t offset)
> +{
> + return pwrite(tdb->fd, buf, count, offset);
> +}
> +
> +static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
> + size_t count, off_t offset)
> +{
> + return pread(tdb->fd, buf, count, offset);
> +}
> +
> +static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
> +{
> + return ftruncate(tdb->fd, length);
> +}
> +
> +#endif
Actually, you can just #ifdef around the tdb_adjust_offset definition:
gcc is pretty good and will produce the obvious result for the
!USE_MUTEX_LOCKING case.
> diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c
> index b59dfbc..49954f2 100644
> --- a/lib/tdb/common/lock.c
> +++ b/lib/tdb/common/lock.c
> @@ -38,6 +38,15 @@ static int fcntl_lock(struct tdb_context *tdb,
> struct flock fl;
> int cmd;
>
> +#ifdef USE_MUTEX_LOCKING
> + {
> + int ret;
> + if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
> + return ret;
> + }
> + }
> +#endif
This is the only place you call this, so it's a bit neater to open-code
it rather than pass the ret ptr:
if (tdb_have_mutexes(tdb)) {
return tdb_mutex_lock(tdb, rw, off, len, waitflag);
}
And make tdb_mutex_lock() abort for !USE_MUTEX_LOCKING.
> +#ifdef USE_MUTEX_LOCKING
> + {
> + int ret;
> + if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
> + return ret;
> + }
> + }
> +#endif
And here.
> +/*
> + * If we run with mutexes, we store the "struct tdb_mutexes" at the
> + * beginning of the file. We store an additional tdb_header right
> + * beyond the mutex area, page aligned. All the offsets within the tdb
> + * are relative to the area behind the mutex area. tdb->map_ptr points
> + * behind the mmap area as well, so the read and write path in the
> + * mutex case can remain unchanged.
I think this comment has dated. The tdb->map_ptr is now a complete mmap
area, so that sentence can be deleted.
> So we had to mmap the first bytes of the file with a second
> + * mmap call. With that scheme, very weird errors happened that could be
> + * easily fixed by doing the mutex mmap in a second file. It seemed that
> + * mapping the same memory area twice does not end up in accessing the same
> + * physical page, looking at the mutexes in gdb it seemed that old data showed
> + * up after some re-mapping.
Weird! I would have expected any modern architecture to have physically
tagged caches. Maybe something else is screwed up: it's not a common
case I guess.
> To avoid a separate mutex file, the code now puts
> + * the real content of the tdb file behind the mutex area. This way we do not
> + * have overlapping mmap areas, the mutex area is mmapped once and not
> + * changed, the tdb data area's mmap is constantly changed but does not
> + * overlap.
s/behind/after/? At least I think of it that way.
Another argument is that you can still fall back to read/write for huge
tdbs and limited memory.
> +again:
> + ret = chain_mutex_lock(chain, waitflag);
> + if (ret == EBUSY) {
> + ret = EAGAIN;
> + }
> + if (ret != 0) {
> + errno = ret;
> + goto fail;
> + }
> +
> + if (tdb->num_chain_mutexes_locked != 0) {
> + /*
> + * We can only check the allrecord lock once. If we do it with
> + * one chain mutex locked, we will deadlock with the allrecord
> + * locker process in the following way: We lock the first hash
> + * chain, we check for the allrecord lock. We keep the hash
> + * chain locked. Then the allrecord locker comes and takes the
> + * allrecord lock. It walks the list of chain mutexes, locking
> + * them all in sequence. Meanwhile, we have the chain mutex
> + * locked, so the allrecord locker blocks trying to lock our
> + * chain mutex. Then we come in and try to lock the second
> + * chain lock, which in most cases will be the freelist. We
> + * see that the allrecord lock is locked and put ourselves on
> + * the allrecord_waiters condition variable. This will never
> + * be signalled though because the allrecord locker waits for
> + * us to give up the chain lock.
> + */
Yeah, if we're already doing locking, we need to finish it: the
allrecord lock just has to wait.
> + tdb->num_chain_mutexes_locked += 1;
> + *pret = 0;
> + return true;
> + }
This += 1 and -= 1 appears a few places here: it's a bit weird...
> +
> + allrecord_ok = false;
> +
> + if (m->allrecord_lock == 0) {
> + /*
> + * allrecord lock not taken
> + */
> + allrecord_ok = true;
> + }
> +
> + if ((m->allrecord_lock == 1) && (rw == F_RDLCK)) {
> + /*
> + * allrecord shared lock taken, but we only want to read
> + */
> + allrecord_ok = true;
> + }
OK, I found this hard to read. An enum for allrecord_lock values? Or
actually, you could use a u32 and the values F_RDLCK, F_WRLCK and
F_UNLCK which might be even better?
A comment just above allrecord_ok = false might help, say:
/* Check if someone is has the allrecord lock: queue if so. */
> @@ -384,12 +437,65 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const
> char *name> , int hash_size, int td
> goto fail;
>
> if (header.rwlocks != 0 &&
> + header.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
> header.rwlocks != TDB_HASH_RWLOCK_MAGIC) {
> TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks
> no longer > supported\n"));
> goto fail;
> }
> tdb->hash_size = header.hash_size;
>
> + if (header.rwlocks == TDB_FEATURE_FLAG_MAGIC) {
> + tdb->feature_flags = header.feature_flags;
> + }
> +
> + {
> + uint32_t mutex_size = tdb_mutex_size(tdb);
> + if (mutex_size != header.mutex_size) {
> + TDB_LOG((tdb, TDB_DEBUG_ERROR,
> + "Mutex size changed\n"));
> + errno = EINVAL;
> + goto fail;
> + }
> + }
Don't need the extra { } here:
if (tdb_mutex_size(tdb) != header.mutex_size) {
....
> +
> + if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
> +
> + if (tdb->flags & TDB_NOMMAP) {
> + /*
> + * We need to mmap the mutex area
> + */
> + TDB_LOG((tdb, TDB_DEBUG_ERROR, "Can not open a tdb "
> + "with mutexes without mmap\n"));
> + errno = EINVAL;
> + goto fail;
> + }
I think we should still mmap the mutexed in this case, just not the
rest. It's great for debugging...
> +
> + /*
> + * Normal operation of mutexed databases is only allowed for
> + * CLEAR_IF_FIRST because the pthread_mutex_t implementation
> + * is not portable at all. We can not detect wether the mutex
> + * implementation in the tdb file is compatible with what the
> + * current opener is using, so we only allow CLEAR_IF_FIRST.
> + *
> + * To allow inspecting tdb files offline on a different system
> + * we allow opening the mutexed tdb file if the opener
> + * requests TDB_NOLOCK. In that case we don't even look at the
> + * mutexes.
> + */
> +
> + if (!(tdb_flags & (TDB_CLEAR_IF_FIRST|TDB_NOLOCK))) {
> + /*
> + * tdb files with mutexes are not portable, so only
> + * allow mutexes for CLEAR_IF_FIRST databases or
> + * without locking.
> + */
> + TDB_LOG((tdb, TDB_DEBUG_ERROR, "Can use mutexes only "
> + "with CLEAR_IF_FIRST or NOLOCK\n"));
> + errno = EINVAL;
> + goto fail;
> + }
> + }
> +
> if ((header.magic1_hash == 0) && (header.magic2_hash == 0)) {
> /* older TDB without magic hash references */
> tdb->hash_fn = tdb_old_hash;
> @@ -420,19 +526,48 @@ _PUBLIC_ struct tdb_context *tdb_open_ex(const
> char *name> , int hash_size, int td
> }
>
> /* Beware truncation! */
> - tdb->map_size = st.st_size;
> - if (tdb->map_size != st.st_size) {
> - /* Ensure ecode is set for log fn. */
> - tdb->ecode = TDB_ERR_IO;
> - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
> - "len %llu too large!\n", (long long)st.st_size));
> - errno = EIO;
> - goto fail;
> +
> + {
> + uint32_t map_size = st.st_size;
> + if (map_size != st.st_size) {
> + /* Ensure ecode is set for log fn. */
> + tdb->ecode = TDB_ERR_IO;
> + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
> + "len %llu too large!\n",
> + (long long)st.st_size));
> + errno = EIO;
> + goto fail;
> + }
> }
Temporary block here is ugly... perhaps:
if ((tdb_off_t)st.st_size != st.st_size) ...
>
> tdb->device = st.st_dev;
> tdb->inode = st.st_ino;
> - tdb_mmap(tdb);
> +
> + tdb->map_size = 0;
> + ret = tdb->methods->tdb_oob(tdb, 0, 1, 0);
> + if (ret == -1) {
> + goto fail;
> + }
> +
> +#ifdef USE_MUTEX_LOCKING
> + if (((tdb->flags & TDB_NOLOCK) == 0) &&
> + ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0)) {
> + ret = tdb_mutex_mmap(tdb);
> + if (ret != 0) {
> + goto fail;
> + }
> + }
> +#else
> + if (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) {
First tests are of weird form, how about:
if (!(tdb->flags & TDB_NOLOCK) &&
(tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX)) {
> + /*
> + * Database was created with transactions, but our
> + * caller here did not request them.
This comment is wrong... we're here because we they asked for mutexes
and we don't support them.
Perhaps implement tdb_mutex_map(tdb) as return -1 if !USE_MUTEX_LOCKING?
> + */
> + errno = EINVAL;
> + goto fail;
> + }
> +#endif
> +
> if (locked) {
> if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
> TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
> @@ -542,6 +677,11 @@ _PUBLIC_ int tdb_close(struct tdb_context *tdb)
> else
> tdb_munmap(tdb);
> }
> +
> +#ifdef USE_MUTEX_LOCKING
> + tdb_mutex_munmap(tdb);
> +#endif
Similarly, perhaps make tdb_mutex_munmap() a noop if !USE_MUTEX_LOCKING.
Will finish review this afternoon, but wanted to send this off now.
Cheers!
Rusty.
More information about the samba-technical
mailing list