[RFC] tdb_traverse_read_lite()

Rusty Russell rusty at rustcorp.com.au
Wed Feb 27 20:41:18 MST 2013


Amitay asked me to look at a lightweight traverse for tdb.  This is my
first attempt (against his ctdb tree, but it's pretty simple).

Normal traverse drops the chain lock when calling the callback, and thus
has to handle deleting the record while it's traversing, which it does
by grabbing a lock on the record itself, which is then tested in
delete...

For simple callback functions, that's overkill.  So I implemented that
to measure performance.

>From 1de5f5259caa4c34a5d66d1d09c986a5de98f6ef Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty at rustcorp.com.au>
Date: Thu, 28 Feb 2013 14:06:53 +1030
Subject: [PATCH] tdb_traverse_read_lite(): lightweight traverse for tdb.

This holds the lock while calling the function, which is much faster
and avoids the "what happens if someone deletes underneath" case.  For
short hash chains this should be a performance win.

Signed-off-by: Rusty Russell <rusty at rustcorp.com.au>

diff --git a/lib/tdb/common/traverse.c b/lib/tdb/common/traverse.c
index d77086a..32e6b4c 100644
--- a/lib/tdb/common/traverse.c
+++ b/lib/tdb/common/traverse.c
@@ -208,6 +208,87 @@ out:
 		return count;
 }
 
+/* We could do write, but we'd have to stash rec.next and fixup on delete */
+static int do_traverse_read_lite(struct tdb_context *tdb,
+				 tdb_traverse_func fn, void *private_data)
+{
+	int count = 0;
+	uint32_t i;
+
+	for (i = 0, tdb->methods->next_hash_chain(tdb, &i);
+	     i < tdb->header.hash_size;
+	     i++, tdb->methods->next_hash_chain(tdb, &i)) {
+		tdb_off_t rec_ptr;
+		TDB_DATA key, dbuf;
+		struct tdb_record rec;
+
+		if (tdb_lock(tdb, i, F_RDLCK) != 0) {
+			goto fail;
+		}
+		if (tdb_ofs_read(tdb, TDB_HASH_TOP(i), &rec_ptr) != 0) {
+			goto unlock_fail;
+		}
+		while (rec_ptr) {
+			if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
+				goto unlock_fail;
+			}
+
+			if (TDB_DEAD(&rec)) {
+				rec_ptr = rec.next;
+				continue;
+			}
+			/* now read the full record */
+			key.dptr = tdb_alloc_read(tdb,
+						  rec_ptr + sizeof(rec), 
+						  rec.key_len + rec.data_len);
+			if (!key.dptr) {
+				goto unlock_fail;
+			}
+
+			key.dsize = rec.key_len;
+			dbuf.dptr = key.dptr + rec.key_len;
+			dbuf.dsize = rec.data_len;
+
+			count++;
+			if (fn && fn(tdb, key, dbuf, private_data)) {
+				SAFE_FREE(key.dptr);
+				tdb_unlock(tdb, i, F_RDLCK);
+				goto out;
+			}
+			SAFE_FREE(key.dptr);
+			rec_ptr = rec.next;
+		}
+		if (tdb_unlock(tdb, i, F_RDLCK) != 0) {
+			goto fail;
+		}
+	}
+
+out:
+	return count;
+
+unlock_fail:
+	tdb_unlock(tdb, i, F_RDLCK);
+fail:
+	return -1;
+}
+
+/*
+  a read traverse, which doesn't drop the lock calling fn().  Great if
+  fn() is fast, since traverse is simpler and faster, but will block
+  any write accesses to that chain.
+ */
+int tdb_traverse_read_lite(struct tdb_context *tdb, 
+			   tdb_traverse_func fn, void *private_data)
+{
+	int ret;
+
+	/* This enables safety checks that they don't write to db */
+	tdb->traverse_read++;
+	tdb_trace(tdb, "tdb_traverse_read_lite");
+	ret = do_traverse_read_lite(tdb, fn, private_data);
+	tdb->traverse_read--;
+	return ret;
+}
 
 /*
   a write style traverse - temporarily marks the db read only
diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h
index 536a0b3..69cf81a 100644
--- a/lib/tdb/include/tdb.h
+++ b/lib/tdb/include/tdb.h
@@ -122,6 +122,7 @@ _PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb);
 _PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA key);
 _PUBLIC_ int tdb_traverse(struct tdb_context *tdb, tdb_traverse_func fn, void *);
 _PUBLIC_ int tdb_traverse_read(struct tdb_context *tdb, tdb_traverse_func fn, void *);
+_PUBLIC_ int tdb_traverse_read_lite(struct tdb_context *tdb, tdb_traverse_func fn, void *);
 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key);
 _PUBLIC_ int tdb_lockall(struct tdb_context *tdb);
 _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb);
diff --git a/lib/tdb/tools/tdbtorture.c b/lib/tdb/tools/tdbtorture.c
index 79fe3cd..1edabba 100644
--- a/lib/tdb/tools/tdbtorture.c
+++ b/lib/tdb/tools/tdbtorture.c
@@ -22,6 +22,7 @@
 #define LOCKSTORE_PROB 5
 #define TRAVERSE_PROB 20
 #define TRAVERSE_READ_PROB 20
+#define TRAVERSE_READ_LITE_PROB 20
 #define CULL_PROB 100
 #define KEYLEN 3
 #define DATALEN 100
@@ -199,6 +200,13 @@ static void addrec_db(void)
 	}
 #endif
 
+#if TRAVERSE_READ_LITE_PROB
+	if (random() % TRAVERSE_READ_LITE_PROB == 0) {
+		tdb_traverse_read_lite(db, NULL, NULL);
+		goto next;
+	}
+#endif
+
 	data = tdb_fetch(db, key);
 	if (data.dptr) free(data.dptr);
 


More information about the samba-technical mailing list