Proposed API change to tdb.

Jeremy Allison jra at samba.org
Wed Dec 22 13:24:30 MST 2010


On Wed, Dec 22, 2010 at 12:17:23PM +1030, Rusty Russell wrote:
> On Wed, 22 Dec 2010 06:02:42 am simo wrote:
> > On Tue, 2010-12-21 at 11:25 -0800, Jeremy Allison wrote:
> > > Hi Rusty & friends,
> > > 
> > > I'd like to propose an API addition to tdb.
> > > 
> > > Currently, when we expand a tdb file on running out
> > > of freelist, we have a heuristic that we follow that
> > > states:
> > > 
> > > "always make room for at least 100 more records, and at
> > > least 25% more space."
> > > 
> > > and then rounded up to a multiple of a page size.
> > > 
> > > I am working with an OEM that is running Samba on
> > > a memory contrained box, and they are storing some
> > > of the tdb's in an in-memory filesystem, to prevent
> > > disk spin-up and consequent power drain.
> > > 
> > > The problem with the above heuristic is it creates
> > > tdb files that are far too large for their box and
> > > prevents them storing them on the ramfs.
> 
> Nothing wrong with this, but AFAICT you're missing the root cause.
> 
> Neither the 25% nor the 100 record heuristics are the problem; it's the
> fact that we disabled right merging of freed blocks for performance.  This
> means our tdbs get fragmented.  As a heuristic, we repack the entire db when
> we expand inside a transaction.
> 
> So smaller expansions == more regular repacking.  Perhaps try turning the
> current #ifdef USE_RIGHT_MERGES into a customisable parameter, and turn
> that on instead?

Actually I'm not sure that is the cause. This is happening in the
winbindd_cache tdb when the box is first joined to a domain and
winbindd is being requested to enumerate all the users in a large
domain, which then get added into the cache.

As there hasn't been time for anything to time out, this means
that all the activity is due to additions, not freelist fragmentation.

However, your idea is certainly a good one, so I've changed
the patch to add this in. Attached is the one I'm testing
right now.

It changes the tuning call to a more generic :

int tdb_set_tuning_parameter(struct tdb_context *tdb,
		enum TDB_TUNE_PARAMETER tuning_param, ...)

varargs function, where there are currently 3 possible
tuning parameters defined:

enum TDB_TUNE_PARAMETER { TDB_TUNE_RECORD_NUMBER_EXPANSION_FACTOR = 1,
		TDB_TUNE_MIN_FILESIZE_EXPANSION_FACTOR,
		TDB_TUNE_FREELIST_MERGE_RIGHT};

This way we can add more in a forwards-compatible way
if we need to which won't break any code that uses
the tdb_set_tuning_parameter() call.

Comments welcome !

Jeremy.
-------------- next part --------------
diff --git a/lib/tdb/common/freelist.c b/lib/tdb/common/freelist.c
index 2f2a4c3..f485bd5 100644
--- a/lib/tdb/common/freelist.c
+++ b/lib/tdb/common/freelist.c
@@ -27,12 +27,6 @@
 
 #include "tdb_private.h"
 
-/* 'right' merges can involve O(n^2) cost when combined with a
-   traverse, so they are disabled until we find a way to do them in 
-   O(1) time
-*/
-#define USE_RIGHT_MERGES 0
-
 /* read a freelist record and check for simple errors */
 int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec)
 {
@@ -62,8 +56,11 @@ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct
 }
 
 
-#if USE_RIGHT_MERGES
 /* Remove an element from the freelist.  Must have alloc lock. */
+/* 'right' merges can involve O(n^2) cost when combined with a
+   traverse, so they are disabled by default, but available as a
+   tuning parameter, until we find a way to do them in O(1) time
+*/
 static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
 {
 	tdb_off_t last_ptr, i;
@@ -81,8 +78,6 @@ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_
 	TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off));
 	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 }
-#endif
-
 
 /* update a record tailer (must hold allocation lock) */
 static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
@@ -110,32 +105,33 @@ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
 		goto fail;
 	}
 
-#if USE_RIGHT_MERGES
-	/* Look right first (I'm an Australian, dammit) */
-	if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) {
-		tdb_off_t right = offset + sizeof(*rec) + rec->rec_len;
-		struct list_struct r;
+	if (tdb->freelist_merge_right) {
+		/* Look right first (I'm an Australian, dammit) */
+		if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) {
+			tdb_off_t right = offset + sizeof(*rec) + rec->rec_len;
+			struct list_struct r;
 
-		if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
-			goto left;
-		}
-
-		/* If it's free, expand to include it. */
-		if (r.magic == TDB_FREE_MAGIC) {
-			if (remove_from_freelist(tdb, right, r.next) == -1) {
-				TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
+			if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
+				TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
 				goto left;
 			}
-			rec->rec_len += sizeof(r) + r.rec_len;
-			if (update_tailer(tdb, offset, rec) == -1) {
-				TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
-				goto fail;
+
+			/* If it's free, expand to include it. */
+			if (r.magic == TDB_FREE_MAGIC) {
+				if (remove_from_freelist(tdb, right, r.next) == -1) {
+					TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
+					goto left;
+				}
+				rec->rec_len += sizeof(r) + r.rec_len;
+				if (update_tailer(tdb, offset, rec) == -1) {
+					TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
+					goto fail;
+				}
 			}
 		}
 	}
+
 left:
-#endif
 
 	/* Look left */
 	if (offset - sizeof(tdb_off_t) > TDB_DATA_START(tdb->header.hash_size)) {
diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index 661f761..2e53296 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -306,10 +306,11 @@ int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
 	/* must know about any previous expansions by another process */
 	tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 
-	/* always make room for at least 100 more records, and at
-           least 25% more space. Round the database up to a multiple
+	/* always make room for at least tdb->record_number_expansion_factor more records, and at
+           least tdb->min_filesize_expansion_factor% more space. Round the database up to a multiple
            of the page size */
-	new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25);
+	new_size = MAX(tdb->map_size + size*tdb->record_number_expansion_factor,
+			tdb->map_size * tdb->min_filesize_expansion_factor);
 	size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
 
 	if (!(tdb->flags & TDB_INTERNAL))
diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c
index 49b8e85..656298c 100644
--- a/lib/tdb/common/open.c
+++ b/lib/tdb/common/open.c
@@ -180,6 +180,9 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
 	}
 
 	tdb->max_dead_records = (tdb_flags & TDB_VOLATILE) ? 5 : 0;
+	tdb->record_number_expansion_factor = TDB_DEFAULT_RECORD_NUMBER_EXPANSION_FACTOR;
+	tdb->min_filesize_expansion_factor = TDB_DEFAULT_MIN_FILESIZE_EXPANSION_FACTOR;
+	tdb->freelist_merge_right = false;
 
 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
 		TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c
index b59bb15..bb810a4 100644
--- a/lib/tdb/common/tdb.c
+++ b/lib/tdb/common/tdb.c
@@ -894,3 +894,38 @@ int tdb_repack(struct tdb_context *tdb)
 
 	return 0;
 }
+
+int tdb_set_tuning_parameter(struct tdb_context *tdb,
+		enum TDB_TUNE_PARAMETER tuning_param, ...)
+{
+	va_list ap;
+
+	va_start(ap, tuning_param);
+
+	switch ((int)tuning_param) {
+		case TDB_TUNE_RECORD_NUMBER_EXPANSION_FACTOR:
+		{
+			unsigned int record_number_expansion_factor = (unsigned int)va_arg(ap, int);
+			if (record_number_expansion_factor == 0) {
+				record_number_expansion_factor = TDB_DEFAULT_RECORD_NUMBER_EXPANSION_FACTOR;
+			}
+			tdb->record_number_expansion_factor = record_number_expansion_factor;
+			return 0;
+		}
+		case TDB_TUNE_MIN_FILESIZE_EXPANSION_FACTOR:
+		{
+			double min_filesize_expansion_factor = va_arg(ap, double);
+			if (min_filesize_expansion_factor == 0.0) {
+				min_filesize_expansion_factor = TDB_DEFAULT_MIN_FILESIZE_EXPANSION_FACTOR;
+			}
+			tdb->min_filesize_expansion_factor = min_filesize_expansion_factor;
+			return 0;
+		}
+		case TDB_TUNE_FREELIST_MERGE_RIGHT:
+		{
+			tdb->freelist_merge_right = (bool)va_arg(ap, int);
+			return 0;
+		}
+	}
+	return -1;
+}
diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index ffac89f..0fe6096 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -63,6 +63,9 @@ typedef uint32_t tdb_off_t;
 #define TDB_PAD_BYTE 0x42
 #define TDB_PAD_U32  0x42424242
 
+#define TDB_DEFAULT_RECORD_NUMBER_EXPANSION_FACTOR 100
+#define TDB_DEFAULT_MIN_FILESIZE_EXPANSION_FACTOR 1.25
+
 /* NB assumes there is a local variable called "tdb" that is the
  * current context, also takes doubly-parenthesized print-style
  * argument. */
@@ -168,6 +171,9 @@ struct tdb_context {
 	int max_dead_records;
 	bool have_transaction_lock;
 	volatile sig_atomic_t *interrupt_sig_ptr;
+	unsigned int record_number_expansion_factor;
+	double min_filesize_expansion_factor;
+	bool freelist_merge_right;
 };
 
 
diff --git a/lib/tdb/include/tdb.h b/lib/tdb/include/tdb.h
index 94b5e36..8f3feba 100644
--- a/lib/tdb/include/tdb.h
+++ b/lib/tdb/include/tdb.h
@@ -77,6 +77,11 @@ typedef struct TDB_DATA {
 #endif
 #endif
 
+/* possible tdb tuning constants. */
+enum TDB_TUNE_PARAMETER { TDB_TUNE_RECORD_NUMBER_EXPANSION_FACTOR = 1,
+			TDB_TUNE_MIN_FILESIZE_EXPANSION_FACTOR,
+			TDB_TUNE_FREELIST_MERGE_RIGHT};
+
 /* this is the context structure that is returned from a db open */
 typedef struct tdb_context TDB_CONTEXT;
 
@@ -140,6 +145,7 @@ void tdb_add_flags(struct tdb_context *tdb, unsigned flag);
 void tdb_remove_flags(struct tdb_context *tdb, unsigned flag);
 void tdb_enable_seqnum(struct tdb_context *tdb);
 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb);
+int tdb_set_tuning_parameter(struct tdb_context *tdb, enum TDB_TUNE_PARAMETER tuning_param, ...);
 
 /* Low level locking functions: use with care */
 int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);


More information about the samba-technical mailing list