Blocksizes and TDB.

Rusty Russell rusty at rustcorp.com.au
Tue Dec 20 00:29:09 MST 2011


On Mon, 19 Dec 2011 20:48:23 -0500, Ira Cooper <ira at wakeful.net> wrote:
Non-text part: multipart/alternative
> On Mon, Dec 19, 2011 at 8:41 PM, Rusty Russell <rusty at rustcorp.com.au>wrote:
> 
> > On Mon, 19 Dec 2011 19:19:43 -0500, Ira Cooper <ira at wakeful.net> wrote:
> > Non-text part: multipart/alternative
> > > On Mon, Dec 19, 2011 at 5:39 PM, Rusty Russell <rusty at rustcorp.com.au
> > >wrote:
> > > > I think I prefer to coalesce records on expand.  It's fairly easy to
> > do;
> > > > sweep the database and try to merge.
> > >
> > > It does, the issue I suspect it stops after checking 50 records, if I
> > > understand the code.  If you get fragmentation sufficient to make those
> > > records useless... it's toast, is my suspicion.
> >
> > Hmm, I don't see any such thing.  Am I looking in the wrong place?
> >
> > We repack if we expand in a transaction, but we don't do anything for
> > non-transaction expands.
> >
> > Does the locking tdb do this inside a transaction?  In which case,
> > perhaps I introduced a bug in 094ab60053bcc0bc3542af8144e394d83270053e
> > back in April?
> 
> I don't think the code uses any transactions, but the code running in prod
> is actually pre-3.6.
> 
> So.. I don't think that's the issue, unless that is a long standing issue.

Amitay reported a similar bug.  An ldb passed 4G and stuff broke, yet it
was mainly free space.  We watched it happen again, and it was caused by
the recovery area expansion.

I've replicated the problem with a simple test program, and here are the
fixes.  They're pretty straight-forward:

http://git.samba.org/rusty/samba.git/?p=rusty/samba.git;a=shortlog;h=refs/heads/tdb-expansion-wip

Here they are for wider reading:

tdb: use same expansion factor logic when expanding for new recovery area.

If we're expanding because the current recovery area is too small, we
expand only the amount we need.  This can quickly lead to exponential
growth when we have a slowly-expanding record (hence a
slowly-expanding transaction size).

Signed-off-by: Rusty Russell <rusty at rustcorp.com.au>

diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index 78bbf2e..34d25a0 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -294,41 +294,49 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 }
 
 
-/* expand the database at least size bytes by expanding the underlying
-   file and doing the mmap again if necessary */
-int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
+/* You need 'size', this tells you how much you should expand by. */
+tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
 {
-	struct tdb_record rec;
-	tdb_off_t offset, new_size, top_size, map_size;
-
-	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
-		TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
-		return -1;
-	}
-
-	/* must know about any previous expansions by another process */
-	tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
+	tdb_off_t new_size, top_size;
 
 	/* limit size in order to avoid using up huge amounts of memory for
 	 * in memory tdbs if an oddball huge record creeps in */
 	if (size > 100 * 1024) {
-		top_size = tdb->map_size + size * 2;
+		top_size = map_size + size * 2;
 	} else {
-		top_size = tdb->map_size + size * 100;
+		top_size = map_size + size * 100;
 	}
 
 	/* always make room for at least top_size more records, and at
 	   least 25% more space. if the DB is smaller than 100MiB,
 	   otherwise grow it by 10% only. */
-	if (tdb->map_size > 100 * 1024 * 1024) {
-		map_size = tdb->map_size * 1.10;
+	if (map_size > 100 * 1024 * 1024) {
+		new_size = map_size * 1.10;
 	} else {
-		map_size = tdb->map_size * 1.25;
+		new_size = map_size * 1.25;
 	}
 
 	/* Round the database up to a multiple of the page size */
-	new_size = MAX(top_size, map_size);
-	size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
+	new_size = MAX(top_size, new_size);
+	return TDB_ALIGN(new_size, page_size) - map_size;
+}
+
+/* expand the database at least size bytes by expanding the underlying
+   file and doing the mmap again if necessary */
+int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
+{
+	struct tdb_record rec;
+	tdb_off_t offset;
+
+	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
+		return -1;
+	}
+
+	/* must know about any previous expansions by another process */
+	tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
+
+	size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
 
 	if (!(tdb->flags & TDB_INTERNAL))
 		tdb_munmap(tdb);
diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h
index 140d4ec..9d277ee 100644
--- a/lib/tdb/common/tdb_private.h
+++ b/lib/tdb/common/tdb_private.h
@@ -271,6 +271,7 @@ tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t has
 			   struct tdb_record *rec);
 void tdb_io_init(struct tdb_context *tdb);
 int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
+tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size);
 int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
 		      struct tdb_record *rec);
 bool tdb_write_all(int fd, const void *buf, size_t count);
diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index e4573cb..ce551e2 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -729,7 +729,11 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	*recovery_size = tdb_recovery_size(tdb);
 
 	/* round up to a multiple of page size */
-	*recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
+	*recovery_max_size = tdb_expand_adjust(tdb->map_size,
+					       *recovery_size,
+					       tdb->page_size)
+		- sizeof(rec);
+
 	*recovery_offset = tdb->map_size;
 	recovery_head = *recovery_offset;
 


tdb: don't free old recovery area when expanding if already at EOF.

We allocate a new recovery area by expanding the file.  But if the
recovery area is already at the end of file (as shown in at least one
client case), we can simply expand the record, rather than freeing it
and creating a new one.

Signed-off-by: Rusty Russell <rusty at rustcorp.com.au>

diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index ce551e2..4e1c1f6 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -697,7 +697,7 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 {
 	struct tdb_record rec;
 	const struct tdb_methods *methods = tdb->transaction->io_methods;
-	tdb_off_t recovery_head;
+	tdb_off_t recovery_head, new_end;
 
 	if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
@@ -706,6 +706,7 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 
 	*recovery_size = tdb_recovery_size(tdb);
 
+	/* Existing recovery area? */
 	if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
 		/* it fits in the existing area */
 		*recovery_max_size = rec.rec_len;
@@ -713,33 +714,45 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 		return 0;
 	}
 
-	/* we need to free up the old recovery area, then allocate a
-	   new one at the end of the file. Note that we cannot use
-	   tdb_allocate() to allocate the new one as that might return
-	   us an area that is being currently used (as of the start of
-	   the transaction) */
-	if (recovery_head != 0) {
-		if (tdb_free(tdb, recovery_head, &rec) == -1) {
-			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
-			return -1;
+	/* If recovery area in middle of file, we need a new one. */
+	if (recovery_head == 0
+	    || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) {
+		/* we need to free up the old recovery area, then allocate a
+		   new one at the end of the file. Note that we cannot use
+		   tdb_allocate() to allocate the new one as that might return
+		   us an area that is being currently used (as of the start of
+		   the transaction) */
+		if (recovery_head) {
+			if (tdb_free(tdb, recovery_head, &rec) == -1) {
+				TDB_LOG((tdb, TDB_DEBUG_FATAL,
+					 "tdb_recovery_allocate: failed to"
+					 " free previous recovery area\n"));
+				return -1;
+			}
+
+			/* the tdb_free() call might have increased
+			 * the recovery size */
+			*recovery_size = tdb_recovery_size(tdb);
 		}
+
+		/* New head will be at end of file. */
+		recovery_head = tdb->map_size;
 	}
 
-	/* the tdb_free() call might have increased the recovery size */
-	*recovery_size = tdb_recovery_size(tdb);
+	/* Now we know where it will be. */
+	*recovery_offset = recovery_head;
 
-	/* round up to a multiple of page size */
+	/* Expand by more than we need, so we don't do it often. */
 	*recovery_max_size = tdb_expand_adjust(tdb->map_size,
 					       *recovery_size,
 					       tdb->page_size)
 		- sizeof(rec);
 
-	*recovery_offset = tdb->map_size;
-	recovery_head = *recovery_offset;
+	new_end = recovery_head + sizeof(rec) + *recovery_max_size;
 
 	if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, 
-				     (tdb->map_size - tdb->transaction->old_map_size) +
-				     sizeof(rec) + *recovery_max_size) == -1) {
+				     new_end - tdb->transaction->old_map_size)
+	    == -1) {
 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
 		return -1;
 	}


More information about the samba-technical mailing list