[PATCH] Improve tdb for ENOSPC

Volker Lendecke Volker.Lendecke at SerNet.DE
Wed Aug 23 14:24:41 UTC 2017


Hi!

Review appreciated!

Thanks, Volker

-- 
SerNet GmbH, Bahnhofsallee 1b, 37081 Göttingen
phone: +49-551-370000-0, fax: +49-551-370000-9
AG Göttingen, HRB 2816, GF: Dr. Johannes Loxen
http://www.sernet.de, mailto:kontakt at sernet.de
-------------- next part --------------
From a07cbb8ea4671486f126acdc2c11ff7ab8b8d08d Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Tue, 22 Aug 2017 14:51:18 +0200
Subject: [PATCH 1/6] tdb: Fix a typo

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/tdb/common/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tdb/common/transaction.c b/lib/tdb/common/transaction.c
index 8ec00256924..7d281fc75ff 100644
--- a/lib/tdb/common/transaction.c
+++ b/lib/tdb/common/transaction.c
@@ -43,7 +43,7 @@
     tdb_free() the old record to place it on the normal tdb freelist
     before allocating the new record
 
-  - during transactions, keep a linked list of writes all that have
+  - during transactions, keep a linked list of all writes that have
     been performed by intercepting all tdb_write() calls. The hooked
     transaction versions of tdb_read() and tdb_write() check this
     linked list and try to use the elements of the list in preference
-- 
2.11.0


From 58fd6a8a270ff0f7ecd2ff2c06d3316d03c97522 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Tue, 22 Aug 2017 17:09:01 +0200
Subject: [PATCH 2/6] configure: Centralize check for posix_fallocate

This checks for posix_fallocate unless we are sitting on an ancient glibc.
With this we don't need HAVE_BROKEN_POSIX_FALLOCATE anymore,
HAVE_POSIX_FALLOCATE will only be defined if we have a valid [g]libc.

./configure tested on Debian, FreeBSD (which does have posix_fallocate) and
OpenBSD (which does not have posix_fallocate). Also tested with changing the
not have an old-enough glibc around. All did the right thing.

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/replace/wscript  | 12 ++++++++++++
 source3/lib/system.c |  2 +-
 source3/wscript      | 17 ++---------------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/lib/replace/wscript b/lib/replace/wscript
index 7c50e1daf85..6972f2d6e40 100644
--- a/lib/replace/wscript
+++ b/lib/replace/wscript
@@ -249,6 +249,18 @@ def configure(conf):
     if conf.CONFIG_SET('HAVE_MEMALIGN'):
         conf.CHECK_DECLS('memalign', headers='malloc.h')
 
+    # glibc up to 2.3.6 had dangerously broken posix_fallocate(). DON'T USE IT.
+    if conf.CHECK_CODE('''
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#if defined(__GLIBC__) && ((__GLIBC__ < 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 4))
+#error probably broken posix_fallocate
+#endif
+''',
+                       '_POSIX_FALLOCATE_CAPABLE_LIBC',
+                       msg='Checking for posix_fallocate-capable libc'):
+        conf.CHECK_FUNCS('posix_fallocate')
+
     conf.CHECK_FUNCS('prctl dirname basename')
 
     # libbsd on some platforms provides strlcpy and strlcat
diff --git a/source3/lib/system.c b/source3/lib/system.c
index 99462b631c7..70ddf6a4dea 100644
--- a/source3/lib/system.c
+++ b/source3/lib/system.c
@@ -438,7 +438,7 @@ int sys_lstat(const char *fname,SMB_STRUCT_STAT *sbuf,
 ********************************************************************/
 int sys_posix_fallocate(int fd, off_t offset, off_t len)
 {
-#if defined(HAVE_POSIX_FALLOCATE) && !defined(HAVE_BROKEN_POSIX_FALLOCATE)
+#if defined(HAVE_POSIX_FALLOCATE)
 	return posix_fallocate(fd, offset, len);
 #elif defined(F_RESVSP64)
 	/* this handles XFS on IRIX */
diff --git a/source3/wscript b/source3/wscript
index 534ff440a40..62edb3e1cd6 100644
--- a/source3/wscript
+++ b/source3/wscript
@@ -106,7 +106,7 @@ def configure(conf):
     conf.CHECK_FUNCS('innetgr')
     conf.CHECK_FUNCS('initgroups select poll rdchk getgrnam getgrent pathconf')
     conf.CHECK_FUNCS('setpriv setgidx setuidx setgroups syscall sysconf')
-    conf.CHECK_FUNCS('atexit grantpt posix_openpt fallocate posix_fallocate')
+    conf.CHECK_FUNCS('atexit grantpt posix_openpt fallocate')
     conf.CHECK_FUNCS('fseeko setluid')
     conf.CHECK_FUNCS('getpwnam', headers='sys/types.h pwd.h')
     conf.CHECK_FUNCS('fdopendir')
@@ -397,7 +397,7 @@ llseek _llseek __llseek _lseek __lseek
 _lstat __lstat lutimes
 __lxstat memalign mknod mlock mlockall munlock munlockall
 _open __open _opendir __opendir
-pathconf poll posix_fallocate
+pathconf poll
 posix_memalign pread _pread __pread
 pwrite _pwrite __pwrite
 rdchk _read __read _readdir __readdir
@@ -1154,19 +1154,6 @@ err:
             execute=True,
             msg="Checking whether fcntl lock supports open file description locks")
 
-# glibc up to 2.3.6 had dangerously broken posix_fallocate(). DON'T USE IT.
-    if not conf.CHECK_CODE('''
-#define _XOPEN_SOURCE 600
-#include <stdlib.h>
-#if defined(__GLIBC__) && ((__GLIBC__ < 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 4))
-#error probably broken posix_fallocate
-#endif
-''',
-                           '_HAVE_UNBROKEN_POSIX_FALLOCATE',
-                           msg='Checking for broken posix_fallocate'):
-        conf.DEFINE('HAVE_BROKEN_POSIX_FALLOCATE', '1')
-
-
     conf.CHECK_STRUCTURE_MEMBER('struct stat', 'st_mtim.tv_nsec',
                                 define='HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC') # Linux, Solaris
     conf.CHECK_STRUCTURE_MEMBER('struct stat', 'st_mtimensec',
-- 
2.11.0


From 32e1647ee59d2785d57df09deddc81e9cab840f4 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Wed, 23 Aug 2017 12:00:00 +0200
Subject: [PATCH 3/6] tdb: Protect against EINTR

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/tdb/common/io.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index fe47d18a5a4..1b422affdca 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -52,27 +52,48 @@ static bool tdb_adjust_offset(struct tdb_context *tdb, off_t *off)
 static ssize_t tdb_pwrite(struct tdb_context *tdb, const void *buf,
 			  size_t count, off_t offset)
 {
+	ssize_t ret;
+
 	if (!tdb_adjust_offset(tdb, &offset)) {
 		return -1;
 	}
-	return pwrite(tdb->fd, buf, count, offset);
+
+	do {
+		ret = pwrite(tdb->fd, buf, count, offset);
+	} while ((ret == -1) && (errno == EINTR));
+
+	return ret;
 }
 
 static ssize_t tdb_pread(struct tdb_context *tdb, void *buf,
 			 size_t count, off_t offset)
 {
+	ssize_t ret;
+
 	if (!tdb_adjust_offset(tdb, &offset)) {
 		return -1;
 	}
-	return pread(tdb->fd, buf, count, offset);
+
+	do {
+		ret = pread(tdb->fd, buf, count, offset);
+	} while ((ret == -1) && (errno == EINTR));
+
+	return ret;
 }
 
 static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
 {
+	ssize_t ret;
+
 	if (!tdb_adjust_offset(tdb, &length)) {
 		return -1;
 	}
-	return ftruncate(tdb->fd, length);
+
+	do {
+		ret = ftruncate(tdb->fd, length);
+	} while ((ret == -1) && (errno == EINTR));
+
+	return ret;
 }
 
 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
-- 
2.11.0


From 7e2b66182a43b5d25d38fea6a01631698acb96f8 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Wed, 23 Aug 2017 12:48:03 +0200
Subject: [PATCH 4/6] tdb: Truncate the file after expand failure

Without this it's very easy to create virtually huge files: ftruncate expands a
file, the pwrites fail with ENOSPC, thus the write fails. The next writer runs
into the same situation, and ftruncate-expands the file even further. tdb_check
will then spend ages reading the 4GB of zeros byte by byte.

Here we hold the freelist lock or are inside a transaction, so it is safe to
cut the file again. Nobody can have used the space that we have tried to
allocate, so we can't have any stray pointers corrupting the database.

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/tdb/common/io.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index 1b422affdca..f7a12c34dc9 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -430,14 +430,14 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
 				"returned 0 twice: giving up!\n"));
 			errno = ENOSPC;
-			return -1;
+			goto fail;
 		}
 		if (written == -1) {
 			tdb->ecode = TDB_ERR_OOM;
 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
 				 "%u bytes failed (%s)\n", (int)n,
 				 strerror(errno)));
-			return -1;
+			goto fail;
 		}
 		if (written != n) {
 			TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
@@ -448,6 +448,29 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 		size += written;
 	}
 	return 0;
+
+fail:
+	{
+		int err = errno;
+		int ret;
+
+		/*
+		 * We're holding the freelist lock or are inside a
+		 * transaction. Cutting the file is safe, the space we
+		 * tried to allocate can't have been used anywhere in
+		 * the meantime.
+		 */
+
+		ret = tdb_ftruncate(tdb, size);
+		if (ret == -1) {
+			TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: "
+				 "retruncate to %ju failed\n",
+				 (uintmax_t)size));
+		}
+		errno = err;
+	}
+
+	return -1;
 }
 
 
-- 
2.11.0


From 473a95e4c788740668e15308bce7e623d7a002e0 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Wed, 23 Aug 2017 12:59:19 +0200
Subject: [PATCH 5/6] tdb: Add an intermediate variable

More README.Coding, but I need "ret" in the next commit as well :-)

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/tdb/common/io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index f7a12c34dc9..ff3f2197ed2 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -379,6 +379,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 {
 	char buf[8192];
 	tdb_off_t new_size;
+	int ret;
 
 	if (tdb->read_only || tdb->traverse_read) {
 		tdb->ecode = TDB_ERR_RDONLY;
@@ -394,7 +395,8 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 		return -1;
 	}
 
-	if (tdb_ftruncate(tdb, new_size) == -1) {
+	ret = tdb_ftruncate(tdb, new_size);
+	if (ret == -1) {
 		char b = 0;
 		ssize_t written = tdb_pwrite(tdb, &b, 1, new_size - 1);
 		if (written == 0) {
@@ -452,7 +454,6 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 fail:
 	{
 		int err = errno;
-		int ret;
 
 		/*
 		 * We're holding the freelist lock or are inside a
-- 
2.11.0


From eb70b4873825888f7915428382db0bf537d55e66 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl at samba.org>
Date: Wed, 23 Aug 2017 13:02:57 +0200
Subject: [PATCH 6/6] tdb: Use posix_fallocate

This should be significantly faster than pwriting.

openbsd doesn't have posix_fallocate, so we do need the fallback. Also, it
might have weird failure modes, so we keep the old code in place except for
posix_fallocate returning success or ENOSPC.

Signed-off-by: Volker Lendecke <vl at samba.org>
---
 lib/tdb/common/io.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/lib/tdb/common/io.c b/lib/tdb/common/io.c
index ff3f2197ed2..15ba5b7d497 100644
--- a/lib/tdb/common/io.c
+++ b/lib/tdb/common/io.c
@@ -96,6 +96,24 @@ static int tdb_ftruncate(struct tdb_context *tdb, off_t length)
 	return ret;
 }
 
+#if HAVE_POSIX_FALLOCATE
+static int tdb_posix_fallocate(struct tdb_context *tdb, off_t offset,
+			       off_t len)
+{
+	ssize_t ret;
+
+	if (!tdb_adjust_offset(tdb, &offset)) {
+		return -1;
+	}
+
+	do {
+		ret = posix_fallocate(tdb->fd, offset, len);
+	} while ((ret == -1) && (errno == EINTR));
+
+	return ret;
+}
+#endif
+
 static int tdb_fstat(struct tdb_context *tdb, struct stat *buf)
 {
 	int ret;
@@ -395,6 +413,34 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad
 		return -1;
 	}
 
+#if HAVE_POSIX_FALLOCATE
+	ret = tdb_posix_fallocate(tdb, size, addition);
+	if (ret == 0) {
+		return 0;
+	}
+	if (ret == ENOSPC) {
+		/*
+		 * The Linux glibc (at least as of 2.24) fallback if
+		 * the file system does not support fallocate does not
+		 * reset the file size back to where it was. Also, to
+		 * me it is unclear from the posix spec of
+		 * posix_fallocate whether this is allowed or
+		 * not. Better be safe than sorry and "goto fail" but
+		 * "return -1" here, leaving the EOF pointer too
+		 * large.
+		 */
+		goto fail;
+	}
+
+	/*
+	 * Retry the "old" way. Possibly unnecessary, but looking at
+	 * our configure script there seem to be weird failure modes
+	 * for posix_fallocate. See commit 3264a98ff16de, which
+	 * probably refers to
+	 * https://sourceware.org/bugzilla/show_bug.cgi?id=1083.
+	 */
+#endif
+
 	ret = tdb_ftruncate(tdb, new_size);
 	if (ret == -1) {
 		char b = 0;
-- 
2.11.0



More information about the samba-technical mailing list