[SCM] Samba Shared Repository - branch master updated - release-4-0-0alpha6-1070-g8ec9903

Tim Prouty tprouty at samba.org
Sun Feb 22 01:38:42 GMT 2009


The branch, master has been updated
       via  8ec9903426ec4e559df8ac8306a8ebcdf0706176 (commit)
       via  0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db (commit)
      from  1ff9696306894c136015f83456e4c6e039e31e26 (commit)

http://gitweb.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 8ec9903426ec4e559df8ac8306a8ebcdf0706176
Author: Tim Prouty <tprouty at samba.org>
Date:   Fri Feb 20 13:27:39 2009 -0800

    s3 OneFS: Add an atomic sendfile implementation

commit 0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db
Author: Tim Prouty <tprouty at samba.org>
Date:   Fri Feb 20 13:28:36 2009 -0800

    s3: If sendfile returns 0 bytes read, fall back to the normal read path
    
    This allows sendfile implementations that are atomic to avoid having
    to send zeros or kill the client connection on a short read (usually
    the file was truncated).

-----------------------------------------------------------------------

Summary of changes:
 source3/modules/onefs.h        |   10 ++
 source3/modules/onefs_system.c |  257 ++++++++++++++++++++++++++++++++++++++++
 source3/modules/vfs_onefs.c    |   15 +++
 source3/smbd/reply.c           |   24 ++++
 4 files changed, 306 insertions(+), 0 deletions(-)


Changeset truncated at 500 lines:

diff --git a/source3/modules/onefs.h b/source3/modules/onefs.h
index ea452a4..a70664b 100644
--- a/source3/modules/onefs.h
+++ b/source3/modules/onefs.h
@@ -47,6 +47,8 @@ enum onefs_acl_wire_format
 #define PARM_ATIME_STATIC_DEFAULT NULL
 #define PARM_ATIME_SLOP		"atime now slop"
 #define PARM_ATIME_SLOP_DEFAULT	 0
+#define PARM_ATOMIC_SENDFILE "atomic sendfile"
+#define PARM_ATOMIC_SENDFILE_DEFAULT true
 #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL "creator owner gets full control"
 #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL_DEFAULT true
 #define PARM_CTIME_NOW		"ctime now files"
@@ -63,6 +65,10 @@ enum onefs_acl_wire_format
 #define PARM_MTIME_SLOP_DEFAULT	0
 #define PARM_USE_READDIRPLUS "use readdirplus"
 #define PARM_USE_READDIRPLUS_DEFAULT true
+#define PARM_SENDFILE_LARGE_READS "sendfile large reads"
+#define PARM_SENDFILE_LARGE_READS_DEFAULT false
+#define PARM_SENDFILE_SAFE "sendfile safe"
+#define PARM_SENDFILE_SAFE_DEFAULT true
 #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE "simple file sharing compatibility mode"
 #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE_DEFAULT false
 #define PARM_UNMAPPABLE_SIDS_DENY_EVERYONE "unmappable sids deny everyone"
@@ -254,6 +260,10 @@ int onefs_sys_create_file(connection_struct *conn,
 			  uint32_t ntfs_flags,
 			  int *granted_oplock);
 
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+			   const DATA_BLOB *header, SMB_OFF_T offset,
+			   size_t count);
+
 ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
 			   size_t count);
 
diff --git a/source3/modules/onefs_system.c b/source3/modules/onefs_system.c
index 3a86b4b..1080289 100644
--- a/source3/modules/onefs_system.c
+++ b/source3/modules/onefs_system.c
@@ -178,6 +178,263 @@ int onefs_sys_create_file(connection_struct *conn,
 }
 
 /**
+ * FreeBSD based sendfile implementation that allows for atomic semantics.
+ */
+static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
+    const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
+{
+	size_t total=0;
+	struct sf_hdtr hdr;
+	struct iovec hdtrl;
+	size_t hdr_len = 0;
+	int flags = 0;
+
+	if (atomic) {
+		flags = SF_ATOMIC;
+	}
+
+	hdr.headers = &hdtrl;
+	hdr.hdr_cnt = 1;
+	hdr.trailers = NULL;
+	hdr.trl_cnt = 0;
+
+	/* Set up the header iovec. */
+	if (header) {
+		hdtrl.iov_base = header->data;
+		hdtrl.iov_len = hdr_len = header->length;
+	} else {
+		hdtrl.iov_base = NULL;
+		hdtrl.iov_len = 0;
+	}
+
+	total = count;
+	while (total + hdtrl.iov_len) {
+		SMB_OFF_T nwritten;
+		int ret;
+
+		/*
+		 * FreeBSD sendfile returns 0 on success, -1 on error.
+		 * Remember, the tofd and fromfd are reversed..... :-).
+		 * nwritten includes the header data sent.
+		 */
+
+		do {
+			ret = sendfile(fromfd, tofd, offset, total, &hdr,
+				       &nwritten, flags);
+		} while (ret == -1 && errno == EINTR);
+
+		/* On error we're done. */
+		if (ret == -1) {
+			return -1;
+		}
+
+		/*
+		 * If this was an ATOMIC sendfile, nwritten doesn't
+		 * necessarily indicate an error.  It could mean count > than
+		 * what sendfile can handle atomically (usually 64K) or that
+		 * there was a short read due to the file being truncated.
+		 */
+		if (nwritten == 0) {
+			return atomic ? 0 : -1;
+		}
+
+		/*
+		 * An atomic sendfile should never send partial data!
+		 */
+		if (atomic && nwritten != total + hdtrl.iov_len) {
+			DEBUG(0,("Atomic sendfile() sent partial data: "
+				 "%llu of %d\n", nwritten,
+				 total + hdtrl.iov_len));
+			return -1;
+		}
+
+		/*
+		 * If this was a short (signal interrupted) write we may need
+		 * to subtract it from the header data, or null out the header
+		 * data altogether if we wrote more than hdtrl.iov_len bytes.
+		 * We change nwritten to be the number of file bytes written.
+		 */
+
+		if (hdtrl.iov_base && hdtrl.iov_len) {
+			if (nwritten >= hdtrl.iov_len) {
+				nwritten -= hdtrl.iov_len;
+				hdtrl.iov_base = NULL;
+				hdtrl.iov_len = 0;
+			} else {
+				hdtrl.iov_base =
+				    (caddr_t)hdtrl.iov_base + nwritten;
+				hdtrl.iov_len -= nwritten;
+				nwritten = 0;
+			}
+		}
+		total -= nwritten;
+		offset += nwritten;
+	}
+	return count + hdr_len;
+}
+
+/**
+ * Handles the subtleties of using sendfile with CIFS.
+ */
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+			   const DATA_BLOB *header, SMB_OFF_T offset,
+			   size_t count)
+{
+	bool atomic = false;
+	ssize_t ret = 0;
+
+	if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+			 PARM_ATOMIC_SENDFILE,
+			 PARM_ATOMIC_SENDFILE_DEFAULT)) {
+		atomic = true;
+	}
+
+	/* Try the sendfile */
+	ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
+				    atomic);
+
+	/* If the sendfile wasn't atomic, we're done. */
+	if (!atomic) {
+		DEBUG(10, ("non-atomic sendfile read %ul bytes", ret));
+		return ret;
+	}
+
+	/*
+	 * Atomic sendfile takes care to not write anything to the socket
+	 * until all of the requested bytes have been read from the file.
+	 * There are two atomic cases that need to be handled.
+	 *
+	 *  1. The file was truncated causing less data to be read than was
+	 *     requested.  In this case, we return back to the caller to
+	 *     indicate 0 bytes were written to the socket.  This should
+	 *     prompt the caller to fallback to the standard read path: read
+	 *     the data, create a header that indicates how many bytes were
+	 *     actually read, and send the header/data back to the client.
+	 *
+	 *     This saves us from standard sendfile behavior of sending a
+	 *     header promising more data then will actually be sent.  The
+	 *     only two options are to close the socket and kill the client
+	 *     connection, or write a bunch of 0s.  Closing the client
+	 *     connection is bad because there could actually be multiple
+	 *     sessions multiplexed from the same client that are all dropped
+	 *     because of a truncate.  Writing the remaining data as 0s also
+	 *     isn't good, because the client will have an incorrect version
+	 *     of the file.  If the file is written back to the server, the 0s
+	 *     will be written back.  Fortunately, atomic sendfile allows us
+	 *     to avoid making this choice in most cases.
+	 *
+	 *  2. One downside of atomic sendfile, is that there is a limit on
+	 *     the number of bytes that can be sent atomically.  The kernel
+	 *     has a limited amount of mbuf space that it can read file data
+	 *     into without exhausting the system's mbufs, so a buffer of
+	 *     length xfsize is used.  The xfsize at the time of writing this
+	 *     is 64K.  xfsize bytes are read from the file, and subsequently
+	 *     written to the socket.  This makes it impossible to do the
+	 *     sendfile atomically for a byte count > xfsize.
+	 *
+	 *     To cope with large requests, atomic sendfile returns -1 with
+	 *     errno set to E2BIG.  Since windows maxes out at 64K writes,
+	 *     this is currently only a concern with non-windows clients.
+	 *     Posix extensions allow the full 24bit bytecount field to be
+	 *     used in ReadAndX, and clients such as smbclient and the linux
+	 *     cifs client can request up to 16MB reads!  There are a few
+	 *     options for handling large sendfile requests.
+	 *
+	 *	a. Fall back to the standard read path.  This is unacceptable
+	 *         because it would require prohibitively large mallocs.
+	 *
+	 *	b. Fall back to using samba's fake_send_file which emulates
+	 *	   the kernel sendfile in userspace.  This still has the same
+	 *	   problem of sending the header before all of the data has
+	 *	   been read, so it doesn't buy us anything, and has worse
+	 *	   performance than the kernel's zero-copy sendfile.
+	 *
+	 *	c. Use non-atomic sendfile syscall to attempt a zero copy
+	 *	   read, and hope that there isn't a short read due to
+	 *	   truncation.  In the case of a short read, there are two
+	 *	   options:
+	 *
+	 *	    1. Kill the client connection
+	 *
+	 *	    2. Write zeros to the socket for the remaining bytes
+	 *	       promised in the header.
+	 *
+	 *	   It is safer from a data corruption perspective to kill the
+	 *	   client connection, so this is our default behavior, but if
+	 *	   this causes problems this can be configured to write zeros
+	 *	   via smb.conf.
+	 */
+
+	/* Handle case 1: short read -> truncated file. */
+	if (ret == 0) {
+		return ret;
+	}
+
+	/* Handle case 2: large read. */
+	if (ret == -1 && errno == E2BIG) {
+
+		if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+				 PARM_SENDFILE_LARGE_READS,
+				 PARM_SENDFILE_LARGE_READS_DEFAULT)) {
+			DEBUG(3, ("Not attempting non-atomic large sendfile: "
+				  "%lu bytes\n", count));
+			return 0;
+		}
+
+		if (count < 0x10000) {
+			DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu",
+				  count));
+		}
+
+		DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
+			   count));
+
+		/* Try a non-atomic sendfile. */
+		ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
+					    count, false);
+		/* Real error: kill the client connection. */
+		if (ret == -1) {
+			DEBUG(1, ("error on non-atomic large sendfile "
+				  "(%lu bytes): %s\n", count,
+				  strerror(errno)));
+			return ret;
+		}
+
+		/* Short read: kill the client connection. */
+		if (ret != count + header->length) {
+			DEBUG(1, ("short read on non-atomic large sendfile "
+				  "(%lu of %lu bytes): %s\n", ret, count,
+				  strerror(errno)));
+
+			/*
+			 * Returning ret here would cause us to drop into the
+			 * codepath that calls sendfile_short_send, which
+			 * sends the client a bunch of zeros instead.
+			 * Returning -1 kills the connection.
+			 */
+			if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+				PARM_SENDFILE_SAFE,
+				PARM_SENDFILE_SAFE_DEFAULT)) {
+				return -1;
+			}
+
+			return ret;
+		}
+
+		DEBUG(10, ("non-atomic large sendfile successful\n"));
+	}
+
+	/* There was error in the atomic sendfile. */
+	if (ret == -1) {
+		DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
+			  atomic ? "atomic" : "non-atomic",
+			  count, strerror(errno)));
+	}
+
+	return ret;
+}
+
+/**
  * Only talloc the spill buffer once (reallocing when necessary).
  */
 static char *get_spill_buffer(size_t new_count)
diff --git a/source3/modules/vfs_onefs.c b/source3/modules/vfs_onefs.c
index f0c6a9d..60c2c97 100644
--- a/source3/modules/vfs_onefs.c
+++ b/source3/modules/vfs_onefs.c
@@ -156,6 +156,19 @@ static int onefs_open(vfs_handle_struct *handle, const char *fname,
 	return SMB_VFS_NEXT_OPEN(handle, fname, fsp, flags, mode);
 }
 
+static ssize_t onefs_sendfile(vfs_handle_struct *handle, int tofd,
+			      files_struct *fromfsp, const DATA_BLOB *header,
+			      SMB_OFF_T offset, size_t count)
+{
+	ssize_t result;
+
+	START_PROFILE_BYTES(syscall_sendfile, count);
+	result = onefs_sys_sendfile(handle->conn, tofd, fromfsp->fh->fd,
+				    header, offset, count);
+	END_PROFILE(syscall_sendfile);
+	return result;
+}
+
 static ssize_t onefs_recvfile(vfs_handle_struct *handle, int fromfd,
 			      files_struct *tofsp, SMB_OFF_T offset,
 			      size_t count)
@@ -340,6 +353,8 @@ static vfs_op_tuple onefs_ops[] = {
 	 SMB_VFS_LAYER_OPAQUE},
 	{SMB_VFS_OP(onefs_close), SMB_VFS_OP_CLOSE,
 	 SMB_VFS_LAYER_TRANSPARENT},
+	{SMB_VFS_OP(onefs_sendfile), SMB_VFS_OP_SENDFILE,
+	 SMB_VFS_LAYER_OPAQUE},
 	{SMB_VFS_OP(onefs_recvfile), SMB_VFS_OP_RECVFILE,
 	 SMB_VFS_LAYER_OPAQUE},
 	{SMB_VFS_OP(onefs_rename), SMB_VFS_OP_RENAME,
diff --git a/source3/smbd/reply.c b/source3/smbd/reply.c
index 457f941..b30ef23 100644
--- a/source3/smbd/reply.c
+++ b/source3/smbd/reply.c
@@ -2788,6 +2788,18 @@ static void send_file_readbraw(connection_struct *conn,
 			DEBUG(0,("send_file_readbraw: sendfile failed for file %s (%s). Terminating\n",
 				fsp->fsp_name, strerror(errno) ));
 			exit_server_cleanly("send_file_readbraw sendfile failed");
+		} else if (sendfile_read == 0) {
+			/*
+			 * Some sendfile implementations return 0 to indicate
+			 * that there was a short read, but nothing was
+			 * actually written to the socket.  In this case,
+			 * fallback to the normal read path so the header gets
+			 * the correct byte count.
+			 */
+			DEBUG(3, ("send_file_readbraw: sendfile sent zero "
+				  "bytes falling back to the normal read: "
+				  "%s\n", fsp->fsp_name));
+			goto normal_readbraw;
 		}
 
 		/* Deal with possible short send. */
@@ -3284,6 +3296,18 @@ static void send_file_readX(connection_struct *conn, struct smb_request *req,
 			DEBUG(0,("send_file_readX: sendfile failed for file %s (%s). Terminating\n",
 				fsp->fsp_name, strerror(errno) ));
 			exit_server_cleanly("send_file_readX sendfile failed");
+		} else if (nread == 0) {
+			/*
+			 * Some sendfile implementations return 0 to indicate
+			 * that there was a short read, but nothing was
+			 * actually written to the socket.  In this case,
+			 * fallback to the normal read path so the header gets
+			 * the correct byte count.
+			 */
+			DEBUG(3, ("send_file_readX: sendfile sent zero bytes "
+				  "falling back to the normal read: %s\n",
+				  fsp->fsp_name));
+			goto normal_read;
 		}
 
 		DEBUG( 3, ( "send_file_readX: sendfile fnum=%d max=%d nread=%d\n",


-- 
Samba Shared Repository


More information about the samba-cvs mailing list