Patchset to add asynchronous open/close to master

Jeremy Allison jra at samba.org
Wed Jun 20 16:47:13 MDT 2012


On Wed, Jun 20, 2012 at 09:42:15PM +0200, Volker Lendecke wrote:
> 
> Let me try to give a technical argument: Metze's concern
> just also came to my mind. In the Terminal Server case, we
> can have multiple users on the same TCP connection. User A
> does the SMB create call, we trigger the thread. User B
> comes in and does another call. We setuid to User B. What
> happens if User A would have been allowed to create the
> file, but the kernel denies it to User B, which we did the
> setuid to?

Ok, so here is go-around number #3 :-).

It addresses the following (additional) issues.

1). The terminal server case - if the smbd process
changes uid to a user who has no rights to create
the file, then the create fails with EACESS. In
this new code any async create failing with EACESS
is retried synchronously once the thread has terminated.

It's one more go around the full create path, but
happens extremely rarely (if the create was going
to return EACESS for that user then it would already
have done so in the create path when checking for create
permissions, so this will only happen when smbd
has changed uid after the thread has been created
but not yet scheduled which is a pretty rare case).

2). A more subtle case, raised by Simo in
a phone call. It works like this:

The problem:

Create code path checks that user A
has the rights to create a file in
directory B - they do. The create
path then schedules an async create.

Thread does not get scheduled, smbd then switches
to user root. Evil user then moves directory B
and makes the previous path B a symlink to privileged
directory C. Thread is then scheduled and new file is
created in privileged directory C. Oops.

The fix:

The async open setup code now does an open(O_DIRECTORY)
on the parent directory B, instead of the $cwd "."
which would have been the directory exported as the
root of the share (as the previous iteration did).

The thread now uses openat(fd, name) where
fd is the file descriptor of parent directory
B, and name is the last component of the
original path. Even if smbd is root this
means that the new file will *always* be
created in directory B, no matter what is
done to the path in between. O_NOFOLLOW
is used in addition to the O_EXCL (although
that's not strictly necessary I think).

The fixup (if needed) is then done on the fd
returned from the openat(), by looking at an fstat
on the fd returned by the open(O_DIRECTORY).
These are guarenteed to be the correct
paths with no races possible.

I'm only posting the async open part of
the patch this time, as all the other
components are unchanged.

Cheers,

	Jeremy.

PS. There is an *evil* Linux-specific
way to cause the uid of the async open
thread to be pinned to the initial uid
it was invoked as, but I really don't
want to have to use that :-). Still it's
available..
-------------- next part --------------
diff --git a/source3/modules/vfs_aio_pthread.c b/source3/modules/vfs_aio_pthread.c
index 9b676d6..d553220 100644
--- a/source3/modules/vfs_aio_pthread.c
+++ b/source3/modules/vfs_aio_pthread.c
@@ -583,6 +583,459 @@ static int aio_pthread_suspend(struct vfs_handle_struct *handle,
 	return ret;
 }
 
+/************************************************************************
+ NB. This threadpool is shared over all instances of this VFS module,
+ as is the current jobid.
+*************************************************************************/
+
+static struct pthreadpool *open_pool;
+static int aio_pthread_open_jobid;
+
+struct aio_open_private_data {
+	struct aio_open_private_data *prev, *next;
+	int jobid;
+	int dir_fd;
+	int flags;
+	mode_t mode;
+	uint64_t mid;
+	bool in_progress;
+	const char *fname;
+	char *dname;
+	struct smbd_server_connection *sconn;
+	uid_t uid;
+	gid_t gid;
+	/* Returns. */
+	int ret_fd;
+	int ret_errno;
+};
+
+/* List of outstanding requests we have. */
+static struct aio_open_private_data *open_pd_list;
+
+/************************************************************************
+ Find the open private data by jobid.
+*************************************************************************/
+
+static struct aio_open_private_data *find_open_private_data_by_jobid(int jobid)
+{
+	struct aio_open_private_data *opd;
+
+	for (opd = open_pd_list; opd != NULL; opd = opd->next) {
+		if (opd->jobid == jobid) {
+			return opd;
+		}
+	}
+
+	return NULL;
+}
+
+/************************************************************************
+ Find the open private data by mid.
+*************************************************************************/
+
+static struct aio_open_private_data *find_open_private_data_by_mid(uint64_t mid)
+{
+	struct aio_open_private_data *opd;
+
+	for (opd = open_pd_list; opd != NULL; opd = opd->next) {
+		if (opd->mid == mid) {
+			return opd;
+		}
+	}
+
+	return NULL;
+}
+
+/************************************************************************
+ Callback when an open completes.
+*************************************************************************/
+
+static void aio_open_handle_completion(struct event_context *event_ctx,
+				struct fd_event *event,
+				uint16 flags,
+				void *p)
+{
+	struct aio_open_private_data *opd = NULL;
+	int jobid = 0;
+	int ret;
+
+	DEBUG(10, ("aio_open_handle_completion called with flags=%d\n",
+			(int)flags));
+
+	if ((flags & EVENT_FD_READ) == 0) {
+		return;
+	}
+
+	ret = pthreadpool_finished_job(open_pool, &jobid);
+	if (ret) {
+		smb_panic("aio_open_handle_completion");
+		return;
+	}
+
+	opd = find_open_private_data_by_jobid(jobid);
+	if (opd == NULL) {
+		DEBUG(1, ("aio_open_handle_completion cannot find jobid %d\n",
+			jobid));
+		smb_panic("aio_open_handle_completion - no jobid");
+		return;
+	}
+
+	DEBUG(10,("aio_open_handle_completion: jobid %d completed\n",
+		jobid ));
+
+	opd->in_progress = false;
+
+	/* Find outstanding event and reschdule. */
+	if (!schedule_deferred_open_message_smb(opd->sconn, opd->mid)) {
+		/* Outstanding event didn't exist or was
+		   cancelled. Free up the fd and throw
+		   away the result. */
+		if (opd->ret_fd != -1) {
+			close(opd->ret_fd);
+			opd->ret_fd = -1;
+		}
+		TALLOC_FREE(opd);
+	}
+}
+
+/*****************************************************************
+ The core of the async open code - the worker function. Note we
+ use the new openat() system call to avoid any problems with
+ current working directory changes.
+******************************************************************/
+
+static void aio_open_worker(void *private_data)
+{
+	struct aio_open_private_data *opd =
+		(struct aio_open_private_data *)private_data;
+	/* Force O_EXCL to deal with race condition. */
+	int oflags = opd->flags | O_EXCL;
+
+#if defined(O_NOFOLLOW)
+	oflags |= O_NOFOLLOW;
+#endif
+
+	opd->ret_fd = openat(opd->dir_fd,
+			opd->fname,
+			oflags,
+			opd->mode);
+
+	if (opd->ret_fd == -1) {
+		opd->ret_errno = errno;
+	} else {
+		/* Create was successful. */
+		opd->ret_errno = 0;
+	}
+}
+
+/************************************************************************
+ Open private data destructor.
+*************************************************************************/
+
+static int opd_destructor(struct aio_open_private_data *opd)
+{
+	if (opd->dir_fd != -1) {
+		close(opd->dir_fd);
+		opd->dir_fd = -1;
+	}
+	DLIST_REMOVE(open_pd_list, opd);
+	return 0;
+}
+
+/************************************************************************
+ Create and initialize a private data struct for async open.
+*************************************************************************/
+
+static struct aio_open_private_data *create_private_open_data(const files_struct *fsp,
+					int flags,
+					mode_t mode)
+{
+	const char *fname = NULL;
+	struct aio_open_private_data *opd = talloc_zero(NULL,
+					struct aio_open_private_data);
+
+	if (!opd) {
+		return NULL;
+	}
+
+	if (!parent_dirname(opd,
+			fsp->fsp_name->base_name,
+			&opd->dname,
+			&fname)) {
+		TALLOC_FREE(opd);
+		return NULL;
+	}
+
+	opd->jobid = aio_pthread_open_jobid++;
+	opd->dir_fd = -1;
+	opd->ret_fd = -1;
+	opd->ret_errno = EINPROGRESS;
+	opd->flags = flags;
+	opd->mode = mode;
+	opd->mid = fsp->mid;
+	opd->in_progress = true;
+	opd->uid = geteuid();
+	opd->gid = getegid();
+	opd->sconn = fsp->conn->sconn;
+	opd->fname = talloc_strdup(opd, fname);
+
+	if (opd->fname == NULL) {
+		TALLOC_FREE(opd);
+		return NULL;
+	}
+
+	/* Open the parent directory. */
+
+#if defined(O_DIRECTORY)
+	opd->dir_fd = open(opd->dname, O_RDONLY|O_DIRECTORY);
+#else
+	opd->dir_fd = open(opd->dname, O_RDONLY);
+#endif
+	if (opd->dir_fd == -1) {
+		TALLOC_FREE(opd);
+		return NULL;
+	}
+
+	DEBUG(5,("create_private_open_data: file %s in dir %s\n",
+		opd->fname,
+		opd->dname));
+
+	talloc_set_destructor(opd, opd_destructor);
+	DLIST_ADD_END(open_pd_list, opd, struct aio_open_private_data *);
+	return opd;
+}
+
+/*****************************************************************
+ Setup an async open.
+*****************************************************************/
+
+static int open_async(const files_struct *fsp,
+			int flags,
+			mode_t mode)
+{
+	struct aio_open_private_data *opd = NULL;
+	int ret;
+
+	if (!init_aio_threadpool(&open_pool,
+			aio_open_handle_completion)) {
+		return -1;
+	}
+
+	opd = create_private_open_data(fsp, flags, mode);
+	if (opd == NULL) {
+		DEBUG(10, ("open_async: Could not create private data.\n"));
+		return -1;
+	}
+
+	ret = pthreadpool_add_job(open_pool,
+				opd->jobid,
+				aio_open_worker,
+				(void *)opd);
+	if (ret) {
+		errno = ret;
+		return -1;
+	}
+
+	/* Cause the calling code to reschedule us. */
+	errno = EINTR; /* Maps to NT_STATUS_RETRY. */
+	return -1;
+}
+
+/*****************************************************************
+ If the parent directory has the SETGID bit set, ensure the opd
+ gid is that of the parent directory, not the creating process.
+*****************************************************************/
+
+static void parent_dir_setguid_adjust(struct aio_open_private_data *opd)
+{
+	struct stat st;
+
+	if (fstat(opd->dir_fd, &st) == -1) {
+		smb_panic("parent_dir_setguid_adjust: fstat failed !\n");
+		return;
+	}
+	if (st.st_mode & S_ISGID) {
+		opd->gid = st.st_gid;
+	}
+}
+
+/*****************************************************************
+ We know the file didn't exist, and now it does. If it doesn't
+ have the right ownership we hit a process uid change and must fix
+ it up.
+*****************************************************************/
+
+static void fix_file_ownership(struct aio_open_private_data *opd)
+{
+	int ret;
+	struct stat st;
+
+	if (opd->ret_fd == -1) {
+		return;
+	}
+
+	ret = fstat(opd->ret_fd, &st);
+	if (ret == -1) {
+		smb_panic("fix_file_ownership: fstat failed !\n");
+		return;
+	}
+
+	if (st.st_uid == opd->uid && st.st_gid == opd->gid) {
+		/* The common case. */
+		return;
+	}
+
+	/*
+	 * We know we created this (see O_EXCL above).
+	 * So make sure it has our uid and the correct
+	 * gid. NB. POSIX ACL inheritance doesn't affect
+	 * us if we have the correct uid and gid set
+	 * on the file.
+	 */
+
+	parent_dir_setguid_adjust(opd);
+
+	/*
+	 * If the parent had setgid then maybe
+	 * we're ok after the setgid adjust.
+	 */
+
+	if (st.st_uid == opd->uid && st.st_gid == opd->gid) {
+		return;
+	}
+
+	become_root();
+	ret = fchown(opd->ret_fd, opd->uid, opd->gid);
+	unbecome_root();
+
+	DEBUG(2,("fix_file_ownership: fchown on file %s "
+		"in dir %s to uid %u gid %u returned %d\n",
+		opd->fname,
+		opd->dname,
+		opd->uid,
+		opd->gid,
+		ret));
+
+	if (ret == -1) {
+		smb_panic("fix_file_ownership: fchown failed !\n");
+	}
+}
+
+/*****************************************************************
+ Look for a matching mid. If we find it we're rescheduled,
+ just return the completed open.
+*****************************************************************/
+
+static bool find_completed_open(files_struct *fsp,
+				int *p_fd,
+				int *p_errno)
+{
+	struct aio_open_private_data *opd;
+
+	opd = find_open_private_data_by_mid(fsp->mid);
+	if (!opd) {
+		return false;
+	}
+
+	DEBUG(5,("find_completed_open: Found existing open for "
+		"file %s\n",
+		smb_fname_str_dbg(fsp->fsp_name)));
+
+	if (opd->in_progress) {
+		/* Disaster ! This is an open
+		   timeout. Just panic. */
+		smb_panic("find_completed_open - in_progress\n");
+	}
+
+	if (opd->ret_fd == -1) {
+		bool retry_open_sync = false;
+
+		if (errno == EACCES) {
+			/*
+			 * We might have tried the
+			 * openat as another user and
+			 * been denied by mistake.
+			 */
+			retry_open_sync = true;
+		} else if (!(opd->flags & O_EXCL) &&
+				opd->ret_errno == EEXIST) {
+			/*
+			 * EEXIST and we didn't have O_EXCL in the
+			 * passed in flags - retry.
+			 */
+			retry_open_sync = true;
+		}
+
+		if (retry_open_sync) {
+			opd->ret_fd = openat(opd->dir_fd,
+					opd->fname,
+					opd->flags,
+					opd->mode);
+			if (opd->ret_fd == -1) {
+				opd->ret_errno = errno;
+			} else {
+				opd->ret_errno = 0;
+			}
+		}
+	} else {
+		/* Fix up the ownership if we have to. */
+		fix_file_ownership(opd);
+	}
+
+	*p_fd = opd->ret_fd;
+	*p_errno = opd->ret_errno;
+
+	/* Now we can free the opd. */
+	TALLOC_FREE(opd);
+	return true;
+}
+
+static int aio_open_fn(vfs_handle_struct *handle,
+			struct smb_filename *smb_fname,
+			files_struct *fsp,
+			int flags,
+			mode_t mode)
+{
+	int my_errno = 0;
+	int fd = -1;
+	bool aio_allow_open = lp_parm_bool(
+		SNUM(handle->conn), "aio_pthread", "aio open", false);
+
+	if (!aio_allow_open) {
+		/* aio opens turned off. */
+		return open(smb_fname->base_name, flags, mode);
+	}
+
+	if (!(flags & O_CREAT)) {
+		/* Only creates matter. */
+		return open(smb_fname->base_name, flags, mode);
+	}
+
+	if (VALID_STAT(fsp->fsp_name->st)) {
+		/* Only creates on a non-existent file matter. */
+		return open(smb_fname->base_name, flags, mode);
+	}
+
+	if (smb_fname->stream_name) {
+		/* Don't handle stream opens. */
+		return open(smb_fname->base_name, flags, mode);
+	}
+
+	/* See if this is a reentrant call - i.e. is this a
+	   restart of an existing open that just completed.
+	 */
+
+	if (find_completed_open(fsp,
+				&fd,
+				&my_errno)) {
+		errno = my_errno;
+		return fd;
+	}
+
+	/* Ok, it's a create call on a new file - pass it to a thread helper. */
+	return open_async(fsp, flags, mode);
+}
+
 static int aio_pthread_connect(vfs_handle_struct *handle, const char *service,
 			       const char *user)
 {
@@ -595,6 +1048,10 @@ static int aio_pthread_connect(vfs_handle_struct *handle, const char *service,
 	 *     requests and Windows clients don't use this anyway.
 	 * Essentially we want this to be unlimited unless smb.conf
 	 * says different.
+	 *
+	 * NB. Also note - this is the maximum number of threads
+	 * used in a threadpool across *ALL* instances of this module
+	 * loaded into an smbd.
 	 *********************************************************************/
 	aio_pending_size = lp_parm_int(
 		SNUM(handle->conn), "aio_pthread", "aio num threads", 100);
@@ -603,6 +1060,7 @@ static int aio_pthread_connect(vfs_handle_struct *handle, const char *service,
 
 static struct vfs_fn_pointers vfs_aio_pthread_fns = {
 	.connect_fn = aio_pthread_connect,
+	.open_fn = aio_open_fn,
 	.aio_read_fn = aio_pthread_read,
 	.aio_write_fn = aio_pthread_write,
 	.aio_return_fn = aio_pthread_return_fn,


More information about the samba-technical mailing list