Fix for offline child showstopper in 3.0.25b

Jeremy Allison jra at samba.org
Thu Jun 21 06:52:31 GMT 2007


On Wed, Jun 20, 2007 at 11:01:46PM -0700, Jeremy Allison wrote:
> On Wed, Jun 20, 2007 at 08:20:41PM -0700, Jeremy Allison wrote:
> > I think this is the correct fix. Call me for details.
> 
> Damn, hang on a min - I've found fault with this....
> Still working on it.

Ok - let's try this. My mistake was adding the
extra schedule_async_request() to the async_reply_recv()
call in the !success code branch.

The correct thing to do is note the pid of the
child we're waiting on (an extra child_pid member
of the struct winbindd_async_request) and on
any read/write/timeout fail *always* call :

if (state->child_pid == state->child->pid) {
	kill(state->child_pid, SIGTERM);
	winbind_child_died(state->child_pid);
}

At this point we know we can't recover and must
kill the child anyway - we use the pid of the
child we were waiting on to detect a signal
having already been received and killed and
restarted the child (in this case
state->child_pid != state->child->pid so we
don't do the kill/winbindd_child_died calls).

This compiles but it's too late (midnight)
to test. Let's give this a good working over
tomorrow, but I think this should work for
3.0.25b.

Jeremy.
-------------- next part --------------
Index: nsswitch/winbindd_dual.c
===================================================================
--- nsswitch/winbindd_dual.c	(revision 23557)
+++ nsswitch/winbindd_dual.c	(working copy)
@@ -97,6 +97,8 @@
 	struct winbindd_response *response;
 	void (*continuation)(void *private_data, BOOL success);
 	struct timed_event *reply_timeout_event;
+	pid_t child_pid; /* pid of the child we're waiting on. Used to detect
+			    a restart of the child (child->pid != child_pid). */
 	void *private_data;
 };
 
@@ -129,6 +131,7 @@
 	state->response = response;
 	state->continuation = continuation;
 	state->private_data = private_data;
+	state->child_pid = child->pid;
 
 	DLIST_ADD_END(child->requests, state, struct winbindd_async_request *);
 
@@ -174,36 +177,53 @@
 	struct winbindd_async_request *state =
 		talloc_get_type_abort(private_data, struct winbindd_async_request);
 
+	DEBUG(0,("async_request_timeout_handler: child pid %u is not responding. "
+		"Closing connection to it.\n",
+		state->child_pid ));
+
 	/* Deal with the reply - set to error. */
-
 	async_reply_recv(private_data, False);
+}
 
-	DEBUG(0,("async_request_timeout_handler: child pid %u is not responding. "
-		"Closing connection to it.\n",
-		state->child->pid ));
+/**************************************************************
+ Common function called on both async send and recv fail.
+ Cleans up the child and schedules the next request.
+**************************************************************/
 
-	/* Send kill signal to child. */
-	kill(state->child->pid, SIGTERM);
+static void async_request_fail(struct winbindd_async_request *state)
+{
+	DLIST_REMOVE(state->child->requests, state);
 
-	/* 
-	 * Close the socket to the child.
-	 */
+	if (state->reply_timeout_event) {
+		TALLOC_FREE(state->reply_timeout_event);
+	}
 
-	winbind_child_died(state->child->pid);
+	SMB_ASSERT(state->child_pid != (pid_t)0);
+
+	/* If not already reaped, send kill signal to child. */
+	if (state->child->pid == state->child_pid) {
+		kill(state->child_pid, SIGTERM);
+
+		/* 
+		 * Close the socket to the child.
+		 */
+		winbind_child_died(state->child_pid);
+	}
+
+	state->response->length = sizeof(struct winbindd_response);
+	state->response->result = WINBINDD_ERROR;
+	state->continuation(state->private_data, False);
 }
 
 static void async_request_sent(void *private_data_data, BOOL success)
 {
-	uint32_t timeout = 30;
 	struct winbindd_async_request *state =
 		talloc_get_type_abort(private_data_data, struct winbindd_async_request);
 
 	if (!success) {
-		DEBUG(5, ("Could not send async request\n"));
-
-		state->response->length = sizeof(struct winbindd_response);
-		state->response->result = WINBINDD_ERROR;
-		state->continuation(state->private_data, False);
+		DEBUG(5, ("Could not send async request to child pid %u\n",
+			(unsigned int)state->child_pid ));
+		async_request_fail(state);
 		return;
 	}
 
@@ -215,25 +235,14 @@
 			 async_reply_recv, state);
 
 	/* 
-	 * Normal timeouts are 30s, but auth requests may take a long
-	 * time to timeout.
-	 */
-
-	if (state->request->cmd == WINBINDD_PAM_AUTH ||
-			state->request->cmd == WINBINDD_PAM_AUTH_CRAP ) {
-
-		timeout = 300;
-	}
-
-	/* 
-	 * Set up a timeout of 30 seconds for the response.
+	 * Set up a timeout of 300 seconds for the response.
 	 * If we don't get it close the child socket and
 	 * report failure.
 	 */
 
 	state->reply_timeout_event = event_add_timed(winbind_event_context(),
 							NULL,
-							timeval_current_ofs(timeout,0),
+							timeval_current_ofs(300,0),
 							"async_request_timeout",
 							async_request_timeout_handler,
 							state);
@@ -255,20 +264,18 @@
 	state->response->length = sizeof(struct winbindd_response);
 
 	if (!success) {
-		DEBUG(5, ("Could not receive async reply\n"));
+		DEBUG(5, ("Could not receive async reply from child pid %u\n",
+			(unsigned int)state->child_pid ));
 
-		cache_cleanup_response(child->pid);
-		DLIST_REMOVE(child->requests, state);
-
-		state->response->result = WINBINDD_ERROR;
-		state->continuation(state->private_data, False);
+		cache_cleanup_response(state->child_pid);
+		async_request_fail(state);
 		return;
 	}
 
-	SMB_ASSERT(cache_retrieve_response(child->pid,
+	SMB_ASSERT(cache_retrieve_response(state->child_pid,
 					   state->response));
 
-	cache_cleanup_response(child->pid);
+	cache_cleanup_response(state->child_pid);
 	
 	DLIST_REMOVE(child->requests, state);
 
@@ -517,7 +524,7 @@
 	}
 
 	if (child == NULL) {
-		DEBUG(0, ("Unknown child %d died!\n", pid));
+		DEBUG(5, ("Already reaped child %u died\n", (unsigned int)pid));
 		return;
 	}
 


More information about the samba-technical mailing list