PATCH/RFC: Another stab at the Cygwin hang problem

Sat Jul 12 20:52:59 EST 2003

On Wed, Jul 09, 2003 at 06:47:35AM -0400, Tillman, James wrote:
> 
> 
> > -----Original Message-----
> > From: jw schultz [mailto:jw at pegasys.ws]
> > Sent: Wednesday, July 09, 2003 5:59 AM
> > To: rsync at lists.samba.org
> > Subject: Re: PATCH/RFC: Another stab at the Cygwin hang problem
> > 
> > 
> > > I can't quite place why but my instincts inform me that you
> > > have latched onto something.  Some sort of one character
> > > buffering error in the io libraries under cygwin.  Most
> > > likely in the windos libs.
> > > 
> > > Well, we have two reports of this fixing the rsync hang
> > > problem when signals failed.  I'd like a little more testing
> > > before mainlining it.
> > 
> > Nope!  This is a no-go.  It intermittantly produces
> > 
> > 	error (10) -- error in socket IO
> > 
> > on both network and local transfers.
> > 
> 
> I guess I'd better double check my processes to make sure that I'm getting a
> satisfactory success rate on my own servers.  If I see any clues, I'll
> report them here.  Any hope for a fix, or does this look like an inherent
> problem in the method being used?

It looks like the method is fairly sound.  The problem seems
to primarily be in dealing with the child termination.

 	io_set_error_fd(-1);
-	kill(pid, SIGUSR2);
-	wait_process(pid, &status);
+	write(cleanup_pipe[1], ".", 1);
+	if (waitpid(pid, &status, 0) != pid) {
+		rprintf(FERROR,"cleanup in do_recv failed\n");
+		exit_cleanup(RERR_SOCKETIO);
+	}
 	return status;

There is a huge window between the write() and the return of
waitpid() that depending on scheduling and signal delivery
allows the child pid to be reaped by SIGCHILD handler.  That
results in this waitpid() returning -1 with errno of ECHILD.
EINTER would also be possible.  The timing dependencies
account for intermittency of the error.

I've attached an altered patch.  I've only dealt with this
one location which produced errors doing a ssh pull.  I
haven't addressed the local transfer errors but i suspect
that derived from this waitpid error.  Further testing will
still be needed to ensure that ssh push and rsyncd usage are
unbroken.  This really needs testing in cygwin which i don't
have.  If it takes care of the the cygwin hang then we can
polish it.  There remains the issue of an error status when
when the only failure is termination.

-- 
________________________________________________________________
	J.W. Schultz            Pegasystems Technologies
	email address:		jw at pegasys.ws

		Remember Cernan and Schmitt
-------------- next part --------------
? main.2.5.5
Index: cleanup.c
===================================================================
RCS file: /data/cvs/rsync/cleanup.c,v
retrieving revision 1.18
diff -u -r1.18 cleanup.c

--- cleanup.c	21 Mar 2003 23:43:50 -0000	1.18
+++ cleanup.c	12 Jul 2003 10:31:04 -0000
@@ -96,7 +96,6 @@
 	inside_cleanup++;
 
 	signal(SIGUSR1, SIG_IGN);
-	signal(SIGUSR2, SIG_IGN);
 
 	if (verbose > 3)
 		rprintf(FINFO,"_exit_cleanup(code=%d, file=%s, line=%d): entered\n", 
Index: main.c
===================================================================
RCS file: /data/cvs/rsync/main.c,v
retrieving revision 1.169
diff -u -r1.169 main.c
--- main.c	4 Jul 2003 15:11:46 -0000	1.169
+++ main.c	12 Jul 2003 10:31:04 -0000
@@ -391,6 +391,7 @@
 	int status=0;
 	int recv_pipe[2];
 	int error_pipe[2];
+	int cleanup_pipe[2];
 	extern int preserve_hard_links;
 	extern int delete_after;
 	extern int recurse;
@@ -417,11 +418,19 @@
 		exit_cleanup(RERR_SOCKETIO);
 	}
 
+	if (pipe(cleanup_pipe) < 0) {
+		rprintf(FERROR,"cleanup pipe failed in do_recv\n");
+		exit_cleanup(RERR_SOCKETIO);
+	}
+  
 	io_flush();
 
 	if ((pid=do_fork()) == 0) {
+		char tmp;
+
 		close(recv_pipe[0]);
 		close(error_pipe[0]);
+		close(cleanup_pipe[1]);
 		if (f_in != f_out) close(f_out);
 
 		/* we can't let two processes write to the socket at one time */
@@ -437,15 +446,21 @@
 		write_int(recv_pipe[1],1);
 		close(recv_pipe[1]);
 		io_flush();
-		/* finally we go to sleep until our parent kills us
-		   with a USR2 signal. We sleep for a short time as on
-		   some OSes a signal won't interrupt a sleep! */
-		while (msleep(20))
-			;
+		do {
+			status = read(cleanup_pipe[0], &tmp, 1);
+		} while (status == -1 && errno == EINTR);
+		if (status != 1) {
+			rprintf(FERROR,"cleanup read returned %d in do_recv\n", status);
+			if (status == -1)
+				rprintf(FERROR,"with errno %d (%s)\n", errno, strerror(errno));
+			_exit(RERR_PARTIAL);
+		}
+		_exit(0);
 	}
 
 	close(recv_pipe[1]);
 	close(error_pipe[1]);
+	close(cleanup_pipe[0]);
 	if (f_in != f_out) close(f_in);
 
 	io_start_buffering(f_out);
@@ -463,7 +478,7 @@
 	io_flush();
 
 	io_set_error_fd(-1);
-	kill(pid, SIGUSR2);
+	write(cleanup_pipe[1], ".", 1);
 	wait_process(pid, &status);
 	return status;
 }
@@ -881,12 +896,6 @@
 	exit_cleanup(RERR_SIGNAL);
 }
 
-static RETSIGTYPE sigusr2_handler(int UNUSED(val)) {
-	extern int log_got_error;
-	if (log_got_error) _exit(RERR_PARTIAL);
-	_exit(0);
-}
-
 static RETSIGTYPE sigchld_handler(int UNUSED(val)) {
 #ifdef WNOHANG
 	int cnt, status;
@@ -976,7 +985,6 @@
 	orig_argv = argv;
 
 	signal(SIGUSR1, sigusr1_handler);
-	signal(SIGUSR2, sigusr2_handler);
 	signal(SIGCHLD, sigchld_handler);
 #ifdef MAINTAINER_MODE
 	signal(SIGSEGV, rsync_panic_handler);