Different problem (Re: [distcc] Problems with distcc hanging on large compiles (Patch not effective))

Hien D. Ngo hien at moses.xp.com
Thu Aug 29 18:21:00 GMT 2002


As luck would have it, two compile sessions got hung up.  The backtrace info for the 
other side is not very useful, though.

I'll try the HAVE_SENDFILE recompile and report back.

Hien

=======
distccd
=======

build01 ngoh> netstat -an | grep 4200
tcp        0      0 0.0.0.0:4200            0.0.0.0:*               LISTEN
tcp    71742      0 172.19.21.37:4200       192.168.0.252:54522
ESTABLISHED
build01 ngoh> lsof -i:54522
COMMAND   PID USER   FD   TYPE   DEVICE SIZE NODE NAME
distccd 11496 ngoh    5u  IPv4 45861285       TCP
build01.foo.com:4200->bldmaster.foo.com:54522 (ESTABLISHED)
build01 ngoh> ps auxwww | grep 11496
ngoh     11496  0.0  0.0  1688  768 ?        SN   07:35   0:00
/usr/local/utils/bin/distccd --verbose --log-file=/tmp/distcc.log --daemon
ngoh     15759  0.0  0.0  1476  464 pts/1    S    22:33   0:00 grep 11496
build01 ngoh> /usr/local/gdb-5.1.1/bin/gdb /usr/local/utils/bin/distccd
11496
GNU gdb 5.1.1
Copyright 2002 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain
conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for details.
This GDB was configured as "i686-pc-linux-gnu"...
/home/ngoh/11496: No such file or directory.
Attaching to program: /usr/local/utils/bin/distccd, process 11496
Reading symbols from /lib/libnsl.so.1...done.
Loaded symbols for /lib/libnsl.so.1
Reading symbols from /lib/i686/libc.so.6...done.
Loaded symbols for /lib/i686/libc.so.6
Reading symbols from /lib/ld-linux.so.2...done.
Loaded symbols for /lib/ld-linux.so.2
Reading symbols from /lib/libnss_files.so.2...done.
Loaded symbols for /lib/libnss_files.so.2
Reading symbols from /lib/libnss_nisplus.so.2...done.
Loaded symbols for /lib/libnss_nisplus.so.2
Reading symbols from /lib/libnss_dns.so.2...done.
Loaded symbols for /lib/libnss_dns.so.2
Reading symbols from /lib/libresolv.so.2...done.
Loaded symbols for /lib/libresolv.so.2
0x40127474 in __libc_open () from /lib/i686/libc.so.6
(gdb) backtrace full
#0  0x40127474 in __libc_open () from /lib/i686/libc.so.6
No locals.
#1  0x00000000 in ?? ()
No symbol table info available.


======
distcc
======

bldmaster ngoh> netstat -an | grep 54522
tcp        0  53576 192.168.0.252:54522     172.19.21.37:4200
ESTABLISHED
bldmaster ngoh> lsof -i:54522
COMMAND   PID USER   FD   TYPE  DEVICE SIZE NODE NAME
distcc  24229 ngoh    5u  IPv4 7385721       TCP
bldmaster.foo.com:54522->build01.foo.com:4200 (ESTABLISHED)
bldmaster ngoh> ps auxwww | grep 24229
ngoh       541  0.0  0.3  9980 7932 pts/11   T    22:39   0:00
/usr/local/gdb-5.1.1/bin/gdb /usr/local/utils/bin/distcc 24229
ngoh     24229  0.0  0.0  1680  740 pts/9    T    07:35   0:00
/usr/local/utils/bin/distcc g++ -fPIC -g -O -Wall -pipe -pthread
-Wno-non-template-friend -I./shadow/linux -I../../tsquant/linux.bld -I.
-I/local/scratch/ngoh/ver/hdr/shadow/linux -I/local/scratch/ngoh/ver/hdr
-I/local/scratch/ngoh/ver/hdr/shadow/linux -I/local/scratch/ngoh/ver/hdr
-DRW_NO_STL -ftemplate-depth-50 -c -o
/local/scratch/ngoh/ccache/tmp.hash.24183.o
/local/scratch/ngoh/ccache/tmp.stdout.24183.i
ngoh      1308  0.0  0.0  1740  604 pts/11   S    22:59   0:00 grep 24229


(gdb) backtrace full
#0  0x40134949 in sendfile () from /lib/i686/libc.so.6
No locals.
#1  0x00079538 in ?? ()
No symbol table info available.
#2  0x0804a5c5 in dcc_pump_sendfile (ofd=5, ifd=6, size=496952) at io.c:114
        ofd = 5
        ifd = 496952
        size = 5
#3  0x0804bf3b in dcc_x_file (ofd=5,
    fname=0x804f660 "/local/scratch/ngoh/ccache/tmp.stdout.24183.i",
    token=0x804c9e8 "DOTI", size_out=0x0) at bulk.c:128
        ofd = 5
        fname = 0x804f660 "/local/scratch/ngoh/ccache/tmp.stdout.24183.i"
        token = 0x804c9e8 "DOTI"
        ifd = 6
        f_size = 496952
#4  0x080491d9 in dcc_compile_remote (argv=0x804f3d8,
    cpp_fname=0x804f660 "/local/scratch/ngoh/ccache/tmp.stdout.24183.i",
    output_fname=0xbfffeba1 "/local/scratch/ngoh/ccache/tmp.hash.24183.o",
    cpp_pid=0, host=0x804f440, status=0xbfffd4c4) at distcc.c:173
        argv = (char **) 0x804f3d8
        cpp_fname = 0x1e97a <Address 0x1e97a out of bounds>
        output_fname = 0x1e97a <Address 0x1e97a out of bounds>
        host = (struct dcc_hostdef *) 0x804f440
        status = (int *) 0xbfffd4c4
        fd = 5
        stime_usec = -1073752892
        utime_usec = 18756
        ret = 125306
#5  0x08049485 in dcc_build_somewhere (argv=0x804f3d8, status=0xbfffd4c4)
    at distcc.c:319
        status = (int *) 0xbfffd4c4
        input_fname = 0xbfffebcd
"/local/scratch/ngoh/ccache/tmp.stdout.24183.i"
        output_fname = 0xbfffeba1
"/local/scratch/ngoh/ccache/tmp.hash.24183.o"
        cpp_fname = 0x804f660
"/local/scratch/ngoh/ccache/tmp.stdout.24183.i"
        cpp_pid = 0
        ret = 125306
        host = (struct dcc_hostdef *) 0x804f440
#6  0x08049616 in main (argc=22, argv=0xbfffd534) at distcc.c:381
        argc = 22
        status = 134541752
#7  0x40063507 in __libc_start_main (main=0x8049584 <main>, argc=22,
    ubp_av=0xbfffd534, init=0x8048af8 <_init>, fini=0x804c614 <_fini>,
    rtld_fini=0x4000dc14 <_dl_fini>, stack_end=0xbfffd52c)
    at ../sysdeps/generic/libc-start.c:129
        ubp_av = (char **) 0xbfffd534
        fini = (void (*)()) 0x400168e4 <_dl_debug_mask>
        rtld_fini = (void (*)()) 0x6
        ubp_ev = (char **) 0xbfffd590



---- Original Message ----
From:		Martin Pool
Date:		Thu 8/29/02 18:58
To:		Hien D. Ngo
Cc:		distcc at lists.samba.org
Subject:	Re: Different problem (Re: [distcc] Problems with distcc hanging on 
large compiles (Patch not effective))

On 29 Aug 2002, "Hien D. Ngo" <hien at moses.xp.com> wrote:
> 
> I removed the RH 6 machines from my DISTCC_HOSTS and am just using all RH 7.2 and 
> 7.3 boxes now.  I'm getting a different error.  The compile process stopped 
> responding with 'netstat' reporting established connections.  I then attached the 
> debugger, got a more reasonable backtrace, then detached.  At which point, things 
> started running again (attaching/detaching with the debugger unfroze
> the compile.)

Thanks for that.  I really need to see the netstat and backtrace on
both the client and server though.

You might try commenting out HAVE_SENDFILE in config.h after
configuring, then doing "make clean all", in case there's a bug either
in RedHat's sendfile or in distcc.

Attaching the debugger probably interrupts the sendfile() system call.
distcc (in CVS, not in 0.8) will cope with this and restart it.  I am
not sure why transmission would hang in sendfile unless there was
something wrong on the other machine, which is why I need to see its
backtrace.

Are the two machines in question on the same ethernet hub, or is there
a firewall or something between them?

Thanks,
-- 
Martin
_______________________________________________
distcc mailing list
distcc at lists.samba.org
http://lists.samba.org/cgi-bin/mailman/listinfo/distcc




More information about the distcc mailing list