Regarding Bug 3204 - winbindd: Exceeding 200 client connections, no idle connection found

Jeremy Allison jra at samba.org
Fri Jul 18 13:09:47 MDT 2014


On Thu, Jul 17, 2014 at 02:24:24PM -0700, Hemanth Thummala wrote:
> Hi All,
> 
> Recently we hit this issue twice at out customer site whose is running
> samba 3.6.12+ stack. Increasing the "winbind max clients" to 400 also did
> not help.
> 
> [2014/07/17 11:23:02.223054,  0]
> winbindd/winbindd.c:947(winbindd_listen_fde_handler)
>   winbindd: Exceeding 400 client connections, no idle connection found
> [2014/07/17 11:23:02.224055,  0]
> winbindd/winbindd.c:947(winbindd_listen_fde_handler)
>   winbindd: Exceeding 400 client connections, no idle connection found
> 
> Winbindd went unresponsive and found that there are lot(>10K) of open file
> handles for stuck winbindd process. Had to restart winbindd to restore the
> user activities.
> 
>  ....
>  ....
>  3378 winbindd         10294 s - rw------   1       0 UDS
> /usr/local/var/locks/winbindd_privileged/pipe
>  3378 winbindd         10295 s - rw------   1       0 UDS
> /usr/local/var/locks/winbindd_privileged/pipe
>  3378 winbindd         *10296* s - rw------   1       0 UDS
> /usr/local/var/locks/winbindd_privileged/pipe
> 
> Gdb stack of stuck winbindd process is updated in the bug.
> 
> I have seen few people have reported the same with post 3.3 stacks. Would
> like to know if someone is able to figure out the reason and fix for this
> issue.

Ok, as this went to samba-technical here is a (test) patch
for 3.6.x only that attempts to fix this bug.

It adds a new [global] parameter:

winbind request timeout

default value of 60 (seconds). What it does is terminate every client
connection that has either remained idle for 60 seconds, or has not replied
within 60 seconds. Initially I worried this was a little aggressive, but I
don't think so - if a request has take > 60 seconds it's almost certainly dead,
and pruning idle clients after 60 seconds is also probably ok. Also it's
tuneable :-).

If this works for people I can forward port to 4.1.next and 4.0.next.

It's also added to the bug report.

Let me know if it helps !

Cheers,

	Jeremy.
-------------- next part --------------
diff --git a/source3/include/proto.h b/source3/include/proto.h
index 7303e76..9ef3517 100644
--- a/source3/include/proto.h
+++ b/source3/include/proto.h
@@ -1696,6 +1696,7 @@ char lp_magicchar(const struct share_params *p );
 int lp_winbind_cache_time(void);
 int lp_winbind_reconnect_delay(void);
 int lp_winbind_max_clients(void);
+int lp_winbind_request_timeout(void);
 const char **lp_winbind_nss_info(void);
 int lp_algorithmic_rid_base(void);
 int lp_name_cache_timeout(void);
diff --git a/source3/param/loadparm.c b/source3/param/loadparm.c
index dd63339..c4b3191 100644
--- a/source3/param/loadparm.c
+++ b/source3/param/loadparm.c
@@ -266,6 +266,7 @@ struct global {
 	int winbind_cache_time;
 	int winbind_reconnect_delay;
 	int winbind_max_clients;
+	int winbind_request_timeout;
 	char **szWinbindNssInfo;
 	int iLockSpinTime;
 	char *szLdapMachineSuffix;
@@ -4772,6 +4773,15 @@ static struct parm_struct parm_table[] = {
 		.flags		= FLAG_ADVANCED,
 	},
 	{
+		.label		= "winbind request timeout",
+		.type		= P_INTEGER,
+		.p_class	= P_GLOBAL,
+		.ptr		= &Globals.winbind_request_timeout,
+		.special	= NULL,
+		.enum_list	= NULL,
+		.flags		= FLAG_ADVANCED,
+	},
+	{
 		.label		= "create krb5 conf",
 		.type		= P_BOOL,
 		.p_class	= P_GLOBAL,
@@ -5435,6 +5445,7 @@ static void init_globals(bool reinit_globals)
 	Globals.winbind_cache_time = 300;	/* 5 minutes */
 	Globals.winbind_reconnect_delay = 30;	/* 30 seconds */
 	Globals.winbind_max_clients = 200;
+	Globals.winbind_request_timeout = 60;	/* 60 seconds */
 	Globals.bWinbindEnumUsers = False;
 	Globals.bWinbindEnumGroups = False;
 	Globals.bWinbindUseDefaultDomain = False;
@@ -6052,6 +6063,7 @@ FN_LOCAL_CHAR(lp_magicchar, magic_char)
 FN_GLOBAL_INTEGER(lp_winbind_cache_time, &Globals.winbind_cache_time)
 FN_GLOBAL_INTEGER(lp_winbind_reconnect_delay, &Globals.winbind_reconnect_delay)
 FN_GLOBAL_INTEGER(lp_winbind_max_clients, &Globals.winbind_max_clients)
+FN_GLOBAL_INTEGER(lp_winbind_request_timeout, &Globals.winbind_request_timeout)
 FN_GLOBAL_LIST(lp_winbind_nss_info, &Globals.szWinbindNssInfo)
 FN_GLOBAL_INTEGER(lp_algorithmic_rid_base, &Globals.AlgorithmicRidBase)
 FN_GLOBAL_INTEGER(lp_name_cache_timeout, &Globals.name_cache_timeout)
diff --git a/source3/winbindd/winbindd.c b/source3/winbindd/winbindd.c
index f447059..6a8a854 100644
--- a/source3/winbindd/winbindd.c
+++ b/source3/winbindd/winbindd.c
@@ -923,6 +923,38 @@ static bool remove_idle_client(void)
 	return False;
 }
 
+/*
+ * Terminate all clients whose requests have taken longer than
+ * "winbind request timeout" seconds to process, or have been
+ * idle for more than "winbind request timeout" seconds.
+ */
+
+static void remove_timed_out_clients(void)
+{
+	struct winbindd_cli_state *state, *next = NULL;
+	time_t curr_time = time(NULL);
+	int timeout_val = lp_winbind_request_timeout();
+
+	for (state = winbindd_client_list(); state; state = next) {
+		next = state->next;
+
+		if (state->last_access + timeout_val > curr_time) {
+			if (client_is_idle(state)) {
+				DEBUG(5,("Idle client timed out, "
+					"shutting down sock %d, pid %u\n",
+					state->sock,
+					(unsigned int)state->pid));
+			} else {
+				DEBUG(5,("Client request timed out, "
+					"shutting down sock %d, pid %u\n",
+					state->sock,
+					(unsigned int)state->pid));
+			}
+			remove_client(state);
+		}
+	}
+}
+
 struct winbindd_listen_state {
 	bool privileged;
 	int fd;
@@ -948,6 +980,7 @@ static void winbindd_listen_fde_handler(struct tevent_context *ev,
 			break;
 		}
 	}
+	remove_timed_out_clients();
 	new_connection(s->fd, s->privileged);
 }
 


More information about the samba-technical mailing list