Reworking of winbindd reconnect delay code

Jeremy Allison jra at samba.org
Sat Aug 16 01:44:42 GMT 2008


Volker, Michael & Jerry,

As promised, here is a re-working of the winbindd
reconnect code to cope with rebooting a DC. This
replaces the code I asked Volker to revert.

Not yet tested (other than compiles) but the
logic is pretty simple. It adds a new parameter,
"winbind reconnect delay", set to 30 seconds by
default, which determines how long to wait between
connection attempts. 

To avoid overwhelming the box with DC-probe
forked children, the code now keeps track of
the DC probe child per winbindd_domain struct
and only starts a new one if the existing one
has died.

I also added a little logic to make sure the
dc probe child always sends a message whatever
the reason for exit so we will always reschedule
another connect attempt.

Review & testing welcome :-).

Cheers,

	Jeremy.
-------------- next part --------------
diff --git a/source/include/proto.h b/source/include/proto.h
index a1595dd..cf3406b 100644
--- a/source/include/proto.h
+++ b/source/include/proto.h
@@ -6091,6 +6091,7 @@ int lp_directory_name_cache_size(int );
 int lp_smb_encrypt(int );
 char lp_magicchar(const struct share_params *p );
 int lp_winbind_cache_time(void);
+int lp_winbind_reconnect_delay(void);
 const char **lp_winbind_nss_info(void);
 int lp_algorithmic_rid_base(void);
 int lp_name_cache_timeout(void);
diff --git a/source/param/loadparm.c b/source/param/loadparm.c
index bc111df..6817eca 100644
--- a/source/param/loadparm.c
+++ b/source/param/loadparm.c
@@ -240,6 +240,7 @@ struct global {
 	int map_to_guest;
 	int oplock_break_wait_time;
 	int winbind_cache_time;
+	int winbind_reconnect_delay;
 	int winbind_max_idle_children;
 	char **szWinbindNssInfo;
 	int iLockSpinTime;
@@ -4363,6 +4364,15 @@ static struct parm_struct parm_table[] = {
 		.flags		= FLAG_ADVANCED,
 	},
 	{
+		.label		= "winbind reconnect delay",
+		.type		= P_INTEGER,
+		.p_class	= P_GLOBAL,
+		.ptr		= &Globals.winbind_reconnect_delay,
+		.special	= NULL,
+		.enum_list	= NULL,
+		.flags		= FLAG_ADVANCED,
+	},
+	{
 		.label		= "winbind enum users",
 		.type		= P_BOOL,
 		.p_class	= P_GLOBAL,
@@ -4829,6 +4839,7 @@ static void init_globals(bool first_time_only)
 	Globals.clustering = False;
 
 	Globals.winbind_cache_time = 300;	/* 5 minutes */
+	Globals.winbind_reconnect_delay = 30;	/* 30 seconds */
 	Globals.bWinbindEnumUsers = False;
 	Globals.bWinbindEnumGroups = False;
 	Globals.bWinbindUseDefaultDomain = False;
@@ -5341,6 +5352,7 @@ FN_LOCAL_INTEGER(lp_directory_name_cache_size, iDirectoryNameCacheSize)
 FN_LOCAL_INTEGER(lp_smb_encrypt, ismb_encrypt)
 FN_LOCAL_CHAR(lp_magicchar, magic_char)
 FN_GLOBAL_INTEGER(lp_winbind_cache_time, &Globals.winbind_cache_time)
+FN_GLOBAL_INTEGER(lp_winbind_reconnect_delay, &Globals.winbind_reconnect_delay)
 FN_GLOBAL_LIST(lp_winbind_nss_info, &Globals.szWinbindNssInfo)
 FN_GLOBAL_INTEGER(lp_algorithmic_rid_base, &Globals.AlgorithmicRidBase)
 FN_GLOBAL_INTEGER(lp_name_cache_timeout, &Globals.name_cache_timeout)
diff --git a/source/winbindd/winbindd.h b/source/winbindd/winbindd.h
index fe0c076..1b8cd91 100644
--- a/source/winbindd/winbindd.h
+++ b/source/winbindd/winbindd.h
@@ -204,6 +204,7 @@ struct winbindd_domain {
 	uint32_t id_range_low, id_range_high;
 
 	/* A working DC */
+	pid_t dc_probe_pid; /* Child we're using to detect the DC. */
 	fstring dcname;
 	struct sockaddr_storage dcaddr;
 
diff --git a/source/winbindd/winbindd_cm.c b/source/winbindd/winbindd_cm.c
index 47df4e4..69e95b1 100644
--- a/source/winbindd/winbindd_cm.c
+++ b/source/winbindd/winbindd_cm.c
@@ -171,20 +171,33 @@ static bool fork_child_dc_connect(struct winbindd_domain *domain)
 	struct dc_name_ip *dcs = NULL;
 	int num_dcs = 0;
 	TALLOC_CTX *mem_ctx = NULL;
-	pid_t child_pid;
 	pid_t parent_pid = sys_getpid();
 
 	/* Stop zombies */
 	CatchChild();
 
-	child_pid = sys_fork();
+	if (domain->dc_probe_pid != (pid_t)-1) {
+		/*
+		 * We might already have a DC probe
+		 * child working, check.
+		 */
+		if (process_exists_by_pid(domain->dc_probe_pid)) {
+			DEBUG(10,("fork_child_dc_connect: pid %u already "
+				"checking for DC's.\n",
+				(unsigned int)domain->dc_probe_pid));
+			return true;
+		}
+		domain->dc_probe_pid = (pid_t)-1;
+	}
 
-	if (child_pid == -1) {
+	domain->dc_probe_pid = sys_fork();
+
+	if (domain->dc_probe_pid == (pid_t)-1) {
 		DEBUG(0, ("fork_child_dc_connect: Could not fork: %s\n", strerror(errno)));
 		return False;
 	}
 
-	if (child_pid != 0) {
+	if (domain->dc_probe_pid != (pid_t)0) {
 		/* Parent */
 		messaging_register(winbind_messaging_context(), NULL,
 				   MSG_WINBIND_TRY_TO_GO_ONLINE,
@@ -201,6 +214,11 @@ static bool fork_child_dc_connect(struct winbindd_domain *domain)
 
 	if (!reinit_after_fork(winbind_messaging_context(), true)) {
 		DEBUG(0,("reinit_after_fork() failed\n"));
+		messaging_send_buf(winbind_messaging_context(),
+				   pid_to_procid(parent_pid),
+				   MSG_WINBIND_FAILED_TO_GO_ONLINE,
+				   (uint8 *)domain->name,
+				   strlen(domain->name)+1);
 		_exit(0);
 	}
 
@@ -218,6 +236,11 @@ static bool fork_child_dc_connect(struct winbindd_domain *domain)
 	mem_ctx = talloc_init("fork_child_dc_connect");
 	if (!mem_ctx) {
 		DEBUG(0,("talloc_init failed.\n"));
+		messaging_send_buf(winbind_messaging_context(),
+				   pid_to_procid(parent_pid),
+				   MSG_WINBIND_FAILED_TO_GO_ONLINE,
+				   (uint8 *)domain->name,
+				   strlen(domain->name)+1);
 		_exit(0);
 	}
 
@@ -291,12 +314,12 @@ static void check_domain_online_handler(struct event_context *ctx,
 
 static void calc_new_online_timeout_check(struct winbindd_domain *domain)
 {
-	int wbc = lp_winbind_cache_time();
+	int wbr = lp_winbind_reconnect_delay();
 
 	if (domain->startup) {
 		domain->check_online_timeout = 10;
-	} else if (domain->check_online_timeout < wbc) {
-		domain->check_online_timeout = wbc;
+	} else if (domain->check_online_timeout < wbr) {
+		domain->check_online_timeout = wbr;
 	}
 }
 
@@ -336,7 +359,7 @@ void set_domain_offline(struct winbindd_domain *domain)
 	}
 
 	/* If we're in statup mode, check again in 10 seconds, not in
-	   lp_winbind_cache_time() seconds (which is 5 mins by default). */
+	   lp_winbind_reconnect_delay() seconds (which is 30 seconds by default). */
 
 	calc_new_online_timeout_check(domain);
 
diff --git a/source/winbindd/winbindd_util.c b/source/winbindd/winbindd_util.c
index 77b1778..4668d37 100644
--- a/source/winbindd/winbindd_util.c
+++ b/source/winbindd/winbindd_util.c
@@ -180,11 +180,11 @@ static struct winbindd_domain *add_trusted_domain(const char *domain_name, const
 	domain->initialized = False;
 	domain->online = is_internal_domain(sid);
 	domain->check_online_timeout = 0;
+	domain->dc_probe_pid = (pid_t)-1;
 	if (sid) {
 		sid_copy(&domain->sid, sid);
 	}
 
-	
 	/* Link to domain list */
 	DLIST_ADD_END(_domain_list, domain, struct winbindd_domain *);
         


More information about the samba-technical mailing list