Winbind reconnect failures

Michael Steffens michael.steffens at hp.com
Fri Jan 24 10:55:00 GMT 2003


Hi,

since quite some time I was observing an intermittant (thus
annoying) problem. Winbindd apparently loses connections to DCs
now and then. As multiple winbind instances are affected
simultanously, I suppose the reason for connections dying is
external. But they all fail to reconnect, then. Winbindd restart
always fixed it, but this can't be the solution.

Digged into it (after learning how to disconnect a TCP link
without killing the process owning the socket on HP-UX :).

The reason seems to be, that the retry code in

   cm_get_lsa_handle()
   cm_get_sam_handle()

in winbindd_cm.c does not actually get executed. After a
connection died, but before an actual read or write is attempted,
the cached handle is still ok for policy_handle_is_valid(), and
the functions above return with success, without trying to write
to the corresponding pipe.

As a result, functions in winbindd_rpc.c do fail on subsequent
rpc calls, but without retrying.

A solution - if we don't want cm_get_*_handle() to probe pipes
on every invocation - would be to move retry triggers to their
callers.

Did so. Attached patch is for 2.2. If I have seen correctly
in CVS, however, 3.0 should suffer from the same problem.

Adds a lot of stupid code, unfortunately, as every caller needs
to be equipped individually. Maybe there is a more elegant way?
But it seems to work fine here.

(The retry code in cm_get_*_handle() seems redundant with the
patch applied, but it doesn't hurt. Didn't change it.)

Cheers!
Michael



-------------- next part --------------
Index: source/nsswitch/winbindd_rpc.c
===================================================================
RCS file: /cvsroot/samba/source/nsswitch/winbindd_rpc.c,v
retrieving revision 1.22.2.13
diff -u -r1.22.2.13 winbindd_rpc.c
--- source/nsswitch/winbindd_rpc.c	6 Jan 2003 07:33:13 -0000	1.22.2.13
+++ source/nsswitch/winbindd_rpc.c	24 Jan 2003 09:51:22 -0000
@@ -52,12 +52,16 @@
 	CLI_POLICY_HND *hnd;
 	NTSTATUS result;
 	POLICY_HND dom_pol;
-	BOOL got_dom_pol = False;
+	BOOL got_dom_pol;
 	uint32 des_access = SEC_RIGHTS_MAXIMUM_ALLOWED;
 	int i;
+	BOOL do_retry = True;
 
+ retry:
+	hnd = NULL;
 	*num_entries = 0;
 	*info = NULL;
+	got_dom_pol = False;
 
 	/* Get sam handle */
 
@@ -132,6 +136,14 @@
 	if (got_dom_pol)
 		cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }
 
@@ -147,18 +159,21 @@
 	POLICY_HND dom_pol;
 	NTSTATUS result;
 	uint32 start = 0;
+	BOOL do_retry = True;
 
+ retry:
+	hnd = NULL;
 	*num_entries = 0;
 	*info = NULL;
 
 	if (!NT_STATUS_IS_OK(result = cm_get_sam_handle(domain->name, &hnd)))
-		return result;
+		goto done;
 
 	result = cli_samr_open_domain(hnd->cli, mem_ctx,
 				      &hnd->pol, des_access, &domain->sid, &dom_pol);
 
 	if (!NT_STATUS_IS_OK(result))
-		return result;
+		goto done;
 
 	do {
 		struct acct_info *info2 = NULL;
@@ -193,6 +208,16 @@
 
 	cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+ done:
+
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }
 
@@ -205,26 +230,32 @@
 	TALLOC_CTX *mem_ctx;
 	CLI_POLICY_HND *hnd;
 	NTSTATUS result;
-	DOM_SID *sids = NULL;
-	uint32 *types = NULL;
+	DOM_SID *sids;
+	uint32 *types;
 	const char *full_name;
+	BOOL do_retry = True;
 
 	if (!(mem_ctx = talloc_init_named("name_to_sid[rpc] for [%s]\\[%s]", domain->name, name))) {
 		DEBUG(0, ("talloc_init failed!\n"));
 		return NT_STATUS_NO_MEMORY;
 	}
         
+ retry:
+	hnd = NULL;
+	sids = NULL;
+	types = NULL;
+
 	if (!NT_STATUS_IS_OK(result = cm_get_lsa_handle(domain->name, &hnd))) {
-		talloc_destroy(mem_ctx);
-		return NT_STATUS_UNSUCCESSFUL;
+		result = NT_STATUS_UNSUCCESSFUL;
+		goto done;
 	}
         
 	full_name = talloc_asprintf(mem_ctx, "%s\\%s", domain->name, name);
 	
 	if (!full_name) {
 		DEBUG(0, ("talloc_asprintf failed!\n"));
-		talloc_destroy(mem_ctx);
-		return NT_STATUS_NO_MEMORY;
+		result = NT_STATUS_NO_MEMORY;
+		goto done;
 	}
 
 	result = cli_lsa_lookup_names(hnd->cli, mem_ctx, &hnd->pol, 1, 
@@ -237,6 +268,16 @@
 		*type = types[0];
 	}
 
+ done:
+
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	talloc_destroy(mem_ctx);
 	return result;
 }
@@ -255,9 +296,15 @@
 	char **names;
 	uint32 *types;
 	NTSTATUS result;
+	BOOL do_retry = True;
 
-	if (!NT_STATUS_IS_OK(result = cm_get_lsa_handle(domain->name, &hnd)))
-		return NT_STATUS_UNSUCCESSFUL;
+ retry:
+	hnd = NULL;
+
+	if (!NT_STATUS_IS_OK(result = cm_get_lsa_handle(domain->name, &hnd))) {
+		result = NT_STATUS_UNSUCCESSFUL;
+		goto done;
+	}
         
 	result = cli_lsa_lookup_sids(hnd->cli, mem_ctx, &hnd->pol,
 				     1, sid, &domains, &names, &types);
@@ -270,10 +317,21 @@
 		/* Paranoia */
 		if (strcasecmp(domain->name, domains[0]) != 0) {
 			DEBUG(1, ("domain name from domain param and PDC lookup return differ! (%s vs %s)\n", domain->name, domains[0]));
-			return NT_STATUS_UNSUCCESSFUL;
+			result = NT_STATUS_UNSUCCESSFUL;
+			goto done;
 		}
 	}
 
+ done:
+
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }
 
@@ -286,8 +344,14 @@
 	CLI_POLICY_HND *hnd;
 	NTSTATUS result;
 	POLICY_HND dom_pol, user_pol;
-	BOOL got_dom_pol = False, got_user_pol = False;
+	BOOL got_dom_pol, got_user_pol;
 	SAM_USERINFO_CTR *ctr;
+	BOOL do_retry = True;
+
+ retry:
+	hnd = NULL;
+	got_user_pol = False;
+	got_dom_pol = False;
 
 	/* Get sam handle */
 
@@ -339,6 +403,14 @@
 	if (got_dom_pol)
 		cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }                                   
 
@@ -355,9 +427,14 @@
 	BOOL got_dom_pol = False, got_user_pol = False;
 	DOM_GID *user_groups;
 	int i;
+	BOOL do_retry = True;
 
+ retry:
+	hnd = NULL;
 	*num_groups = 0;
 	*user_gids = NULL;
+	got_user_pol = False;
+	got_dom_pol = False;
 
 	/* Get sam handle */
 
@@ -403,6 +480,14 @@
 	if (got_dom_pol)
 		cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }
 
@@ -420,15 +505,20 @@
         POLICY_HND dom_pol, group_pol;
         uint32 des_access = SEC_RIGHTS_MAXIMUM_ALLOWED;
         BOOL got_dom_pol = False, got_group_pol = False;
+	BOOL do_retry = True;
 
+ retry:
+	hnd = NULL;
 	*num_names = 0;
+	got_group_pol = False;
+	got_dom_pol = False;
 
-        /* Get sam handle */
+         /* Get sam handle */
 
         if (!NT_STATUS_IS_OK(result = cm_get_sam_handle(domain->name, &hnd)))
                 goto done;
 
-        /* Get domain handle */
+       /* Get domain handle */
 
         result = cli_samr_open_domain(hnd->cli, mem_ctx, &hnd->pol,
                                       des_access, &domain->sid, &dom_pol);
@@ -507,6 +597,14 @@
         if (got_dom_pol)
                 cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
         return result;
 }
 
@@ -606,16 +704,21 @@
 	SAM_UNK_CTR ctr;
 	uint16 switch_value = 2;
 	NTSTATUS result;
-	uint32 seqnum = DOM_SEQUENCE_NONE;
+	uint32 seqnum;
 	POLICY_HND dom_pol;
 	BOOL got_dom_pol = False;
 	uint32 des_access = SEC_RIGHTS_MAXIMUM_ALLOWED;
+	BOOL do_retry = True;
 
 	*seq = DOM_SEQUENCE_NONE;
 
 	if (!(mem_ctx = talloc_init_named("sequence_number[rpc]")))
 		return NT_STATUS_NO_MEMORY;
 
+ retry:
+	seqnum = DOM_SEQUENCE_NONE;
+	got_dom_pol = False;
+
 	/* Get sam handle */
 
 	if (!NT_STATUS_IS_OK(result = cm_get_sam_handle(domain->name, &hnd)))
@@ -666,6 +769,14 @@
 	if (got_dom_pol)
 		cli_samr_close(hnd->cli, mem_ctx, &dom_pol);
 
+	if (!NT_STATUS_IS_OK(result) && do_retry && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		*seq = DOM_SEQUENCE_NONE;
+		do_retry = False;
+		goto retry;
+	}
+
 	talloc_destroy(mem_ctx);
 
 	*seq = seqnum;
@@ -682,10 +793,14 @@
 {
 	CLI_POLICY_HND *hnd;
 	NTSTATUS result;
-	uint32 enum_ctx = 0;
+	uint32 enum_ctx;
+	BOOL do_retry = True;
 
+ retry:
+	hnd = NULL;
 	*num_domains = 0;
-
+	enum_ctx = 0;
+	
 	if (!NT_STATUS_IS_OK(result = cm_get_lsa_handle(lp_workgroup(), &hnd)))
 		goto done;
 
@@ -693,6 +808,15 @@
 					&hnd->pol, &enum_ctx, num_domains, 
 					names, dom_sids);
 done:
+
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	return result;
 }
 
@@ -703,10 +827,14 @@
 	TALLOC_CTX *mem_ctx;
 	CLI_POLICY_HND *hnd;
 	fstring level5_dom;
+	BOOL do_retry = True;
 
 	if (!(mem_ctx = talloc_init_named("domain_sid[rpc]")))
 		return NT_STATUS_NO_MEMORY;
 
+ retry:
+	hnd = NULL;
+
 	/* Get sam handle */
 
 	if (!NT_STATUS_IS_OK(result = cm_get_lsa_handle(domain->name, &hnd)))
@@ -716,6 +844,15 @@
 					   &hnd->pol, 0x05, level5_dom, sid);
 
 done:
+
+	if (!NT_STATUS_IS_OK(result) && do_retry &&
+	    hnd && hnd->cli && hnd->cli->fd == -1) {
+		DEBUG(3, ("Failed with %s for domain %s, retrying\n", 
+			  nt_errstr(result), domain));
+		do_retry = False;
+		goto retry;
+	}
+
 	talloc_destroy(mem_ctx);
 	return result;
 }


More information about the samba-technical mailing list