[PATCH] Add process restart to pre-fork process model

Tim Beale timbeale at catalyst.net.nz
Wed Nov 21 22:49:39 UTC 2018


Hi Gary,

Here's some feedback on the changes. I figured it was easier to provide
it in patch form (attached). If you're happy with my suggestions, then I
can squash them into your merge request and re-push it.

Cheers,
Tim

On 20/11/18 5:18 PM, Gary Lockyer via samba-technical wrote:
> Conflict markers removed
>
>
> Gary
>
> On 20/11/18 17:13, Andrew Bartlett wrote:
>> On Tue, 2018-11-20 at 17:08 +1300, Gary Lockyer via samba-technical
>> wrote:
>>> Updated patch set that applies to current master, conflicted with latest
>>> WHATSNEW.
>> Sorry Gary, you left the conflict markers in the diff.
>>
>> Andrew Bartlett
>>
-------------- next part --------------
From 46f7cab8e8fd709c2b0698a1fd51324526757c54 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 21 Nov 2018 11:21:00 +1300
Subject: [PATCH 1/7] SQUASH INTO #1: source4 smbd test: prefork process
 restart [trivial]

Feedback:
- prefork_restart.py: replace hardcoded number
- README: add description for new testenv
- how about using 'prockill' instead of 'prfrkrstrt'?
- moved testenv in ENV_DEPS (it's not at all related to backup testenvs)
- changed provision_ad_dc() slightly (first part of realm and DC name were
  the same, not sure if that's intentional, but it looks different to the
  other testenvs)
- tests.py: removed the environ variables as they didn't appear to be used
  by the test
- fixed some typos

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/tests/prefork_restart.py |  4 ++--
 selftest/target/README                |  6 ++++++
 selftest/target/Samba.pm              |  2 +-
 selftest/target/Samba4.pm             | 15 ++++++++-------
 source4/selftest/tests.py             |  4 +---
 5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/samba/tests/prefork_restart.py b/python/samba/tests/prefork_restart.py
index 90037d9..c101177 100644
--- a/python/samba/tests/prefork_restart.py
+++ b/python/samba/tests/prefork_restart.py
@@ -266,7 +266,7 @@ class PreforkProcessRestartTests(TestCase):
 
         # check that the worker processes have restarted
         new_workers = self.get_worker_pids("ldap", NUM_WORKERS)
-        for x in range(4):
+        for x in range(NUM_WORKERS):
             self.assertNotEquals(workers[x], new_workers[x])
 
         # check that the previous server entries have been removed.
@@ -380,7 +380,7 @@ class PreforkProcessRestartTests(TestCase):
 
         # check that the worker processes have restarted
         new_workers = self.get_worker_pids("rpc", NUM_WORKERS)
-        for x in range(4):
+        for x in range(NUM_WORKERS):
             self.assertNotEquals(workers[x], new_workers[x])
 
         # check that the previous server entries have been removed.
diff --git a/selftest/target/README b/selftest/target/README
index 237cd6c..3fd283e 100644
--- a/selftest/target/README
+++ b/selftest/target/README
@@ -101,3 +101,9 @@ the issue, without fear of never seeing the problem again.
 
 You could even spin up a 'lab DC' inside a testenv, by taking a backup of a
 real network DC.
+
+preforkrestartdc testenv
+------------------------
+Used to test killing and restarting processes under the pre-fork model. Due to
+the destructive nature of the tests, it's not recommended to use this testenv
+for anything else.
diff --git a/selftest/target/Samba.pm b/selftest/target/Samba.pm
index 5f83645..f5c490f 100644
--- a/selftest/target/Samba.pm
+++ b/selftest/target/Samba.pm
@@ -424,7 +424,7 @@ sub get_interface($)
     $interfaces{"labdc"} = 43;
     $interfaces{"offlinebackupdc"} = 44;
     $interfaces{"customdc"} = 45;
-    $interfaces{"prfrkrstrt"} = 46;
+    $interfaces{"prockilldc"} = 46;
 
     # update lib/socket_wrapper/socket_wrapper.c
     #  #define MAX_WRAPPED_INTERFACES 64
diff --git a/selftest/target/Samba4.pm b/selftest/target/Samba4.pm
index e667acc..c2e9fdb 100755
--- a/selftest/target/Samba4.pm
+++ b/selftest/target/Samba4.pm
@@ -2185,6 +2185,7 @@ sub check_env($$)
 	ad_dc_ntvfs          => [],
 	backupfromdc         => [],
 	customdc             => [],
+	preforkrestartdc     => [],
 
 	fl2008r2dc           => ["ad_dc"],
 	fl2003dc             => ["ad_dc"],
@@ -2206,7 +2207,6 @@ sub check_env($$)
 	renamedc             => ["backupfromdc"],
 	offlinebackupdc      => ["backupfromdc"],
 	labdc                => ["backupfromdc"],
-	preforkrestartdc     => [],
 
 	none                 => [],
 );
@@ -2629,9 +2629,8 @@ sub setup_ad_dc_no_ntlm
 }
 
 #
-# ad_dc test environment used solely to test pre-frork process restarts
-# as processes get killed off and restarted it should not be used for other
-# tests.
+# AD DC test environment used solely to test pre-fork process restarts.
+# As processes get killed off and restarted it should not be used for other
 sub setup_preforkrestartdc
 {
 	my ($self, $path) = @_;
@@ -2641,11 +2640,13 @@ sub setup_preforkrestartdc
 	       return "UNKNOWN";
 	}
 
+	# note DC name must be <= 15 chars so we use 'prockill' instead of
+	# 'preforkrestart'
 	my $env = $self->provision_ad_dc(
 		$path,
-		"prfrkrstrt",
-		"PRFRKRSTRTDOM",
-		"prfrkrstrt.samba.example.com",
+		"prockilldc",
+		"PROCKILLDOMAIN",
+		"prockilldom.samba.example.com",
 		"prefork backoff increment = 5\nprefork maximum backoff=10");
 	unless ($env) {
 		return undef;
diff --git a/source4/selftest/tests.py b/source4/selftest/tests.py
index 60e6824..6a1e144 100755
--- a/source4/selftest/tests.py
+++ b/source4/selftest/tests.py
@@ -1234,13 +1234,11 @@ plantestsuite("samba4.dsdb.samdb.ldb_modules.group_audit.errors", "none",
               [os.path.join(bindir(), "test_group_audit_errors")])
 
 # process restart and limit tests, these break the environment so need to run
-# in their own specific environaments
+# in their own specific environment
 planoldpythontestsuite("preforkrestartdc:local",
                        "samba.tests.prefork_restart",
                        extra_path=[
                            os.path.join(srcdir(), 'python/samba/tests')],
                        extra_args=['-U"$USERNAME%$PASSWORD"'],
-                       environ={'CLIENT_IP': '127.0.0.11',
-                                'SOCKET_WRAPPER_DEFAULT_IFACE': 11},
                        name="samba.tests.prefork_restart",
                        py3_compatible=True)
-- 
2.7.4


From 83f9ae1543b1b6d840c591372894ca5180798c2f Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 21 Nov 2018 16:47:53 +1300
Subject: [PATCH 2/7] SQUASH INTO #1: source4 smbd test: prefork process
 restart [flappy tests]

Tests were slightly flappy. Usually they pass, but occasionally you can
get unlucky and they fail. Dropping the wait_for_process() sleep from 1
sec to 0.01s makes the flappy tests fail 100%.

The problem was that restarting the master process restarts all the
worker processes as well, but the tests were only waiting for the master
to restart. So the next test case/assertion could start before the
worker had finihsed restarting.

Also noticed that test_rpc_worker_zero_restart() was checking for the
wrong PID.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/tests/prefork_restart.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/python/samba/tests/prefork_restart.py b/python/samba/tests/prefork_restart.py
index c101177..1586bbc 100644
--- a/python/samba/tests/prefork_restart.py
+++ b/python/samba/tests/prefork_restart.py
@@ -78,6 +78,12 @@ class PreforkProcessRestartTests(TestCase):
             self.assertIsNotNone(pids[x])
         return pids
 
+    def wait_for_workers(self, name, workers):
+        num_workers = len(workers)
+        for x in range(num_workers):
+            process_name = "prefork-worker-{0}-{1}".format(name, x)
+            self.wait_for_process(process_name, workers[x], 0, 1, 30)
+
     def wait_for_process(self, name, pid, initial_delay, wait, timeout):
         time.sleep(initial_delay)
         delay = initial_delay
@@ -177,6 +183,10 @@ class PreforkProcessRestartTests(TestCase):
         # wait for the process to restart
         self.wait_for_process("prefork-master-ldap", pid, 1, 1, 30)
 
+        # restarting the master restarts the workers as well, so make sure
+        # they have finished restarting
+        self.wait_for_workers("ldap", workers)
+
         # get ldap master process
         new_pid = self.get_process("prefork-master-ldap")
         self.assertIsNotNone(new_pid)
@@ -253,9 +263,7 @@ class PreforkProcessRestartTests(TestCase):
             os.kill(x, signal.SIGTERM)
 
         # wait for the worker processes to restart
-        for x in range(NUM_WORKERS):
-            self.wait_for_process(
-                "prefork-worker-ldap-{0}".format(x), workers[x], 0, 1, 30)
+        self.wait_for_workers("ldap", workers)
 
         # get ldap master process
         new_pid = self.get_process("prefork-master-ldap")
@@ -292,6 +300,9 @@ class PreforkProcessRestartTests(TestCase):
         # wait for the process to restart
         self.wait_for_process("prefork-master-rpc", pid, 1, 1, 30)
 
+        # wait for workers to restart as well
+        self.wait_for_workers("rpc", workers)
+
         # get ldap master process
         new_pid = self.get_process("prefork-master-rpc")
         self.assertIsNotNone(new_pid)
@@ -326,7 +337,7 @@ class PreforkProcessRestartTests(TestCase):
         os.kill(workers[0], signal.SIGTERM)
 
         # wait for the process to restart
-        self.wait_for_process("prefork-worker-rpc-0", pid, 1, 1, 30)
+        self.wait_for_process("prefork-worker-rpc-0", workers[0], 1, 1, 30)
 
         # get rpc master process
         new_pid = self.get_process("prefork-master-rpc")
@@ -396,9 +407,6 @@ class PreforkProcessRestartTests(TestCase):
         pid = self.get_process("prefork-master-kdc")
         self.assertIsNotNone(pid)
 
-        # Get the worker processes
-        workers = self.get_worker_pids("kdc", NUM_WORKERS)
-
         #
         # Check that the processes get backed off as expected
         #
@@ -407,11 +415,16 @@ class PreforkProcessRestartTests(TestCase):
         # Set the expected values to one less than the next ideal value
         # 0, 5, 10, 10, ...
         for expected in [4, 9, 14, 14]:
+            # Get the worker processes
+            workers = self.get_worker_pids("kdc", NUM_WORKERS)
+
             process = self.get_process("prefork-master-kdc")
             os.kill(process, signal.SIGTERM)
             # wait for the process to restart
             start = time.time()
             self.wait_for_process("prefork-master-kdc", process, 0, 1, 30)
+            # wait for the workers to restart as well
+            self.wait_for_workers("kdc", workers)
             end = time.time()
             duration = end - start
             self.assertTrue(duration < expected)
-- 
2.7.4


From a973159488f0b491892fe97cff90b3349fb580a9 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 21 Nov 2018 17:17:34 +1300
Subject: [PATCH 3/7] SQUASH INTO #1: source4 smbd test: prefork process
 restart [backoff tests]

The backoff tests would actually pass without the backoff code present
in the server. The problem is we're only asserting the duration is less
than the next increment. Zero (i.e. no delay) is less than the next
increment, so the tests still pass. Add an assertion that the elapsed
duration is greater than the expected delay.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/tests/prefork_restart.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/python/samba/tests/prefork_restart.py b/python/samba/tests/prefork_restart.py
index 1586bbc..5233ca1 100644
--- a/python/samba/tests/prefork_restart.py
+++ b/python/samba/tests/prefork_restart.py
@@ -412,9 +412,8 @@ class PreforkProcessRestartTests(TestCase):
         #
         # have prefork backoff increment = 5
         #      prefork maximum backoff   = 10
-        # Set the expected values to one less than the next ideal value
-        # 0, 5, 10, 10, ...
-        for expected in [4, 9, 14, 14]:
+        backoff_increment = 5
+        for expected in [0, 5, 10, 10]:
             # Get the worker processes
             workers = self.get_worker_pids("kdc", NUM_WORKERS)
 
@@ -427,7 +426,12 @@ class PreforkProcessRestartTests(TestCase):
             self.wait_for_workers("kdc", workers)
             end = time.time()
             duration = end - start
-            self.assertTrue(duration < expected)
+
+            # process restart will take some time. Check that the elapsed
+            # duration falls somewhere in the expected range, i.e. we haven't
+            # taken longer than the backoff increment
+            self.assertLess(duration, expected + backoff_increment)
+            self.assertGreaterEqual(duration, expected)
 
         # check that the worker processes have restarted
         new_workers = self.get_worker_pids("kdc", NUM_WORKERS)
@@ -443,9 +447,8 @@ class PreforkProcessRestartTests(TestCase):
         #
         # have prefork backoff increment = 5
         #      prefork maximum backoff   = 10
-        # Set the expected values to one less than the next ideal value
-        # 0, 5, 10, 10, ...
-        for expected in [4, 9, 14, 14]:
+        backoff_increment = 5
+        for expected in [0, 5, 10, 10]:
             process = self.get_process("prefork-worker-kdc-2")
             self.assertIsNotNone(process)
             os.kill(process, signal.SIGTERM)
@@ -454,6 +457,11 @@ class PreforkProcessRestartTests(TestCase):
             self.wait_for_process("prefork-worker-kdc-2", process, 0, 1, 30)
             end = time.time()
             duration = end - start
-            self.assertTrue(duration < expected)
+
+            # process restart will take some time. Check that the elapsed
+            # duration falls somewhere in the expected range, i.e. we haven't
+            # taken longer than the backoff increment
+            self.assertLess(duration, expected + backoff_increment)
+            self.assertGreaterEqual(duration, expected)
 
         self.check_for_duplicate_processes()
-- 
2.7.4


From ec11a253bd7772bcecfcc84ff7a0e173e81ad6ed Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 22 Nov 2018 09:51:17 +1300
Subject: [PATCH 4/7] SQUASH INTO #3: source4 smdb prefork: Restart failed
 processes

fix copy-n-paste error?

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index f892f7f..f36cdfd 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -361,7 +361,7 @@ static void prefork_fork_master(
 			smb_panic("Unable to create worker control pipe\n");
 		}
 		smb_set_close_on_exec(control_pipe[0]);
-		smb_set_close_on_exec(control_pipe[0]);
+		smb_set_close_on_exec(control_pipe[1]);
 	}
 
 	/*
-- 
2.7.4


From 9841bd22a3d25f481f661dbd1386e0a474787426 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 22 Nov 2018 10:36:10 +1300
Subject: [PATCH 5/7] SQUASH INTO #5: source4 smbd prefork: restart on non zero
 exit code

Should probably include SEGV, just in case --disable-fault-handling is
used?

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index f36cdfd..0d004ad 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -492,7 +492,7 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 		DBG_ERR("Parent %d, Child %d terminated with signal %d\n",
 			getpid(), pid, status);
 		if (status == SIGABRT || status == SIGBUS || status == SIGFPE ||
-		    status == SIGILL || status == SIGSYS) {
+		    status == SIGILL || status == SIGSYS || status == SIGSEGV) {
 
 			prefork_restart(ev, rc);
 		}
-- 
2.7.4


From f628c8327e0c0f4f2a615014d6bdea91db8f526f Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 22 Nov 2018 10:41:00 +1300
Subject: [PATCH 6/7] SQUASH INTO #6: source4 smbd prefork: Add backoff to
 process restart

The 2 & 10 values here don't line up with defaults used anywhere else...
Might make more sense to just initialize them to zero?

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index 0d004ad..c62f08f 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -391,11 +391,10 @@ static void prefork_fork_master(
 static void prefork_restart(struct tevent_context *ev,
 			    struct restart_context *rc)
 {
-	unsigned max_backoff = 10;
-	unsigned backoff = 2;
+	unsigned max_backoff = 0;
+	unsigned backoff = 0;
 	unsigned restart_delay = rc->restart_delay;
-
-	unsigned default_value;
+	unsigned default_value = 0;
 
 	default_value = lpcfg_prefork_backoff_increment(rc->lp_ctx);
 	backoff = lpcfg_parm_int(rc->lp_ctx,
-- 
2.7.4


From d3671e811d071f24a8b8a57e943bacb6197c9115 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 22 Nov 2018 10:35:01 +1300
Subject: [PATCH 7/7] source4 smbd prefork: Add code comments

Add some comments to the prefork code explaining what's going on.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 51 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index c62f08f..49446d2 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -22,7 +22,16 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-
+/*
+ * The pre-fork process model distributes the server workload amongst several
+ * designated worker threads (e.g. 'prefork-worker-ldap-0',
+ * 'prefork-worker-ldap-1', etc). The number of worker threads is controlled
+ * by the 'prefork children' conf setting. The worker threads are controlled
+ * by a prefork master process (e.g. 'prefork-master-ldap'). The prefork master
+ * doesn't handle the server workload (i.e. processing messages) itself, but is
+ * responsible for restarting workers if they exit unexpectedly. The top-level
+ * samba process is responsible for restarting the master process if it exits.
+ */
 #include "includes.h"
 #include <unistd.h>
 
@@ -181,8 +190,10 @@ static void irpc_cleanup(
 }
 
 /*
-  handle EOF on the parent-to-all-children pipe in the child
-*/
+ * handle EOF on the parent-to-all-children pipe in the child, i.e.
+ * the parent has died and its end of the pipe has been closed.
+ * The child handles this by exiting as well.
+ */
 static void prefork_pipe_handler(struct tevent_context *event_ctx,
 		                 struct tevent_fd *fde, uint16_t flags,
 				 void *private_data)
@@ -209,7 +220,7 @@ static void prefork_pipe_handler(struct tevent_context *event_ctx,
 
 
 /*
- * called to create a new server task
+ * Called by the top-level samba process to create a new prefork master process
  */
 static void prefork_fork_master(
     struct tevent_context *ev,
@@ -354,6 +365,13 @@ static void prefork_fork_master(
 	}
 	DBG_NOTICE("Forking %d %s worker processes\n",
 		   num_children, service_name);
+
+	/*
+	 * the prefork master creates its own control pipe, so the prefork
+	 * workers can detect if the master exits (in which case an EOF gets
+	 * written). (Whereas from_parent_fd is the control pipe from the
+	 * top-level process that the prefork master listens on)
+	 */
 	{
 		int ret;
 		ret = pipe(control_pipe);
@@ -386,8 +404,11 @@ static void prefork_fork_master(
 	/* We need to keep ev2 until we're finished for the messaging to work */
 	TALLOC_FREE(ev2);
 	exit(0);
-
 }
+
+/*
+ * Restarts a child process if it exits unexpectedly
+ */
 static void prefork_restart(struct tevent_context *ev,
 			    struct restart_context *rc)
 {
@@ -396,6 +417,11 @@ static void prefork_restart(struct tevent_context *ev,
 	unsigned restart_delay = rc->restart_delay;
 	unsigned default_value = 0;
 
+	/*
+	 * If the child process is constantly exiting, then restarting it can
+	 * consume a lot of resources. In which case, we want to backoff a bit
+	 * before respawning it
+	 */
 	default_value = lpcfg_prefork_backoff_increment(rc->lp_ctx);
 	backoff = lpcfg_parm_int(rc->lp_ctx,
 				 NULL,
@@ -447,6 +473,7 @@ static void prefork_restart(struct tevent_context *ev,
 				    &pd);
 	}
 }
+
 /*
   handle EOF on the child pipe in the parent, so we know when a
   process terminates without using SIGCHLD or waiting on all possible pids.
@@ -585,6 +612,9 @@ static void setup_handlers(
 	}
 }
 
+/*
+ * Called by the prefork master to create a new prefork worker process
+ */
 static void prefork_fork_worker(struct task_server *task,
 				struct tevent_context *ev,
 				struct tevent_context *ev2,
@@ -609,6 +639,10 @@ static void prefork_fork_worker(struct task_server *task,
 		int fd = tfork_event_fd(w);
 		struct restart_context *rc = NULL;
 
+		/*
+		 * we're the parent (prefork master), so store enough info to
+		 * restart the worker/child if it exits unexpectedly
+		 */
 		rc = talloc_zero(ev, struct restart_context);
 		if (rc == NULL) {
 			smb_panic("OOM allocating restart context\n");
@@ -637,8 +671,15 @@ static void prefork_fork_worker(struct task_server *task,
 		}
 		tevent_fd_set_auto_close(fde);
 	} else {
+
+		/*
+		 * we're the child (prefork-worker). We never write to the
+		 * control pipe, but listen on the read end in case our parent
+		 * (the pre-fork master) exits
+		 */
 		close(control_pipe[1]);
 		setup_handlers(ev2, lp_ctx, control_pipe[0]);
+
 		/*
 		 * tfork uses malloc
 		 */
-- 
2.7.4



More information about the samba-technical mailing list