[PATCH] Add process restart to pre-fork process model

Gary Lockyer gary at catalyst.net.nz
Tue Nov 20 04:08:06 UTC 2018


Updated patch set that applies to current master, conflicted with latest
WHATSNEW.

With the patch this time.

Thanks

Gary

On 14/11/18 13:22, Gary Lockyer via samba-technical wrote:
> This patch set modifies the pre-fork process model so that it will
> restart failed processes.  Restart attempts are backed of using a linear
> back off strategy up to a configurable maximum delay.
> 
> CI results: https://gitlab.com/samba-team/devel/samba/pipelines/36482415
> 
> Reviews and comments appreciated
> 
> Gary
> 


-------------- next part --------------
From 585a9888469070d6a9f5bbeb0786ff95c716b722 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Tue, 18 Sep 2018 08:37:02 +1200
Subject: [PATCH 01/11] source4 smbd test: prefork process restart

Add tests for the restarting of failed/terminated process, by the
pre-fork process model.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 python/samba/tests/prefork_restart.py | 446 ++++++++++++++++++++++++++
 selftest/knownfail.d/prefork_restart  |   8 +
 selftest/target/Samba.pm              |   1 +
 selftest/target/Samba4.pm             |  39 +++
 source4/selftest/tests.py             |  12 +
 5 files changed, 506 insertions(+)
 create mode 100644 python/samba/tests/prefork_restart.py
 create mode 100644 selftest/knownfail.d/prefork_restart

diff --git a/python/samba/tests/prefork_restart.py b/python/samba/tests/prefork_restart.py
new file mode 100644
index 00000000000..90037d99461
--- /dev/null
+++ b/python/samba/tests/prefork_restart.py
@@ -0,0 +1,446 @@
+# Tests for process restarting in the pre-fork process model
+#
+# Copyright (C) Andrew Bartlett <abartlet at samba.org> 2018
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+from __future__ import print_function
+"""Tests process restarting in the pre-fork process model.
+   NOTE: As this test kills samba processes it won't play nicely with other
+         tests, so needs to be run in it's own environment.
+"""
+
+
+import os
+import signal
+import time
+
+import samba
+from samba.tests import TestCase, delete_force
+from samba.dcerpc import echo, netlogon
+from samba.messaging import Messaging
+from samba.samdb import SamDB
+from samba.credentials import Credentials, DONT_USE_KERBEROS
+from samba.compat import get_string
+from samba.dsdb import (
+    UF_WORKSTATION_TRUST_ACCOUNT,
+    UF_PASSWD_NOTREQD)
+from samba.dcerpc.misc import SEC_CHAN_WKSTA
+from samba.auth import system_session
+
+NUM_WORKERS = 4
+MACHINE_NAME = "PFRS"
+
+
+class PreforkProcessRestartTests(TestCase):
+
+    def setUp(self):
+        super(PreforkProcessRestartTests, self).setUp()
+        lp_ctx = self.get_loadparm()
+        self.msg_ctx = Messaging(lp_ctx=lp_ctx)
+
+    def tearDown(self):
+        super(PreforkProcessRestartTests, self).tearDown()
+
+    def get_process_data(self):
+        services = self.msg_ctx.irpc_all_servers()
+
+        processes = []
+        for service in services:
+            for id in service.ids:
+                processes.append((service.name, id.pid))
+        return processes
+
+    def get_process(self, name):
+        processes = self.get_process_data()
+        for pname, pid in processes:
+            if name == pname:
+                return pid
+        return None
+
+    def get_worker_pids(self, name, workers):
+        pids = []
+        for x in range(workers):
+            process_name = "prefork-worker-{0}-{1}".format(name, x)
+            pids.append(self.get_process(process_name))
+            self.assertIsNotNone(pids[x])
+        return pids
+
+    def wait_for_process(self, name, pid, initial_delay, wait, timeout):
+        time.sleep(initial_delay)
+        delay = initial_delay
+        while delay < timeout:
+            p = self.get_process(name)
+            if p is not None and p != pid:
+                # process has restarted
+                return
+            time.sleep(wait)
+            delay += wait
+        self.fail("Times out after {0} seconds waiting for {1} to restart".
+                  format(delay, name))
+
+    def check_for_duplicate_processes(self):
+            processes = self.get_process_data()
+            process_map = {}
+            for name, p in processes:
+                if (name.startswith("prefork-") or
+                    name.endswith("_server") or
+                    name.endswith("srv")):
+
+                    if name in process_map:
+                        if p != process_map[name]:
+                            self.fail(
+                                "Duplicate process for {0}, pids {1} and {2}".
+                                format(name, p, process_map[name]))
+
+    def simple_bind(self):
+        creds = self.insta_creds(template=self.get_credentials())
+        creds.set_bind_dn("%s\\%s" % (creds.get_domain(),
+                                      creds.get_username()))
+
+        self.samdb = SamDB(url="ldaps://%s" % os.environ["SERVER"],
+                           lp=self.get_loadparm(),
+                           credentials=creds)
+
+    def rpc_echo(self):
+        conn = echo.rpcecho("ncalrpc:", self.get_loadparm())
+        self.assertEquals([1, 2, 3], conn.EchoData([1, 2, 3]))
+
+    def netlogon(self):
+        server = os.environ["SERVER"]
+        host = os.environ["SERVER_IP"]
+        lp = self.get_loadparm()
+
+        credentials = self.get_credentials()
+
+        session = system_session()
+        ldb = SamDB(url="ldap://%s" % host,
+                    session_info=session,
+                    credentials=credentials,
+                    lp=lp)
+        machine_pass = samba.generate_random_password(32, 32)
+        machine_name = MACHINE_NAME
+        machine_dn = "cn=%s,%s" % (machine_name, ldb.domain_dn())
+
+        delete_force(ldb, machine_dn)
+
+        utf16pw = ('"%s"' % get_string(machine_pass)).encode('utf-16-le')
+        ldb.add({
+            "dn": machine_dn,
+            "objectclass": "computer",
+            "sAMAccountName": "%s$" % machine_name,
+            "userAccountControl":
+                str(UF_WORKSTATION_TRUST_ACCOUNT | UF_PASSWD_NOTREQD),
+            "unicodePwd": utf16pw})
+
+        machine_creds = Credentials()
+        machine_creds.guess(lp)
+        machine_creds.set_secure_channel_type(SEC_CHAN_WKSTA)
+        machine_creds.set_kerberos_state(DONT_USE_KERBEROS)
+        machine_creds.set_password(machine_pass)
+        machine_creds.set_username(machine_name + "$")
+        machine_creds.set_workstation(machine_name)
+
+        netlogon.netlogon(
+            "ncacn_ip_tcp:%s[schannel,seal]" % server,
+            lp,
+            machine_creds)
+
+        delete_force(ldb, machine_dn)
+
+    def test_ldap_master_restart(self):
+        # check ldap connection, do a simple bind
+        self.simple_bind()
+
+        # get ldap master process
+        pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("ldap", NUM_WORKERS)
+
+        # kill it
+        os.kill(pid, signal.SIGTERM)
+
+        # wait for the process to restart
+        self.wait_for_process("prefork-master-ldap", pid, 1, 1, 30)
+
+        # get ldap master process
+        new_pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has changed
+        self.assertNotEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("ldap", NUM_WORKERS)
+        for x in range(NUM_WORKERS):
+            self.assertNotEquals(workers[x], new_workers[x])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check ldap connection, another simple bind
+        self.simple_bind()
+
+    def test_ldap_worker_restart(self):
+        # check ldap connection, do a simple bind
+        self.simple_bind()
+
+        # get ldap master process
+        pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("ldap", NUM_WORKERS)
+
+        # kill worker 0
+        os.kill(workers[0], signal.SIGTERM)
+
+        # wait for the process to restart
+        self.wait_for_process("prefork-worker-ldap-0", pid, 1, 1, 30)
+
+        # get ldap master process
+        new_pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has not changed
+        self.assertEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("ldap", NUM_WORKERS)
+        # process 0 should have a new pid the others should be unchanged
+        self.assertNotEquals(workers[0], new_workers[0])
+        self.assertEquals(workers[1], new_workers[1])
+        self.assertEquals(workers[2], new_workers[2])
+        self.assertEquals(workers[3], new_workers[3])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check ldap connection, another simple bind
+        self.simple_bind()
+
+    #
+    # Kill all the ldap worker processes and ensure that they are restarted
+    # correctly
+    #
+    def test_ldap_all_workers_restart(self):
+        # check ldap connection, do a simple bind
+        self.simple_bind()
+
+        # get ldap master process
+        pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("ldap", NUM_WORKERS)
+
+        # kill all the worker processes
+        for x in workers:
+            os.kill(x, signal.SIGTERM)
+
+        # wait for the worker processes to restart
+        for x in range(NUM_WORKERS):
+            self.wait_for_process(
+                "prefork-worker-ldap-{0}".format(x), workers[x], 0, 1, 30)
+
+        # get ldap master process
+        new_pid = self.get_process("prefork-master-ldap")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has not changed
+        self.assertEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("ldap", NUM_WORKERS)
+        for x in range(4):
+            self.assertNotEquals(workers[x], new_workers[x])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check ldap connection, another simple bind
+        self.simple_bind()
+
+    def test_rpc_master_restart(self):
+        # check rpc connection, make a rpc echo request
+        self.rpc_echo()
+
+        # get rpc master process
+        pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("rpc", NUM_WORKERS)
+
+        # kill it
+        os.kill(pid, signal.SIGTERM)
+
+        # wait for the process to restart
+        self.wait_for_process("prefork-master-rpc", pid, 1, 1, 30)
+
+        # get ldap master process
+        new_pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has changed
+        self.assertNotEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("rpc", NUM_WORKERS)
+        for x in range(NUM_WORKERS):
+            self.assertNotEquals(workers[x], new_workers[x])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check rpc connection, another rpc echo request
+        self.rpc_echo()
+
+    def test_rpc_worker_zero_restart(self):
+        # check rpc connection, make a rpc echo request and a netlogon request
+        self.rpc_echo()
+        self.netlogon()
+
+        # get rpc master process
+        pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("rpc", NUM_WORKERS)
+
+        # kill worker 0
+        os.kill(workers[0], signal.SIGTERM)
+
+        # wait for the process to restart
+        self.wait_for_process("prefork-worker-rpc-0", pid, 1, 1, 30)
+
+        # get rpc master process
+        new_pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has not changed
+        self.assertEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("rpc", NUM_WORKERS)
+        # process 0 should have a new pid the others should be unchanged
+        self.assertNotEquals(workers[0], new_workers[0])
+        self.assertEquals(workers[1], new_workers[1])
+        self.assertEquals(workers[2], new_workers[2])
+        self.assertEquals(workers[3], new_workers[3])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check rpc connection, another rpc echo request, and netlogon request
+        self.rpc_echo()
+        self.netlogon()
+
+    def test_rpc_all_workers_restart(self):
+        # check rpc connection, make a rpc echo request, and a netlogon request
+        self.rpc_echo()
+        self.netlogon()
+
+        # get rpc master process
+        pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("rpc", NUM_WORKERS)
+
+        # kill all the worker processes
+        for x in workers:
+            os.kill(x, signal.SIGTERM)
+
+        # wait for the worker processes to restart
+        for x in range(NUM_WORKERS):
+            self.wait_for_process(
+                "prefork-worker-rpc-{0}".format(x), workers[x], 0, 1, 30)
+
+        # get rpc master process
+        new_pid = self.get_process("prefork-master-rpc")
+        self.assertIsNotNone(new_pid)
+
+        # check that the pid has not changed
+        self.assertEquals(pid, new_pid)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("rpc", NUM_WORKERS)
+        for x in range(4):
+            self.assertNotEquals(workers[x], new_workers[x])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+        # check rpc connection, another rpc echo request and netlogon
+        self.rpc_echo()
+        self.netlogon()
+
+    def test_master_restart_backoff(self):
+
+        # get kdc master process
+        pid = self.get_process("prefork-master-kdc")
+        self.assertIsNotNone(pid)
+
+        # Get the worker processes
+        workers = self.get_worker_pids("kdc", NUM_WORKERS)
+
+        #
+        # Check that the processes get backed off as expected
+        #
+        # have prefork backoff increment = 5
+        #      prefork maximum backoff   = 10
+        # Set the expected values to one less than the next ideal value
+        # 0, 5, 10, 10, ...
+        for expected in [4, 9, 14, 14]:
+            process = self.get_process("prefork-master-kdc")
+            os.kill(process, signal.SIGTERM)
+            # wait for the process to restart
+            start = time.time()
+            self.wait_for_process("prefork-master-kdc", process, 0, 1, 30)
+            end = time.time()
+            duration = end - start
+            self.assertTrue(duration < expected)
+
+        # check that the worker processes have restarted
+        new_workers = self.get_worker_pids("kdc", NUM_WORKERS)
+        for x in range(NUM_WORKERS):
+            self.assertNotEquals(workers[x], new_workers[x])
+
+        # check that the previous server entries have been removed.
+        self.check_for_duplicate_processes()
+
+    def test_worker_restart_backoff(self):
+        #
+        # Check that the processes get backed off as expected
+        #
+        # have prefork backoff increment = 5
+        #      prefork maximum backoff   = 10
+        # Set the expected values to one less than the next ideal value
+        # 0, 5, 10, 10, ...
+        for expected in [4, 9, 14, 14]:
+            process = self.get_process("prefork-worker-kdc-2")
+            self.assertIsNotNone(process)
+            os.kill(process, signal.SIGTERM)
+            # wait for the process to restart
+            start = time.time()
+            self.wait_for_process("prefork-worker-kdc-2", process, 0, 1, 30)
+            end = time.time()
+            duration = end - start
+            self.assertTrue(duration < expected)
+
+        self.check_for_duplicate_processes()
diff --git a/selftest/knownfail.d/prefork_restart b/selftest/knownfail.d/prefork_restart
new file mode 100644
index 00000000000..020199cd9b4
--- /dev/null
+++ b/selftest/knownfail.d/prefork_restart
@@ -0,0 +1,8 @@
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_all_workers_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_master_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_worker_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_master_restart_backoff
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_all_workers_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_master_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_worker_zero_restart
+^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_worker_restart_backoff
diff --git a/selftest/target/Samba.pm b/selftest/target/Samba.pm
index aa6ec9ee931..5f83645476f 100644
--- a/selftest/target/Samba.pm
+++ b/selftest/target/Samba.pm
@@ -424,6 +424,7 @@ sub get_interface($)
     $interfaces{"labdc"} = 43;
     $interfaces{"offlinebackupdc"} = 44;
     $interfaces{"customdc"} = 45;
+    $interfaces{"prfrkrstrt"} = 46;
 
     # update lib/socket_wrapper/socket_wrapper.c
     #  #define MAX_WRAPPED_INTERFACES 64
diff --git a/selftest/target/Samba4.pm b/selftest/target/Samba4.pm
index 5de0a706f35..e667acce4e3 100755
--- a/selftest/target/Samba4.pm
+++ b/selftest/target/Samba4.pm
@@ -2206,6 +2206,7 @@ sub check_env($$)
 	renamedc             => ["backupfromdc"],
 	offlinebackupdc      => ["backupfromdc"],
 	labdc                => ["backupfromdc"],
+	preforkrestartdc     => [],
 
 	none                 => [],
 );
@@ -2627,6 +2628,44 @@ sub setup_ad_dc_no_ntlm
 	return $env;
 }
 
+#
+# ad_dc test environment used solely to test pre-frork process restarts
+# as processes get killed off and restarted it should not be used for other
+# tests.
+sub setup_preforkrestartdc
+{
+	my ($self, $path) = @_;
+
+	# If we didn't build with ADS, pretend this env was never available
+	if (not $self->{target3}->have_ads()) {
+	       return "UNKNOWN";
+	}
+
+	my $env = $self->provision_ad_dc(
+		$path,
+		"prfrkrstrt",
+		"PRFRKRSTRTDOM",
+		"prfrkrstrt.samba.example.com",
+		"prefork backoff increment = 5\nprefork maximum backoff=10");
+	unless ($env) {
+		return undef;
+	}
+
+	$env->{NSS_WRAPPER_MODULE_SO_PATH} = undef;
+	$env->{NSS_WRAPPER_MODULE_FN_PREFIX} = undef;
+
+	if (not defined($self->check_or_start($env, "prefork"))) {
+	    return undef;
+	}
+
+	my $upn_array = ["$env->{REALM}.upn"];
+	my $spn_array = ["$env->{REALM}.spn"];
+
+	$self->setup_namespaces($env, $upn_array, $spn_array);
+
+	return $env;
+}
+
 # Sets up a DC that's solely used to do a domain backup from. We then use the
 # backupfrom-DC to create the restore-DC - this proves that the backup/restore
 # process will create a Samba DC that will actually start up.
diff --git a/source4/selftest/tests.py b/source4/selftest/tests.py
index c4b7d18444c..60e682495fe 100755
--- a/source4/selftest/tests.py
+++ b/source4/selftest/tests.py
@@ -1232,3 +1232,15 @@ plantestsuite("samba4.dsdb.samdb.ldb_modules.group_audit", "none",
               [os.path.join(bindir(), "test_group_audit")])
 plantestsuite("samba4.dsdb.samdb.ldb_modules.group_audit.errors", "none",
               [os.path.join(bindir(), "test_group_audit_errors")])
+
+# process restart and limit tests, these break the environment so need to run
+# in their own specific environaments
+planoldpythontestsuite("preforkrestartdc:local",
+                       "samba.tests.prefork_restart",
+                       extra_path=[
+                           os.path.join(srcdir(), 'python/samba/tests')],
+                       extra_args=['-U"$USERNAME%$PASSWORD"'],
+                       environ={'CLIENT_IP': '127.0.0.11',
+                                'SOCKET_WRAPPER_DEFAULT_IFACE': 11},
+                       name="samba.tests.prefork_restart",
+                       py3_compatible=True)
-- 
2.17.1


From b2da9787c716b77aab7b03bc031975f1fa60fdc2 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Fri, 31 Aug 2018 11:40:18 +1200
Subject: [PATCH 02/11] source4 smdb prefork: Pass restart information

Pass information about the pre-fork master and worker processes that
will allow them to be restarted.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 176 ++++++++++++++++++++++++---------
 1 file changed, 129 insertions(+), 47 deletions(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index 016385c762c..c8ed942b741 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -37,6 +37,35 @@
 
 NTSTATUS process_model_prefork_init(void);
 
+/*
+ * State needed to restart the master process or a worker process if they
+ * terminate early.
+ */
+struct master_restart_context {
+	struct loadparm_context *lp_ctx;
+	struct task_server *(*new_task_fn)(struct tevent_context *,
+					   struct loadparm_context *lp_ctx,
+					   struct server_id,
+					   void *,
+					   void *);
+	void *private_data;
+};
+
+struct worker_restart_context {
+	unsigned int instance;
+	struct task_server *task;
+	struct tevent_context *ev2;
+};
+
+struct restart_context {
+	struct tfork *t;
+	int from_parent_fd;
+	const struct service_details *service_details;
+	const char *service_name;
+	struct master_restart_context *master;
+	struct worker_restart_context *worker;
+};
+
 static void sighup_signal_handler(struct tevent_context *ev,
 				struct tevent_signal *se,
 				int signum, int count, void *siginfo,
@@ -110,7 +139,7 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 				       uint16_t flags,
 				       void *private_data)
 {
-	struct tfork *t = NULL;
+	struct restart_context *rc = NULL;
 	int status = 0;
 	pid_t pid = 0;
 
@@ -119,11 +148,10 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 
 	/* the child has closed the pipe, assume its dead */
 
-	/* tfork allocates tfork structures with malloc  */
-	t = (struct tfork*)private_data;
-	pid = tfork_child_pid(t);
+	rc = talloc_get_type_abort(private_data, struct restart_context);
+	pid = tfork_child_pid(rc->t);
 	errno = 0;
-	status = tfork_status(&t, false);
+	status = tfork_status(&rc->t, false);
 	if (status == -1) {
 		DBG_ERR("Parent %d, Child %d terminated, "
 			"unable to get status code from tfork\n",
@@ -138,7 +166,8 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 			getpid(), pid, status);
 	}
 	/* tfork allocates tfork structures with malloc */
-	free(t);
+	free(rc->t);
+	TALLOC_FREE(rc);
 	return;
 }
 
@@ -220,6 +249,72 @@ static void setup_handlers(struct tevent_context *ev, int from_parent_fd) {
 	}
 }
 
+static void prefork_fork_worker(struct task_server *task,
+				struct tevent_context *ev,
+				struct tevent_context *ev2,
+				const struct service_details *service_details,
+				const char *service_name,
+				int from_parent_fd,
+				struct process_details *pd)
+{
+	struct tfork *w = NULL;
+	pid_t pid;
+
+	w = tfork_create();
+	if (w == NULL) {
+		smb_panic("failure in tfork\n");
+	}
+
+	pid = tfork_child_pid(w);
+	if (pid != 0) {
+		struct tevent_fd *fde = NULL;
+		int fd = tfork_event_fd(w);
+		struct restart_context *rc = NULL;
+
+		rc = talloc_zero(ev, struct restart_context);
+		if (rc == NULL) {
+			smb_panic("OOM allocating restart context\n");
+		}
+		rc->t = w;
+		rc->service_name = service_name;
+		rc->service_details = service_details;
+		rc->from_parent_fd = from_parent_fd;
+		rc->master = NULL;
+		rc->worker = talloc_zero(rc, struct worker_restart_context);
+		if (rc->worker == NULL) {
+			smb_panic("OOM allocating master restart context\n");
+		}
+		rc->worker->ev2 = ev2;
+		rc->worker->instance = pd->instances;
+		rc->worker->task = task;
+
+		fde = tevent_add_fd(
+		    ev, ev, fd, TEVENT_FD_READ, prefork_child_pipe_handler, rc);
+		if (fde == NULL) {
+			smb_panic("Failed to add child pipe handler, "
+				  "after fork");
+		}
+		tevent_fd_set_auto_close(fde);
+	} else {
+		/*
+		 * tfork uses malloc
+		 */
+		free(w);
+
+		TALLOC_FREE(ev);
+		setproctitle("task[%s] pre-forked worker(%d)",
+			     service_name,
+			     pd->instances);
+		prefork_reload_after_fork();
+		setup_handlers(ev2, from_parent_fd);
+		if (service_details->post_fork != NULL) {
+			service_details->post_fork(task, pd);
+		}
+		tevent_loop_wait(ev2);
+		talloc_free(ev2);
+		exit(0);
+	}
+}
 /*
  * called to create a new server task
  */
@@ -251,12 +346,30 @@ static void prefork_new_task(
 	if (pid != 0) {
 		struct tevent_fd *fde = NULL;
 		int fd = tfork_event_fd(t);
+		struct restart_context *rc = NULL;
 
 		/* Register a pipe handler that gets called when the prefork
 		 * master process terminates.
 		 */
-		fde = tevent_add_fd(ev, ev, fd, TEVENT_FD_READ,
-				    prefork_child_pipe_handler, t);
+		rc = talloc_zero(ev, struct restart_context);
+		if (rc == NULL) {
+			smb_panic("OOM allocating restart context\n");
+		}
+		rc->t = t;
+		rc->service_name = service_name;
+		rc->service_details = service_details;
+		rc->from_parent_fd = from_parent_fd;
+		rc->master = talloc_zero(rc, struct master_restart_context);
+		if (rc->master == NULL) {
+			smb_panic("OOM allocating master restart context\n");
+		}
+
+		rc->master->lp_ctx = lp_ctx;
+		rc->master->new_task_fn = new_task_fn;
+		rc->master->private_data = private_data;
+
+		fde = tevent_add_fd(
+		    ev, ev, fd, TEVENT_FD_READ, prefork_child_pipe_handler, rc);
 		if (fde == NULL) {
 			smb_panic("Failed to add child pipe handler, "
 				  "after fork");
@@ -330,45 +443,14 @@ static void prefork_new_task(
 	 * We are now free to spawn some worker processes
 	 */
 	for (i=0; i < num_children; i++) {
-		struct tfork* w = NULL;
-
-		w = tfork_create();
-		if (w == NULL) {
-			smb_panic("failure in tfork\n");
-		}
-
-		pid = tfork_child_pid(w);
-		if (pid != 0) {
-			struct tevent_fd *fde = NULL;
-			int fd = tfork_event_fd(w);
-
-			fde = tevent_add_fd(ev, ev, fd, TEVENT_FD_READ,
-					    prefork_child_pipe_handler, w);
-			if (fde == NULL) {
-				smb_panic("Failed to add child pipe handler, "
-					  "after fork");
-			}
-			tevent_fd_set_auto_close(fde);
-			pd.instances++;
-		} else {
-			/*
-			 * tfork uses malloc
-			 */
-			free(w);
-
-			TALLOC_FREE(ev);
-			setproctitle("task[%s] pre-forked worker(%d)",
-				     service_name,
-				     pd.instances);
-			prefork_reload_after_fork();
-			setup_handlers(ev2, from_parent_fd);
-			if (service_details->post_fork != NULL) {
-				service_details->post_fork(task, &pd);
-			}
-			tevent_loop_wait(ev2);
-			talloc_free(ev2);
-			exit(0);
-		}
+		prefork_fork_worker(task,
+				    ev,
+				    ev2,
+				    service_details,
+				    service_name,
+				    from_parent_fd,
+				    &pd);
+		pd.instances++;
 	}
 
 	/* Don't listen on the sockets we just gave to the children */
-- 
2.17.1


From 8418f7c2ee926a2f6d4e523368041d0fc6f94233 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Mon, 3 Sep 2018 09:34:17 +1200
Subject: [PATCH 03/11] source4 smdb prefork: Restart failed processes

Restart any pre-forked master or worker process that terminated with SIGABRT,
SIGBUS, SIGFPE, SIGILL or SIGSYS

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/smbd/process_prefork.c | 75 ++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index c8ed942b741..35513bc7118 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -36,6 +36,25 @@
 #include "lib/util/tfork.h"
 
 NTSTATUS process_model_prefork_init(void);
+static void prefork_new_task(
+    struct tevent_context *ev,
+    struct loadparm_context *lp_ctx,
+    const char *service_name,
+    struct task_server *(*new_task_fn)(struct tevent_context *,
+				       struct loadparm_context *lp_ctx,
+				       struct server_id,
+				       void *,
+				       void *),
+    void *private_data,
+    const struct service_details *service_details,
+    int from_parent_fd);
+static void prefork_fork_worker(struct task_server *task,
+				struct tevent_context *ev,
+				struct tevent_context *ev2,
+				const struct service_details *service_details,
+				const char *service_name,
+				int control_pipe[2],
+				struct process_details *pd);
 
 /*
  * State needed to restart the master process or a worker process if they
@@ -55,6 +74,7 @@ struct worker_restart_context {
 	unsigned int instance;
 	struct task_server *task;
 	struct tevent_context *ev2;
+	int control_pipe[2];
 };
 
 struct restart_context {
@@ -164,6 +184,39 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 		status = WTERMSIG(status);
 		DBG_ERR("Parent %d, Child %d terminated with signal %d\n",
 			getpid(), pid, status);
+		if (status == SIGABRT || status == SIGBUS || status == SIGFPE ||
+		    status == SIGILL || status == SIGSYS) {
+			/*
+			 * Lets restart the process.
+			 *
+			 */
+			tfork_destroy(&rc->t);
+			if (rc->master != NULL) {
+				DBG_ERR("Restarting [%s] pre-fork master\n",
+					rc->service_name);
+				prefork_new_task(ev,
+						 rc->master->lp_ctx,
+						 rc->service_name,
+						 rc->master->new_task_fn,
+						 rc->master->private_data,
+						 rc->service_details,
+						 rc->from_parent_fd);
+			} else if (rc->worker != NULL) {
+				struct process_details pd =
+				    initial_process_details;
+				DBG_ERR("Restarting [%s] pre-fork worker(%d)\n",
+					rc->service_name,
+					rc->worker->instance);
+				pd.instances = rc->worker->instance;
+				prefork_fork_worker(rc->worker->task,
+						    ev,
+						    rc->worker->ev2,
+						    rc->service_details,
+						    rc->service_name,
+						    rc->worker->control_pipe,
+						    &pd);
+			}
+		}
 	}
 	/* tfork allocates tfork structures with malloc */
 	free(rc->t);
@@ -254,7 +307,7 @@ static void prefork_fork_worker(struct task_server *task,
 				struct tevent_context *ev2,
 				const struct service_details *service_details,
 				const char *service_name,
-				int from_parent_fd,
+				int control_pipe[2],
 				struct process_details *pd)
 {
 	struct tfork *w = NULL;
@@ -278,7 +331,6 @@ static void prefork_fork_worker(struct task_server *task,
 		rc->t = w;
 		rc->service_name = service_name;
 		rc->service_details = service_details;
-		rc->from_parent_fd = from_parent_fd;
 		rc->master = NULL;
 		rc->worker = talloc_zero(rc, struct worker_restart_context);
 		if (rc->worker == NULL) {
@@ -287,6 +339,8 @@ static void prefork_fork_worker(struct task_server *task,
 		rc->worker->ev2 = ev2;
 		rc->worker->instance = pd->instances;
 		rc->worker->task = task;
+		rc->worker->control_pipe[0] = control_pipe[0];
+		rc->worker->control_pipe[1] = control_pipe[1];
 
 		fde = tevent_add_fd(
 		    ev, ev, fd, TEVENT_FD_READ, prefork_child_pipe_handler, rc);
@@ -296,6 +350,8 @@ static void prefork_fork_worker(struct task_server *task,
 		}
 		tevent_fd_set_auto_close(fde);
 	} else {
+		close(control_pipe[1]);
+		setup_handlers(ev2, control_pipe[0]);
 		/*
 		 * tfork uses malloc
 		 */
@@ -306,7 +362,6 @@ static void prefork_fork_worker(struct task_server *task,
 			     service_name,
 			     pd->instances);
 		prefork_reload_after_fork();
-		setup_handlers(ev2, from_parent_fd);
 		if (service_details->post_fork != NULL) {
 			service_details->post_fork(task, pd);
 		}
@@ -336,12 +391,14 @@ static void prefork_new_task(
 	struct tevent_context *ev2;
 	struct task_server *task = NULL;
 	struct process_details pd = initial_process_details;
+	int control_pipe[2];
 
 	t = tfork_create();
 	if (t == NULL) {
 		smb_panic("failure in tfork\n");
 	}
 
+	DBG_NOTICE("Forking [%s] pre-fork master process\n", service_name);
 	pid = tfork_child_pid(t);
 	if (pid != 0) {
 		struct tevent_fd *fde = NULL;
@@ -439,6 +496,16 @@ static void prefork_new_task(
 	}
 	DBG_NOTICE("Forking %d %s worker processes\n",
 		   num_children, service_name);
+	{
+		int ret;
+		ret = pipe(control_pipe);
+		if (ret != 0) {
+			smb_panic("Unable to create worker control pipe\n");
+		}
+		smb_set_close_on_exec(control_pipe[0]);
+		smb_set_close_on_exec(control_pipe[0]);
+	}
+
 	/*
 	 * We are now free to spawn some worker processes
 	 */
@@ -448,7 +515,7 @@ static void prefork_new_task(
 				    ev2,
 				    service_details,
 				    service_name,
-				    from_parent_fd,
+				    control_pipe,
 				    &pd);
 		pd.instances++;
 	}
-- 
2.17.1


From d937375fac4d62b07265e67dda15976cca7ac837 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Tue, 4 Sep 2018 10:09:38 +1200
Subject: [PATCH 04/11] source4 smbd process: pass the fatal flag to terminate

Pass the fatal flag supplied to task_server_terminate to the process
task_terminate method.  It will be used by the task_terminate methods to
set an appropriate exit code.  The process_prefork model will use a non
zero exit code to indicate that the process should be restarted.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/smbd/process_model.h    | 3 ++-
 source4/smbd/process_prefork.c  | 1 +
 source4/smbd/process_single.c   | 1 +
 source4/smbd/process_standard.c | 1 +
 source4/smbd/service_stream.c   | 3 ++-
 source4/smbd/service_task.c     | 4 ++--
 6 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/source4/smbd/process_model.h b/source4/smbd/process_model.h
index 17d70254cf2..2a226cef8cb 100644
--- a/source4/smbd/process_model.h
+++ b/source4/smbd/process_model.h
@@ -69,7 +69,8 @@ struct model_ops {
 	void (*terminate)(struct tevent_context *,
 			  struct loadparm_context *lp_ctx,
 			  const char *reason,
-			  void * process_context);
+			  bool fatal,
+			  void *process_context);
 
 	/* function to set a title for the connection or task */
 	void (*set_title)(struct tevent_context *, const char *title);
diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index 35513bc7118..762f32aa977 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -534,6 +534,7 @@ static void prefork_new_task(
 static void prefork_terminate(struct tevent_context *ev,
 			      struct loadparm_context *lp_ctx,
 			      const char *reason,
+			      bool fatal,
 			      void *process_context)
 {
 	DBG_DEBUG("called with reason[%s]\n", reason);
diff --git a/source4/smbd/process_single.c b/source4/smbd/process_single.c
index 174c1572812..061f09673dd 100644
--- a/source4/smbd/process_single.c
+++ b/source4/smbd/process_single.c
@@ -124,6 +124,7 @@ static void single_new_task(struct tevent_context *ev,
 static void single_terminate(struct tevent_context *ev,
 			     struct loadparm_context *lp_ctx,
 			     const char *reason,
+			     bool fatal,
 			     void *process_context)
 {
 	DBG_NOTICE("single_terminate: reason[%s]\n",reason);
diff --git a/source4/smbd/process_standard.c b/source4/smbd/process_standard.c
index 91dfa9753c5..536a6be3007 100644
--- a/source4/smbd/process_standard.c
+++ b/source4/smbd/process_standard.c
@@ -513,6 +513,7 @@ static void standard_new_task(struct tevent_context *ev,
 static void standard_terminate(struct tevent_context *ev,
 			       struct loadparm_context *lp_ctx,
 			       const char *reason,
+			       bool fatal,
 			       void *process_context)
 {
 	struct process_context *proc_ctx = NULL;
diff --git a/source4/smbd/service_stream.c b/source4/smbd/service_stream.c
index fc996d942e6..336a0cb32aa 100644
--- a/source4/smbd/service_stream.c
+++ b/source4/smbd/service_stream.c
@@ -57,6 +57,7 @@ void stream_terminate_connection(struct stream_connection *srv_conn, const char
 	const struct model_ops *model_ops = srv_conn->model_ops;
 	struct loadparm_context *lp_ctx = srv_conn->lp_ctx;
 	void *process_context = srv_conn->process_context;
+	bool fatal = true;
 	TALLOC_CTX *frame = NULL;
 
 	if (!reason) reason = "unknown reason";
@@ -91,7 +92,7 @@ void stream_terminate_connection(struct stream_connection *srv_conn, const char
 	srv_conn->event.fde = NULL;
 	imessaging_cleanup(srv_conn->msg_ctx);
 	TALLOC_FREE(srv_conn);
-	model_ops->terminate(event_ctx, lp_ctx, reason, process_context);
+	model_ops->terminate(event_ctx, lp_ctx, reason, fatal, process_context);
 	TALLOC_FREE(frame);
 }
 
diff --git a/source4/smbd/service_task.c b/source4/smbd/service_task.c
index 15e480ec043..cb1f4d5ad1f 100644
--- a/source4/smbd/service_task.c
+++ b/source4/smbd/service_task.c
@@ -54,8 +54,8 @@ void task_server_terminate(struct task_server *task, const char *reason, bool fa
 
 	imessaging_cleanup(task->msg_ctx);
 
-	model_ops->terminate(event_ctx, task->lp_ctx, reason,
-			     task->process_context);
+	model_ops->terminate(
+	    event_ctx, task->lp_ctx, reason, fatal, task->process_context);
 	/* don't free this above, it might contain the 'reason' being printed */
 	talloc_free(task);
 }
-- 
2.17.1


From 35e839bbde165396d7dd4c9b1d93fdb222f90f71 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Tue, 4 Sep 2018 12:12:49 +1200
Subject: [PATCH 05/11] source4 smbd prefork: restart on non zero exit code

Restart any pre-fork master or worker process that exits with a non
zero exit code.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/kdc/kdc-heimdal.c       |  3 +-
 source4/smbd/process_model.h    | 17 ++++--
 source4/smbd/process_prefork.c  | 97 ++++++++++++++++++++-------------
 source4/smbd/process_single.c   | 30 +++++++---
 source4/smbd/process_standard.c | 32 +++++++----
 source4/smbd/service_stream.c   |  4 +-
 source4/smbd/service_task.c     |  2 +-
 7 files changed, 118 insertions(+), 67 deletions(-)

diff --git a/source4/kdc/kdc-heimdal.c b/source4/kdc/kdc-heimdal.c
index 67c07c3d81a..b5de5a790d4 100644
--- a/source4/kdc/kdc-heimdal.c
+++ b/source4/kdc/kdc-heimdal.c
@@ -276,7 +276,8 @@ static NTSTATUS kdc_task_init(struct task_server *task)
 		return NT_STATUS_INVALID_DOMAIN_ROLE;
 	case ROLE_DOMAIN_PDC:
 	case ROLE_DOMAIN_BDC:
-		task_server_terminate(task, "Cannot start KDC as a 'classic Samba' DC", true);
+		task_server_terminate(
+		    task, "Cannot start KDC as a 'classic Samba' DC", false);
 		return NT_STATUS_INVALID_DOMAIN_ROLE;
 	case ROLE_ACTIVE_DIRECTORY_DC:
 		/* Yes, we want a KDC */
diff --git a/source4/smbd/process_model.h b/source4/smbd/process_model.h
index 2a226cef8cb..2892dd43c04 100644
--- a/source4/smbd/process_model.h
+++ b/source4/smbd/process_model.h
@@ -65,12 +65,17 @@ struct model_ops {
 			 const struct service_details*,
 			 const int);
 
-	/* function to terminate a connection or task */
-	void (*terminate)(struct tevent_context *,
-			  struct loadparm_context *lp_ctx,
-			  const char *reason,
-			  bool fatal,
-			  void *process_context);
+	/* function to terminate a task */
+	void (*terminate_task)(struct tevent_context *,
+			       struct loadparm_context *lp_ctx,
+			       const char *reason,
+			       bool fatal,
+			       void *process_context);
+	/* function to terminate a connection */
+	void (*terminate_connection)(struct tevent_context *,
+				     struct loadparm_context *lp_ctx,
+				     const char *reason,
+				     void *process_context);
 
 	/* function to set a title for the connection or task */
 	void (*set_title)(struct tevent_context *, const char *title);
diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index 762f32aa977..37b258d75ed 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -147,6 +147,33 @@ static void prefork_pipe_handler(struct tevent_context *event_ctx,
 	exit(0);
 }
 
+static void prefork_restart(struct tevent_context *ev,
+			    struct restart_context *rc)
+{
+	if (rc->master != NULL) {
+		DBG_ERR("Restarting [%s] pre-fork master\n", rc->service_name);
+		prefork_new_task(ev,
+				 rc->master->lp_ctx,
+				 rc->service_name,
+				 rc->master->new_task_fn,
+				 rc->master->private_data,
+				 rc->service_details,
+				 rc->from_parent_fd);
+	} else if (rc->worker != NULL) {
+		struct process_details pd = initial_process_details;
+		DBG_ERR("Restarting [%s] pre-fork worker(%d)\n",
+			rc->service_name,
+			rc->worker->instance);
+		pd.instances = rc->worker->instance;
+		prefork_fork_worker(rc->worker->task,
+				    ev,
+				    rc->worker->ev2,
+				    rc->service_details,
+				    rc->service_name,
+				    rc->worker->control_pipe,
+				    &pd);
+	}
+}
 /*
   handle EOF on the child pipe in the parent, so we know when a
   process terminates without using SIGCHLD or waiting on all possible pids.
@@ -176,49 +203,26 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 		DBG_ERR("Parent %d, Child %d terminated, "
 			"unable to get status code from tfork\n",
 			getpid(), pid);
+		prefork_restart(ev, rc);
 	} else if (WIFEXITED(status)) {
 		status = WEXITSTATUS(status);
 		DBG_ERR("Parent %d, Child %d exited with status %d\n",
 			 getpid(), pid,  status);
+		if (status != 0) {
+			prefork_restart(ev, rc);
+		}
 	} else if (WIFSIGNALED(status)) {
 		status = WTERMSIG(status);
 		DBG_ERR("Parent %d, Child %d terminated with signal %d\n",
 			getpid(), pid, status);
 		if (status == SIGABRT || status == SIGBUS || status == SIGFPE ||
 		    status == SIGILL || status == SIGSYS) {
-			/*
-			 * Lets restart the process.
-			 *
-			 */
-			tfork_destroy(&rc->t);
-			if (rc->master != NULL) {
-				DBG_ERR("Restarting [%s] pre-fork master\n",
-					rc->service_name);
-				prefork_new_task(ev,
-						 rc->master->lp_ctx,
-						 rc->service_name,
-						 rc->master->new_task_fn,
-						 rc->master->private_data,
-						 rc->service_details,
-						 rc->from_parent_fd);
-			} else if (rc->worker != NULL) {
-				struct process_details pd =
-				    initial_process_details;
-				DBG_ERR("Restarting [%s] pre-fork worker(%d)\n",
-					rc->service_name,
-					rc->worker->instance);
-				pd.instances = rc->worker->instance;
-				prefork_fork_worker(rc->worker->task,
-						    ev,
-						    rc->worker->ev2,
-						    rc->service_details,
-						    rc->service_name,
-						    rc->worker->control_pipe,
-						    &pd);
-			}
+
+			prefork_restart(ev, rc);
 		}
 	}
 	/* tfork allocates tfork structures with malloc */
+	tfork_destroy(&rc->t);
 	free(rc->t);
 	TALLOC_FREE(rc);
 	return;
@@ -529,15 +533,31 @@ static void prefork_new_task(
 
 }
 
-
-/* called when a task goes down */
-static void prefork_terminate(struct tevent_context *ev,
-			      struct loadparm_context *lp_ctx,
-			      const char *reason,
-			      bool fatal,
-			      void *process_context)
+/*
+ * called when a task terminates
+ */
+static void prefork_terminate_task(struct tevent_context *ev,
+				   struct loadparm_context *lp_ctx,
+				   const char *reason,
+				   bool fatal,
+				   void *process_context)
 {
 	DBG_DEBUG("called with reason[%s]\n", reason);
+	if (fatal == true) {
+		exit(127);
+	} else {
+		exit(0);
+	}
+}
+
+/*
+ * called when a connection completes
+ */
+static void prefork_terminate_connection(struct tevent_context *ev,
+					 struct loadparm_context *lp_ctx,
+					 const char *reason,
+					 void *process_context)
+{
 }
 
 /* called to set a title of a task or connection */
@@ -550,7 +570,8 @@ static const struct model_ops prefork_ops = {
 	.model_init		= prefork_model_init,
 	.accept_connection	= prefork_accept_connection,
 	.new_task		= prefork_new_task,
-	.terminate		= prefork_terminate,
+	.terminate_task		= prefork_terminate_task,
+	.terminate_connection	= prefork_terminate_connection,
 	.set_title		= prefork_set_title,
 };
 
diff --git a/source4/smbd/process_single.c b/source4/smbd/process_single.c
index 061f09673dd..12ed2e30a74 100644
--- a/source4/smbd/process_single.c
+++ b/source4/smbd/process_single.c
@@ -119,17 +119,28 @@ static void single_new_task(struct tevent_context *ev,
 	}
 }
 
-
-/* called when a task goes down */
-static void single_terminate(struct tevent_context *ev,
-			     struct loadparm_context *lp_ctx,
-			     const char *reason,
-			     bool fatal,
-			     void *process_context)
+/*
+ * Called when a task goes down
+ */
+static void single_terminate_task(struct tevent_context *ev,
+				  struct loadparm_context *lp_ctx,
+				  const char *reason,
+				  bool fatal,
+				  void *process_context)
 {
 	DBG_NOTICE("single_terminate: reason[%s]\n",reason);
 }
 
+/*
+ * Called when a connection has ended
+ */
+static void single_terminate_connection(struct tevent_context *ev,
+					struct loadparm_context *lp_ctx,
+					const char *reason,
+					void *process_context)
+{
+}
+
 /* called to set a title of a task or connection */
 static void single_set_title(struct tevent_context *ev, const char *title) 
 {
@@ -138,9 +149,10 @@ static void single_set_title(struct tevent_context *ev, const char *title)
 const struct model_ops single_ops = {
 	.name			= "single",
 	.model_init		= single_model_init,
-	.new_task               = single_new_task,
+	.new_task		= single_new_task,
 	.accept_connection	= single_accept_connection,
-	.terminate              = single_terminate,
+	.terminate_task		= single_terminate_task,
+	.terminate_connection	= single_terminate_connection,
 	.set_title		= single_set_title,
 };
 
diff --git a/source4/smbd/process_standard.c b/source4/smbd/process_standard.c
index 536a6be3007..d25538716ba 100644
--- a/source4/smbd/process_standard.c
+++ b/source4/smbd/process_standard.c
@@ -510,15 +510,27 @@ static void standard_new_task(struct tevent_context *ev,
 
 
 /* called when a task goes down */
-static void standard_terminate(struct tevent_context *ev,
-			       struct loadparm_context *lp_ctx,
-			       const char *reason,
-			       bool fatal,
-			       void *process_context)
+static void standard_terminate_task(struct tevent_context *ev,
+				    struct loadparm_context *lp_ctx,
+				    const char *reason,
+				    bool fatal,
+				    void *process_context)
+{
+	if (fatal == true) {
+		exit(127);
+	}
+	exit(0);
+}
+
+/* called when a connection terminates*/
+static void standard_terminate_connection(struct tevent_context *ev,
+					  struct loadparm_context *lp_ctx,
+					  const char *reason,
+					  void *process_context)
 {
 	struct process_context *proc_ctx = NULL;
 
-	DBG_DEBUG("process terminating reason[%s]\n", reason);
+	DBG_DEBUG("connection terminating reason[%s]\n", reason);
 	if (process_context == NULL) {
 		smb_panic("Panicking process_context is NULL");
 	}
@@ -547,7 +559,6 @@ static void standard_terminate(struct tevent_context *ev,
 	/* terminate this process */
 	exit(0);
 }
-
 /* called to set a title of a task or connection */
 static void standard_set_title(struct tevent_context *ev, const char *title) 
 {
@@ -562,9 +573,10 @@ static const struct model_ops standard_ops = {
 	.name			= "standard",
 	.model_init		= standard_model_init,
 	.accept_connection	= standard_accept_connection,
-	.new_task               = standard_new_task,
-	.terminate              = standard_terminate,
-	.set_title              = standard_set_title,
+	.new_task		= standard_new_task,
+	.terminate_task		= standard_terminate_task,
+	.terminate_connection	= standard_terminate_connection,
+	.set_title		= standard_set_title,
 };
 
 /*
diff --git a/source4/smbd/service_stream.c b/source4/smbd/service_stream.c
index 336a0cb32aa..d0ad45aca3a 100644
--- a/source4/smbd/service_stream.c
+++ b/source4/smbd/service_stream.c
@@ -57,7 +57,6 @@ void stream_terminate_connection(struct stream_connection *srv_conn, const char
 	const struct model_ops *model_ops = srv_conn->model_ops;
 	struct loadparm_context *lp_ctx = srv_conn->lp_ctx;
 	void *process_context = srv_conn->process_context;
-	bool fatal = true;
 	TALLOC_CTX *frame = NULL;
 
 	if (!reason) reason = "unknown reason";
@@ -92,7 +91,8 @@ void stream_terminate_connection(struct stream_connection *srv_conn, const char
 	srv_conn->event.fde = NULL;
 	imessaging_cleanup(srv_conn->msg_ctx);
 	TALLOC_FREE(srv_conn);
-	model_ops->terminate(event_ctx, lp_ctx, reason, fatal, process_context);
+	model_ops->terminate_connection(
+	    event_ctx, lp_ctx, reason, process_context);
 	TALLOC_FREE(frame);
 }
 
diff --git a/source4/smbd/service_task.c b/source4/smbd/service_task.c
index cb1f4d5ad1f..d911027db0a 100644
--- a/source4/smbd/service_task.c
+++ b/source4/smbd/service_task.c
@@ -54,7 +54,7 @@ void task_server_terminate(struct task_server *task, const char *reason, bool fa
 
 	imessaging_cleanup(task->msg_ctx);
 
-	model_ops->terminate(
+	model_ops->terminate_task(
 	    event_ctx, task->lp_ctx, reason, fatal, task->process_context);
 	/* don't free this above, it might contain the 'reason' being printed */
 	talloc_free(task);
-- 
2.17.1


From 871ebb10c611ff8fa5a5bbc7c431461b4f54d375 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Wed, 5 Sep 2018 07:31:22 +1200
Subject: [PATCH 06/11] source4 smbd prefork: Add backoff to process restart

Add new smbd.conf variables 'prefork backoff increment' and
'prefork maximum backoff' to control the rate at which failed pre-forked
processes are restarted.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 .../base/preforkbackoffincrement.xml          |  25 ++
 .../smbdotconf/base/preforkmaximumbackoff.xml |  13 +
 lib/param/loadparm.c                          |   2 +
 source3/param/loadparm.c                      |   2 +
 source4/smbd/process_prefork.c                | 378 +++++++++++-------
 5 files changed, 270 insertions(+), 150 deletions(-)
 create mode 100644 docs-xml/smbdotconf/base/preforkbackoffincrement.xml
 create mode 100644 docs-xml/smbdotconf/base/preforkmaximumbackoff.xml

diff --git a/docs-xml/smbdotconf/base/preforkbackoffincrement.xml b/docs-xml/smbdotconf/base/preforkbackoffincrement.xml
new file mode 100644
index 00000000000..2cb1cc32fd8
--- /dev/null
+++ b/docs-xml/smbdotconf/base/preforkbackoffincrement.xml
@@ -0,0 +1,25 @@
+<samba:parameter name="prefork backoff increment"
+                 context="G"
+                 type="integer"
+                 xmlns:samba="http://www.samba.org/samba/DTD/samba-doc">
+<description>
+	<para>This option specifies the number of seconds added to the delay
+		before a prefork master or worker process is restarted.  The
+		restart is initially zero, the prefork backoff increment is
+		added to the delay on each restart up to the value specified by
+		"prefork maximum backoff".
+	</para>
+
+	<para>Additionally the the backoff for an individual service by using
+		"prefork backoff increment: service name"
+		i.e. "prefork backoff increment:ldap = 2" to set the
+		backoff increment to 2.</para>
+
+	<para>If the backoff increment is 2 and the maximum backoff is 5.
+		There will be a zero second delay for the first restart. A two
+		second delay for the second restart. A four second delay for the
+		third and any subsequent restarts</para>
+</description>
+
+<value type="default">10</value>
+</samba:parameter>
diff --git a/docs-xml/smbdotconf/base/preforkmaximumbackoff.xml b/docs-xml/smbdotconf/base/preforkmaximumbackoff.xml
new file mode 100644
index 00000000000..17e530d5db8
--- /dev/null
+++ b/docs-xml/smbdotconf/base/preforkmaximumbackoff.xml
@@ -0,0 +1,13 @@
+<samba:parameter name="prefork maximum backoff"
+                 context="G"
+                 type="integer"
+                 xmlns:samba="http://www.samba.org/samba/DTD/samba-doc">
+<description>
+	<para>This option controls the maximum delay before a failed pre-fork
+		process is restarted.
+	</para>
+
+</description>
+
+<value type="default">120</value>
+</samba:parameter>
diff --git a/lib/param/loadparm.c b/lib/param/loadparm.c
index 484c8913009..a7dbc6f8f0b 100644
--- a/lib/param/loadparm.c
+++ b/lib/param/loadparm.c
@@ -2997,6 +2997,8 @@ struct loadparm_context *loadparm_init(TALLOC_CTX *mem_ctx)
 				  "49152-65535");
 
 	lpcfg_do_global_parameter(lp_ctx, "prefork children", "4");
+	lpcfg_do_global_parameter(lp_ctx, "prefork backoff increment", "10");
+	lpcfg_do_global_parameter(lp_ctx, "prefork maximum backoff", "120");
 
 	lpcfg_do_global_parameter(lp_ctx, "check parent directory delete on close", "no");
 
diff --git a/source3/param/loadparm.c b/source3/param/loadparm.c
index 495683083cb..994f9b0ce43 100644
--- a/source3/param/loadparm.c
+++ b/source3/param/loadparm.c
@@ -958,6 +958,8 @@ static void init_globals(struct loadparm_context *lp_ctx, bool reinit_globals)
 	Globals.rpc_low_port = SERVER_TCP_LOW_PORT;
 	Globals.rpc_high_port = SERVER_TCP_HIGH_PORT;
 	Globals.prefork_children = 4;
+	Globals.prefork_backoff_increment = 10;
+	Globals.prefork_maximum_backoff = 120;
 
 	/* Now put back the settings that were set with lp_set_cmdline() */
 	apply_lp_set_cmdline();
diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index 37b258d75ed..f963b83afd1 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -35,6 +35,8 @@
 #include "ldb_wrap.h"
 #include "lib/util/tfork.h"
 
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
 NTSTATUS process_model_prefork_init(void);
 static void prefork_new_task(
     struct tevent_context *ev,
@@ -51,17 +53,23 @@ static void prefork_new_task(
 static void prefork_fork_worker(struct task_server *task,
 				struct tevent_context *ev,
 				struct tevent_context *ev2,
+				struct loadparm_context *lp_ctx,
 				const struct service_details *service_details,
 				const char *service_name,
 				int control_pipe[2],
+				unsigned restart_delay,
 				struct process_details *pd);
+static void prefork_child_pipe_handler(struct tevent_context *ev,
+				       struct tevent_fd *fde,
+				       uint16_t flags,
+				       void *private_data);
+static void setup_handlers(struct tevent_context *ev, int from_parent_fd);
 
 /*
  * State needed to restart the master process or a worker process if they
  * terminate early.
  */
 struct master_restart_context {
-	struct loadparm_context *lp_ctx;
 	struct task_server *(*new_task_fn)(struct tevent_context *,
 					   struct loadparm_context *lp_ctx,
 					   struct server_id,
@@ -78,10 +86,12 @@ struct worker_restart_context {
 };
 
 struct restart_context {
+	struct loadparm_context *lp_ctx;
 	struct tfork *t;
 	int from_parent_fd;
 	const struct service_details *service_details;
 	const char *service_name;
+	unsigned restart_delay;
 	struct master_restart_context *master;
 	struct worker_restart_context *worker;
 };
@@ -147,18 +157,214 @@ static void prefork_pipe_handler(struct tevent_context *event_ctx,
 	exit(0);
 }
 
+
+/*
+ * called to create a new server task
+ */
+static void prefork_fork_master(
+    struct tevent_context *ev,
+    struct loadparm_context *lp_ctx,
+    const char *service_name,
+    struct task_server *(*new_task_fn)(struct tevent_context *,
+				       struct loadparm_context *lp_ctx,
+				       struct server_id,
+				       void *,
+				       void *),
+    void *private_data,
+    const struct service_details *service_details,
+    unsigned restart_delay,
+    int from_parent_fd)
+{
+	pid_t pid;
+	struct tfork* t = NULL;
+	int i, num_children;
+
+	struct tevent_context *ev2;
+	struct task_server *task = NULL;
+	struct process_details pd = initial_process_details;
+	int control_pipe[2];
+
+	t = tfork_create();
+	if (t == NULL) {
+		smb_panic("failure in tfork\n");
+	}
+
+	DBG_NOTICE("Forking [%s] pre-fork master process\n", service_name);
+	pid = tfork_child_pid(t);
+	if (pid != 0) {
+		struct tevent_fd *fde = NULL;
+		int fd = tfork_event_fd(t);
+		struct restart_context *rc = NULL;
+
+		/* Register a pipe handler that gets called when the prefork
+		 * master process terminates.
+		 */
+		rc = talloc_zero(ev, struct restart_context);
+		if (rc == NULL) {
+			smb_panic("OOM allocating restart context\n");
+		}
+		rc->t = t;
+		rc->lp_ctx = lp_ctx;
+		rc->service_name = service_name;
+		rc->service_details = service_details;
+		rc->from_parent_fd = from_parent_fd;
+		rc->restart_delay = restart_delay;
+		rc->master = talloc_zero(rc, struct master_restart_context);
+		if (rc->master == NULL) {
+			smb_panic("OOM allocating master restart context\n");
+		}
+
+		rc->master->new_task_fn = new_task_fn;
+		rc->master->private_data = private_data;
+
+		fde = tevent_add_fd(
+		    ev, ev, fd, TEVENT_FD_READ, prefork_child_pipe_handler, rc);
+		if (fde == NULL) {
+			smb_panic("Failed to add child pipe handler, "
+				  "after fork");
+		}
+		tevent_fd_set_auto_close(fde);
+		return;
+	}
+
+	pid = getpid();
+	setproctitle("task[%s] pre-fork master", service_name);
+
+	/*
+	 * this will free all the listening sockets and all state that
+	 * is not associated with this new connection
+	 */
+	if (tevent_re_initialise(ev) != 0) {
+		smb_panic("Failed to re-initialise tevent after fork");
+	}
+	prefork_reload_after_fork();
+	setup_handlers(ev, from_parent_fd);
+
+	if (service_details->inhibit_pre_fork) {
+		task = new_task_fn(
+		    ev, lp_ctx, cluster_id(pid, 0), private_data, NULL);
+		/*
+		 * The task does not support pre-fork
+		 */
+		if (task != NULL && service_details->post_fork != NULL) {
+			service_details->post_fork(task, &pd);
+		}
+		tevent_loop_wait(ev);
+		TALLOC_FREE(ev);
+		exit(0);
+	}
+
+	/*
+	 * This is now the child code. We need a completely new event_context
+	 * to work with
+	 */
+	ev2 = s4_event_context_init(NULL);
+
+	/* setup this new connection: process will bind to it's sockets etc
+	 *
+	 * While we can use ev for the child, which has been re-initialised
+	 * above we must run the new task under ev2 otherwise the children would
+	 * be listening on the sockets.  Also we don't want the top level
+	 * process accepting and handling requests, it's responsible for
+	 * monitoring and controlling the child work processes.
+	 */
+	task = new_task_fn(ev2, lp_ctx, cluster_id(pid, 0), private_data, NULL);
+	if (task == NULL) {
+		TALLOC_FREE(ev);
+		TALLOC_FREE(ev2);
+		exit(0);
+	}
+
+	{
+		int default_children;
+		default_children = lpcfg_prefork_children(lp_ctx);
+		num_children = lpcfg_parm_int(lp_ctx, NULL, "prefork children",
+			                      service_name, default_children);
+	}
+	if (num_children == 0) {
+		DBG_WARNING("Number of pre-fork children for %s is zero, "
+			    "NO worker processes will be started for %s\n",
+			    service_name, service_name);
+	}
+	DBG_NOTICE("Forking %d %s worker processes\n",
+		   num_children, service_name);
+	{
+		int ret;
+		ret = pipe(control_pipe);
+		if (ret != 0) {
+			smb_panic("Unable to create worker control pipe\n");
+		}
+		smb_set_close_on_exec(control_pipe[0]);
+		smb_set_close_on_exec(control_pipe[0]);
+	}
+
+	/*
+	 * We are now free to spawn some worker processes
+	 */
+	for (i=0; i < num_children; i++) {
+		prefork_fork_worker(task,
+				    ev,
+				    ev2,
+				    lp_ctx,
+				    service_details,
+				    service_name,
+				    control_pipe,
+				    0,
+				    &pd);
+		pd.instances++;
+	}
+
+	/* Don't listen on the sockets we just gave to the children */
+	tevent_loop_wait(ev);
+	TALLOC_FREE(ev);
+	/* We need to keep ev2 until we're finished for the messaging to work */
+	TALLOC_FREE(ev2);
+	exit(0);
+
+}
 static void prefork_restart(struct tevent_context *ev,
 			    struct restart_context *rc)
 {
+	unsigned max_backoff = 10;
+	unsigned backoff = 2;
+	unsigned restart_delay = rc->restart_delay;
+
+	unsigned default_value;
+
+	default_value = lpcfg_prefork_backoff_increment(rc->lp_ctx);
+	backoff = lpcfg_parm_int(rc->lp_ctx,
+				 NULL,
+				 "prefork backoff increment",
+				 rc->service_name,
+				 default_value);
+
+	default_value = lpcfg_prefork_maximum_backoff(rc->lp_ctx);
+	max_backoff = lpcfg_parm_int(rc->lp_ctx,
+				     NULL,
+				     "prefork maximum backoff",
+				     rc->service_name,
+				     default_value);
+
+	if (restart_delay > 0) {
+		DBG_ERR("Restarting [%s] pre-fork %s in (%d) seconds\n",
+			rc->service_name,
+			(rc->master == NULL) ? "worker" : "master",
+			restart_delay);
+		sleep(restart_delay);
+	}
+	restart_delay += backoff;
+	restart_delay = min(restart_delay, max_backoff);
+
 	if (rc->master != NULL) {
 		DBG_ERR("Restarting [%s] pre-fork master\n", rc->service_name);
-		prefork_new_task(ev,
-				 rc->master->lp_ctx,
-				 rc->service_name,
-				 rc->master->new_task_fn,
-				 rc->master->private_data,
-				 rc->service_details,
-				 rc->from_parent_fd);
+		prefork_fork_master(ev,
+				    rc->lp_ctx,
+				    rc->service_name,
+				    rc->master->new_task_fn,
+				    rc->master->private_data,
+				    rc->service_details,
+				    restart_delay,
+				    rc->from_parent_fd);
 	} else if (rc->worker != NULL) {
 		struct process_details pd = initial_process_details;
 		DBG_ERR("Restarting [%s] pre-fork worker(%d)\n",
@@ -168,9 +374,11 @@ static void prefork_restart(struct tevent_context *ev,
 		prefork_fork_worker(rc->worker->task,
 				    ev,
 				    rc->worker->ev2,
+				    rc->lp_ctx,
 				    rc->service_details,
 				    rc->service_name,
 				    rc->worker->control_pipe,
+				    restart_delay,
 				    &pd);
 	}
 }
@@ -309,9 +517,11 @@ static void setup_handlers(struct tevent_context *ev, int from_parent_fd) {
 static void prefork_fork_worker(struct task_server *task,
 				struct tevent_context *ev,
 				struct tevent_context *ev2,
+				struct loadparm_context *lp_ctx,
 				const struct service_details *service_details,
 				const char *service_name,
 				int control_pipe[2],
+				unsigned restart_delay,
 				struct process_details *pd)
 {
 	struct tfork *w = NULL;
@@ -333,8 +543,10 @@ static void prefork_fork_worker(struct task_server *task,
 			smb_panic("OOM allocating restart context\n");
 		}
 		rc->t = w;
+		rc->lp_ctx = lp_ctx;
 		rc->service_name = service_name;
 		rc->service_details = service_details;
+		rc->restart_delay = restart_delay;
 		rc->master = NULL;
 		rc->worker = talloc_zero(rc, struct worker_restart_context);
 		if (rc->worker == NULL) {
@@ -388,148 +600,14 @@ static void prefork_new_task(
 	const struct service_details *service_details,
 	int from_parent_fd)
 {
-	pid_t pid;
-	struct tfork* t = NULL;
-	int i, num_children;
-
-	struct tevent_context *ev2;
-	struct task_server *task = NULL;
-	struct process_details pd = initial_process_details;
-	int control_pipe[2];
-
-	t = tfork_create();
-	if (t == NULL) {
-		smb_panic("failure in tfork\n");
-	}
-
-	DBG_NOTICE("Forking [%s] pre-fork master process\n", service_name);
-	pid = tfork_child_pid(t);
-	if (pid != 0) {
-		struct tevent_fd *fde = NULL;
-		int fd = tfork_event_fd(t);
-		struct restart_context *rc = NULL;
-
-		/* Register a pipe handler that gets called when the prefork
-		 * master process terminates.
-		 */
-		rc = talloc_zero(ev, struct restart_context);
-		if (rc == NULL) {
-			smb_panic("OOM allocating restart context\n");
-		}
-		rc->t = t;
-		rc->service_name = service_name;
-		rc->service_details = service_details;
-		rc->from_parent_fd = from_parent_fd;
-		rc->master = talloc_zero(rc, struct master_restart_context);
-		if (rc->master == NULL) {
-			smb_panic("OOM allocating master restart context\n");
-		}
-
-		rc->master->lp_ctx = lp_ctx;
-		rc->master->new_task_fn = new_task_fn;
-		rc->master->private_data = private_data;
-
-		fde = tevent_add_fd(
-		    ev, ev, fd, TEVENT_FD_READ, prefork_child_pipe_handler, rc);
-		if (fde == NULL) {
-			smb_panic("Failed to add child pipe handler, "
-				  "after fork");
-		}
-		tevent_fd_set_auto_close(fde);
-		return;
-	}
-
-	pid = getpid();
-	setproctitle("task[%s] pre-fork master", service_name);
-
-	/*
-	 * this will free all the listening sockets and all state that
-	 * is not associated with this new connection
-	 */
-	if (tevent_re_initialise(ev) != 0) {
-		smb_panic("Failed to re-initialise tevent after fork");
-	}
-	prefork_reload_after_fork();
-	setup_handlers(ev, from_parent_fd);
-
-	if (service_details->inhibit_pre_fork) {
-		task = new_task_fn(
-		    ev, lp_ctx, cluster_id(pid, 0), private_data, NULL);
-		/*
-		 * The task does not support pre-fork
-		 */
-		if (task != NULL && service_details->post_fork != NULL) {
-			service_details->post_fork(task, &pd);
-		}
-		tevent_loop_wait(ev);
-		TALLOC_FREE(ev);
-		exit(0);
-	}
-
-	/*
-	 * This is now the child code. We need a completely new event_context
-	 * to work with
-	 */
-	ev2 = s4_event_context_init(NULL);
-
-	/* setup this new connection: process will bind to it's sockets etc
-	 *
-	 * While we can use ev for the child, which has been re-initialised
-	 * above we must run the new task under ev2 otherwise the children would
-	 * be listening on the sockets.  Also we don't want the top level
-	 * process accepting and handling requests, it's responsible for
-	 * monitoring and controlling the child work processes.
-	 */
-	task = new_task_fn(ev2, lp_ctx, cluster_id(pid, 0), private_data, NULL);
-	if (task == NULL) {
-		TALLOC_FREE(ev);
-		TALLOC_FREE(ev2);
-		exit(0);
-	}
-
-	{
-		int default_children;
-		default_children = lpcfg_prefork_children(lp_ctx);
-		num_children = lpcfg_parm_int(lp_ctx, NULL, "prefork children",
-			                      service_name, default_children);
-	}
-	if (num_children == 0) {
-		DBG_WARNING("Number of pre-fork children for %s is zero, "
-			    "NO worker processes will be started for %s\n",
-			    service_name, service_name);
-	}
-	DBG_NOTICE("Forking %d %s worker processes\n",
-		   num_children, service_name);
-	{
-		int ret;
-		ret = pipe(control_pipe);
-		if (ret != 0) {
-			smb_panic("Unable to create worker control pipe\n");
-		}
-		smb_set_close_on_exec(control_pipe[0]);
-		smb_set_close_on_exec(control_pipe[0]);
-	}
-
-	/*
-	 * We are now free to spawn some worker processes
-	 */
-	for (i=0; i < num_children; i++) {
-		prefork_fork_worker(task,
-				    ev,
-				    ev2,
-				    service_details,
-				    service_name,
-				    control_pipe,
-				    &pd);
-		pd.instances++;
-	}
-
-	/* Don't listen on the sockets we just gave to the children */
-	tevent_loop_wait(ev);
-	TALLOC_FREE(ev);
-	/* We need to keep ev2 until we're finished for the messaging to work */
-	TALLOC_FREE(ev2);
-	exit(0);
+	prefork_fork_master(ev,
+			    lp_ctx,
+			    service_name,
+			    new_task_fn,
+			    private_data,
+			    service_details,
+			    0,
+			    from_parent_fd);
 
 }
 
-- 
2.17.1


From 491fab0433abecf843e04c6692c5e89a26837b38 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Fri, 14 Sep 2018 09:43:59 +1200
Subject: [PATCH 07/11] source4 messaging: clean up terminated processes

Now that the smbd pre-fork process model restarts failed processes rather than
terminating, we end up with names registered to defunct processes.
This patch adds a function to clean up all the names registered to a process.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/lib/messaging/messaging.c | 42 +++++++++++++++++++++++++++++++
 source4/lib/messaging/messaging.h |  2 ++
 2 files changed, 44 insertions(+)

diff --git a/source4/lib/messaging/messaging.c b/source4/lib/messaging/messaging.c
index 935951f3fba..413a19445eb 100644
--- a/source4/lib/messaging/messaging.c
+++ b/source4/lib/messaging/messaging.c
@@ -248,6 +248,48 @@ static void imessaging_dgm_recv(struct tevent_context *ev,
 /* Keep a list of imessaging contexts */
 static struct imessaging_context *msg_ctxs;
 
+/*
+ * A process has terminated, clean-up any names it has registered.
+ */
+NTSTATUS imessaging_process_cleanup(
+	struct imessaging_context *msg_ctx,
+	pid_t pid)
+{
+	struct irpc_name_records *names = NULL;
+	int i = 0;
+	int j = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+
+	if (mem_ctx == NULL) {
+		DBG_ERR("OOM unable to clean up messaging for process (%d)\n",
+			pid);
+		return NT_STATUS_NO_MEMORY;
+	}
+
+	names = irpc_all_servers(msg_ctx, mem_ctx);
+	if (names == NULL) {
+		TALLOC_FREE(mem_ctx);
+		return NT_STATUS_OK;
+	}
+	for (i = 0; i < names->num_records; i++) {
+		for (j = 0; j < names->names[i]->count; j++) {
+			if (names->names[i]->ids[j].pid == pid) {
+				int ret = server_id_db_prune_name(
+					msg_ctx->names,
+					names->names[i]->name,
+					names->names[i]->ids[j]);
+				if (ret != 0 && ret != ENOENT) {
+					TALLOC_FREE(mem_ctx);
+					return map_nt_error_from_unix_common(
+					    ret);
+				}
+			}
+		}
+	}
+	TALLOC_FREE(mem_ctx);
+	return NT_STATUS_OK;
+}
+
 static int imessaging_context_destructor(struct imessaging_context *msg)
 {
 	DLIST_REMOVE(msg_ctxs, msg);
diff --git a/source4/lib/messaging/messaging.h b/source4/lib/messaging/messaging.h
index 668464dc061..8eb195dfb2e 100644
--- a/source4/lib/messaging/messaging.h
+++ b/source4/lib/messaging/messaging.h
@@ -54,5 +54,7 @@ NTSTATUS imessaging_send_ptr(struct imessaging_context *msg, struct server_id se
 			    uint32_t msg_type, void *ptr);
 void imessaging_deregister(struct imessaging_context *msg, uint32_t msg_type, void *private_data);
 struct server_id imessaging_get_server_id(struct imessaging_context *msg_ctx);
+NTSTATUS imessaging_process_cleanup(struct imessaging_context *msg_ctx,
+				    pid_t pid);
 
 #endif
-- 
2.17.1


From 6190b4730f91e0d200a8bfad3c4e13850f83bc03 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Fri, 14 Sep 2018 09:45:38 +1200
Subject: [PATCH 08/11] source4 smbd prefork: Cleanup messaging on restart

Clean up names registered in messaging for a terminated process.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 selftest/knownfail.d/prefork_restart |   8 ---
 source4/smbd/process_prefork.c       | 104 ++++++++++++++++++++++++---
 2 files changed, 95 insertions(+), 17 deletions(-)
 delete mode 100644 selftest/knownfail.d/prefork_restart

diff --git a/selftest/knownfail.d/prefork_restart b/selftest/knownfail.d/prefork_restart
deleted file mode 100644
index 020199cd9b4..00000000000
--- a/selftest/knownfail.d/prefork_restart
+++ /dev/null
@@ -1,8 +0,0 @@
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_all_workers_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_master_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_ldap_worker_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_master_restart_backoff
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_all_workers_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_master_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_rpc_worker_zero_restart
-^samba.tests.prefork_restart.samba.tests.prefork_restart.PreforkProcessRestartTests.test_worker_restart_backoff
diff --git a/source4/smbd/process_prefork.c b/source4/smbd/process_prefork.c
index f963b83afd1..f892f7f10d8 100644
--- a/source4/smbd/process_prefork.c
+++ b/source4/smbd/process_prefork.c
@@ -34,6 +34,7 @@
 #include "param/param.h"
 #include "ldb_wrap.h"
 #include "lib/util/tfork.h"
+#include "lib/messaging/irpc.h"
 
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 
@@ -63,7 +64,9 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 				       struct tevent_fd *fde,
 				       uint16_t flags,
 				       void *private_data);
-static void setup_handlers(struct tevent_context *ev, int from_parent_fd);
+static void setup_handlers(struct tevent_context *ev,
+			   struct loadparm_context *lp_ctx,
+                           int from_parent_fd);
 
 /*
  * State needed to restart the master process or a worker process if they
@@ -120,7 +123,7 @@ static void sigterm_signal_handler(struct tevent_context *ev,
 	}
 #endif
 	DBG_NOTICE("Exiting pid %d on SIGTERM\n", getpid());
-	talloc_free(ev);
+	TALLOC_FREE(ev);
 	exit(127);
 }
 
@@ -143,6 +146,40 @@ static void prefork_reload_after_fork(void)
 	}
 }
 
+/*
+ * clean up any messaging associated with the old process.
+ *
+ */
+static void irpc_cleanup(
+	struct loadparm_context *lp_ctx,
+	struct tevent_context *ev,
+	pid_t pid)
+{
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	struct imessaging_context *msg_ctx = NULL;
+	NTSTATUS status = NT_STATUS_OK;
+
+	if (mem_ctx == NULL) {
+		DBG_ERR("OOM cleaning up irpc\n");
+		return;
+	}
+	msg_ctx = imessaging_client_init(mem_ctx, lp_ctx, ev);
+	if (msg_ctx == NULL) {
+		DBG_ERR("Unable to create imessaging_context\n");
+		TALLOC_FREE(mem_ctx);
+		return;
+	}
+	status = imessaging_process_cleanup(msg_ctx, pid);
+	if (!NT_STATUS_IS_OK(status)) {
+		DBG_ERR("imessaging_process_cleanup returned (%s)\n",
+			nt_errstr(status));
+		TALLOC_FREE(mem_ctx);
+		return;
+	}
+
+	TALLOC_FREE(mem_ctx);
+}
+
 /*
   handle EOF on the parent-to-all-children pipe in the child
 */
@@ -150,10 +187,23 @@ static void prefork_pipe_handler(struct tevent_context *event_ctx,
 		                 struct tevent_fd *fde, uint16_t flags,
 				 void *private_data)
 {
-	/* free the fde which removes the event and stops it firing again */
+	struct loadparm_context *lp_ctx = NULL;
+	pid_t pid;
+
+	/*
+	 * free the fde which removes the event and stops it firing again
+	 */
 	TALLOC_FREE(fde);
+
+	/*
+	 * Clean up any irpc end points this process had.
+	 */
+	pid = getpid();
+	lp_ctx = talloc_get_type_abort(private_data, struct loadparm_context);
+	irpc_cleanup(lp_ctx, event_ctx, pid);
+
 	DBG_NOTICE("Child %d exiting\n", getpid());
-	talloc_free(event_ctx);
+	TALLOC_FREE(event_ctx);
 	exit(0);
 }
 
@@ -238,7 +288,7 @@ static void prefork_fork_master(
 		smb_panic("Failed to re-initialise tevent after fork");
 	}
 	prefork_reload_after_fork();
-	setup_handlers(ev, from_parent_fd);
+	setup_handlers(ev, lp_ctx, from_parent_fd);
 
 	if (service_details->inhibit_pre_fork) {
 		task = new_task_fn(
@@ -272,7 +322,23 @@ static void prefork_fork_master(
 	if (task == NULL) {
 		TALLOC_FREE(ev);
 		TALLOC_FREE(ev2);
-		exit(0);
+		exit(127);
+	}
+
+	/*
+	 * Register an irpc name that can be used by the samba-tool processes
+	 * command
+	 */
+	{
+		struct talloc_ctx *ctx = talloc_new(NULL);
+		char *name = NULL;
+		if (ctx == NULL) {
+			DBG_ERR("Out of memory");
+			exit(127);
+		}
+		name = talloc_asprintf(ctx, "prefork-master-%s", service_name);
+		irpc_add_name(task->msg_ctx, name);
+		TALLOC_FREE(ctx);
 	}
 
 	{
@@ -406,6 +472,8 @@ static void prefork_child_pipe_handler(struct tevent_context *ev,
 	rc = talloc_get_type_abort(private_data, struct restart_context);
 	pid = tfork_child_pid(rc->t);
 	errno = 0;
+
+	irpc_cleanup(rc->lp_ctx, ev, pid);
 	status = tfork_status(&rc->t, false);
 	if (status == -1) {
 		DBG_ERR("Parent %d, Child %d terminated, "
@@ -483,12 +551,16 @@ static void prefork_accept_connection(
 		 private_data, process_context);
 }
 
-static void setup_handlers(struct tevent_context *ev, int from_parent_fd) {
+static void setup_handlers(
+	struct tevent_context *ev,
+	struct loadparm_context *lp_ctx,
+	int from_parent_fd)
+{
 	struct tevent_fd *fde = NULL;
 	struct tevent_signal *se = NULL;
 
 	fde = tevent_add_fd(ev, ev, from_parent_fd, TEVENT_FD_READ,
-		      prefork_pipe_handler, NULL);
+		      prefork_pipe_handler, lp_ctx);
 	if (fde == NULL) {
 		smb_panic("Failed to add fd handler after fork");
 	}
@@ -567,7 +639,7 @@ static void prefork_fork_worker(struct task_server *task,
 		tevent_fd_set_auto_close(fde);
 	} else {
 		close(control_pipe[1]);
-		setup_handlers(ev2, control_pipe[0]);
+		setup_handlers(ev2, lp_ctx, control_pipe[0]);
 		/*
 		 * tfork uses malloc
 		 */
@@ -581,6 +653,19 @@ static void prefork_fork_worker(struct task_server *task,
 		if (service_details->post_fork != NULL) {
 			service_details->post_fork(task, pd);
 		}
+		{
+			struct talloc_ctx *ctx = talloc_new(NULL);
+			char *name = NULL;
+			if (ctx == NULL) {
+				smb_panic("OOM allocating talloc context\n");
+			}
+			name = talloc_asprintf(ctx,
+					       "prefork-worker-%s-%d",
+					       service_name,
+					       pd->instances);
+			irpc_add_name(task->msg_ctx, name);
+			TALLOC_FREE(ctx);
+		}
 		tevent_loop_wait(ev2);
 		talloc_free(ev2);
 		exit(0);
@@ -621,6 +706,7 @@ static void prefork_terminate_task(struct tevent_context *ev,
 				   void *process_context)
 {
 	DBG_DEBUG("called with reason[%s]\n", reason);
+	TALLOC_FREE(ev);
 	if (fatal == true) {
 		exit(127);
 	} else {
-- 
2.17.1


From 8ebc53b9e083e5a9e520621dbe828d68281c3bc7 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Tue, 11 Sep 2018 07:38:06 +1200
Subject: [PATCH 09/11] source4 dcerpc_server: remove irpc_add_name

Remove the irpc_add_name from dcesrv_sock_accept, as it results in two
identical names being registered for a process.

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 source4/rpc_server/dcerpc_server.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/source4/rpc_server/dcerpc_server.c b/source4/rpc_server/dcerpc_server.c
index b900623b627..7949c66323a 100644
--- a/source4/rpc_server/dcerpc_server.c
+++ b/source4/rpc_server/dcerpc_server.c
@@ -2723,8 +2723,6 @@ static void dcesrv_sock_accept(struct stream_connection *srv_conn)
 
 	srv_conn->private_data = dcesrv_conn;
 
-	irpc_add_name(srv_conn->msg_ctx, "rpc_server");
-
 	subreq = dcerpc_read_ncacn_packet_send(dcesrv_conn,
 					       dcesrv_conn->event_ctx,
 					       dcesrv_conn->stream);
-- 
2.17.1


From 376f90f3129bc2094243347ba132d2124c5f4209 Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Fri, 14 Sep 2018 09:38:56 +1200
Subject: [PATCH 10/11] samba-tool processes: display pre-fork masters and
 workers

Tag prefork work processes with "(worker 0)", and sort the process list
on server name to get a consistent order.

 Service:                          PID
 --------------------------------------
 cldap_server                     15588
 ...
 ldap_server                      15584
 ldap_server(worker 0)            15627
 ldap_server(worker 1)            15630
 ldap_server(worker 2)            15632
 ldap_server(worker 3)            15634
 nbt_server                       15576
 notify-daemon                    15638
 ...
 samba                                0
 ...
 wrepl_server                     15580

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 python/samba/netcmd/processes.py | 75 +++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/python/samba/netcmd/processes.py b/python/samba/netcmd/processes.py
index d04a548abd7..0406b1859ca 100644
--- a/python/samba/netcmd/processes.py
+++ b/python/samba/netcmd/processes.py
@@ -49,6 +49,42 @@ class cmd_processes(Command):
 
     takes_args = []
 
+    #
+    # Get details of the samba services currently registered in irpc
+    # The prefork process model registers names in the form:
+    #     prefork-master-<service> and prefork-worker-<service>-<instance>
+    #
+    # To allow this routine to identify pre-fork master and worker process
+    #
+    # returns a tuple (filtered, masters, workers)
+    #
+    #  filtered - is a list of services with the prefork-* removed
+    #  masters  - dictionary keyed on service name of prefork master processes
+    #  workers  - dictionary keyed on service name containing an ordered list
+    #             of worker processes.
+    def get_service_data(self, msg_ctx):
+        services = msg_ctx.irpc_all_servers()
+        filtered = []
+        masters = {}
+        workers = {}
+        for service in services:
+            for id in service.ids:
+                if service.name.startswith("prefork-master"):
+                    ns = service.name.split("-")
+                    name = ns[2] + "_server"
+                    masters[name] = service.ids[0].pid
+                elif service.name.startswith("prefork-worker"):
+                    ns = service.name.split("-")
+                    name = ns[2] + "_server"
+                    instance = int(ns[3])
+                    pid = service.ids[0].pid
+                    if name not in workers:
+                        workers[name] = {}
+                    workers[name][instance] = (instance, pid)
+                else:
+                    filtered.append(service)
+        return (filtered, masters, workers)
+
     def run(self, sambaopts, versionopts, section_name=None,
             name=None, pid=None):
 
@@ -72,9 +108,36 @@ class cmd_processes(Command):
                     if server_id.pid == int(pid):
                         self.outf.write("%s\n" % name.name)
         else:
-            names = msg_ctx.irpc_all_servers()
-            self.outf.write(" Service:                PID \n")
-            self.outf.write("-----------------------------\n")
-            for name in names:
-                for server_id in name.ids:
-                    self.outf.write("%-16s      %6d\n" % (name.name, server_id.pid))
+            seen = {}     # Service entries already printed, service names can
+            #               be registered multiple times against a process
+            #               but we should only display them once.
+            prefork = {}  # Services running in the prefork process model
+            #               want to ensure that the master process and workers
+            #               are grouped to together.
+            (services, masters, workers) = self.get_service_data(msg_ctx)
+            self.outf.write(" Service:                          PID\n")
+            self.outf.write("--------------------------------------\n")
+
+            for service in sorted(services, key=lambda x: x.name):
+                if service.name in masters:
+                    # If this service is running in a pre-forked process we
+                    # want to print the master process followed by all the
+                    # worker processes
+                    pid = masters[service.name]
+                    if pid not in prefork:
+                        prefork[pid] = True
+                        self.outf.write("%-26s      %6d\n" %
+                            (service.name, pid))
+                        if service.name in workers:
+                            ws = workers[service.name]
+                            for w in ws:
+                                (instance, pid) = ws[w]
+                                sn = "{0}(worker {1})".format(
+                                    service.name, instance)
+                                self.outf.write("%-26s      %6d\n" % (sn, pid))
+                else:
+                    for server_id in service.ids:
+                        if (service.name, server_id.pid) not in seen:
+                            self.outf.write("%-26s      %6d\n"
+                                % (service.name, server_id.pid))
+                            seen[(service.name, server_id.pid)] = True
-- 
2.17.1


From 2284563b416e083cab935dc043343ccf04c1c25c Mon Sep 17 00:00:00 2001
From: Gary Lockyer <gary at catalyst.net.nz>
Date: Wed, 19 Sep 2018 15:25:02 +1200
Subject: [PATCH 11/11] WHATSNEW: prefork restart

Signed-off-by: Gary Lockyer <gary at catalyst.net.nz>
---
 WHATSNEW.txt | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/WHATSNEW.txt b/WHATSNEW.txt
index 98ada34e5c6..b6243c0beb0 100644
--- a/WHATSNEW.txt
+++ b/WHATSNEW.txt
@@ -45,6 +45,7 @@ netlogon prefork
 DCERPC now supports pre-forked NETLOGON processes. The netlogon processes are
 pre-forked when the prefork process model is selected for samba.
 
+<<<<<<< HEAD
 Offline domain backups
 ----------------------
 
@@ -63,6 +64,18 @@ information about how the users are spread across groups in your domain.
 The 'samba-tool group list --verbose' command has also been updated to include
 the number of users in each group.
 
+prefork process restart
+-----------------------
+
+The pre-fork process model now restarts failed processes. The delay between
+restart attempts is controlled by the "prefork backoff increment" (default = 10)
+and "prefork maximum backoff" (default = 120) smbd.conf parameters.  A linear
+back off strategy is used with "prefork backoff increment" added to the
+delay between restart attempts up until it reaches "prefork maximum backoff".
+
+Using the default sequence the restart delays (in seconds) are:
+  0, 10, 20, ..., 120, 120, ...
+
 REMOVED FEATURES
 ================
 
@@ -75,8 +88,12 @@ The samba_backup script has been removed. This has now been replaced by the
 smb.conf changes
 ================
 
-  Parameter Name                     Description             Default
-  --------------                     -----------             -------
+  Parameter Name                     Description                Default
+  --------------                     -----------                -------
+  prefork backoff increment   Delay added to process restart    10 (seconds)
+                              between attempts.
+  prefork maximum backoff     Maximum delay for process between 120 (seconds)
+                              process restart attempts
 
 
 KNOWN ISSUES
-- 
2.17.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: OpenPGP digital signature
URL: <http://lists.samba.org/pipermail/samba-technical/attachments/20181120/b9abf432/signature-0001.sig>


More information about the samba-technical mailing list