[PATCH] Improvements to traffic_replay user generation

Wed Oct 17 19:49:23 UTC 2018

These patches make some improvements to the traffic_replay tool, when
using the --generate-users-only to populate a large DB (e.g. 10,000+ users).

The changes allow the tool to generate the users a lot faster, and tries
to improve how users were randomly being assigned to groups.

CI pass: https://gitlab.com/catalyst-samba/samba/pipelines/33218214

Review appreciated. Thanks.

-------------- next part --------------
From c08770436ba78ce895584120b25c9ac4bac61775 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 11 Oct 2018 14:47:28 +1300
Subject: [PATCH 01/11] traffic_replay: Generate users faster by writing to
 local DB

We can create user accounts much faster if the LDB connection talks
directly to the local sam.ldb file rather than going via LDAP. This
patch allows the 'host' argument to the tool to be a .ldb file (e.g.
"/usr/local/samba/private/sam.ldb") instead of a server name/IP.

In most cases, the traffic_replay tool wants to run on a remote device
(because the point of it is to send traffic to the DC). However, the
--generate-users-only is one case where the tool can be run locally,
directly on the test DC. (The traffic_replay user generation is handy
for standalone testing, because it also handles assigning group
memberships to the generated user accounts).

Note that you also need to use '--option="ldb:nosync = true"' to get
the improvement in performance.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 script/traffic_replay | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/script/traffic_replay b/script/traffic_replay
index 6578c84..3f4ff9b 100755
--- a/script/traffic_replay
+++ b/script/traffic_replay
@@ -30,6 +30,8 @@ from samba import gensec, get_debug_level
 from samba.emulate import traffic
 import samba.getopt as options
 from samba.logger import get_samba_logger
+from samba.samdb import SamDB
+from samba.auth import system_session
 
 
 def print_err(*args, **kwargs):
@@ -306,8 +308,16 @@ def main():
                          opts.number_of_groups)))
         sys.exit(1)
 
+    # Get an LDB connection.
     try:
-        ldb = traffic.openLdb(host, creds, lp)
+        # if we're only adding users, then it's OK to pass a sam.ldb filepath
+        # as the host, which creates the users much faster. In all other cases
+        # we should be connecting to a remote DC
+        if opts.generate_users_only and os.path.isfile(host):
+            ldb = SamDB(url="ldb://{0}".format(host),
+                        session_info=system_session(), lp=lp)
+        else:
+            ldb = traffic.openLdb(host, creds, lp)
     except:
         logger.error(("\nInitial LDAP connection failed! Did you supply "
                       "a DNS host name and the correct credentials?"))
-- 
2.7.4


From 804200a95ba3df677a42383e25ddf9bdb19062d0 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 11:26:28 +1300
Subject: [PATCH 02/11] traffic_replay: Change print() to use logger()

This reduces noise, so the messages only come out if you specify
--debug.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 35100ca..677ad82 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1892,7 +1892,7 @@ def add_users_to_groups(db, instance_id, assignments):
         db.modify(m)
         end = time.time()
         duration = end - start
-        print("%f\t0\tadd\tuser\t%f\tTrue\t" % (end, duration))
+        LOGGER.info("%f\t0\tadd\tuser\t%f\tTrue\t" % (end, duration))
 
 
 def generate_stats(statsdir, timing_file):
-- 
2.7.4


From a5710bf9571ed1816a5bd2712c597a3c4815a3d2 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 11:56:58 +1300
Subject: [PATCH 03/11] logger: Add custom log level for important 'info'-esque
 logs

Currently info logs are only displayed if --debug or --verbose is
specified. We have a certain category of logs that are not
warnings/errors/critical messages, but we still want the user to see by
default. This adds a custom 'STATUS' level for such status updates.

The new level can be used: logger.log(LOG_STATUS, "Blah...")

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/logger.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/samba/logger.py b/python/samba/logger.py
index a3db3be..c8d3a14 100644
--- a/python/samba/logger.py
+++ b/python/samba/logger.py
@@ -18,7 +18,11 @@
 
 import sys
 import logging
-from samba.colour import GREY, YELLOW, GREEN, RED, DARK_RED, C_NORMAL
+from samba.colour import GREY, YELLOW, GREEN, RED, DARK_RED, C_NORMAL, BLUE
+
+# custom level which is not a "warning", but has a level higher than WARNING
+# to ensure it gets displayed by default
+LOG_STATUS = logging.WARNING + 1
 
 LEVEL_COLORS = {
     logging.CRITICAL: DARK_RED,
@@ -26,6 +30,7 @@ LEVEL_COLORS = {
     logging.WARNING: YELLOW,
     logging.INFO: GREEN,
     logging.DEBUG: GREY,
+    LOG_STATUS: BLUE,
 }
 
 
@@ -55,6 +60,7 @@ def get_samba_logger(
                  (quiet and logging.WARNING) or logging.INFO)
 
     logger.setLevel(level)
+    logging.addLevelName(LOG_STATUS, "STATUS")
 
     Formatter = use_color and ColoredFormatter or logging.Formatter
     formatter = Formatter(fmt=fmt, datefmt=datefmt)
-- 
2.7.4


From bc21672ecfbcbbf81b592a89f136ad475814bada Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Mon, 15 Oct 2018 11:23:47 +1300
Subject: [PATCH 04/11] traffic_replay: Change user generation debug to use
 logger

Currently the messages that display the user generation's progresss lack
any timestamp information, so it's hard to tell how long things are
taking. Change it to user logger, which will include a timestamp.

Note that we want these logs to always be displayed, however
LOGGER.info() will only be displayed if the --debug option is set. These
aren't really warning/critical messages, but we can use a higher log
level so that they get displayed by default.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 677ad82..df0a6fe 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -51,7 +51,7 @@ from samba.dcerpc.misc import SEC_CHAN_BDC
 from samba import gensec
 from samba import sd_utils
 from samba.compat import get_string
-from samba.logger import get_samba_logger
+from samba.logger import get_samba_logger, LOG_STATUS
 
 SLEEP_OVERHEAD = 3e-4
 
@@ -1784,31 +1784,30 @@ def generate_users_and_groups(ldb, instance_id, password,
 
     create_ou(ldb, instance_id)
 
-    print("Generating dummy user accounts", file=sys.stderr)
+    LOGGER.log(LOG_STATUS, "Generating dummy user accounts")
     users_added = generate_users(ldb, instance_id, number_of_users, password)
 
     if number_of_groups > 0:
-        print("Generating dummy groups", file=sys.stderr)
+        LOGGER.log(LOG_STATUS, "Generating dummy groups")
         groups_added = generate_groups(ldb, instance_id, number_of_groups)
 
     if group_memberships > 0:
-        print("Assigning users to groups", file=sys.stderr)
+        LOGGER.log(LOG_STATUS, "Assigning users to groups")
         assignments = assign_groups(number_of_groups,
                                     groups_added,
                                     number_of_users,
                                     users_added,
                                     group_memberships)
-        print("Adding users to groups", file=sys.stderr)
+        LOGGER.log(LOG_STATUS, "Adding users to groups")
         add_users_to_groups(ldb, instance_id, assignments)
 
     if (groups_added > 0 and users_added == 0 and
        number_of_groups != groups_added):
-        print("Warning: the added groups will contain no members",
-              file=sys.stderr)
+        LOGGER.warning("Warning: the added groups will contain no members")
 
-    print(("Added %d users, %d groups and %d group memberships" %
-           (users_added, groups_added, len(assignments))),
-          file=sys.stderr)
+    LOGGER.log(LOG_STATUS,
+               "Added %d users, %d groups and %d group memberships" %
+               (users_added, groups_added, len(assignments)))
 
 
 def assign_groups(number_of_groups,
-- 
2.7.4


From 4136996ee6452b53cf01389a0e040fd490c264a8 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 12:25:49 +1300
Subject: [PATCH 05/11] traffic_replay: Improve debug when populating large DB

The current debug is not that helpful - we get one message per user, but
it doesn't tell us how far through adding 10,000 users we are.

Instead, just log every x many operations. Adding users is the slowest,
whereas creating groups and adding memberships are both a lot faster, so
vary the ratio of operation we log, so that we spit out a log every few
seconds.

Logger already includes timestamps, so we can remove that timestamp
information.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index df0a6fe..13b3a20 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1630,6 +1630,8 @@ def generate_traffic_accounts(ldb, instance_id, number, password):
             netbios_name = "STGM-%d-%d" % (instance_id, i)
             create_machine_account(ldb, instance_id, netbios_name, password)
             added += 1
+            if added % 5 == 0:
+                LOGGER.info("Created %u/%u machine accounts" % (added, number))
         except LdbError as e:
             (status, _) = e.args
             if status == 68:
@@ -1646,6 +1648,9 @@ def generate_traffic_accounts(ldb, instance_id, number, password):
             username = "STGU-%d-%d" % (instance_id, i)
             create_user_account(ldb, instance_id, username, password)
             added += 1
+            if added % 5 == 0:
+                LOGGER.info("Created %u/%u users" % (added, number))
+
         except LdbError as e:
             (status, _) = e.args
             if status == 68:
@@ -1683,7 +1688,6 @@ def create_user_account(ldb, instance_id, username, userpass):
     ou = ou_name(ldb, instance_id)
     user_dn = "cn=%s,%s" % (username, ou)
     utf16pw = ('"%s"' % get_string(userpass)).encode('utf-16-le')
-    start = time.time()
     ldb.add({
         "dn": user_dn,
         "objectclass": "user",
@@ -1696,25 +1700,17 @@ def create_user_account(ldb, instance_id, username, userpass):
     sdutils = sd_utils.SDUtils(ldb)
     sdutils.dacl_add_ace(user_dn, "(A;;WP;;;PS)")
 
-    end = time.time()
-    duration = end - start
-    LOGGER.info("%f\t0\tcreate\tuser\t%f\tTrue\t" % (end, duration))
-
 
 def create_group(ldb, instance_id, name):
     """Create a group via ldap."""
 
     ou = ou_name(ldb, instance_id)
     dn = "cn=%s,%s" % (name, ou)
-    start = time.time()
     ldb.add({
         "dn": dn,
         "objectclass": "group",
         "sAMAccountName": name,
     })
-    end = time.time()
-    duration = end - start
-    LOGGER.info("%f\t0\tcreate\tgroup\t%f\tTrue\t" % (end, duration))
 
 
 def user_name(instance_id, i):
@@ -1740,6 +1736,8 @@ def generate_users(ldb, instance_id, number, password):
         if name not in existing_objects:
             create_user_account(ldb, instance_id, name, password)
             users += 1
+            if users % 5 == 0:
+                LOGGER.info("Created %u/%u users" % (users, number))
 
     return users
 
@@ -1758,6 +1756,8 @@ def generate_groups(ldb, instance_id, number):
         if name not in existing_objects:
             create_group(ldb, instance_id, name)
             groups += 1
+            if groups % 100 == 0:
+                LOGGER.info("Created %u/%u groups" % (groups, number))
 
     return groups
 
@@ -1876,6 +1876,8 @@ def add_users_to_groups(db, instance_id, assignments):
     assign the users to their specified groups."""
 
     ou = ou_name(db, instance_id)
+    count = 0
+    total = len(assignments)
 
     def build_dn(name):
         return("cn=%s,%s" % (name, ou))
@@ -1887,11 +1889,10 @@ def add_users_to_groups(db, instance_id, assignments):
         m = ldb.Message()
         m.dn = ldb.Dn(db, group_dn)
         m["member"] = ldb.MessageElement(user_dn, ldb.FLAG_MOD_ADD, "member")
-        start = time.time()
         db.modify(m)
-        end = time.time()
-        duration = end - start
-        LOGGER.info("%f\t0\tadd\tuser\t%f\tTrue\t" % (end, duration))
+        count += 1
+        if count % 1000 == 0:
+            LOGGER.info("Added %u/%u memberships" % (count, total))
 
 
 def generate_stats(statsdir, timing_file):
-- 
2.7.4


From 8f58020b73ab6d84b9c5e15c07f497d76492884d Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 12:44:37 +1300
Subject: [PATCH 06/11] traffic_replay: Add helper class for group assignments

Wrap up the group assignment calculations in a helper class. We're going
to tweak the internals a bit in subsequent patches, but the rest of the
code doesn't really need to know about these changes.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 128 ++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 59 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 13b3a20..ab53c25 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1779,7 +1779,7 @@ def generate_users_and_groups(ldb, instance_id, password,
                               group_memberships):
     """Generate the required users and groups, allocating the users to
        those groups."""
-    assignments = []
+    memberships_added = 0
     groups_added  = 0
 
     create_ou(ldb, instance_id)
@@ -1793,13 +1793,14 @@ def generate_users_and_groups(ldb, instance_id, password,
 
     if group_memberships > 0:
         LOGGER.log(LOG_STATUS, "Assigning users to groups")
-        assignments = assign_groups(number_of_groups,
-                                    groups_added,
-                                    number_of_users,
-                                    users_added,
-                                    group_memberships)
+        assignments = GroupAssignments(number_of_groups,
+                                       groups_added,
+                                       number_of_users,
+                                       users_added,
+                                       group_memberships)
         LOGGER.log(LOG_STATUS, "Adding users to groups")
         add_users_to_groups(ldb, instance_id, assignments)
+        memberships_added = assignments.total()
 
     if (groups_added > 0 and users_added == 0 and
        number_of_groups != groups_added):
@@ -1807,66 +1808,75 @@ def generate_users_and_groups(ldb, instance_id, password,
 
     LOGGER.log(LOG_STATUS,
                "Added %d users, %d groups and %d group memberships" %
-               (users_added, groups_added, len(assignments)))
+               (users_added, groups_added, memberships_added))
 
 
-def assign_groups(number_of_groups,
-                  groups_added,
-                  number_of_users,
-                  users_added,
-                  group_memberships):
-    """Allocate users to groups.
+class GroupAssignments(object):
+    def __init__(self, number_of_groups, groups_added, number_of_users,
+                 users_added, group_memberships):
+        self.assignments = self.assign_groups(number_of_groups,
+                                              groups_added,
+                                              number_of_users,
+                                              users_added,
+                                              group_memberships)
 
-    The intention is to have a few users that belong to most groups, while
-    the majority of users belong to a few groups.
+    def assign_groups(self, number_of_groups, groups_added, number_of_users,
+                      users_added, group_memberships):
+        """Allocate users to groups.
 
-    A few groups will contain most users, with the remaining only having a
-    few users.
-    """
+        The intention is to have a few users that belong to most groups, while
+        the majority of users belong to a few groups.
 
-    def generate_user_distribution(n):
-        """Probability distribution of a user belonging to a group.
+        A few groups will contain most users, with the remaining only having a
+        few users.
         """
-        dist = []
-        for x in range(1, n + 1):
-            p = 1 / (x + 0.001)
-            dist.append(p)
-        return dist
-
-    def generate_group_distribution(n):
-        """Probability distribution of a group containing a user."""
-        dist = []
-        for x in range(1, n + 1):
-            p = 1 / (x**1.3)
-            dist.append(p)
-        return dist
-
-    assignments = set()
-    if group_memberships <= 0:
-        return assignments
-
-    group_dist = generate_group_distribution(number_of_groups)
-    user_dist  = generate_user_distribution(number_of_users)
 
-    # Calculate the number of group menberships required
-    group_memberships = math.ceil(
-        float(group_memberships) *
-        (float(users_added) / float(number_of_users)))
-
-    existing_users  = number_of_users  - users_added  - 1
-    existing_groups = number_of_groups - groups_added - 1
-    while len(assignments) < group_memberships:
-        user        = random.randint(0, number_of_users - 1)
-        group       = random.randint(0, number_of_groups - 1)
-        probability = group_dist[group] * user_dist[user]
+        def generate_user_distribution(n):
+            """Probability distribution of a user belonging to a group.
+            """
+            dist = []
+            for x in range(1, n + 1):
+                p = 1 / (x + 0.001)
+                dist.append(p)
+            return dist
+
+        def generate_group_distribution(n):
+            """Probability distribution of a group containing a user."""
+            dist = []
+            for x in range(1, n + 1):
+                p = 1 / (x**1.3)
+                dist.append(p)
+            return dist
+
+        assignments = set()
+        if group_memberships <= 0:
+            return assignments
+
+        group_dist = generate_group_distribution(number_of_groups)
+        user_dist  = generate_user_distribution(number_of_users)
+
+        # Calculate the number of group menberships required
+        group_memberships = math.ceil(
+            float(group_memberships) *
+            (float(users_added) / float(number_of_users)))
+
+        existing_users  = number_of_users  - users_added  - 1
+        existing_groups = number_of_groups - groups_added - 1
+        while len(assignments) < group_memberships:
+            user        = random.randint(0, number_of_users - 1)
+            group       = random.randint(0, number_of_groups - 1)
+            probability = group_dist[group] * user_dist[user]
+
+            if ((random.random() < probability * 10000) and
+               (group > existing_groups or user > existing_users)):
+                # the + 1 converts the array index to the corresponding
+                # group or user number
+                assignments.add(((user + 1), (group + 1)))
 
-        if ((random.random() < probability * 10000) and
-           (group > existing_groups or user > existing_users)):
-            # the + 1 converts the array index to the corresponding
-            # group or user number
-            assignments.add(((user + 1), (group + 1)))
+        return assignments
 
-    return assignments
+    def total(self):
+        return len(self.assignments)
 
 
 def add_users_to_groups(db, instance_id, assignments):
@@ -1877,12 +1887,12 @@ def add_users_to_groups(db, instance_id, assignments):
 
     ou = ou_name(db, instance_id)
     count = 0
-    total = len(assignments)
+    total = assignments.total()
 
     def build_dn(name):
         return("cn=%s,%s" % (name, ou))
 
-    for (user, group) in assignments:
+    for (user, group) in assignments.assignments:
         user_dn  = build_dn(user_name(instance_id, user))
         group_dn = build_dn(group_name(instance_id, group))
 
-- 
2.7.4


From 5dfa148fa19ae4dbe3d4afa7162b12c7904afc42 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 12:54:03 +1300
Subject: [PATCH 07/11] traffic_replay: Split out random group membership 
 generation logic

This doesn't change functionality at all. It just moves the probability
calculations out into separate functions.

We want to tweak the logic/implementation behind this code, but the
rest of assign_groups() doesn't really care how the underlying
probabilities are worked out, so long as it gets a suitably random
user/group membership each time round the loop.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 63 +++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index ab53c25..8933732 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1814,14 +1814,46 @@ def generate_users_and_groups(ldb, instance_id, password,
 class GroupAssignments(object):
     def __init__(self, number_of_groups, groups_added, number_of_users,
                  users_added, group_memberships):
+
+        self.generate_group_distribution(number_of_groups)
+        self.generate_user_distribution(number_of_users)
         self.assignments = self.assign_groups(number_of_groups,
                                               groups_added,
                                               number_of_users,
                                               users_added,
                                               group_memberships)
 
-    def assign_groups(self, number_of_groups, groups_added, number_of_users,
-                      users_added, group_memberships):
+    def generate_user_distribution(self, n):
+        """Probability distribution of a user belonging to a group.
+        """
+        self.user_dist = []
+        for x in range(1, n + 1):
+            p = 1 / (x + 0.001)
+            self.user_dist.append(p)
+
+        self.num_users = n
+
+    def generate_group_distribution(self, n):
+        """Probability distribution of a group containing a user."""
+        self.group_dist = []
+        for x in range(1, n + 1):
+            p = 1 / (x**1.3)
+            self.group_dist.append(p)
+
+        self.num_groups = n
+
+    def generate_random_membership(self):
+        """Returns a randomly generated user-group membership"""
+        while True:
+            user        = random.randint(0, self.num_users - 1)
+            group       = random.randint(0, self.num_groups - 1)
+            probability = self.group_dist[group] * self.user_dist[user]
+
+            if random.random() < probability * 10000:
+                return user, group
+
+    def assign_groups(self, number_of_groups, groups_added,
+                      number_of_users, users_added, group_memberships):
         """Allocate users to groups.
 
         The intention is to have a few users that belong to most groups, while
@@ -1831,30 +1863,10 @@ class GroupAssignments(object):
         few users.
         """
 
-        def generate_user_distribution(n):
-            """Probability distribution of a user belonging to a group.
-            """
-            dist = []
-            for x in range(1, n + 1):
-                p = 1 / (x + 0.001)
-                dist.append(p)
-            return dist
-
-        def generate_group_distribution(n):
-            """Probability distribution of a group containing a user."""
-            dist = []
-            for x in range(1, n + 1):
-                p = 1 / (x**1.3)
-                dist.append(p)
-            return dist
-
         assignments = set()
         if group_memberships <= 0:
             return assignments
 
-        group_dist = generate_group_distribution(number_of_groups)
-        user_dist  = generate_user_distribution(number_of_users)
-
         # Calculate the number of group menberships required
         group_memberships = math.ceil(
             float(group_memberships) *
@@ -1863,12 +1875,9 @@ class GroupAssignments(object):
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
         while len(assignments) < group_memberships:
-            user        = random.randint(0, number_of_users - 1)
-            group       = random.randint(0, number_of_groups - 1)
-            probability = group_dist[group] * user_dist[user]
+            user, group = self.generate_random_membership()
 
-            if ((random.random() < probability * 10000) and
-               (group > existing_groups or user > existing_users)):
+            if group > existing_groups or user > existing_users:
                 # the + 1 converts the array index to the corresponding
                 # group or user number
                 assignments.add(((user + 1), (group + 1)))
-- 
2.7.4


From 35b742a7ffe9c21b46b47444e1c4df5e03f6dafe Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Mon, 15 Oct 2018 16:24:00 +1300
Subject: [PATCH 08/11] traffic_replay: Improve assign_groups() performance
 with large domains

When assigning 10,000 users to 15 groups each (on average),
assign_groups() would take over 30 seconds. This did not include any DB
operations whatsoever. This patch improves things, so that it takes less
than a second in the same situation.

The problem was the code was looping ~23 million times where the
'random.random() < probability * 10000' condition was not met. The
problem is individual group/user probabilities get lower as the number
of groups/users increases. And so with large numbers of users, most of
the time the calculated probability was very small and didn't meet the
threshold.

This patch changes it so we can select a user/group in one go, avoiding
the need to loop multiple times.

Basically we distribute the users (or groups) between 0.0 and 1.0, so
that each user has their own 'slice', and this slice is proporational to
their weighted probability. random.random() generates a value between
0.0 and 1.0, so we can use this to pick a 'slice' (or rather, we use
this as an index into the list, using .bisect()). Users/groups with
larger probabilities end up with larger slices, so are more likely to
get picked.

The end result is roughly the same distribution as before, although the
first 10 or so user/groups seem to get picked more frequently, so the
weighted-probability calculations may need tweaking some more.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 47 ++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 8933732..2c5232e 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -52,6 +52,7 @@ from samba import gensec
 from samba import sd_utils
 from samba.compat import get_string
 from samba.logger import get_samba_logger, LOG_STATUS
+import bisect
 
 SLEEP_OVERHEAD = 3e-4
 
@@ -1823,34 +1824,56 @@ class GroupAssignments(object):
                                               users_added,
                                               group_memberships)
 
+    def cumulative_distribution(self, weights):
+        # make sure the probabilities conform to a cumulative distribution
+        # spread between 0.0 and 1.0. Dividing by the weighted total gives each
+        # probability a proportional share of 1.0. Higher probabilities get a
+        # bigger share, so are more likely to be picked. We use the cumulative
+        # value, so we can use random.random() as a simple index into the list
+        dist = []
+        total = sum(weights)
+        cumulative = 0.0
+        for probability in weights:
+            cumulative += probability
+            dist.append(cumulative / total)
+        return dist
+
     def generate_user_distribution(self, n):
         """Probability distribution of a user belonging to a group.
         """
-        self.user_dist = []
+        # Assign a weighted probability to each user. Probability decreases
+        # as the user-ID increases
+        weights = []
         for x in range(1, n + 1):
             p = 1 / (x + 0.001)
-            self.user_dist.append(p)
+            weights.append(p)
 
-        self.num_users = n
+        # convert the weights to a cumulative distribution between 0.0 and 1.0
+        self.user_dist = self.cumulative_distribution(weights)
 
     def generate_group_distribution(self, n):
         """Probability distribution of a group containing a user."""
-        self.group_dist = []
+
+        # Assign a weighted probability to each user. Probability decreases
+        # as the group-ID increases
+        weights = []
         for x in range(1, n + 1):
             p = 1 / (x**1.3)
-            self.group_dist.append(p)
+            weights.append(p)
 
-        self.num_groups = n
+        # convert the weights to a cumulative distribution between 0.0 and 1.0
+        self.group_dist = self.cumulative_distribution(weights)
 
     def generate_random_membership(self):
         """Returns a randomly generated user-group membership"""
-        while True:
-            user        = random.randint(0, self.num_users - 1)
-            group       = random.randint(0, self.num_groups - 1)
-            probability = self.group_dist[group] * self.user_dist[user]
 
-            if random.random() < probability * 10000:
-                return user, group
+        # the list items are cumulative distribution values between 0.0 and
+        # 1.0, which makes random() a handy way to index the list to get a
+        # weighted random user/group
+        user = bisect.bisect(self.user_dist, random.random())
+        group = bisect.bisect(self.group_dist, random.random())
+
+        return user, group
 
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
-- 
2.7.4


From 0276a140e78a89a41f89e789b9245deed49feae4 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 16 Oct 2018 10:57:29 +1300
Subject: [PATCH 09/11] traffic_replay: Change user distribution to use Pareto
 Distribution

The current probability we were assigning to users roughly approximates
the Pareto Distribution (with shape=1.0). This means the code now uses a
documented algorithm (i.e. explanation on Wikipedia). It also allows us
to vary the distribution by changing the shape parameter.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 2c5232e..32b77c0 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1841,11 +1841,12 @@ class GroupAssignments(object):
     def generate_user_distribution(self, n):
         """Probability distribution of a user belonging to a group.
         """
-        # Assign a weighted probability to each user. Probability decreases
-        # as the user-ID increases
+        # Assign a weighted probability to each user. Use the Pareto
+        # Distribution so that some users are in a lot of groups, and the
+        # bulk of users are in only a few groups
         weights = []
         for x in range(1, n + 1):
-            p = 1 / (x + 0.001)
+            p = random.paretovariate(1.0)
             weights.append(p)
 
         # convert the weights to a cumulative distribution between 0.0 and 1.0
-- 
2.7.4


From b83a163dbd536b3ec7416fd9bc790195e6d4de1d Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 16 Oct 2018 16:01:25 +1300
Subject: [PATCH 10/11] traffic_replay: Prevent users having 1000+ memberOf
 links

When adding 10,000 users, one user would end up in over 1000 groups.
With 100,000 users, it would be more like 10,000 groups. While it makes
sense to have groups with large numbers of users, having a single user
in 1000s of groups is probably less realistic.

This patch changes the shape of the Pareto distribution that we use to
assign users to groups. The aim is to cap users at belonging to at most
~500 groups. Increasing the shape of the Pareto distribution pushes the
user assignments so they're closer to the average, and the tail (with
users in lots of groups) is not so large).

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 32b77c0..eb09dc8 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1817,7 +1817,7 @@ class GroupAssignments(object):
                  users_added, group_memberships):
 
         self.generate_group_distribution(number_of_groups)
-        self.generate_user_distribution(number_of_users)
+        self.generate_user_distribution(number_of_users, group_memberships)
         self.assignments = self.assign_groups(number_of_groups,
                                               groups_added,
                                               number_of_users,
@@ -1838,15 +1838,27 @@ class GroupAssignments(object):
             dist.append(cumulative / total)
         return dist
 
-    def generate_user_distribution(self, n):
+    def generate_user_distribution(self, num_users, num_memberships):
         """Probability distribution of a user belonging to a group.
         """
         # Assign a weighted probability to each user. Use the Pareto
         # Distribution so that some users are in a lot of groups, and the
-        # bulk of users are in only a few groups
+        # bulk of users are in only a few groups. If we're assigning a large
+        # number of group memberships, use a higher shape. This means slightly
+        # fewer outlying users that are in large numbers of groups. The aim is
+        # to have no users belonging to more than ~500 groups.
+        if num_memberships > 5000000:
+            shape = 3.0
+        elif num_memberships > 2000000:
+            shape = 2.5
+        elif num_memberships > 300000:
+            shape = 2.25
+        else:
+            shape = 1.75
+
         weights = []
-        for x in range(1, n + 1):
-            p = random.paretovariate(1.0)
+        for x in range(1, num_users + 1):
+            p = random.paretovariate(shape)
             weights.append(p)
 
         # convert the weights to a cumulative distribution between 0.0 and 1.0
-- 
2.7.4


From d42c0cdf9638e5c8fd8b5c9a9fc5512785cef803 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 13:22:59 +1300
Subject: [PATCH 11/11] traffic_replay: Write group memberships once per group

Each user-group membership was being written to the DB in a single
operation. With large numbers of users (e.g. 10,000 in average 15 groups
each), this becomes a lot of operations (e.g. 150,000). This patch
reworks the code so that we write *all* the memberships for a group in
one operation. E.g. instead of 150,000 DB operations, we might make
1,500. This makes writing the group memberships several times
faster.

To do this, we reorganize the assignments so instead of being a set of
tuples, it's a dictionary where key=group and
value=list-of-users-in-group.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 57 ++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index eb09dc8..3286b69 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1816,13 +1816,17 @@ class GroupAssignments(object):
     def __init__(self, number_of_groups, groups_added, number_of_users,
                  users_added, group_memberships):
 
+        # populate a dictionary of empty sets (group is the dictionary key,
+        # because that optimizes for DB membership writes)
+        self.count = 0
+        self.assignments = {}
+        for i in range(number_of_groups, 0, -1):
+            self.assignments[i] = set()
+
         self.generate_group_distribution(number_of_groups)
         self.generate_user_distribution(number_of_users, group_memberships)
-        self.assignments = self.assign_groups(number_of_groups,
-                                              groups_added,
-                                              number_of_users,
-                                              users_added,
-                                              group_memberships)
+        self.assign_groups(number_of_groups, groups_added, number_of_users,
+                           users_added, group_memberships)
 
     def cumulative_distribution(self, weights):
         # make sure the probabilities conform to a cumulative distribution
@@ -1888,6 +1892,17 @@ class GroupAssignments(object):
 
         return user, group
 
+    def add_assignment(self, user, group):
+        if user not in self.assignments[group]:
+            self.assignments[group].add(user)
+            self.count += 1
+
+    def users_in_group(self, group):
+        return self.assignments[group]
+
+    def get_groups(self):
+        return self.assignments.keys()
+
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
         """Allocate users to groups.
@@ -1899,9 +1914,8 @@ class GroupAssignments(object):
         few users.
         """
 
-        assignments = set()
         if group_memberships <= 0:
-            return assignments
+            return
 
         # Calculate the number of group menberships required
         group_memberships = math.ceil(
@@ -1910,18 +1924,16 @@ class GroupAssignments(object):
 
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
-        while len(assignments) < group_memberships:
+        while self.total() < group_memberships:
             user, group = self.generate_random_membership()
 
             if group > existing_groups or user > existing_users:
                 # the + 1 converts the array index to the corresponding
                 # group or user number
-                assignments.add(((user + 1), (group + 1)))
-
-        return assignments
+                self.add_assignment((user + 1), (group + 1))
 
     def total(self):
-        return len(self.assignments)
+        return self.count
 
 
 def add_users_to_groups(db, instance_id, assignments):
@@ -1931,23 +1943,32 @@ def add_users_to_groups(db, instance_id, assignments):
     assign the users to their specified groups."""
 
     ou = ou_name(db, instance_id)
+    added = 0
     count = 0
     total = assignments.total()
 
     def build_dn(name):
         return("cn=%s,%s" % (name, ou))
 
-    for (user, group) in assignments.assignments:
-        user_dn  = build_dn(user_name(instance_id, user))
-        group_dn = build_dn(group_name(instance_id, group))
+    for group in assignments.get_groups():
+        users_in_group = assignments.users_in_group(group)
+        if len(users_in_group) == 0:
+            continue
 
+        group_dn = build_dn(group_name(instance_id, group))
         m = ldb.Message()
         m.dn = ldb.Dn(db, group_dn)
-        m["member"] = ldb.MessageElement(user_dn, ldb.FLAG_MOD_ADD, "member")
+
+        for user in users_in_group:
+            user_dn  = build_dn(user_name(instance_id, user))
+            idx = "member-" + str(user)
+            m[idx] = ldb.MessageElement(user_dn, ldb.FLAG_MOD_ADD, "member")
+
         db.modify(m)
+        added += len(users_in_group)
         count += 1
-        if count % 1000 == 0:
-            LOGGER.info("Added %u/%u memberships" % (count, total))
+        if count % 100 == 0:
+            LOGGER.info("Added %u/%u memberships" % (added, total))
 
 
 def generate_stats(statsdir, timing_file):
-- 
2.7.4