[PATCH] Add max-members option to traffic_replay

Tim Beale timbeale at catalyst.net.nz
Tue Nov 27 03:08:38 UTC 2018


The attached patch adds a new --max-members option to the traffic_replay
user generation.

This allows us to have finer-grain control over how the users are
assigned to groups, i.e. if we have 100K users but want to limit the
maximum users in a group to 5K.

CI link: https://gitlab.com/samba-team/devel/samba/pipelines/38004655

Merge request: https://gitlab.com/samba-team/samba/merge_requests/113

Review appreciated. Thanks.

-------------- next part --------------
From 3fb5febe01d3bbeca085d92d62be4f7c0e988f45 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 27 Nov 2018 11:45:51 +1300
Subject: [PATCH 1/4] netcmd: Minor changes to 'group stats' command

These changes were inadvertently left off 0c910245fca70948a3.
(They were made to the 2nd patch-set iteration posted to the
mailing-list, but for some reason the first patch-set got delivered).

Changes are:
+ rework some variable names for better readability
+ Average members defaulted to int, so lost any floating point
precision.
+ Replace 'Min members' (which was fairly meaningless) with 'Median
members per group'.
+ Fix flake8 long line warnings

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/netcmd/group.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/python/samba/netcmd/group.py b/python/samba/netcmd/group.py
index 121161c..3d55222 100644
--- a/python/samba/netcmd/group.py
+++ b/python/samba/netcmd/group.py
@@ -358,7 +358,8 @@ class cmd_group_list(Command):
                     self.outf.write("Distribution     Universal")
                 else:
                     self.outf.write("                          ")
-                self.outf.write("   %u\n" % len(msg.get("member", default=[])))
+                num_members = len(msg.get("member", default=[]))
+                self.outf.write("    %6u\n" % num_members)
         else:
             for msg in res:
                 self.outf.write("%s\n" % msg.get("samaccountname", idx=0))
@@ -630,39 +631,45 @@ class cmd_group_stats(Command):
 
         for msg in res:
             name = str(msg.get("samaccountname"))
-            memberships = len(msg.get("member", default=[]))
-            group_assignments[name] = memberships
-            total_memberships += memberships
+            num_members = len(msg.get("member", default=[]))
+            group_assignments[name] = num_members
+            total_memberships += num_members
 
+        num_groups = res.count
         self.outf.write("Group membership statistics*\n")
         self.outf.write("-------------------------------------------------\n")
-        self.outf.write("Total groups: {0}\n".format(res.count))
+        self.outf.write("Total groups: {0}\n".format(num_groups))
         self.outf.write("Total memberships: {0}\n".format(total_memberships))
-        average = float(total_memberships / res.count)
+        average = total_memberships / float(num_groups)
         self.outf.write("Average members per group: %.2f\n" % average)
+
+        # find the max and median memberships (note that some default groups
+        # always have zero members, so displaying the min is not very helpful)
         group_names = list(group_assignments.keys())
         group_members = list(group_assignments.values())
-        # note that some builtin groups have no members, so this doesn't tell us much
-        idx = group_members.index(min(group_members))
-        self.outf.write("Min members: {0} ({1})\n".format(group_members[idx],
-                                                          group_names[idx]))
         idx = group_members.index(max(group_members))
         max_members = group_members[idx]
-        self.outf.write("Max members: {0} ({1})\n\n".format(max_members,
-                                                            group_names[idx]))
+        self.outf.write("Max members: {0} ({1})\n".format(max_members,
+                                                          group_names[idx]))
+        group_members.sort()
+        midpoint = num_groups // 2
+        median = group_members[midpoint]
+        if num_groups % 2 == 0:
+            median = (median + group_members[midpoint - 1]) / 2
+        self.outf.write("Median members per group: {0}\n\n".format(median))
 
         # convert this to the frequency of group membership, i.e. how many
         # groups have 5 members, how many have 6 members, etc
         group_freqs = defaultdict(int)
-        for group, count in group_assignments.items():
-            group_freqs[count] += 1
+        for group, num_members in group_assignments.items():
+            group_freqs[num_members] += 1
 
         # now squash this down even further, so that we just display the number
         # of groups that fall into one of the following membership bands
-        bands = [(0, 1), (2, 4), (5, 9), (10, 14), (15, 19), (20, 24), (25, 29),
-                 (30, 39), (40, 49), (50, 59), (60, 69), (70, 79), (80, 89),
-                 (90, 99), (100, 149), (150, 199), (200, 249), (250, 299),
-                 (300, 399), (400, 499), (500, 999), (1000, 1999),
+        bands = [(0, 1), (2, 4), (5, 9), (10, 14), (15, 19), (20, 24),
+                 (25, 29), (30, 39), (40, 49), (50, 59), (60, 69), (70, 79),
+                 (80, 89), (90, 99), (100, 149), (150, 199), (200, 249),
+                 (250, 299), (300, 399), (400, 499), (500, 999), (1000, 1999),
                  (2000, 2999), (3000, 3999), (4000, 4999), (5000, 9999),
                  (10000, max_members)]
 
-- 
2.7.4


From dcab499be3ff8cab6dabab37fecd45da9ffd26e5 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 27 Nov 2018 11:51:51 +1300
Subject: [PATCH 2/4] tests: Add test-case for 'group list --verbose'

Check that the number of members reported is correct.
(This change somehow got left off the ca570bd4827aa commit that was
actually delivered).

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/tests/samba_tool/group.py | 41 ++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/python/samba/tests/samba_tool/group.py b/python/samba/tests/samba_tool/group.py
index bb701e9..9862251 100644
--- a/python/samba/tests/samba_tool/group.py
+++ b/python/samba/tests/samba_tool/group.py
@@ -117,6 +117,47 @@ class GroupCmdTestCase(SambaToolCmdTest):
             found = self.assertMatch(out, name,
                                      "group '%s' not found" % name)
 
+    def test_list_verbose(self):
+        (result, out, err) = self.runsubcmd("group", "list", "--verbose",
+                                            "-H", "ldap://%s" % os.environ["DC_SERVER"],
+                                            "-U%s%%%s" % (os.environ["DC_USERNAME"],
+                                                          os.environ["DC_PASSWORD"]))
+        self.assertCmdSuccess(result, out, err, "Error running list --verbose")
+
+        # use the output to build a dictionary, where key=group-name,
+        # value=num-members
+        output_memberships = {}
+
+        # split the output by line, skipping the first 2 header lines
+        group_lines = out.split('\n')[2:-1]
+        for line in group_lines:
+            # split line by column whitespace (but keep the group name together
+            # if it contains spaces)
+            values = line.split("   ")
+            name = values[0]
+            num_members = int(values[-1])
+            output_memberships[name] = num_members
+
+        # build up a similar dict using an LDAP search
+        search_filter = "(objectClass=group)"
+        grouplist = self.samdb.search(base=self.samdb.domain_dn(),
+                                      scope=ldb.SCOPE_SUBTREE,
+                                      expression=search_filter,
+                                      attrs=["samaccountname", "member"])
+        self.assertTrue(len(grouplist) > 0, "no groups found in samdb")
+
+        ldap_memberships = {}
+        for groupobj in grouplist:
+            name = str(groupobj.get("samaccountname", idx=0))
+            num_members = len(groupobj.get("member", default=[]))
+            ldap_memberships[name] = num_members
+
+        # check the command output matches LDAP
+        self.assertTrue(output_memberships == ldap_memberships,
+                        "Command output doesn't match LDAP results.\n" +
+                        "Command='%s'\nLDAP='%s'" %(output_memberships,
+                                                    ldap_memberships))
+
     def test_listmembers(self):
         (result, out, err) = self.runsubcmd("group", "listmembers", "Domain Users",
                                             "-H", "ldap://%s" % os.environ["DC_SERVER"],
-- 
2.7.4


From 9ff2076100a7f6620a6f395cfebc2a37d859043d Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 27 Nov 2018 10:47:48 +1300
Subject: [PATCH 3/4] traffic: Rework how assignments are generated slightly

We want to cap the number of members that can be in a group. But first,
we need to tweak how the assignment dict gets generated, so that we get
rid of the intermediary set.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 069c410..fd886e3 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1813,11 +1813,9 @@ class GroupAssignments(object):
         self.count = 0
         self.generate_group_distribution(number_of_groups)
         self.generate_user_distribution(number_of_users, group_memberships)
-        self.assignments = self.assign_groups(number_of_groups,
-                                              groups_added,
-                                              number_of_users,
-                                              users_added,
-                                              group_memberships)
+        self.assignments = defaultdict(list)
+        self.assign_groups(number_of_groups, groups_added, number_of_users,
+                           users_added, group_memberships)
 
     def cumulative_distribution(self, weights):
         # make sure the probabilities conform to a cumulative distribution
@@ -1890,6 +1888,14 @@ class GroupAssignments(object):
     def get_groups(self):
         return self.assignments.keys()
 
+    def add_assignment(self, user, group):
+        # the assignments are stored in a dictionary where key=group,
+        # value=list-of-users-in-group (indexing by group-ID allows us to
+        # optimize for DB membership writes)
+        if user not in self.assignments[group]:
+            self.assignments[group].append(user)
+            self.count += 1
+
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
         """Allocate users to groups.
@@ -1901,9 +1907,8 @@ class GroupAssignments(object):
         few users.
         """
 
-        assignments = set()
         if group_memberships <= 0:
-            return {}
+            return
 
         # Calculate the number of group menberships required
         group_memberships = math.ceil(
@@ -1912,23 +1917,13 @@ class GroupAssignments(object):
 
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
-        while len(assignments) < group_memberships:
+        while self.total() < group_memberships:
             user, group = self.generate_random_membership()
 
             if group > existing_groups or user > existing_users:
                 # the + 1 converts the array index to the corresponding
                 # group or user number
-                assignments.add(((user + 1), (group + 1)))
-
-        # convert the set into a dictionary, where key=group, value=list-of-
-        # users-in-group (indexing by group-ID allows us to optimize for
-        # DB membership writes)
-        assignment_dict = defaultdict(list)
-        for (user, group) in assignments:
-            assignment_dict[group].append(user)
-            self.count += 1
-
-        return assignment_dict
+                self.add_assignment(user + 1, group + 1)
 
     def total(self):
         return self.count
-- 
2.7.4


From 6e192cb0930973367f71bcb3022764da8bd7500f Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 27 Nov 2018 13:50:32 +1300
Subject: [PATCH 4/4] traffic_replay: Add a max-members option to cap group
 size

traffic_replay tries to distribute the users among the groups in a
realistic manner - some groups will have almost all users in them.
However, this becomes a problem when testing a really large database,
e.g. we may want 100K users, but no more than 5K users in each group.

This patch adds a max-member option so we can limit how big the groups
actually get.

If we detect that a group exceeds the max-members, we reset the group's
probability (of getting selected) to zero, and then recalculate the
cumulative distribution. The means that the group should no longer get
selected by generate_random_membership(). (Note we can't completely
remove the group from the list because that changes the
list-index-to-group-ID mapping).

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 34 ++++++++++++++++++++++++++++++----
 script/traffic_replay           |  4 ++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index fd886e3..291162f 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1764,8 +1764,8 @@ def clean_up_accounts(ldb, instance_id):
 
 def generate_users_and_groups(ldb, instance_id, password,
                               number_of_users, number_of_groups,
-                              group_memberships, machine_accounts,
-                              traffic_accounts=True):
+                              group_memberships, max_members,
+                              machine_accounts, traffic_accounts=True):
     """Generate the required users and groups, allocating the users to
        those groups."""
     memberships_added = 0
@@ -1792,7 +1792,8 @@ def generate_users_and_groups(ldb, instance_id, password,
                                        groups_added,
                                        number_of_users,
                                        users_added,
-                                       group_memberships)
+                                       group_memberships,
+                                       max_members)
         LOGGER.info("Adding users to groups")
         add_users_to_groups(ldb, instance_id, assignments)
         memberships_added = assignments.total()
@@ -1808,11 +1809,12 @@ def generate_users_and_groups(ldb, instance_id, password,
 
 class GroupAssignments(object):
     def __init__(self, number_of_groups, groups_added, number_of_users,
-                 users_added, group_memberships):
+                 users_added, group_memberships, max_members):
 
         self.count = 0
         self.generate_group_distribution(number_of_groups)
         self.generate_user_distribution(number_of_users, group_memberships)
+        self.max_members = max_members
         self.assignments = defaultdict(list)
         self.assign_groups(number_of_groups, groups_added, number_of_users,
                            users_added, group_memberships)
@@ -1825,6 +1827,9 @@ class GroupAssignments(object):
         # value, so we can use random.random() as a simple index into the list
         dist = []
         total = sum(weights)
+        if total == 0:
+            return None
+
         cumulative = 0.0
         for probability in weights:
             cumulative += probability
@@ -1868,6 +1873,7 @@ class GroupAssignments(object):
             weights.append(p)
 
         # convert the weights to a cumulative distribution between 0.0 and 1.0
+        self.group_weights = weights
         self.group_dist = self.cumulative_distribution(weights)
 
     def generate_random_membership(self):
@@ -1888,6 +1894,18 @@ class GroupAssignments(object):
     def get_groups(self):
         return self.assignments.keys()
 
+    def cap_group_membership(self, group, max_members):
+        """Prevent the group's membership from exceeding the max specified"""
+        num_members = len(self.assignments[group])
+        if num_members >= max_members:
+            LOGGER.info("Group {0} has {1} members".format(group, num_members))
+
+            # remove this group and then recalculate the cumulative
+            # distribution, so this group is no longer selected
+            self.group_weights[group - 1] = 0
+            new_dist = self.cumulative_distribution(self.group_weights)
+            self.group_dist = new_dist
+
     def add_assignment(self, user, group):
         # the assignments are stored in a dictionary where key=group,
         # value=list-of-users-in-group (indexing by group-ID allows us to
@@ -1896,6 +1914,10 @@ class GroupAssignments(object):
             self.assignments[group].append(user)
             self.count += 1
 
+        # check if there'a cap on how big the groups can grow
+        if self.max_members:
+            self.cap_group_membership(group, self.max_members)
+
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
         """Allocate users to groups.
@@ -1915,6 +1937,10 @@ class GroupAssignments(object):
             float(group_memberships) *
             (float(users_added) / float(number_of_users)))
 
+        if self.max_members:
+            group_memberships = min(group_memberships,
+                                    self.max_members * number_of_groups)
+
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
         while self.total() < group_memberships:
diff --git a/script/traffic_replay b/script/traffic_replay
index 991c9a9..0ee0f9b 100755
--- a/script/traffic_replay
+++ b/script/traffic_replay
@@ -112,6 +112,8 @@ def main():
     user_gen_group.add_option('--group-memberships', type='int', default=0,
                               help='Total memberships to assign across all '
                               'test users and all groups')
+    user_gen_group.add_option('--max-members', type='int', default=None,
+                              help='Max users to add to any one group')
     parser.add_option_group(user_gen_group)
 
     sambaopts = options.SambaOptions(parser)
@@ -333,6 +335,7 @@ def main():
                                           opts.number_of_users,
                                           opts.number_of_groups,
                                           opts.group_memberships,
+                                          opts.max_members,
                                           machine_accounts=computer_accounts,
                                           traffic_accounts=False)
         sys.exit()
@@ -346,6 +349,7 @@ def main():
                                       number_of_users,
                                       opts.number_of_groups,
                                       opts.group_memberships,
+                                      opts.max_members,
                                       machine_accounts=len(conversations),
                                       traffic_accounts=True)
 
-- 
2.7.4



More information about the samba-technical mailing list