[PATCH] Improvements to traffic_replay user generation

Thu Oct 18 03:54:54 UTC 2018

Updated patch-set. I've split out most of the logging changes and will
resend those separately.

CI link: https://gitlab.com/catalyst-samba/samba/pipelines/33374613

On 18/10/18 8:49 AM, Tim Beale via samba-technical wrote:
> These patches make some improvements to the traffic_replay tool, when
> using the --generate-users-only to populate a large DB (e.g. 10,000+ users).
>
> The changes allow the tool to generate the users a lot faster, and tries
> to improve how users were randomly being assigned to groups.
>
> CI pass: https://gitlab.com/catalyst-samba/samba/pipelines/33218214
>
> Review appreciated. Thanks.
>
-------------- next part --------------
From 7d2d24109357a7b34d367aa755d3cc3e969174ad Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 11 Oct 2018 14:47:28 +1300
Subject: [PATCH 1/8] traffic_replay: Generate users faster by writing to local
 DB

We can create user accounts much faster if the LDB connection talks
directly to the local sam.ldb file rather than going via LDAP. This
patch allows the 'host' argument to the tool to be a .ldb file (e.g.
"/usr/local/samba/private/sam.ldb") instead of a server name/IP.

In most cases, the traffic_replay tool wants to run on a remote device
(because the point of it is to send traffic to the DC). However, the
--generate-users-only is one case where the tool can be run locally,
directly on the test DC. (The traffic_replay user generation is handy
for standalone testing, because it also handles assigning group
memberships to the generated user accounts).

Note that you also need to use '--option="ldb:nosync = true"' to get
the improvement in performance.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 script/traffic_replay | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/script/traffic_replay b/script/traffic_replay
index 6578c84..3f4ff9b 100755
--- a/script/traffic_replay
+++ b/script/traffic_replay
@@ -30,6 +30,8 @@ from samba import gensec, get_debug_level
 from samba.emulate import traffic
 import samba.getopt as options
 from samba.logger import get_samba_logger
+from samba.samdb import SamDB
+from samba.auth import system_session
 
 
 def print_err(*args, **kwargs):
@@ -306,8 +308,16 @@ def main():
                          opts.number_of_groups)))
         sys.exit(1)
 
+    # Get an LDB connection.
     try:
-        ldb = traffic.openLdb(host, creds, lp)
+        # if we're only adding users, then it's OK to pass a sam.ldb filepath
+        # as the host, which creates the users much faster. In all other cases
+        # we should be connecting to a remote DC
+        if opts.generate_users_only and os.path.isfile(host):
+            ldb = SamDB(url="ldb://{0}".format(host),
+                        session_info=system_session(), lp=lp)
+        else:
+            ldb = traffic.openLdb(host, creds, lp)
     except:
         logger.error(("\nInitial LDAP connection failed! Did you supply "
                       "a DNS host name and the correct credentials?"))
-- 
2.7.4


From a4bd9d1893bd4a77bb978ec72f19a6cdafbcec1c Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 11:26:28 +1300
Subject: [PATCH 2/8] traffic_replay: Change print() to use logger()

This reduces noise, so the messages only come out if you specify
--debug.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 35100ca..677ad82 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1892,7 +1892,7 @@ def add_users_to_groups(db, instance_id, assignments):
         db.modify(m)
         end = time.time()
         duration = end - start
-        print("%f\t0\tadd\tuser\t%f\tTrue\t" % (end, duration))
+        LOGGER.info("%f\t0\tadd\tuser\t%f\tTrue\t" % (end, duration))
 
 
 def generate_stats(statsdir, timing_file):
-- 
2.7.4


From b099c87d395a2a39addf090ffec9fee62d903337 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Thu, 18 Oct 2018 16:36:44 +1300
Subject: [PATCH 3/8] traffic_replay: Add helper class for group assignments

Wrap up the group assignment calculations in a helper class. We're going
to tweak the internals a bit in subsequent patches, but the rest of the
code doesn't really need to know about these changes.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 124 ++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 57 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 677ad82..cdff76d 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1779,7 +1779,7 @@ def generate_users_and_groups(ldb, instance_id, password,
                               group_memberships):
     """Generate the required users and groups, allocating the users to
        those groups."""
-    assignments = []
+    memberships_added = 0
     groups_added  = 0
 
     create_ou(ldb, instance_id)
@@ -1793,13 +1793,14 @@ def generate_users_and_groups(ldb, instance_id, password,
 
     if group_memberships > 0:
         print("Assigning users to groups", file=sys.stderr)
-        assignments = assign_groups(number_of_groups,
-                                    groups_added,
-                                    number_of_users,
-                                    users_added,
-                                    group_memberships)
+        assignments = GroupAssignments(number_of_groups,
+                                       groups_added,
+                                       number_of_users,
+                                       users_added,
+                                       group_memberships)
         print("Adding users to groups", file=sys.stderr)
         add_users_to_groups(ldb, instance_id, assignments)
+        memberships_added = assignments.total()
 
     if (groups_added > 0 and users_added == 0 and
        number_of_groups != groups_added):
@@ -1807,67 +1808,76 @@ def generate_users_and_groups(ldb, instance_id, password,
               file=sys.stderr)
 
     print(("Added %d users, %d groups and %d group memberships" %
-           (users_added, groups_added, len(assignments))),
+           (users_added, groups_added, memberships_added)),
           file=sys.stderr)
 
 
-def assign_groups(number_of_groups,
-                  groups_added,
-                  number_of_users,
-                  users_added,
-                  group_memberships):
-    """Allocate users to groups.
+class GroupAssignments(object):
+    def __init__(self, number_of_groups, groups_added, number_of_users,
+                 users_added, group_memberships):
+        self.assignments = self.assign_groups(number_of_groups,
+                                              groups_added,
+                                              number_of_users,
+                                              users_added,
+                                              group_memberships)
 
-    The intention is to have a few users that belong to most groups, while
-    the majority of users belong to a few groups.
+    def assign_groups(self, number_of_groups, groups_added, number_of_users,
+                      users_added, group_memberships):
+        """Allocate users to groups.
 
-    A few groups will contain most users, with the remaining only having a
-    few users.
-    """
+        The intention is to have a few users that belong to most groups, while
+        the majority of users belong to a few groups.
 
-    def generate_user_distribution(n):
-        """Probability distribution of a user belonging to a group.
+        A few groups will contain most users, with the remaining only having a
+        few users.
         """
-        dist = []
-        for x in range(1, n + 1):
-            p = 1 / (x + 0.001)
-            dist.append(p)
-        return dist
-
-    def generate_group_distribution(n):
-        """Probability distribution of a group containing a user."""
-        dist = []
-        for x in range(1, n + 1):
-            p = 1 / (x**1.3)
-            dist.append(p)
-        return dist
-
-    assignments = set()
-    if group_memberships <= 0:
-        return assignments
-
-    group_dist = generate_group_distribution(number_of_groups)
-    user_dist  = generate_user_distribution(number_of_users)
 
-    # Calculate the number of group menberships required
-    group_memberships = math.ceil(
-        float(group_memberships) *
-        (float(users_added) / float(number_of_users)))
-
-    existing_users  = number_of_users  - users_added  - 1
-    existing_groups = number_of_groups - groups_added - 1
-    while len(assignments) < group_memberships:
-        user        = random.randint(0, number_of_users - 1)
-        group       = random.randint(0, number_of_groups - 1)
-        probability = group_dist[group] * user_dist[user]
+        def generate_user_distribution(n):
+            """Probability distribution of a user belonging to a group.
+            """
+            dist = []
+            for x in range(1, n + 1):
+                p = 1 / (x + 0.001)
+                dist.append(p)
+            return dist
+
+        def generate_group_distribution(n):
+            """Probability distribution of a group containing a user."""
+            dist = []
+            for x in range(1, n + 1):
+                p = 1 / (x**1.3)
+                dist.append(p)
+            return dist
+
+        assignments = set()
+        if group_memberships <= 0:
+            return assignments
+
+        group_dist = generate_group_distribution(number_of_groups)
+        user_dist  = generate_user_distribution(number_of_users)
+
+        # Calculate the number of group menberships required
+        group_memberships = math.ceil(
+            float(group_memberships) *
+            (float(users_added) / float(number_of_users)))
+
+        existing_users  = number_of_users  - users_added  - 1
+        existing_groups = number_of_groups - groups_added - 1
+        while len(assignments) < group_memberships:
+            user        = random.randint(0, number_of_users - 1)
+            group       = random.randint(0, number_of_groups - 1)
+            probability = group_dist[group] * user_dist[user]
+
+            if ((random.random() < probability * 10000) and
+               (group > existing_groups or user > existing_users)):
+                # the + 1 converts the array index to the corresponding
+                # group or user number
+                assignments.add(((user + 1), (group + 1)))
 
-        if ((random.random() < probability * 10000) and
-           (group > existing_groups or user > existing_users)):
-            # the + 1 converts the array index to the corresponding
-            # group or user number
-            assignments.add(((user + 1), (group + 1)))
+        return assignments
 
-    return assignments
+    def total(self):
+        return len(self.assignments)
 
 
 def add_users_to_groups(db, instance_id, assignments):
-- 
2.7.4


From df6820f3bc3207f1988db9dbd60689f2b31bf4ff Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 12:54:03 +1300
Subject: [PATCH 4/8] traffic_replay: Split out random group membership 
 generation logic

This doesn't change functionality at all. It just moves the probability
calculations out into separate functions.

We want to tweak the logic/implementation behind this code, but the
rest of assign_groups() doesn't really care how the underlying
probabilities are worked out, so long as it gets a suitably random
user/group membership each time round the loop.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 63 +++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index cdff76d..55dadd3 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1815,14 +1815,46 @@ def generate_users_and_groups(ldb, instance_id, password,
 class GroupAssignments(object):
     def __init__(self, number_of_groups, groups_added, number_of_users,
                  users_added, group_memberships):
+
+        self.generate_group_distribution(number_of_groups)
+        self.generate_user_distribution(number_of_users)
         self.assignments = self.assign_groups(number_of_groups,
                                               groups_added,
                                               number_of_users,
                                               users_added,
                                               group_memberships)
 
-    def assign_groups(self, number_of_groups, groups_added, number_of_users,
-                      users_added, group_memberships):
+    def generate_user_distribution(self, n):
+        """Probability distribution of a user belonging to a group.
+        """
+        self.user_dist = []
+        for x in range(1, n + 1):
+            p = 1 / (x + 0.001)
+            self.user_dist.append(p)
+
+        self.num_users = n
+
+    def generate_group_distribution(self, n):
+        """Probability distribution of a group containing a user."""
+        self.group_dist = []
+        for x in range(1, n + 1):
+            p = 1 / (x**1.3)
+            self.group_dist.append(p)
+
+        self.num_groups = n
+
+    def generate_random_membership(self):
+        """Returns a randomly generated user-group membership"""
+        while True:
+            user        = random.randint(0, self.num_users - 1)
+            group       = random.randint(0, self.num_groups - 1)
+            probability = self.group_dist[group] * self.user_dist[user]
+
+            if random.random() < probability * 10000:
+                return user, group
+
+    def assign_groups(self, number_of_groups, groups_added,
+                      number_of_users, users_added, group_memberships):
         """Allocate users to groups.
 
         The intention is to have a few users that belong to most groups, while
@@ -1832,30 +1864,10 @@ class GroupAssignments(object):
         few users.
         """
 
-        def generate_user_distribution(n):
-            """Probability distribution of a user belonging to a group.
-            """
-            dist = []
-            for x in range(1, n + 1):
-                p = 1 / (x + 0.001)
-                dist.append(p)
-            return dist
-
-        def generate_group_distribution(n):
-            """Probability distribution of a group containing a user."""
-            dist = []
-            for x in range(1, n + 1):
-                p = 1 / (x**1.3)
-                dist.append(p)
-            return dist
-
         assignments = set()
         if group_memberships <= 0:
             return assignments
 
-        group_dist = generate_group_distribution(number_of_groups)
-        user_dist  = generate_user_distribution(number_of_users)
-
         # Calculate the number of group menberships required
         group_memberships = math.ceil(
             float(group_memberships) *
@@ -1864,12 +1876,9 @@ class GroupAssignments(object):
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
         while len(assignments) < group_memberships:
-            user        = random.randint(0, number_of_users - 1)
-            group       = random.randint(0, number_of_groups - 1)
-            probability = group_dist[group] * user_dist[user]
+            user, group = self.generate_random_membership()
 
-            if ((random.random() < probability * 10000) and
-               (group > existing_groups or user > existing_users)):
+            if group > existing_groups or user > existing_users:
                 # the + 1 converts the array index to the corresponding
                 # group or user number
                 assignments.add(((user + 1), (group + 1)))
-- 
2.7.4


From 3aa7938cce4c4ed2c43d02149f625f418242d683 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Mon, 15 Oct 2018 16:24:00 +1300
Subject: [PATCH 5/8] traffic_replay: Improve assign_groups() performance with
 large domains

When assigning 10,000 users to 15 groups each (on average),
assign_groups() would take over 30 seconds. This did not include any DB
operations whatsoever. This patch improves things, so that it takes less
than a second in the same situation.

The problem was the code was looping ~23 million times where the
'random.random() < probability * 10000' condition was not met. The
problem is individual group/user probabilities get lower as the number
of groups/users increases. And so with large numbers of users, most of
the time the calculated probability was very small and didn't meet the
threshold.

This patch changes it so we can select a user/group in one go, avoiding
the need to loop multiple times.

Basically we distribute the users (or groups) between 0.0 and 1.0, so
that each user has their own 'slice', and this slice is proporational to
their weighted probability. random.random() generates a value between
0.0 and 1.0, so we can use this to pick a 'slice' (or rather, we use
this as an index into the list, using .bisect()). Users/groups with
larger probabilities end up with larger slices, so are more likely to
get picked.

The end result is roughly the same distribution as before, although the
first 10 or so user/groups seem to get picked more frequently, so the
weighted-probability calculations may need tweaking some more.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 47 ++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 55dadd3..b79fc28 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -52,6 +52,7 @@ from samba import gensec
 from samba import sd_utils
 from samba.compat import get_string
 from samba.logger import get_samba_logger
+import bisect
 
 SLEEP_OVERHEAD = 3e-4
 
@@ -1824,34 +1825,56 @@ class GroupAssignments(object):
                                               users_added,
                                               group_memberships)
 
+    def cumulative_distribution(self, weights):
+        # make sure the probabilities conform to a cumulative distribution
+        # spread between 0.0 and 1.0. Dividing by the weighted total gives each
+        # probability a proportional share of 1.0. Higher probabilities get a
+        # bigger share, so are more likely to be picked. We use the cumulative
+        # value, so we can use random.random() as a simple index into the list
+        dist = []
+        total = sum(weights)
+        cumulative = 0.0
+        for probability in weights:
+            cumulative += probability
+            dist.append(cumulative / total)
+        return dist
+
     def generate_user_distribution(self, n):
         """Probability distribution of a user belonging to a group.
         """
-        self.user_dist = []
+        # Assign a weighted probability to each user. Probability decreases
+        # as the user-ID increases
+        weights = []
         for x in range(1, n + 1):
             p = 1 / (x + 0.001)
-            self.user_dist.append(p)
+            weights.append(p)
 
-        self.num_users = n
+        # convert the weights to a cumulative distribution between 0.0 and 1.0
+        self.user_dist = self.cumulative_distribution(weights)
 
     def generate_group_distribution(self, n):
         """Probability distribution of a group containing a user."""
-        self.group_dist = []
+
+        # Assign a weighted probability to each user. Probability decreases
+        # as the group-ID increases
+        weights = []
         for x in range(1, n + 1):
             p = 1 / (x**1.3)
-            self.group_dist.append(p)
+            weights.append(p)
 
-        self.num_groups = n
+        # convert the weights to a cumulative distribution between 0.0 and 1.0
+        self.group_dist = self.cumulative_distribution(weights)
 
     def generate_random_membership(self):
         """Returns a randomly generated user-group membership"""
-        while True:
-            user        = random.randint(0, self.num_users - 1)
-            group       = random.randint(0, self.num_groups - 1)
-            probability = self.group_dist[group] * self.user_dist[user]
 
-            if random.random() < probability * 10000:
-                return user, group
+        # the list items are cumulative distribution values between 0.0 and
+        # 1.0, which makes random() a handy way to index the list to get a
+        # weighted random user/group
+        user = bisect.bisect(self.user_dist, random.random())
+        group = bisect.bisect(self.group_dist, random.random())
+
+        return user, group
 
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
-- 
2.7.4


From d93126503feaffd488ea22699e1cd45579cb3f77 Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 16 Oct 2018 10:57:29 +1300
Subject: [PATCH 6/8] traffic_replay: Change user distribution to use Pareto
 Distribution

The current probability we were assigning to users roughly approximates
the Pareto Distribution (with shape=1.0). This means the code now uses a
documented algorithm (i.e. explanation on Wikipedia). It also allows us
to vary the distribution by changing the shape parameter.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index b79fc28..5ab56b2 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1842,11 +1842,12 @@ class GroupAssignments(object):
     def generate_user_distribution(self, n):
         """Probability distribution of a user belonging to a group.
         """
-        # Assign a weighted probability to each user. Probability decreases
-        # as the user-ID increases
+        # Assign a weighted probability to each user. Use the Pareto
+        # Distribution so that some users are in a lot of groups, and the
+        # bulk of users are in only a few groups
         weights = []
         for x in range(1, n + 1):
-            p = 1 / (x + 0.001)
+            p = random.paretovariate(1.0)
             weights.append(p)
 
         # convert the weights to a cumulative distribution between 0.0 and 1.0
-- 
2.7.4


From 080a1f8ba9fe00e221c24a628db1707fe093400b Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Tue, 16 Oct 2018 16:01:25 +1300
Subject: [PATCH 7/8] traffic_replay: Prevent users having 1000+ memberOf links

When adding 10,000 users, one user would end up in over 1000 groups.
With 100,000 users, it would be more like 10,000 groups. While it makes
sense to have groups with large numbers of users, having a single user
in 1000s of groups is probably less realistic.

This patch changes the shape of the Pareto distribution that we use to
assign users to groups. The aim is to cap users at belonging to at most
~500 groups. Increasing the shape of the Pareto distribution pushes the
user assignments so they're closer to the average, and the tail (with
users in lots of groups) is not so large).

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 5ab56b2..04a5cbe 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1818,7 +1818,7 @@ class GroupAssignments(object):
                  users_added, group_memberships):
 
         self.generate_group_distribution(number_of_groups)
-        self.generate_user_distribution(number_of_users)
+        self.generate_user_distribution(number_of_users, group_memberships)
         self.assignments = self.assign_groups(number_of_groups,
                                               groups_added,
                                               number_of_users,
@@ -1839,15 +1839,27 @@ class GroupAssignments(object):
             dist.append(cumulative / total)
         return dist
 
-    def generate_user_distribution(self, n):
+    def generate_user_distribution(self, num_users, num_memberships):
         """Probability distribution of a user belonging to a group.
         """
         # Assign a weighted probability to each user. Use the Pareto
         # Distribution so that some users are in a lot of groups, and the
-        # bulk of users are in only a few groups
+        # bulk of users are in only a few groups. If we're assigning a large
+        # number of group memberships, use a higher shape. This means slightly
+        # fewer outlying users that are in large numbers of groups. The aim is
+        # to have no users belonging to more than ~500 groups.
+        if num_memberships > 5000000:
+            shape = 3.0
+        elif num_memberships > 2000000:
+            shape = 2.5
+        elif num_memberships > 300000:
+            shape = 2.25
+        else:
+            shape = 1.75
+
         weights = []
-        for x in range(1, n + 1):
-            p = random.paretovariate(1.0)
+        for x in range(1, num_users + 1):
+            p = random.paretovariate(shape)
             weights.append(p)
 
         # convert the weights to a cumulative distribution between 0.0 and 1.0
-- 
2.7.4


From 66f08ae486c09770f19bedb97f16dc93d8c6195e Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale at catalyst.net.nz>
Date: Wed, 17 Oct 2018 13:22:59 +1300
Subject: [PATCH 8/8] traffic_replay: Write group memberships once per group

Each user-group membership was being written to the DB in a single
operation. With large numbers of users (e.g. 10,000 in average 15 groups
each), this becomes a lot of operations (e.g. 150,000). This patch
reworks the code so that we write *all* the memberships for a group in
one operation. E.g. instead of 150,000 DB operations, we might make
1,500. This makes writing the group memberships several times
faster.

To do this, we reorganize the assignments so instead of being a set of
tuples, it's a dictionary where key=group and
value=list-of-users-in-group.

Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
---
 python/samba/emulate/traffic.py | 51 ++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/python/samba/emulate/traffic.py b/python/samba/emulate/traffic.py
index 04a5cbe..89e876f 100644
--- a/python/samba/emulate/traffic.py
+++ b/python/samba/emulate/traffic.py
@@ -1817,13 +1817,17 @@ class GroupAssignments(object):
     def __init__(self, number_of_groups, groups_added, number_of_users,
                  users_added, group_memberships):
 
+        # populate a dictionary of empty sets (group is the dictionary key,
+        # because that optimizes for DB membership writes)
+        self.count = 0
+        self.assignments = {}
+        for i in range(number_of_groups, 0, -1):
+            self.assignments[i] = set()
+
         self.generate_group_distribution(number_of_groups)
         self.generate_user_distribution(number_of_users, group_memberships)
-        self.assignments = self.assign_groups(number_of_groups,
-                                              groups_added,
-                                              number_of_users,
-                                              users_added,
-                                              group_memberships)
+        self.assign_groups(number_of_groups, groups_added, number_of_users,
+                           users_added, group_memberships)
 
     def cumulative_distribution(self, weights):
         # make sure the probabilities conform to a cumulative distribution
@@ -1889,6 +1893,17 @@ class GroupAssignments(object):
 
         return user, group
 
+    def add_assignment(self, user, group):
+        if user not in self.assignments[group]:
+            self.assignments[group].add(user)
+            self.count += 1
+
+    def users_in_group(self, group):
+        return self.assignments[group]
+
+    def get_groups(self):
+        return self.assignments.keys()
+
     def assign_groups(self, number_of_groups, groups_added,
                       number_of_users, users_added, group_memberships):
         """Allocate users to groups.
@@ -1900,9 +1915,8 @@ class GroupAssignments(object):
         few users.
         """
 
-        assignments = set()
         if group_memberships <= 0:
-            return assignments
+            return
 
         # Calculate the number of group menberships required
         group_memberships = math.ceil(
@@ -1911,18 +1925,16 @@ class GroupAssignments(object):
 
         existing_users  = number_of_users  - users_added  - 1
         existing_groups = number_of_groups - groups_added - 1
-        while len(assignments) < group_memberships:
+        while self.total() < group_memberships:
             user, group = self.generate_random_membership()
 
             if group > existing_groups or user > existing_users:
                 # the + 1 converts the array index to the corresponding
                 # group or user number
-                assignments.add(((user + 1), (group + 1)))
-
-        return assignments
+                self.add_assignment((user + 1), (group + 1))
 
     def total(self):
-        return len(self.assignments)
+        return self.count
 
 
 def add_users_to_groups(db, instance_id, assignments):
@@ -1936,14 +1948,21 @@ def add_users_to_groups(db, instance_id, assignments):
     def build_dn(name):
         return("cn=%s,%s" % (name, ou))
 
-    for (user, group) in assignments:
-        user_dn  = build_dn(user_name(instance_id, user))
-        group_dn = build_dn(group_name(instance_id, group))
+    for group in assignments.get_groups():
+        users_in_group = assignments.users_in_group(group)
+        if len(users_in_group) == 0:
+            continue
 
+        group_dn = build_dn(group_name(instance_id, group))
         m = ldb.Message()
         m.dn = ldb.Dn(db, group_dn)
-        m["member"] = ldb.MessageElement(user_dn, ldb.FLAG_MOD_ADD, "member")
         start = time.time()
+
+        for user in users_in_group:
+            user_dn  = build_dn(user_name(instance_id, user))
+            idx = "member-" + str(user)
+            m[idx] = ldb.MessageElement(user_dn, ldb.FLAG_MOD_ADD, "member")
+
         db.modify(m)
         end = time.time()
         duration = end - start
-- 
2.7.4