[SCM] Samba Shared Repository - branch master updated

Wed Jul 24 07:08:01 UTC 2019

The branch, master has been updated
       via  9d2fd082498 netcmd: Better error message for backup with no RID pool
       via  6c691bf84e4 partition: reversing partition unlocking
       via  7f4bc0ea81f partition: correcting lock ordering
      from  54af94ff21a s4/source4/common: clang: Fix 'Dereference of undefined pointer value'

https://git.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 9d2fd08249881702a4b4b881688d91576cc71ede
Author: Tim Beale <timbeale at catalyst.net.nz>
Date:   Wed Jul 24 14:17:06 2019 +1200

    netcmd: Better error message for backup with no RID pool
    
    Add a better error message (and what to do about it) if the user tries
    to back up a DC that hasn't initialized its RID pool yet.
    
    Seems to be a fairly common problem hit by users.
    
    BUG: https://bugzilla.samba.org/show_bug.cgi?id=14048
    RN: Added more informative error message if the 'samba-tool domain
    backup' command fails due to no RID pool being present on the DC.
    
    Signed-off-by: Tim Beale <timbeale at catalyst.net.nz>
    Reviewed-by: Andrew Bartlett <abartlet at samba.org>
    
    Autobuild-User(master): Andrew Bartlett <abartlet at samba.org>
    Autobuild-Date(master): Wed Jul 24 07:07:01 UTC 2019 on sn-devel-184

commit 6c691bf84e41b1edd3228c219f7a94e108795d28
Author: Aaron Haslett <aaronhaslett at catalyst.net.nz>
Date:   Mon Jul 15 13:32:41 2019 +1200

    partition: reversing partition unlocking
    
    Unlock partition databases in the reverse order from which they were
    acquired. This is separated from the previous commit for future
    bisecting purposes, since the last commit was made to fix specific CI
    failures, while this one is a speculative fix made based on code
    inspection.
    
    Signed-off-by: Aaron Haslett <aaronhaslett at catalyst.net.nz>
    Reviewed-by: Garming Sam <garming at catalyst.net.nz>
    Reviewed-by: Andrew Bartlett <abartlet at samba.org>

commit 7f4bc0ea81f2b34607849911f1271b030be8ca02
Author: Aaron Haslett <aaronhaslett at catalyst.net.nz>
Date:   Thu Jul 11 17:12:06 2019 +1200

    partition: correcting lock ordering
    
    A schema reading bug was traced to a lock ordering issue in partition.c.
    This patch fixes the problem by:
    1. Releasing locks/transactions in the order they were acquired.
    2. Always lock/start_trans on metadata.tdb first, before any other
    databases, and release it last, after all others. This is so that we are
    never exposed to MDB's lock semantics, which we don't support.
    
    Signed-off-by: Aaron Haslett <aaronhaslett at catalyst.net.nz>
    Reviewed-by: Garming Sam <garming at catalyst.net.nz>
    Reviewed-by: Andrew Bartlett <abartlet at samba.org>

-----------------------------------------------------------------------

Summary of changes:
 python/samba/netcmd/domain_backup.py       |  18 +-
 source4/dsdb/samdb/ldb_modules/partition.c | 260 +++++++++++++++++------------
 2 files changed, 169 insertions(+), 109 deletions(-)


Changeset truncated at 500 lines:

diff --git a/python/samba/netcmd/domain_backup.py b/python/samba/netcmd/domain_backup.py
index 4e32b4b9b1c..cca6db49b43 100644
--- a/python/samba/netcmd/domain_backup.py
+++ b/python/samba/netcmd/domain_backup.py
@@ -59,7 +59,7 @@ from samba.ndr import ndr_pack
 # work out a SID (based on a free RID) to use when the domain gets restored.
 # This ensures that the restored DC's SID won't clash with any other RIDs
 # already in use in the domain
-def get_sid_for_restore(samdb):
+def get_sid_for_restore(samdb, logger):
     # Find the DN of the RID set of the server
     res = samdb.search(base=ldb.Dn(samdb, samdb.get_serverName()),
                        scope=ldb.SCOPE_BASE, attrs=["serverReference"])
@@ -78,7 +78,15 @@ def get_sid_for_restore(samdb):
                               'rIDNextRID'])
 
     # Decode the bounds of the RID allocation pools
-    rid = int(res[0].get('rIDNextRID')[0])
+    try:
+        rid = int(res[0].get('rIDNextRID')[0])
+    except IndexError:
+        logger.info("The RID pool for this DC is not initalized "
+                    "(e.g. it may be a fairly new DC).")
+        logger.info("To initialize it, create a temporary user on this DC "
+                    "(you can delete it later).")
+        raise CommandError("Cannot create backup - "
+                           "please initialize this DC's RID pool first.")
 
     def split_val(num):
         high = (0xFFFFFFFF00000000 & int(num)) >> 32
@@ -255,7 +263,7 @@ class cmd_domain_backup_online(samba.netcmd.Command):
         # Get a free RID to use as the new DC's SID (when it gets restored)
         remote_sam = SamDB(url='ldap://' + server, credentials=creds,
                            session_info=system_session(), lp=lp)
-        new_sid = get_sid_for_restore(remote_sam)
+        new_sid = get_sid_for_restore(remote_sam, logger)
         realm = remote_sam.domain_dns_name()
 
         # Grab the remote DC's sysvol files and bundle them into a tar file
@@ -838,7 +846,7 @@ class cmd_domain_backup_rename(samba.netcmd.Command):
         # get a free RID to use as the new DC's SID (when it gets restored)
         remote_sam = SamDB(url='ldap://' + server, credentials=creds,
                            session_info=system_session(), lp=lp)
-        new_sid = get_sid_for_restore(remote_sam)
+        new_sid = get_sid_for_restore(remote_sam, logger)
 
         # Grab the remote DC's sysvol files and bundle them into a tar file.
         # Note we end up with 2 sysvol dirs - the original domain's files (that
@@ -1016,7 +1024,7 @@ class cmd_domain_backup_offline(samba.netcmd.Command):
         check_targetdir(logger, targetdir)
 
         samdb = SamDB(url=paths.samdb, session_info=system_session(), lp=lp)
-        sid = get_sid_for_restore(samdb)
+        sid = get_sid_for_restore(samdb, logger)
 
         backup_dirs = [paths.private_dir, paths.state_dir,
                        os.path.dirname(paths.smbconf)]  # etc dir
diff --git a/source4/dsdb/samdb/ldb_modules/partition.c b/source4/dsdb/samdb/ldb_modules/partition.c
index 4cfcf6f3ba7..e34ba35680b 100644
--- a/source4/dsdb/samdb/ldb_modules/partition.c
+++ b/source4/dsdb/samdb/ldb_modules/partition.c
@@ -1032,8 +1032,8 @@ static int partition_rename(struct ldb_module *module, struct ldb_request *req)
 /* start a transaction */
 int partition_start_trans(struct ldb_module *module)
 {
-	int i;
-	int ret;
+	int i = 0;
+	int ret = 0;
 	struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
 							      struct partition_private_data);
 	/* Look at base DN */
@@ -1043,18 +1043,58 @@ int partition_start_trans(struct ldb_module *module)
 		ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_start_trans() -> (metadata partition)");
 	}
 
-	/* This order must match that in prepare_commit() and read_lock() */
+	/*
+	 * We start a transaction on metadata.tdb first and end it last in
+	 * end_trans. This makes locking semantics follow TDB rather than MDB,
+	 * and effectively locks all partitions at once.
+	 * Detail:
+	 * Samba AD is special in that the partitions module (this file)
+	 * combines multiple independently locked databases into one overall
+	 * transaction. Changes across multiple partition DBs in a single
+	 * transaction must ALL be either visible or invisible.
+	 * The way this is achieved is by taking out a write lock on
+	 * metadata.tdb at the start of prepare_commit, while unlocking it at
+	 * the end of end_trans. This is matched by read_lock, ensuring it
+	 * can't progress until that write lock is released.
+	 *
+	 * metadata.tdb needs to be a TDB file because MDB uses independent
+	 * locks, which means a read lock and a write lock can be held at the
+	 * same time, whereas in TDB, the two locks block each other. The TDB
+	 * behaviour is required to implement the functionality described
+	 * above.
+	 *
+	 * An important additional detail here is that if prepare_commit is
+	 * called on a TDB without any changes being made, no write lock is
+	 * taken. We address this by storing a sequence number in metadata.tdb
+	 * which is updated every time a replicated attribute is modified.
+	 * The possibility of a few unreplicated attributes being out of date
+	 * turns out not to be a problem.
+	 * For this reason, a lock on sam.ldb (which is a TDB) won't achieve
+	 * the same end as locking metadata.tdb, unless we made a modification
+	 * to the @ records found there before every prepare_commit.
+	 */
+	ret = partition_metadata_start_trans(module);
+	if (ret != LDB_SUCCESS) {
+		return ret;
+	}
+
 	ret = ldb_next_start_trans(module);
 	if (ret != LDB_SUCCESS) {
+		partition_metadata_del_trans(module);
 		return ret;
 	}
 
 	ret = partition_reload_if_required(module, data, NULL);
 	if (ret != LDB_SUCCESS) {
 		ldb_next_del_trans(module);
+		partition_metadata_del_trans(module);
 		return ret;
 	}
 
+	/*
+	 * The following per partition locks are required mostly because TDB
+	 * and MDB require locks before read and write ops are permitted.
+	 */
 	for (i=0; data && data->partitions && data->partitions[i]; i++) {
 		if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
 			ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_start_trans() -> %s",
@@ -1072,20 +1112,6 @@ int partition_start_trans(struct ldb_module *module)
 		}
 	}
 
-	/*
-	 * Because in prepare_commit this must come last, to ensure
-	 * lock ordering we have to do this last here also 
-	 */
-	ret = partition_metadata_start_trans(module);
-	if (ret != LDB_SUCCESS) {
-		/* Back it out, if it fails on one */
-		for (i--; i >= 0; i--) {
-			ldb_next_del_trans(data->partitions[i]->module);
-		}
-		ldb_next_del_trans(module);
-		return ret;
-	}
-
 	data->in_transaction++;
 
 	return LDB_SUCCESS;
@@ -1099,6 +1125,15 @@ int partition_prepare_commit(struct ldb_module *module)
 							      struct partition_private_data);
 	int ret;
 
+	/*
+	 * Order of prepare_commit calls must match that in
+	 * partition_start_trans. See comment in that function for detail.
+	 */
+	ret = partition_metadata_prepare_commit(module);
+	if (ret != LDB_SUCCESS) {
+		return ret;
+	}
+
 	ret = ldb_next_prepare_commit(module);
 	if (ret != LDB_SUCCESS) {
 		return ret;
@@ -1122,9 +1157,7 @@ int partition_prepare_commit(struct ldb_module *module)
 		ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_prepare_commit() -> (metadata partition)");
 	}
 
-	/* metadata prepare commit must come last, as other partitions could modify
-	 * the database inside the prepare commit method of a module */
-	return partition_metadata_prepare_commit(module);
+	return LDB_SUCCESS;
 }
 
 
@@ -1132,9 +1165,11 @@ int partition_prepare_commit(struct ldb_module *module)
 int partition_end_trans(struct ldb_module *module)
 {
 	int ret, ret2;
-	unsigned int i;
+	int i;
+	struct ldb_context *ldb = ldb_module_get_ctx(module);
 	struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
 							      struct partition_private_data);
+	bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
 
 	ret = LDB_SUCCESS;
 
@@ -1145,22 +1180,32 @@ int partition_end_trans(struct ldb_module *module)
 		data->in_transaction--;
 	}
 
-
-	for (i=0; data && data->partitions && data->partitions[i]; i++) {
-		if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
-			ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_end_trans() -> %s",
-				  ldb_dn_get_linearized(data->partitions[i]->ctrl->dn));
-		}
-		ret2 = ldb_next_end_trans(data->partitions[i]->module);
-		if (ret2 != LDB_SUCCESS) {
-			ldb_asprintf_errstring(ldb_module_get_ctx(module), "end_trans error on %s: %s",
-					       ldb_dn_get_linearized(data->partitions[i]->ctrl->dn),
-					       ldb_errstring(ldb_module_get_ctx(module)));
-			ret = ret2;
+	/*
+	 * Order of end_trans calls must be the reverse of that in
+	 * partition_start_trans. See comment in that function for detail.
+	 */
+	if (data && data->partitions) {
+		for (i=0; data->partitions[i]; i++);;
+		for (i--; i>=0; i--) {
+			struct dsdb_partition *p = data->partitions[i];
+			if (trace) {
+				ldb_debug(ldb,
+					  LDB_DEBUG_TRACE,
+					  "partition_end_trans() -> %s",
+					  ldb_dn_get_linearized(p->ctrl->dn));
+			}
+			ret2 = ldb_next_end_trans(p->module);
+			if (ret2 != LDB_SUCCESS) {
+				ldb_asprintf_errstring(ldb,
+					"end_trans error on %s: %s",
+					ldb_dn_get_linearized(p->ctrl->dn),
+					ldb_errstring(ldb));
+				ret = ret2;
+			}
 		}
 	}
 
-	if ((module && ldb_module_flags(ldb_module_get_ctx(module)) & LDB_FLG_ENABLE_TRACING)) {
+	if (trace) {
 		ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_end_trans() -> (metadata partition)");
 	}
 	ret2 = ldb_next_end_trans(module);
@@ -1180,27 +1225,38 @@ int partition_end_trans(struct ldb_module *module)
 int partition_del_trans(struct ldb_module *module)
 {
 	int ret, final_ret = LDB_SUCCESS;
-	unsigned int i;
+	int i;
+	struct ldb_context *ldb = ldb_module_get_ctx(module);
 	struct partition_private_data *data = talloc_get_type(ldb_module_get_private(module),
 							      struct partition_private_data);
+	bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
 
-	for (i=0; data && data->partitions && data->partitions[i]; i++) {
-		if (ldb_module_flags(ldb_module_get_ctx(module)) &
-		    LDB_FLG_ENABLE_TRACING) {
-			ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_del_trans() -> %s",
-				  ldb_dn_get_linearized(data->partitions[i]->ctrl->dn));
-		}
-		ret = ldb_next_del_trans(data->partitions[i]->module);
-		if (ret != LDB_SUCCESS) {
-			ldb_asprintf_errstring(ldb_module_get_ctx(module), "del_trans error on %s: %s",
-					       ldb_dn_get_linearized(data->partitions[i]->ctrl->dn),
-					       ldb_errstring(ldb_module_get_ctx(module)));
-			final_ret = ret;
+	/*
+	 * Order of del_trans calls must be the reverse of that in
+	 * partition_start_trans. See comment in that function for detail.
+	 */
+	if (data && data->partitions) {
+		for (i=0; data->partitions[i]; i++);;
+		for (i--; i>=0; i--) {
+			struct dsdb_partition *p = data->partitions[i];
+			if (trace) {
+				ldb_debug(ldb,
+					  LDB_DEBUG_TRACE,
+					  "partition_del_trans() -> %s",
+					  ldb_dn_get_linearized(p->ctrl->dn));
+			}
+			ret = ldb_next_del_trans(p->module);
+			if (ret != LDB_SUCCESS) {
+				ldb_asprintf_errstring(ldb,
+					"del_trans error on %s: %s",
+					ldb_dn_get_linearized(p->ctrl->dn),
+					ldb_errstring(ldb));
+				final_ret = ret;
+			}
 		}
-	}	
+	}
 
-	if (ldb_module_flags(ldb_module_get_ctx(module)) &
-	    LDB_FLG_ENABLE_TRACING) {
+	if (trace) {
 		ldb_debug(ldb_module_get_ctx(module), LDB_DEBUG_TRACE, "partition_del_trans() -> (metadata partition)");
 	}
 	ret = ldb_next_del_trans(module);
@@ -1382,9 +1438,9 @@ static int partition_sequence_number(struct ldb_module *module, struct ldb_reque
 /* lock all the backends */
 int partition_read_lock(struct ldb_module *module)
 {
-	int i;
-	int ret;
-	int ret2;
+	int i = 0;
+	int ret = 0;
+	int ret2 = 0;
 	struct ldb_context *ldb = ldb_module_get_ctx(module);
 	struct partition_private_data *data = \
 		talloc_get_type(ldb_module_get_private(module),
@@ -1430,9 +1486,8 @@ int partition_read_lock(struct ldb_module *module)
 	}
 
 	/*
-	 * This will lock the metadata partition (sam.ldb) and
-	 * will also call event loops, so we do it before we
-	 * get the whole db lock.
+	 * This will lock sam.ldb and will also call event loops,
+	 * so we do it before we get the whole db lock.
 	 */
 	ret = partition_reload_if_required(module, data, NULL);
 	if (ret != LDB_SUCCESS) {
@@ -1440,8 +1495,20 @@ int partition_read_lock(struct ldb_module *module)
 	}
 
 	/*
-	 * This order must match that in prepare_commit(), start with
-	 * the top level DB (sam.ldb) lock
+	 * Order of read_lock calls must match that in partition_start_trans.
+	 * See comment in that function for detail.
+	 */
+	ret = partition_metadata_read_lock(module);
+	if (ret != LDB_SUCCESS) {
+		goto failed;
+	}
+
+	/*
+	 * The top level DB (sam.ldb) lock is not enough to block another
+	 * process in prepare_commit(), because if nothing was changed in the
+	 * specific backend, then prepare_commit() is a no-op. Therefore the
+	 * metadata.tdb lock is taken out above, as it is the best we can do
+	 * right now.
 	 */
 	ret = ldb_next_read_lock(module);
 	if (ret != LDB_SUCCESS) {
@@ -1455,12 +1522,8 @@ int partition_read_lock(struct ldb_module *module)
 	}
 
 	/*
-	 * The top level DB (sam.ldb) lock is not
-	 * enough to block another process in prepare_commit(),
-	 * because prepare_commit() is a no-op, if nothing
-	 * was changed in the specific backend.
-	 *
-	 * That means the following per partition locks are required.
+	 * The following per partition locks are required mostly because TDB
+	 * and MDB require locks before reads are permitted.
 	 */
 	for (i=0; data && data->partitions && data->partitions[i]; i++) {
 		if ((module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING)) {
@@ -1485,15 +1548,6 @@ int partition_read_lock(struct ldb_module *module)
 		goto failed;
 	}
 
-	/*
-	 * Because in prepare_commit this must come last, to ensure
-	 * lock ordering we have to do this last here also
-	 */
-	ret = partition_metadata_read_lock(module);
-	if (ret != LDB_SUCCESS) {
-		goto failed;
-	}
-
 	return LDB_SUCCESS;
 
 failed:
@@ -1529,40 +1583,42 @@ int partition_read_unlock(struct ldb_module *module)
 	struct partition_private_data *data = \
 		talloc_get_type(ldb_module_get_private(module),
 				struct partition_private_data);
+	bool trace = module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING;
 
 	/*
-	 * This order must be similar to partition_{end,del}_trans()
-	 * the metadata partition (sam.ldb) unlock must be at the end.
+	 * Order of read_unlock calls must be the reverse of that in
+	 * partition_start_trans. See comment in that function for detail.
 	 */
-
-	for (i=0; data && data->partitions && data->partitions[i]; i++) {
-		if ((module && ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING)) {
-			ldb_debug(ldb, LDB_DEBUG_TRACE,
-				  "partition_read_unlock() -> %s",
-				  ldb_dn_get_linearized(
-					  data->partitions[i]->ctrl->dn));
-		}
-		ret2 = ldb_next_read_unlock(data->partitions[i]->module);
-		if (ret2 != LDB_SUCCESS) {
-			ldb_debug_set(ldb,
-				      LDB_DEBUG_FATAL,
-				      "Failed to lock db: %s / %s for %s",
-				      ldb_errstring(ldb),
-				      ldb_strerror(ret),
-				      ldb_dn_get_linearized(
-					      data->partitions[i]->ctrl->dn));
-
-			/*
-			 * Don't overwrite the original failure code
-			 * if there was one
-			 */
-			if (ret == LDB_SUCCESS) {
-				ret = ret2;
+	if (data && data->partitions) {
+		for (i=0; data->partitions[i]; i++);;
+		for (i--; i>=0; i--) {
+			struct dsdb_partition *p = data->partitions[i];
+			if (trace) {
+				ldb_debug(ldb, LDB_DEBUG_TRACE,
+					  "partition_read_unlock() -> %s",
+					  ldb_dn_get_linearized(p->ctrl->dn));
+			}
+			ret2 = ldb_next_read_unlock(p->module);
+			if (ret2 != LDB_SUCCESS) {
+				ldb_debug_set(ldb,
+					   LDB_DEBUG_FATAL,
+					   "Failed to lock db: %s / %s for %s",
+					   ldb_errstring(ldb),
+					   ldb_strerror(ret),
+					   ldb_dn_get_linearized(p->ctrl->dn));
+
+				/*
+				 * Don't overwrite the original failure code
+				 * if there was one
+				 */
+				if (ret == LDB_SUCCESS) {
+					ret = ret2;
+				}
 			}
 		}
 	}
 
-	if (ldb_module_flags(ldb) & LDB_FLG_ENABLE_TRACING) {
+	if (trace) {
 		ldb_debug(ldb, LDB_DEBUG_TRACE,
 			  "partition_read_unlock() -> (metadata partition)");
 	}
@@ -1584,10 +1640,6 @@ int partition_read_unlock(struct ldb_module *module)
 		}
 	}
 
-	/*
-	 * Because in prepare_commit this must come last, to ensure
-	 * lock ordering we have to do this last here also
-	 */
 	ret = partition_metadata_read_unlock(module);
 
 	/*


-- 
Samba Shared Repository