[SCM] The rsync repository. - branch master updated

Rsync CVS commit messages rsync-cvs at lists.samba.org
Thu Sep 15 17:30:48 UTC 2022


The branch, master has been updated
       via  5b27d2e6 Pre-compute FILE_SUM_EXTRA_CNT.
       via  7e634f53 We always add a slash now that path is cleaned.
       via  8fe8cfd6 Use string length diff heuristic to skip Levenshtein Algo (#369)
      from  7a2dbf71 Make the implied-arg adding for --relative more efficient.

https://git.samba.org/?p=rsync.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 5b27d2e6f31dc0f7bd259771121e778486030577
Author: Wayne Davison <wayne at opencoder.net>
Date:   Thu Sep 15 10:25:32 2022 -0700

    Pre-compute FILE_SUM_EXTRA_CNT.

commit 7e634f5355d6dd96c5be7957837cba5a8154c70c
Author: Wayne Davison <wayne at opencoder.net>
Date:   Thu Sep 15 10:13:07 2022 -0700

    We always add a slash now that path is cleaned.

commit 8fe8cfd60af417f5467f7723075f5ad050b806c8
Author: Kenneth Finnegan <kennethfinnegan2007 at gmail.com>
Date:   Thu Sep 15 10:12:02 2022 -0700

    Use string length diff heuristic to skip Levenshtein Algo (#369)
    
    When using the --fuzzy option to try and find close matches locally,
    the edit distance algorithm used is O(N^2), which can get painful on
    CPU constrained systems when working in folders with tens of thousands
    of files in it.
    
    The lower bound on the calculated Levenshtein distance is the difference
    of the two strings being compared, so if that difference is larger than
    the current best match, the calculation of the exact edit distance between
    the two strings can be skipped.
    
    Testing on the OpenSUSE package repo has shown a 50% reduction in the CPU time
    required to plan the rsync transaction.

-----------------------------------------------------------------------

Summary of changes:
 checksum.c  | 4 +++-
 exclude.c   | 7 ++-----
 generator.c | 9 ++++++---
 rsync.h     | 4 ++--
 util1.c     | 9 ++++++++-
 5 files changed, 21 insertions(+), 12 deletions(-)


Changeset truncated at 500 lines:

diff --git a/checksum.c b/checksum.c
index 68ea0fa0..b5363bca 100644
--- a/checksum.c
+++ b/checksum.c
@@ -104,7 +104,7 @@ const EVP_MD *xfer_sum_evp_md;
 int xfer_sum_len;
 struct name_num_item *file_sum_nni; /* used for the pre-transfer --checksum computations */
 const EVP_MD *file_sum_evp_md;
-int file_sum_len;
+int file_sum_len, file_sum_extra_cnt;
 
 #ifdef USE_OPENSSL
 EVP_MD_CTX *ctx_evp = NULL;
@@ -197,6 +197,8 @@ void parse_checksum_choice(int final_call)
 	xfer_sum_evp_md = csum_evp_md(xfer_sum_nni);
 	file_sum_evp_md = csum_evp_md(file_sum_nni);
 
+	file_sum_extra_cnt = (file_sum_len + EXTRA_LEN - 1) / EXTRA_LEN;
+
 	if (xfer_sum_nni->num == CSUM_NONE)
 		whole_file = 1;
 
diff --git a/exclude.c b/exclude.c
index 5c0b0a69..ffe55b16 100644
--- a/exclude.c
+++ b/exclude.c
@@ -555,15 +555,12 @@ void add_implied_include(const char *arg, int skip_daemon_module)
 				p += arg_len;
 			}
 		}
-		if (p[-1] != '/') {
-			*p++ = '/';
-			slash_cnt++;
-		}
+		*p++ = '/';
 		*p++ = '*';
 		if (recurse)
 			*p++ = '*';
 		*p = '\0';
-		rule->u.slash_cnt = slash_cnt;
+		rule->u.slash_cnt = slash_cnt + 1;
 		rule->next = implied_filter_list.head;
 		implied_filter_list.head = rule;
 		if (DEBUG_GTE(FILTER, 3))
diff --git a/generator.c b/generator.c
index 935c84f9..21c4a595 100644
--- a/generator.c
+++ b/generator.c
@@ -875,9 +875,12 @@ static struct file_struct *find_fuzzy(struct file_struct *file, struct file_list
 			len = strlen(name);
 			suf = find_filename_suffix(name, len, &suf_len);
 
-			dist = fuzzy_distance(name, len, fname, fname_len);
-			/* Add some extra weight to how well the suffixes match. */
-			dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) * 10;
+			dist = fuzzy_distance(name, len, fname, fname_len, lowest_dist);
+			/* Add some extra weight to how well the suffixes match unless we've already disqualified
+			 * this file based on a heuristic. */
+			if (dist < 0xFFFF0000U) {
+				dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len, 0xFFFF0000U) * 10;
+			}
 			if (DEBUG_GTE(FUZZY, 2)) {
 				rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n",
 					f_name(fp, NULL), (int)(dist>>16), (int)(dist&0xFFFF));
diff --git a/rsync.h b/rsync.h
index 0a5ff809..2c5e5376 100644
--- a/rsync.h
+++ b/rsync.h
@@ -826,7 +826,7 @@ extern int uid_ndx;
 extern int gid_ndx;
 extern int acls_ndx;
 extern int xattrs_ndx;
-extern int file_sum_len;
+extern int file_sum_extra_cnt;
 
 #ifdef USE_FLEXIBLE_ARRAY
 #define FILE_STRUCT_LEN (sizeof (struct file_struct))
@@ -837,7 +837,7 @@ extern int file_sum_len;
 #define DEV_EXTRA_CNT 2
 #define DIRNODE_EXTRA_CNT 3
 #define EXTRA64_CNT ((sizeof (union file_extras64) + EXTRA_LEN - 1) / EXTRA_LEN)
-#define SUM_EXTRA_CNT ((file_sum_len + EXTRA_LEN - 1) / EXTRA_LEN)
+#define SUM_EXTRA_CNT file_sum_extra_cnt
 
 #define REQ_EXTRA(f,ndx) ((union file_extras*)(f) - (ndx))
 #define OPT_EXTRA(f,bump) ((union file_extras*)(f) - file_extra_cnt - 1 - (bump))
diff --git a/util1.c b/util1.c
index 671f3c75..da50ff1e 100644
--- a/util1.c
+++ b/util1.c
@@ -1487,12 +1487,19 @@ const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
 
 #define UNIT (1 << 16)
 
-uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2)
+uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2, uint32 upperlimit)
 {
 	uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
 	int32 cost;
 	unsigned i1, i2;
 
+	/* Check to see if the Levenshtein distance must be greater than the
+	 * upper limit defined by the previously found lowest distance using
+	 * the heuristic that the Levenshtein distance is greater than the
+	 * difference in length of the two strings */
+	if ((len1 > len2 ? len1 - len2 : len2 - len1) * UNIT > upperlimit)
+		return 0xFFFFU * UNIT + 1;
+
 	if (!len1 || !len2) {
 		if (!len1) {
 			s1 = s2;


-- 
The rsync repository.



More information about the rsync-cvs mailing list