Storage compression patch for Rsync (unfinished)

Craig Barratt craig at atheros.com
Sat Jan 18 07:24:00 EST 2003


> While the idea of rsyncing with compression is mildly
> attractive i can't say i care for the new compression
> format.  It would be better just to use the standard gzip or
> other format.  If you are going to create a new file type
> you could at least discuss storing the blocksums in it so
> that the receiver wouldn't have to generate them.

Yes!  Caching the block checksums and file checksums could yield a large
improvement for the receiver.  However, an integer checksum seed is used
in each block and file MD4 checksum. The default value is unix time() on
the server, sent to the client at startup.

So currently you can't cache block and file checksums (technically it is
possible for block checksums since the checksum seed is appended at the
end of each block, so you could cache the MD4 state prior to the checksum
seed being added; for files you can't since the checksum seed is at the
start).

Enter a new option, --checksum-seed=NUM, that allows the checksum seed to
be fixed.  I've attached a patch below against 2.5.6pre1.

The motivation for this is that BackupPC (http://backuppc.sourceforge.net)
will shortly release rsync support, and I plan to support caching
block and file checksums (in addition to the existing compression,
hardlinking among any identical files etc).  So it would be really
great if this patch, or something similar, could make it into 2.5.6
or at a minimum the contributed patch area in 2.5.6.

[Also, this option is convenient for debugging because it makes the
rsync traffic identical between runs, assuming the file states at
each end are the same too.]

Thanks,
Craig

###########################################################################
diff -bur rsync-2.5.6pre1/checksum.c rsync-2.5.6pre1-csum/checksum.c
--- rsync-2.5.6pre1/checksum.c	Mon Apr  8 01:31:57 2002
+++ rsync-2.5.6pre1-csum/checksum.c	Thu Jan 16 23:38:47 2003
@@ -23,7 +23,7 @@
 
 #define CSUM_CHUNK 64
 
-int checksum_seed = 0;
+extern int checksum_seed;
 extern int remote_version;
 
 /*
diff -bur rsync-2.5.6pre1/compat.c rsync-2.5.6pre1-csum/compat.c
--- rsync-2.5.6pre1/compat.c	Sun Apr  7 20:50:13 2002
+++ rsync-2.5.6pre1-csum/compat.c	Fri Jan 17 21:18:35 2003
@@ -35,7 +35,7 @@
 extern int preserve_times;
 extern int always_checksum;
 extern int checksum_seed;
-
+extern int checksum_seed_set;
 
 extern int remote_version;
 extern int verbose;
@@ -64,11 +64,14 @@
 	
 	if (remote_version >= 12) {
 		if (am_server) {
-		    if (read_batch || write_batch) /* dw */
+		    if (read_batch || write_batch) { /* dw */
+			if ( !checksum_seed_set )
 			checksum_seed = 32761;
-		    else
+		    } else {
+			if ( !checksum_seed_set )
 			checksum_seed = time(NULL);
 			write_int(f_out,checksum_seed);
+		    }
 		} else {
 			checksum_seed = read_int(f_in);
 		}
diff -bur rsync-2.5.6pre1/options.c rsync-2.5.6pre1-csum/options.c
--- rsync-2.5.6pre1/options.c	Fri Jan 10 17:30:11 2003
+++ rsync-2.5.6pre1-csum/options.c	Thu Jan 16 23:39:17 2003
@@ -116,6 +116,8 @@
 char *backup_dir = NULL;
 int rsync_port = RSYNC_PORT;
 int link_dest = 0;
+int checksum_seed = 0;
+int checksum_seed_set;
 
 int verbose = 0;
 int quiet = 0;
@@ -274,6 +276,7 @@
   rprintf(F,"     --bwlimit=KBPS          limit I/O bandwidth, KBytes per second\n");
   rprintf(F,"     --write-batch=PREFIX    write batch fileset starting with PREFIX\n");
   rprintf(F,"     --read-batch=PREFIX     read batch fileset starting with PREFIX\n");
+  rprintf(F,"     --checksum-seed=NUM     set MD4 checksum seed\n");
   rprintf(F," -h, --help                  show this help screen\n");
 #ifdef INET6
   rprintf(F," -4                          prefer IPv4\n");
@@ -293,7 +296,7 @@
       OPT_COPY_UNSAFE_LINKS, OPT_SAFE_LINKS, OPT_COMPARE_DEST, OPT_LINK_DEST,
       OPT_LOG_FORMAT, OPT_PASSWORD_FILE, OPT_SIZE_ONLY, OPT_ADDRESS,
       OPT_DELETE_AFTER, OPT_EXISTING, OPT_MAX_DELETE, OPT_BACKUP_DIR, 
-      OPT_IGNORE_ERRORS, OPT_BWLIMIT, OPT_BLOCKING_IO,
+      OPT_IGNORE_ERRORS, OPT_BWLIMIT, OPT_BLOCKING_IO, OPT_CHECKSUM_SEED,
       OPT_NO_BLOCKING_IO, OPT_WHOLE_FILE, OPT_NO_WHOLE_FILE,
       OPT_MODIFY_WINDOW, OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_IGNORE_EXISTING};
 
@@ -306,6 +309,7 @@
   {"ignore-times",    'I', POPT_ARG_NONE,   &ignore_times , 0, 0, 0 },
   {"size-only",        0,  POPT_ARG_NONE,   &size_only , 0, 0, 0 },
   {"modify-window",    0,  POPT_ARG_INT,    &modify_window, OPT_MODIFY_WINDOW, 0, 0 },
+  {"checksum-seed",    0,  POPT_ARG_INT,    &checksum_seed, OPT_CHECKSUM_SEED, 0, 0 },
   {"one-file-system", 'x', POPT_ARG_NONE,   &one_file_system , 0, 0, 0 },
   {"delete",           0,  POPT_ARG_NONE,   &delete_mode , 0, 0, 0 },
   {"existing",         0,  POPT_ARG_NONE,   &only_existing , 0, 0, 0 },
@@ -489,6 +493,13 @@
 			modify_window_set = 1;
 			break;
 
+		case OPT_CHECKSUM_SEED:
+                        /* The value has already been set by popt, but
+                         * we need to remember that we're using a
+                         * non-default setting. */
+			checksum_seed_set = 1;
+			break;
+
 		case OPT_DELETE_AFTER:
 			delete_after = 1;
 			delete_mode = 1;
@@ -661,6 +672,7 @@
 	static char iotime[30];
 	static char mdelete[30];
 	static char mwindow[30];
+	static char csumseed[50];
 	static char bw[50];
 	/* Leave room for ``--(write|read)-batch='' */
 	static char fext[MAXPATHLEN + 15];
@@ -795,6 +807,12 @@
 	        snprintf(mwindow,sizeof(mwindow),"--modify-window=%d",
 			 modify_window);
 		args[ac++] = mwindow;
+	}
+
+	if (checksum_seed_set) {
+	        snprintf(csumseed,sizeof(csumseed),"--checksum-seed=%d",
+			 checksum_seed);
+		args[ac++] = csumseed;
 	}
 
 	if (keep_partial)
diff -bur rsync-2.5.6pre1/rsync.yo rsync-2.5.6pre1-csum/rsync.yo
--- rsync-2.5.6pre1/rsync.yo	Wed Jan  8 19:55:10 2003
+++ rsync-2.5.6pre1-csum/rsync.yo	Fri Jan 17 21:19:58 2003
@@ -349,6 +349,7 @@
      --bwlimit=KBPS          limit I/O bandwidth, KBytes per second
      --read-batch=PREFIX     read batch fileset starting with PREFIX
      --write-batch=PREFIX    write batch fileset starting with PREFIX
+     --checksum-seed=NUM     set MD4 checksum seed
  -h, --help                  show this help screen
 
 
@@ -816,6 +817,17 @@
 dit(bf(--read-batch=PREFIX)) Apply a previously generated change batch,
 using the fileset whose filenames start with PREFIX. See the "BATCH
 MODE" section for details.
+
+dit(bf(--checksum-seed=NUM)) Set the MD4 checksum seed to the given
+number.  The 4 byte checksum seed is included in each block and file
+MD4 checksum calculation.  By default the checksum seed is generated
+by the server and defaults to the current time(), or 32761 if
+bf(--write-batch) or bf(--read-batch) are specified.  This causes
+the MD4 block and file checksums to be different each time rsync
+is run, further reducing the probability that different blocks or
+data always produce colliding MD4 checksums.  For applications that
+cache the block or file checksums the checksum seed needs to be fixed
+each time rsync runs using this option.
 
 enddit()
 



More information about the rsync mailing list