Storage compression patch for Rsync (unfinished)

Dave Dykstra dwd at drdykstra.us
Sun Jan 26 21:02:01 EST 2003


Is there any reason why caching programs would need to set the
value, rather than it just being a fixed value?  I think it is hard
to describe what this is for and what it should be set to.  Maybe a
--fixed-checksum-seed option would make some sense, or for a caching
mechanism to be built in to rsync if it is shown to be very valuable.
I don't think I'll include the option in 2.5.6.  I know people have
proposed some caching mechanisms in the past and they've been rejected
for one reason or another.

- Dave

On Fri, Jan 17, 2003 at 11:19:35PM -0800, Craig Barratt wrote:
> > While the idea of rsyncing with compression is mildly
> > attractive i can't say i care for the new compression
> > format.  It would be better just to use the standard gzip or
> > other format.  If you are going to create a new file type
> > you could at least discuss storing the blocksums in it so
> > that the receiver wouldn't have to generate them.
> 
> Yes!  Caching the block checksums and file checksums could yield a large
> improvement for the receiver.  However, an integer checksum seed is used
> in each block and file MD4 checksum. The default value is unix time() on
> the server, sent to the client at startup.
> 
> So currently you can't cache block and file checksums (technically it is
> possible for block checksums since the checksum seed is appended at the
> end of each block, so you could cache the MD4 state prior to the checksum
> seed being added; for files you can't since the checksum seed is at the
> start).
> 
> Enter a new option, --checksum-seed=NUM, that allows the checksum seed to
> be fixed.  I've attached a patch below against 2.5.6pre1.
> 
> The motivation for this is that BackupPC (http://backuppc.sourceforge.net)
> will shortly release rsync support, and I plan to support caching
> block and file checksums (in addition to the existing compression,
> hardlinking among any identical files etc).  So it would be really
> great if this patch, or something similar, could make it into 2.5.6
> or at a minimum the contributed patch area in 2.5.6.
> 
> [Also, this option is convenient for debugging because it makes the
> rsync traffic identical between runs, assuming the file states at
> each end are the same too.]
> 
> Thanks,
> Craig
> 
> ###########################################################################
> diff -bur rsync-2.5.6pre1/checksum.c rsync-2.5.6pre1-csum/checksum.c
> --- rsync-2.5.6pre1/checksum.c	Mon Apr  8 01:31:57 2002
> +++ rsync-2.5.6pre1-csum/checksum.c	Thu Jan 16 23:38:47 2003
> @@ -23,7 +23,7 @@
>  
>  #define CSUM_CHUNK 64
>  
> -int checksum_seed = 0;
> +extern int checksum_seed;
>  extern int remote_version;
>  
>  /*
> diff -bur rsync-2.5.6pre1/compat.c rsync-2.5.6pre1-csum/compat.c
> --- rsync-2.5.6pre1/compat.c	Sun Apr  7 20:50:13 2002
> +++ rsync-2.5.6pre1-csum/compat.c	Fri Jan 17 21:18:35 2003
> @@ -35,7 +35,7 @@
>  extern int preserve_times;
>  extern int always_checksum;
>  extern int checksum_seed;
> -
> +extern int checksum_seed_set;
>  
>  extern int remote_version;
>  extern int verbose;
> @@ -64,11 +64,14 @@
>  	
>  	if (remote_version >= 12) {
>  		if (am_server) {
> -		    if (read_batch || write_batch) /* dw */
> +		    if (read_batch || write_batch) { /* dw */
> +			if ( !checksum_seed_set )
>  			checksum_seed = 32761;
> -		    else
> +		    } else {
> +			if ( !checksum_seed_set )
>  			checksum_seed = time(NULL);
>  			write_int(f_out,checksum_seed);
> +		    }
>  		} else {
>  			checksum_seed = read_int(f_in);
>  		}
> diff -bur rsync-2.5.6pre1/options.c rsync-2.5.6pre1-csum/options.c
> --- rsync-2.5.6pre1/options.c	Fri Jan 10 17:30:11 2003
> +++ rsync-2.5.6pre1-csum/options.c	Thu Jan 16 23:39:17 2003
> @@ -116,6 +116,8 @@
>  char *backup_dir = NULL;
>  int rsync_port = RSYNC_PORT;
>  int link_dest = 0;
> +int checksum_seed = 0;
> +int checksum_seed_set;
>  
>  int verbose = 0;
>  int quiet = 0;
> @@ -274,6 +276,7 @@
>    rprintf(F,"     --bwlimit=KBPS          limit I/O bandwidth, KBytes per second\n");
>    rprintf(F,"     --write-batch=PREFIX    write batch fileset starting with PREFIX\n");
>    rprintf(F,"     --read-batch=PREFIX     read batch fileset starting with PREFIX\n");
> +  rprintf(F,"     --checksum-seed=NUM     set MD4 checksum seed\n");
>    rprintf(F," -h, --help                  show this help screen\n");
>  #ifdef INET6
>    rprintf(F," -4                          prefer IPv4\n");
> @@ -293,7 +296,7 @@
>        OPT_COPY_UNSAFE_LINKS, OPT_SAFE_LINKS, OPT_COMPARE_DEST, OPT_LINK_DEST,
>        OPT_LOG_FORMAT, OPT_PASSWORD_FILE, OPT_SIZE_ONLY, OPT_ADDRESS,
>        OPT_DELETE_AFTER, OPT_EXISTING, OPT_MAX_DELETE, OPT_BACKUP_DIR, 
> -      OPT_IGNORE_ERRORS, OPT_BWLIMIT, OPT_BLOCKING_IO,
> +      OPT_IGNORE_ERRORS, OPT_BWLIMIT, OPT_BLOCKING_IO, OPT_CHECKSUM_SEED,
>        OPT_NO_BLOCKING_IO, OPT_WHOLE_FILE, OPT_NO_WHOLE_FILE,
>        OPT_MODIFY_WINDOW, OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_IGNORE_EXISTING};
>  
> @@ -306,6 +309,7 @@
>    {"ignore-times",    'I', POPT_ARG_NONE,   &ignore_times , 0, 0, 0 },
>    {"size-only",        0,  POPT_ARG_NONE,   &size_only , 0, 0, 0 },
>    {"modify-window",    0,  POPT_ARG_INT,    &modify_window, OPT_MODIFY_WINDOW, 0, 0 },
> +  {"checksum-seed",    0,  POPT_ARG_INT,    &checksum_seed, OPT_CHECKSUM_SEED, 0, 0 },
>    {"one-file-system", 'x', POPT_ARG_NONE,   &one_file_system , 0, 0, 0 },
>    {"delete",           0,  POPT_ARG_NONE,   &delete_mode , 0, 0, 0 },
>    {"existing",         0,  POPT_ARG_NONE,   &only_existing , 0, 0, 0 },
> @@ -489,6 +493,13 @@
>  			modify_window_set = 1;
>  			break;
>  
> +		case OPT_CHECKSUM_SEED:
> +                        /* The value has already been set by popt, but
> +                         * we need to remember that we're using a
> +                         * non-default setting. */
> +			checksum_seed_set = 1;
> +			break;
> +
>  		case OPT_DELETE_AFTER:
>  			delete_after = 1;
>  			delete_mode = 1;
> @@ -661,6 +672,7 @@
>  	static char iotime[30];
>  	static char mdelete[30];
>  	static char mwindow[30];
> +	static char csumseed[50];
>  	static char bw[50];
>  	/* Leave room for ``--(write|read)-batch='' */
>  	static char fext[MAXPATHLEN + 15];
> @@ -795,6 +807,12 @@
>  	        snprintf(mwindow,sizeof(mwindow),"--modify-window=%d",
>  			 modify_window);
>  		args[ac++] = mwindow;
> +	}
> +
> +	if (checksum_seed_set) {
> +	        snprintf(csumseed,sizeof(csumseed),"--checksum-seed=%d",
> +			 checksum_seed);
> +		args[ac++] = csumseed;
>  	}
>  
>  	if (keep_partial)
> diff -bur rsync-2.5.6pre1/rsync.yo rsync-2.5.6pre1-csum/rsync.yo
> --- rsync-2.5.6pre1/rsync.yo	Wed Jan  8 19:55:10 2003
> +++ rsync-2.5.6pre1-csum/rsync.yo	Fri Jan 17 21:19:58 2003
> @@ -349,6 +349,7 @@
>       --bwlimit=KBPS          limit I/O bandwidth, KBytes per second
>       --read-batch=PREFIX     read batch fileset starting with PREFIX
>       --write-batch=PREFIX    write batch fileset starting with PREFIX
> +     --checksum-seed=NUM     set MD4 checksum seed
>   -h, --help                  show this help screen
>  
>  
> @@ -816,6 +817,17 @@
>  dit(bf(--read-batch=PREFIX)) Apply a previously generated change batch,
>  using the fileset whose filenames start with PREFIX. See the "BATCH
>  MODE" section for details.
> +
> +dit(bf(--checksum-seed=NUM)) Set the MD4 checksum seed to the given
> +number.  The 4 byte checksum seed is included in each block and file
> +MD4 checksum calculation.  By default the checksum seed is generated
> +by the server and defaults to the current time(), or 32761 if
> +bf(--write-batch) or bf(--read-batch) are specified.  This causes
> +the MD4 block and file checksums to be different each time rsync
> +is run, further reducing the probability that different blocks or
> +data always produce colliding MD4 checksums.  For applications that
> +cache the block or file checksums the checksum seed needs to be fixed
> +each time rsync runs using this option.
>  
>  enddit()



More information about the rsync mailing list