[RFC] dynamic checksum size

jw schultz jw at pegasys.ws
Sun Mar 23 07:40:26 EST 2003


Currently rsync has a bit of a problem with very large
files.  Dynamic block sizes were introduced to try handle that
automatically if the user didn't specify a block size.
Unfortunately that isn't enough and the block size would
need to grow faster than the file.  Besides, overly large block
sizes mean large amounts of data need to be copied even for
small changes.

The maths indicate that increasing the checksum length would
rapidly get eliminate the problem.  However, increasing the
checksum length regardless of file size burns bandwidth so
per-file checksum sizes were wanted. Unfortunately that
required a protocol bump which was undesirable at the time.

The two attached patches implement per-file dynamic checksum
lengths.

The dynsumlen patch lays the groundwork.  It makes the
checksum length a per-file attribute alongside the block
size.  This required a protocol bump and reaching into
almost every routine that touches block checksums.  I found
that there was a great deal of code duplication involved
with reading and writing the sum_struct fields.  I have
eliminated the duplication by consolidating that transmission
in the new (read|write)_sum_head functions.  A added bonus
is that write_sum_head replaces the send_null_sums function.
The Adleresque checksum1 is in no way affected.

The varsumlen patch builds on that groundwork by
implementing a simple heuristic to generate the per-file
sum2 lengths.  It remains two bytes until the file is 8193
blocks at which it increments and continues incrementing
each time the block count quadruples.  This builds on the
dynamic block size code (which is consolidated as well) so
that the point where we go to a 3 byte sum2 is approximately
125MB.  Specifying a fixed block size on the command-line
will have an effect on the heuristic i present here.

Here is a quick table showing the file sizes at which
transition occurs, the sum2 length and the jump in size of
the block sum table transmitted:

	filesize     sum2len	transition
	<125MB		2
	 125MB		3	  8KB
	   1GB		4	 32KB
	   4GB		5	128KB
	  16GB		6	523KB
	  64GB		7	  2MB

This heuristic is a stab in the dark.  It may be that it is
too aggressive in the rate of increase or insufficiently
aggressive at starting the increase.  It may be that a 8
bits of sum2 in conjunction with the 32 bit adler is enough
for smaller files.  We may also wish to revisit the
heuristic for block sizes that fixes the block count at 1000
for files between 700KB and 16MB, or coordinate them so that
the transitions would be easier.  Both heuristics can be
changed at will without breaking compatibility.

These patches are against current CVS.  The varsumlen patch
depends on the dynsumlen patch.  I have done only minimal
testing.

-- 
________________________________________________________________
	J.W. Schultz            Pegasystems Technologies
	email address:		jw at pegasys.ws

		Remember Cernan and Schmitt
-------------- next part --------------
--- rsync.h		Sat Mar 22 02:34:59 2003
+++ rsync.h.dynsumlen	Sat Mar 22 09:47:43 2003
@@ -50,7 +50,7 @@
 #define SAME_TIME (1<<7)
 
 /* update this if you make incompatible changes */
-#define PROTOCOL_VERSION 26
+#define PROTOCOL_VERSION 27
 
 /* We refuse to interoperate with versions that are not in this range.
  * Note that we assume we'll work with later versions: the onus is on
@@ -385,7 +385,8 @@
 	OFF_T flength;		/**< total file length */
 	size_t count;		/**< how many chunks */
 	size_t remainder;	/**< flength % block_length */
-	size_t n;		/**< block_length */
+	size_t blength;		/**< block_length */
+	size_t s2length;	/**< csum_length */
 	struct sum_buf *sums;	/**< points to info for each chunk */
 };
 
--- generator.c			Sat Mar 22 03:38:21 2003
+++ generator.c.dynsumlen	Sat Mar 22 09:47:43 2003
@@ -109,13 +109,21 @@
 
 
 /*
-  send a header that says "we have no checksums" down the f_out fd
+ * 	NULL sum_struct means we have no checksums
   */
-static void send_null_sums(int f_out)
+
+void write_sum_head(int f, struct sum_struct *sum)
 {
-	write_int(f_out, 0);
-	write_int(f_out, block_size);
-	write_int(f_out, 0);
+	static struct sum_struct null_sum;
+
+	if (sum == (struct sum_struct *)NULL)
+		sum = &null_sum;
+
+	write_int(f, sum->count);
+	write_int(f, sum->blength);
+	if (remote_version >= 27)
+		write_int(f, sum->s2length);
+	write_int(f, sum->remainder);
 }
 
 
@@ -157,19 +165,18 @@
 
 	sum.count = (len + (block_len - 1)) / block_len;
 	sum.remainder = (len % block_len);
-	sum.n = block_len;
+	sum.blength	= block_len;
 	sum.flength = len;
+	sum.s2length	= csum_length;
 	/* not needed here  sum.sums = NULL; */
 
 	if (sum.count && verbose > 3) {
 		rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
 			(long) sum.count, (long) sum.remainder,
-			(long) sum.n, (double) sum.flength);
+			(long) sum.blength, (double) sum.flength);
 	}
 
-	write_int(f_out, sum.count);
-	write_int(f_out, sum.n);
-	write_int(f_out, sum.remainder);
+	write_sum_head(f_out, &sum);
 
 	for (i = 0; i < sum.count; i++) {
 		int n1 = MIN(len, block_len);
@@ -185,7 +192,7 @@
 				i, (double) offset, n1, (unsigned long) sum1);
 		}
 		write_int(f_out, sum1);
-		write_buf(f_out, sum2, csum_length);
+		write_buf(f_out, sum2, sum.s2length);
 		len -= n1;
 		offset += n1;
 	}
@@ -377,7 +384,7 @@
 	if (statret == -1) {
 		if (errno == ENOENT) {
 			write_int(f_out,i);
-			if (!dry_run) send_null_sums(f_out);
+			if (!dry_run) write_sum_head(f_out, NULL);
 		} else {
 			if (verbose > 1)
 				rprintf(FERROR, RSYNC_NAME
@@ -394,7 +401,7 @@
 
 		/* now pretend the file didn't exist */
 		write_int(f_out,i);
-		if (!dry_run) send_null_sums(f_out);
+		if (!dry_run) write_sum_head(f_out, NULL);
 		return;
 	}
 
@@ -423,7 +430,7 @@
 
 	if (disable_deltas_p()) {
 		write_int(f_out,i);
-		send_null_sums(f_out);
+		write_sum_head(f_out, NULL);
 		return;
 	}
 
@@ -434,7 +441,7 @@
 		rprintf(FERROR,RSYNC_NAME": failed to open \"%s\", continuing : %s\n",fnamecmp,strerror(errno));
 		/* pretend the file didn't exist */
 		write_int(f_out,i);
-		send_null_sums(f_out);
+		write_sum_head(f_out, NULL);
 		return;
 	}
 
--- match.c	Sat Mar 22 03:39:11 2003
+++ match.c.dynsumlen	Sat Mar 22 09:47:43 2003
@@ -19,8 +19,6 @@
 
 #include "rsync.h"
 
-extern int csum_length;
-
 extern int verbose;
 extern int am_server;
 
@@ -154,11 +152,11 @@
 
 	if (verbose > 2)
 		rprintf(FINFO,"hash search b=%ld len=%.0f\n",
-			(long) s->n, (double)len);
+			(long) s->blength, (double)len);
 
-	/* cast is to make s->n signed; it should always be reasonably
+	/* cast is to make s->blength signed; it should always be reasonably
 	 * small */
-	k = MIN(len, (OFF_T) s->n);
+	k = MIN(len, (OFF_T) s->blength);
 	
 	map = (schar *)map_ptr(buf,0,k);
 	
@@ -173,8 +171,8 @@
 	end = len + 1 - s->sums[s->count-1].len;
 	
 	if (verbose > 3)
-		rprintf(FINFO, "hash search s->n=%ld len=%.0f count=%ld\n",
-			(long) s->n, (double) len, (long) s->count);
+		rprintf(FINFO, "hash search s->blength=%ld len=%.0f count=%ld\n",
+			(long) s->blength, (double) len, (long) s->count);
 	
 	do {
 		tag t = gettag2(s1,s2);
@@ -196,7 +194,7 @@
 			if (sum != s->sums[i].sum1) continue;
 			
 			/* also make sure the two blocks are the same length */
-			l = MIN(s->n,len-offset);
+			l = MIN(s->blength,len-offset);
 			if (l != s->sums[i].len) continue;			
 
 			if (verbose > 3)
@@ -209,7 +207,7 @@
 				done_csum2 = 1;
 			}
 			
-			if (memcmp(sum2,s->sums[i].sum2,csum_length) != 0) {
+			if (memcmp(sum2,s->sums[i].sum2,s->s2length) != 0) {
 				false_alarms++;
 				continue;
 			}
@@ -220,7 +218,7 @@
 				int i2 = targets[j].i;
 				if (i2 == last_i + 1) {
 					if (sum != s->sums[i2].sum1) break;
-					if (memcmp(sum2,s->sums[i2].sum2,csum_length) != 0) break;
+					if (memcmp(sum2,s->sums[i2].sum2,s->s2length) != 0) break;
 					/* we've found an adjacent match - the RLL coder 
 					   will be happy */
 					i = i2;
@@ -232,7 +230,7 @@
 			
 			matched(f,s,buf,offset,i);
 			offset += s->sums[i].len - 1;
-			k = MIN((len-offset), s->n);
+			k = MIN((len-offset), s->blength);
 			map = (schar *)map_ptr(buf,offset,k);
 			sum = get_checksum1((char *)map, k);
 			s1 = sum & 0xFFFF;
@@ -262,9 +260,9 @@
 		   running match, the checksum update and the
 		   literal send. */
 		if (offset > last_match &&
-		    offset-last_match >= CHUNK_SIZE+s->n && 
+		    offset-last_match >= CHUNK_SIZE+s->blength && 
 		    (end-offset > CHUNK_SIZE)) {
-			matched(f,s,buf,offset - s->n, -2);
+			matched(f,s,buf,offset - s->blength, -2);
 		}
 	} while (++offset < end);
 	
--- receiver.c			Sat Mar 22 08:23:21 2003
+++ receiver.c.dynsumlen	Sat Mar 22 09:47:43 2003
@@ -207,7 +207,8 @@
 			OFF_T total_size)
 {
 	int i;
-	unsigned int n,remainder,len,count;
+	struct sum_struct sum;
+	unsigned int len;
 	OFF_T offset = 0;
 	OFF_T offset2;
 	char *data;
@@ -215,9 +216,7 @@
 	static char file_sum2[MD4_SUM_LENGTH];
 	char *map=NULL;
 	
-	count = read_int(f_in);
-	n = read_int(f_in);
-	remainder = read_int(f_in);
+	read_sum_head(f_in, &sum);
 	
 	sum_init();
 	
@@ -247,10 +246,10 @@
 		} 
 
 		i = -(i+1);
-		offset2 = i*(OFF_T)n;
-		len = n;
-		if (i == (int) count-1 && remainder != 0)
-			len = remainder;
+		offset2 = i*(OFF_T)sum.blength;
+		len = sum.blength;
+		if (i == (int) sum.count-1 && sum.remainder != 0)
+			len = sum.remainder;
 		
 		stats.matched_data += len;
 		
--- sender.c		Sat Mar 22 03:39:34 2003
+++ sender.c.dynsumlen	Sat Mar 22 10:33:00 2003
@@ -37,6 +37,19 @@
  **/
 
 
+void read_sum_head(int f, struct sum_struct *sum)
+{
+	sum->count = read_int(f);
+	sum->blength = read_int(f);
+	if (remote_version < 27)
+	{
+		sum->s2length = csum_length;
+	} else {
+		sum->s2length = read_int(f);  
+	}
+	sum->remainder = read_int(f);  
+}
+
 /**
  * Receive the checksums for a buffer
  **/
@@ -49,14 +62,14 @@
 	s = (struct sum_struct *)malloc(sizeof(*s));
 	if (!s) out_of_memory("receive_sums");
 
-	s->count = read_int(f);
-	s->n = read_int(f);
-	s->remainder = read_int(f);  
+	read_sum_head(f, s);
+
 	s->sums = NULL;
 
 	if (verbose > 3)
 		rprintf(FINFO,"count=%ld n=%ld rem=%ld\n",
-			(long) s->count, (long) s->n, (long) s->remainder);
+			(long) s->count, (long) s->blength,
+			(long) s->remainder);
 
 	if (s->count == 0) 
 		return(s);
@@ -66,7 +79,7 @@
 
 	for (i=0; i < (int) s->count;i++) {
 		s->sums[i].sum1 = read_int(f);
-		read_buf(f,s->sums[i].sum2,csum_length);
+		read_buf(f,s->sums[i].sum2,s->s2length);
 
 		s->sums[i].offset = offset;
 		s->sums[i].i = i;
@@ -74,7 +87,7 @@
 		if (i == (int) s->count-1 && s->remainder != 0) {
 			s->sums[i].len = s->remainder;
 		} else {
-			s->sums[i].len = s->n;
+			s->sums[i].len = s->blength;
 		}
 		offset += s->sums[i].len;
 
@@ -211,9 +224,7 @@
 			if (write_batch)
 				write_batch_delta_file((char *)&i,sizeof(i));
 
-			write_int(f_out,s->count);
-			write_int(f_out,s->n);
-			write_int(f_out,s->remainder);
+			write_sum_head(f_out, s);
 		}
 	  
 		if (verbose > 2)
@@ -240,9 +251,7 @@
                        }
                        else {
                          write_int(f_out,j);
-                         write_int(f_out,s->count);
-                         write_int(f_out,s->n);
-                         write_int(f_out,s->remainder);
+			 write_sum_head(f_out, s);
                          done=0;
                          while (!done) {
                             read_batch_delta_file( (char *) &buff_len, sizeof(int) );
-------------- next part --------------
--- rsync.h		Sat Mar 22 09:47:43 2003
+++ rsync.h.varsumlen	Sat Mar 22 10:41:24 2003
@@ -322,6 +322,7 @@
 /* the length of the md4 checksum */
 #define MD4_SUM_LENGTH 16
 #define SUM_LENGTH 16
+#define SHORT_SUM_LENGTH 2
 
 #ifndef MAXPATHLEN
 #define MAXPATHLEN 1024
--- generator.c			Sat Mar 22 09:47:43 2003
+++ generator.c.varsumlen	Sat Mar 22 10:41:12 2003
@@ -32,7 +32,6 @@
 extern int preserve_hard_links;
 extern int update_only;
 extern int opt_ignore_existing;
-extern int block_size;
 extern int csum_length;
 extern int ignore_times;
 extern int size_only;
@@ -93,20 +92,6 @@
 }
 
 
-/* use a larger block size for really big files */
-static int adapt_block_size(struct file_struct *file, int bsize)
-{
-	int ret;
-
-	if (bsize != BLOCK_SIZE) return bsize;
-
-	ret = file->length / (10000); /* rough heuristic */
-	ret = ret & ~15; /* multiple of 16 */
-	if (ret < bsize) ret = bsize;
-	if (ret > CHUNK_SIZE/2) ret = CHUNK_SIZE/2;
-	return ret;
-}
-
 
 /*
  * 	NULL sum_struct means we have no checksums
@@ -156,24 +141,50 @@
  *
  * Generate approximately one checksum every block_len bytes.
  */
-static void generate_and_send_sums(struct map_struct *buf, OFF_T len,
-				   int block_len, int f_out)
+static void generate_and_send_sums(struct map_struct *buf, OFF_T len, int f_out)
 {
+	extern int block_size;
+	int block_len;
 	size_t i;
 	struct sum_struct sum;
 	OFF_T offset = 0;
 
+	/*	adapt block size if not fixed	*/
+	if (block_size)	
+	{
+		block_len = block_size;
+	} else {
+		block_len = (len / 10000) & ~15; /* rough heuristic */
+		block_len = MIN(block_len, BLOCK_SIZE);
+		block_len = MAX(block_len, (CHUNK_SIZE / 2));
+	}
+
 	sum.count	= (len + (block_len - 1)) / block_len;
 	sum.remainder	= (len % block_len);
 	sum.blength	= block_len;
 	sum.flength	= len;
-	sum.s2length	= csum_length;
+
+	/* adapt checksum length if protocol allows	*/
+	if (remote_version < 27) {
+		sum.s2length	= csum_length;
+	} else {
+		size_t	l = SHORT_SUM_LENGTH;
+		size_t	c = sum.count >> 13;
+		while(c && l < SUM_LENGTH)
+		{
+			c >>= 2;
+			l++;
+		}
+		sum.s2length = l;
+	}
+
 	/* not needed here  sum.sums = NULL; */
 
 	if (sum.count && verbose > 3) {
-		rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
+		rprintf(FINFO, "count=%ld rem=%ld blength=%ld s2length=%ld flength=%.0f\n",
 			(long) sum.count, (long) sum.remainder,
-			(long) sum.blength, (double) sum.flength);
+			(long) sum.blength, (long) sum.s2length,
+			(double) sum.flength);
 	}
 
 	write_sum_head(f_out, &sum);
@@ -458,8 +469,7 @@
 		rprintf(FINFO, "generating and sending sums for %d\n", i);
 
 	write_int(f_out,i);
-	generate_and_send_sums(buf, st.st_size,
-			       adapt_block_size(file, block_size), f_out);
+	generate_and_send_sums(buf, st.st_size, f_out);
 
 	close(fd);
 	if (buf) unmap_file(buf);
--- options.c		Sat Mar 22 11:16:03 2003
+++ options.c.varsumlen	Sat Mar 22 10:40:13 2003
@@ -72,7 +72,7 @@
 int keep_partial=0;
 int safe_symlinks=0;
 int copy_unsafe_links=0;
-int block_size=BLOCK_SIZE;
+int block_size=0;
 int size_only=0;
 int bwlimit=0;
 int delete_after=0;
@@ -725,7 +725,7 @@
 
 	if (x != 1) args[ac++] = argstr;
 
-	if (block_size != BLOCK_SIZE) {
+	if (block_size) {
 		snprintf(bsize,sizeof(bsize),"-B%d",block_size);
 		args[ac++] = bsize;
 	}


More information about the rsync mailing list