[RFC][patch] dynamic rolling block and sum sizes II
jw schultz
jw at pegasys.ws
Sun Mar 30 10:56:30 EST 2003
Mark II of the patch set.
The first patch (dynsumlen2.patch) increments the protocol
version to support per-file dynamic block checksum sizes.
It is a prerequisite for varsumlen2.patch.
varsumlen2.patch implements per-file dynamic block and checksum
sizes.
The current block size calculation only applies to files
between 7MB and 160MB setting the block size to 1/10,0000 of
the file length for a fixed count of 10,001 blocks.
Starting at 160MB the block size would stay at 16KB and the
count would be 1/15,536 of the file length. A logic error
prevents --block-size=700 from locking the block size.
While an improvement over fixed 700 byte blocks for very
large files the block count would grow linearly with
file size to outrageous numbers (65536/GB). An iso image
would need over 40,000 blocks and with the fixed sum size of
4+2 bytes had a measurable sum collision rate. With the sum
collisions the file would have to be redone with 4+16 byte
sums killing efficiency. Additionally all those blocks
meant a good deal of processing to find matching blocks and
a sizable memory and network load.
varsumlen2.patch calculates the block size to be a rounded square
root of the file size with a 700byte minimum. In this way
block sizes and counts grow smoothly with file size for
files larger than 478KB. A 1MB file will have 1024 1KB
blocks and a 4GB file would have 65,536 64KB blocks.
--block-size will override this regardless of what block
size is specified.
The variable checksum size takes block size into account
using an approximation of the formula provided by Donovan
Baarda on this list (blocksum_bits includes the 32bit sum1).
blocksum_bits = BIAS + 2*log2(file_len) - log2(block_len)
This should result in increased efficiency as files grow,
reduce the likelihood of file-redo and raise the ceiling on
how large a file can be processed by rsync. The one case
where efficiency could be decreased would be huge files with
many tiny changes scattered throughout.
The first table illustrates this showing pertinent data for
a sampling of file sizes. xmit sums shows the relative
network load of just sending checksums. array_size is the
memory footprint sums array which could compounded by the
sum hash could be a limiting factor on the size file a given
machine can reasonably handle.
The second table shows the effect of --block-size=16384
and is indicative of what setting a MAX_BLOCK_SIZE would do.
file length block_len block_count s2length xmit sums array_size
50 700 1 2 6 36
831K 920 926 2 5556 32K
1439K 1208 1221 2 7326 42K
2493K 1592 1604 2 9624 56K
4317K 2096 2110 2 12K 74K
7475K 2760 2774 2 16K 97K
12M 3640 3642 2 21K 128K
21M 4784 4799 2 28K 168K
32M 5824 5835 3 39K 205K
56M 7664 7678 3 52K 269K
97M 9K 9K 3 69K 355K
168M 12K 12K 3 90K 467K
291M 17K 17K 3 119K 614K
504M 22K 22K 3 157K 808K
873M 29K 29K 3 206K 1064K
1513M 38K 38K 3 272K 1400K
2070M 45K 45K 4 366K 1647K
4195M 119K 35K 4 280K 1261K
16G 251K 65K 4 525K 2365K
66G 509K 133K 5 1198K 4795K
261G 1022K 262K 5 2358K 9434K
1033G 2047K 516K 5 4650K 18M
2092G 2047K 1046K 6 10M 36M
4239G 4095K 1060K 6 10M 37M
16T 8191K 2091K 6 20M 73M
64T 15M 4126K 6 40M 145M
130T 15M 8359K 7 89M 293M
file length block_len block_count s2length xmit sums array_size
50 16K 1 2 6 36
65M 16K 4202 3 28K 147K
1063M 16K 66K 4 531K 2392K
16G 16K 1034K 5 9312K 36M
261G 16K 16M 6 163M 589M
4239G 16K 264M 7 2914M 9539M
64T 16K 30M 8 365M 1096M
--
________________________________________________________________
J.W. Schultz Pegasystems Technologies
email address: jw at pegasys.ws
Remember Cernan and Schmitt
-------------- next part --------------
--- rsync.h Sat Mar 29 11:11:30 2003
+++ rsync.h Sat Mar 29 12:15:15 2003
@@ -50,7 +50,7 @@
#define SAME_TIME (1<<7)
/* update this if you make incompatible changes */
-#define PROTOCOL_VERSION 26
+#define PROTOCOL_VERSION 27
/* We refuse to interoperate with versions that are not in this range.
* Note that we assume we'll work with later versions: the onus is on
@@ -406,7 +406,8 @@
OFF_T flength; /**< total file length */
size_t count; /**< how many chunks */
size_t remainder; /**< flength % block_length */
- size_t n; /**< block_length */
+ size_t blength; /**< block_length */
+ size_t s2length; /**< sum2_length */
struct sum_buf *sums; /**< points to info for each chunk */
};
--- proto.h Sat Mar 29 12:18:02 2003
+++ proto.h Sat Mar 29 12:15:38 2003
@@ -91,6 +91,7 @@
struct file_list *flist_new(void);
void flist_free(struct file_list *flist);
char *f_name(struct file_struct *f);
+void write_sum_head(int f, struct sum_struct *sum);
void recv_generator(char *fname, struct file_list *flist, int i, int f_out);
void generate_files(int f,struct file_list *flist,char *local_name,int f_recv);
int main(int argc, char *argv[]);
@@ -190,6 +191,7 @@
int report);
void sig_int(void);
void finish_transfer(char *fname, char *fnametmp, struct file_struct *file);
+void read_sum_head(int f, struct sum_struct *sum);
void send_files(struct file_list *flist,int f_out,int f_in);
int try_bind_local(int s,
int ai_family, int ai_socktype,
--- generator.c Sat Mar 29 11:11:30 2003
+++ generator.c Sat Mar 29 12:16:02 2003
@@ -116,13 +116,21 @@
/*
- send a header that says "we have no checksums" down the f_out fd
+ * NULL sum_struct means we have no checksums
*/
-static void send_null_sums(int f_out)
+
+void write_sum_head(int f, struct sum_struct *sum)
{
- write_int(f_out, 0);
- write_int(f_out, block_size);
- write_int(f_out, 0);
+ static struct sum_struct null_sum;
+
+ if (sum == (struct sum_struct *)NULL)
+ sum = &null_sum;
+
+ write_int(f, sum->count);
+ write_int(f, sum->blength);
+ if (remote_version >= 27)
+ write_int(f, sum->s2length);
+ write_int(f, sum->remainder);
}
@@ -164,19 +172,18 @@
sum.count = (len + (block_len - 1)) / block_len;
sum.remainder = (len % block_len);
- sum.n = block_len;
+ sum.blength = block_len;
sum.flength = len;
+ sum.s2length = csum_length;
/* not needed here sum.sums = NULL; */
if (sum.count && verbose > 3) {
rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
(long) sum.count, (long) sum.remainder,
- (long) sum.n, (double) sum.flength);
+ (long) sum.blength, (double) sum.flength);
}
- write_int(f_out, sum.count);
- write_int(f_out, sum.n);
- write_int(f_out, sum.remainder);
+ write_sum_head(f_out, &sum);
for (i = 0; i < sum.count; i++) {
int n1 = MIN(len, block_len);
@@ -192,7 +199,7 @@
i, (double) offset, n1, (unsigned long) sum1);
}
write_int(f_out, sum1);
- write_buf(f_out, sum2, csum_length);
+ write_buf(f_out, sum2, sum.s2length);
len -= n1;
offset += n1;
}
@@ -384,7 +391,7 @@
if (statret == -1) {
if (errno == ENOENT) {
write_int(f_out,i);
- if (!dry_run) send_null_sums(f_out);
+ if (!dry_run) write_sum_head(f_out, NULL);
} else {
if (verbose > 1)
rprintf(FERROR, RSYNC_NAME
@@ -401,7 +408,7 @@
/* now pretend the file didn't exist */
write_int(f_out,i);
- if (!dry_run) send_null_sums(f_out);
+ if (!dry_run) write_sum_head(f_out, NULL);
return;
}
@@ -430,7 +437,7 @@
if (disable_deltas_p()) {
write_int(f_out,i);
- send_null_sums(f_out);
+ write_sum_head(f_out, NULL);
return;
}
@@ -441,7 +448,7 @@
rprintf(FERROR,RSYNC_NAME": failed to open \"%s\", continuing : %s\n",fnamecmp,strerror(errno));
/* pretend the file didn't exist */
write_int(f_out,i);
- send_null_sums(f_out);
+ write_sum_head(f_out, NULL);
return;
}
--- receiver.c Sat Mar 29 11:11:30 2003
+++ receiver.c Sat Mar 29 12:16:23 2003
@@ -230,7 +230,8 @@
OFF_T total_size)
{
int i;
- unsigned int n,remainder,len,count;
+ struct sum_struct sum;
+ unsigned int len;
OFF_T offset = 0;
OFF_T offset2;
char *data;
@@ -238,9 +239,7 @@
static char file_sum2[MD4_SUM_LENGTH];
char *map=NULL;
- count = read_int(f_in);
- n = read_int(f_in);
- remainder = read_int(f_in);
+ read_sum_head(f_in, &sum);
sum_init();
@@ -270,10 +269,10 @@
}
i = -(i+1);
- offset2 = i*(OFF_T)n;
- len = n;
- if (i == (int) count-1 && remainder != 0)
- len = remainder;
+ offset2 = i*(OFF_T)sum.blength;
+ len = sum.blength;
+ if (i == (int) sum.count-1 && sum.remainder != 0)
+ len = sum.remainder;
stats.matched_data += len;
--- sender.c Sat Mar 29 11:11:30 2003
+++ sender.c Sat Mar 29 12:16:32 2003
@@ -37,6 +37,19 @@
**/
+void read_sum_head(int f, struct sum_struct *sum)
+{
+ sum->count = read_int(f);
+ sum->blength = read_int(f);
+ if (remote_version < 27)
+ {
+ sum->s2length = csum_length;
+ } else {
+ sum->s2length = read_int(f);
+ }
+ sum->remainder = read_int(f);
+}
+
/**
* Receive the checksums for a buffer
**/
@@ -49,14 +62,14 @@
s = (struct sum_struct *)malloc(sizeof(*s));
if (!s) out_of_memory("receive_sums");
- s->count = read_int(f);
- s->n = read_int(f);
- s->remainder = read_int(f);
+ read_sum_head(f, s);
+
s->sums = NULL;
if (verbose > 3)
rprintf(FINFO,"count=%ld n=%ld rem=%ld\n",
- (long) s->count, (long) s->n, (long) s->remainder);
+ (long) s->count, (long) s->blength,
+ (long) s->remainder);
if (s->count == 0)
return(s);
@@ -66,7 +79,7 @@
for (i=0; i < (int) s->count;i++) {
s->sums[i].sum1 = read_int(f);
- read_buf(f,s->sums[i].sum2,csum_length);
+ read_buf(f,s->sums[i].sum2,s->s2length);
s->sums[i].offset = offset;
s->sums[i].i = i;
@@ -74,7 +87,7 @@
if (i == (int) s->count-1 && s->remainder != 0) {
s->sums[i].len = s->remainder;
} else {
- s->sums[i].len = s->n;
+ s->sums[i].len = s->blength;
}
offset += s->sums[i].len;
@@ -211,9 +224,7 @@
if (write_batch)
write_batch_delta_file((char *)&i,sizeof(i));
- write_int(f_out,s->count);
- write_int(f_out,s->n);
- write_int(f_out,s->remainder);
+ write_sum_head(f_out, s);
}
if (verbose > 2)
@@ -240,9 +251,7 @@
}
else {
write_int(f_out,j);
- write_int(f_out,s->count);
- write_int(f_out,s->n);
- write_int(f_out,s->remainder);
+ write_sum_head(f_out, s);
done=0;
while (!done) {
read_batch_delta_file( (char *) &buff_len, sizeof(int) );
--- match.c Sat Mar 29 11:11:30 2003
+++ match.c Sat Mar 29 12:16:13 2003
@@ -19,8 +19,6 @@
#include "rsync.h"
-extern int csum_length;
-
extern int verbose;
extern int am_server;
@@ -154,11 +152,11 @@
if (verbose > 2)
rprintf(FINFO,"hash search b=%ld len=%.0f\n",
- (long) s->n, (double)len);
+ (long) s->blength, (double)len);
- /* cast is to make s->n signed; it should always be reasonably
+ /* cast is to make s->blength signed; it should always be reasonably
* small */
- k = MIN(len, (OFF_T) s->n);
+ k = MIN(len, (OFF_T) s->blength);
map = (schar *)map_ptr(buf,0,k);
@@ -173,8 +171,8 @@
end = len + 1 - s->sums[s->count-1].len;
if (verbose > 3)
- rprintf(FINFO, "hash search s->n=%ld len=%.0f count=%ld\n",
- (long) s->n, (double) len, (long) s->count);
+ rprintf(FINFO, "hash search s->blength=%ld len=%.0f count=%ld\n",
+ (long) s->blength, (double) len, (long) s->count);
do {
tag t = gettag2(s1,s2);
@@ -196,7 +194,7 @@
if (sum != s->sums[i].sum1) continue;
/* also make sure the two blocks are the same length */
- l = MIN(s->n,len-offset);
+ l = MIN(s->blength,len-offset);
if (l != s->sums[i].len) continue;
if (verbose > 3)
@@ -209,7 +207,7 @@
done_csum2 = 1;
}
- if (memcmp(sum2,s->sums[i].sum2,csum_length) != 0) {
+ if (memcmp(sum2,s->sums[i].sum2,s->s2length) != 0) {
false_alarms++;
continue;
}
@@ -220,7 +218,7 @@
int i2 = targets[j].i;
if (i2 == last_i + 1) {
if (sum != s->sums[i2].sum1) break;
- if (memcmp(sum2,s->sums[i2].sum2,csum_length) != 0) break;
+ if (memcmp(sum2,s->sums[i2].sum2,s->s2length) != 0) break;
/* we've found an adjacent match - the RLL coder
will be happy */
i = i2;
@@ -232,7 +230,7 @@
matched(f,s,buf,offset,i);
offset += s->sums[i].len - 1;
- k = MIN((len-offset), s->n);
+ k = MIN((len-offset), s->blength);
map = (schar *)map_ptr(buf,offset,k);
sum = get_checksum1((char *)map, k);
s1 = sum & 0xFFFF;
@@ -262,9 +260,9 @@
running match, the checksum update and the
literal send. */
if (offset > last_match &&
- offset-last_match >= CHUNK_SIZE+s->n &&
+ offset-last_match >= CHUNK_SIZE+s->blength &&
(end-offset > CHUNK_SIZE)) {
- matched(f,s,buf,offset - s->n, -2);
+ matched(f,s,buf,offset - s->blength, -2);
}
} while (++offset < end);
-------------- next part --------------
--- rsync.h Sat Mar 29 12:24:35 2003
+++ rsync.h Sat Mar 29 15:11:04 2003
@@ -339,6 +339,8 @@
/* the length of the md4 checksum */
#define MD4_SUM_LENGTH 16
#define SUM_LENGTH 16
+#define SHORT_SUM_LENGTH 2
+#define BLOCKSUM_BIAS 10
#ifndef MAXPATHLEN
#define MAXPATHLEN 1024
--- options.c Sat Mar 29 12:25:34 2003
+++ options.c Sat Mar 29 15:10:42 2003
@@ -72,7 +72,7 @@
int keep_partial=0;
int safe_symlinks=0;
int copy_unsafe_links=0;
-int block_size=BLOCK_SIZE;
+int block_size=0;
int size_only=0;
int bwlimit=0;
int delete_after=0;
@@ -725,7 +725,7 @@
if (x != 1) args[ac++] = argstr;
- if (block_size != BLOCK_SIZE) {
+ if (block_size) {
snprintf(bsize,sizeof(bsize),"-B%d",block_size);
args[ac++] = bsize;
}
--- generator.c Sat Mar 29 12:24:49 2003
+++ generator.c Sat Mar 29 16:21:55 2003
@@ -32,7 +32,6 @@
extern int preserve_hard_links;
extern int update_only;
extern int opt_ignore_existing;
-extern int block_size;
extern int csum_length;
extern int ignore_times;
extern int size_only;
@@ -100,24 +99,9 @@
}
-/* use a larger block size for really big files */
-static int adapt_block_size(struct file_struct *file, int bsize)
-{
- int ret;
-
- if (bsize != BLOCK_SIZE) return bsize;
-
- ret = file->length / (10000); /* rough heuristic */
- ret = ret & ~15; /* multiple of 16 */
- if (ret < bsize) ret = bsize;
- if (ret > CHUNK_SIZE/2) ret = CHUNK_SIZE/2;
- return ret;
-}
-
-
/*
* NULL sum_struct means we have no checksums
- */
+ */
void write_sum_head(int f, struct sum_struct *sum)
{
@@ -133,7 +117,87 @@
write_int(f, sum->remainder);
}
+/*
+ * set (initialize) the size entries in the per-file sum_struct
+ * calulating dynamic block ans checksum sizes.
+ *
+ * This is only called from generate_and_send_sums() but is a seperate
+ * function to encapsulate the logic.
+ *
+ * The block size is a rounded square root of file length.
+ *
+ * The checksum size is determined according to:
+ * blocksum_bits = BLOCKSUM_EXP + 2*log2(file_len) - log2(block_len)
+ * provided by Donovan Baarda which gives a probability of rsync
+ * algorithm corrupting data and falling back using the whole md4
+ * checksums.
+ *
+ * This might be made one of several selectable heuristics.
+ */
+static void sum_sizes_sqroot_baarda(struct sum_struct *sum, uint64 len)
+{
+ extern int block_size;
+ int blength, s2length, b;
+ uint32 c;
+ uint64 l;
+
+ if (block_size) {
+ blength = block_size;
+ } else if (len <= BLOCK_SIZE * BLOCK_SIZE) {
+ blength = BLOCK_SIZE;
+ } else {
+ l = len;
+ c = 1;
+ while (l >>= 2) {
+ c <<= 1;
+ }
+ blength = 0;
+ do {
+ blength |= c;
+ if (len < (uint64)(blength * blength))
+ blength &= ~c;
+ c >>= 1;
+ } while (c >= 8); /* round to multiple of 8 */
+ blength = MAX(blength, BLOCK_SIZE);
+ }
+
+ if (remote_version < 27) {
+ s2length = csum_length;
+ } else if (csum_length == SUM_LENGTH) {
+ s2length = SUM_LENGTH;
+ } else {
+ b = BLOCKSUM_BIAS;
+ l = len;
+ while (l >>= 1) {
+ b += 2;
+ }
+ c = blength;
+ while (c >>= 1 && b) {
+ b--;
+ }
+ s2length = (b + 1 - 32 + 7) / 8; /* add a bit,
+ * subtract rollsum,
+ * round up
+ * --optimize in compiler--
+ */
+ s2length = MAX(s2length, csum_length);
+ s2length = MIN(s2length, SUM_LENGTH);
+ }
+
+ sum->flength = len;
+ sum->blength = blength;
+ sum->s2length = s2length;
+ sum->count = (len + (blength - 1)) / blength;
+ sum->remainder = (len % blength);
+
+ if (sum->count && verbose > 2) {
+ rprintf(FINFO, "count=%ld rem=%ld blength=%ld s2length=%ld flength=%.0f\n",
+ (long) sum->count, (long) sum->remainder,
+ (long) sum->blength, (long) sum->s2length,
+ (double) sum->flength);
+ }
+}
/**
* Perhaps we want to just send an empty checksum set for this file,
@@ -163,30 +227,18 @@
*
* Generate approximately one checksum every block_len bytes.
*/
-static void generate_and_send_sums(struct map_struct *buf, OFF_T len,
- int block_len, int f_out)
+static void generate_and_send_sums(struct map_struct *buf, OFF_T len, int f_out)
{
size_t i;
struct sum_struct sum;
OFF_T offset = 0;
- sum.count = (len + (block_len - 1)) / block_len;
- sum.remainder = (len % block_len);
- sum.blength = block_len;
- sum.flength = len;
- sum.s2length = csum_length;
- /* not needed here sum.sums = NULL; */
-
- if (sum.count && verbose > 3) {
- rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
- (long) sum.count, (long) sum.remainder,
- (long) sum.blength, (double) sum.flength);
- }
+ sum_sizes_sqroot_baarda(&sum, len);
write_sum_head(f_out, &sum);
for (i = 0; i < sum.count; i++) {
- int n1 = MIN(len, block_len);
+ int n1 = MIN(len, sum.blength);
char *map = map_ptr(buf, offset, n1);
uint32 sum1 = get_checksum1(map, n1);
char sum2[SUM_LENGTH];
@@ -465,8 +517,7 @@
rprintf(FINFO, "generating and sending sums for %d\n", i);
write_int(f_out,i);
- generate_and_send_sums(buf, st.st_size,
- adapt_block_size(file, block_size), f_out);
+ generate_and_send_sums(buf, st.st_size, f_out);
close(fd);
if (buf) unmap_file(buf);
More information about the rsync
mailing list