rsync filename heuristics

Martin Pool mbp at sourcefrog.net
Wed Jan 5 04:23:59 GMT 2005


On  5 Jan 2005, Rusty Russell <rusty at rustcorp.com.au> wrote:
> On Tue, 2005-01-04 at 18:24 +0100, Robert Lemmen wrote:
> > hi rusty,
> > 
> > i read on some webpage about rsync and debian that you wrote a patch to
> > rsync that let's it uses heuristics when deciding which local file to
> > use. could you tell me whether this is planned to be included in a rsync
> > release? could i have that patch?
> 
> Hmm, good question.  This is from 2.5.4, and can't remember how well it
> worked.  Good luck!

I'm not the rsync maintainer anymore, but I think it would be cool if
this were merged, if the current team feels OK about it.


> 
> Rusty.
> 
> diff -urN rsync-2.5.4/Makefile.in rsync-2.5.4-fuzzy/Makefile.in
> --- rsync-2.5.4/Makefile.in	2002-02-26 05:48:25.000000000 +1100
> +++ rsync-2.5.4-fuzzy/Makefile.in	2002-04-03 16:35:55.000000000 +1000
> @@ -28,7 +28,7 @@
>  ZLIBOBJ=zlib/deflate.o zlib/infblock.o zlib/infcodes.o zlib/inffast.o \
>  	zlib/inflate.o zlib/inftrees.o zlib/infutil.o zlib/trees.o \
>  	zlib/zutil.o zlib/adler32.o 
> -OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o main.o checksum.o match.o syscall.o log.o backup.o
> +OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o main.o checksum.o match.o syscall.o log.o backup.o alternate.o
>  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o fileio.o batch.o \
>  	clientname.o
>  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
> diff -urN rsync-2.5.4/alternate.c rsync-2.5.4-fuzzy/alternate.c
> --- rsync-2.5.4/alternate.c	1970-01-01 10:00:00.000000000 +1000
> +++ rsync-2.5.4-fuzzy/alternate.c	2002-04-03 17:04:15.000000000 +1000
> @@ -0,0 +1,117 @@
> +#include "rsync.h"
> +
> +extern char *compare_dest;
> +extern int verbose;
> +
> +/* Alternate methods for opening files, if local doesn't exist */
> +/* Sanity check that we are about to open regular file */
> +int do_open_regular(char *fname)
> +{
> +	STRUCT_STAT st;
> +
> +	if (do_stat(fname, &st) == 0 && S_ISREG(st.st_mode))
> +		return do_open(fname, O_RDONLY, 0);
> +
> +	return -1;
> +}
> +
> +static void split_names(char *fname, char **dirname, char **basename)
> +{
> +	char *slash;
> +
> +	slash = strrchr(fname, '/');
> +	if (slash) {
> +		*dirname = fname;
> +		*slash = '\0';
> +		*basename = slash+1;
> +	} else {
> +		*basename = fname;
> +		*dirname = ".";
> +	}
> +}
> +
> +static unsigned int measure_name(const char *name,
> +				 const char *basename,
> +				 const char *ext)
> +{
> +	int namelen = strlen(name);
> +	int extlen = strlen(ext);
> +	unsigned int score = 0;
> +
> +	/* Extensions must match */
> +	if (namelen <= extlen || strcmp(name+namelen-extlen, ext) != 0)
> +		return 0;
> +
> +	/* Now score depends on similarity of prefix */
> +	for (; *name==*basename && *name; name++, basename++)
> +		score++;
> +	return score;
> +}
> +
> +int open_alternate_base_fuzzy(const char *fname)
> +{
> +	DIR *d;
> +	struct dirent *di;
> +	char *basename, *dirname;
> +	char mangled_name[MAXPATHLEN];
> +	char bestname[MAXPATHLEN];
> +	unsigned int bestscore = 0;
> +	const char *ext;
> +
> +	/* FIXME: can we assume fname fits here? */
> +	strcpy(mangled_name, fname);
> +
> +	split_names(mangled_name, &dirname, &basename);
> +	d = opendir(dirname);
> +	if (!d) {
> +		rprintf(FERROR,"recv_generator opendir(%s): %s\n",
> +			dirname,strerror(errno));
> +		return -1;
> +	}
> +
> +	/* Get final extension, eg. .gz; never full basename though. */
> +	ext = strrchr(basename + 1, '.');
> +	if (!ext)
> +		ext = basename + strlen(basename); /* ext = "" */
> +
> +	while ((di = readdir(d)) != NULL) {
> +		const char *dname = d_name(di);
> +		unsigned int score;
> +
> +		if (strcmp(dname,".")==0 ||
> +		    strcmp(dname,"..")==0)
> +			continue;
> +		
> +		score = measure_name(dname, basename, ext);
> +		if (verbose > 4)
> +			rprintf(FINFO,"fuzzy score for %s = %u\n",
> +				dname, score);
> +		if (score > bestscore) {
> +			strcpy(bestname, dname); 
> +			bestscore = score;
> +		}
> +	}
> +	closedir(d);
> +
> +	/* Found a candidate. */
> +	if (bestscore != 0) {
> +		char fuzzyname[MAXPATHLEN];
> +
> +		snprintf(fuzzyname,MAXPATHLEN,"%s/%s", dirname, bestname);
> +		if (verbose > 2)
> +			rprintf(FINFO,"fuzzy match %s->%s\n",
> +				fname, fuzzyname);
> +		return do_open_regular(fuzzyname);
> +	}
> +	return -1;
> +}
> +
> +int open_alternate_base_comparedir(const char *fname)
> +{
> +	char fnamebuf[MAXPATHLEN];
> +	/* try the file at compare_dest instead */
> +	snprintf(fnamebuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
> +
> +	/* FIXME: now follows symlinks... */
> +	return do_open_regular(fnamebuf);
> +}
> diff -urN rsync-2.5.4/generator.c rsync-2.5.4-fuzzy/generator.c
> --- rsync-2.5.4/generator.c	2002-02-08 03:36:12.000000000 +1100
> +++ rsync-2.5.4-fuzzy/generator.c	2002-04-03 17:00:06.000000000 +1000
> @@ -42,11 +42,12 @@
>  extern int always_checksum;
>  extern int modify_window;
>  extern char *compare_dest;
> +extern int fuzzy;
>  
>  
>  /* choose whether to skip a particular file */
>  static int skip_file(char *fname,
> -		     struct file_struct *file, STRUCT_STAT *st)
> +		     struct file_struct *file, const STRUCT_STAT *st)
>  {
>  	if (st->st_size != file->length) {
>  		return 0;
> @@ -185,7 +186,61 @@
>  	return s;
>  }
>  
> +/* Returns -1 for can't open (null file), -2 for skip */
> +static int open_base_file(struct file_struct *file,
> +			  char *fname, 
> +			  int statret, 
> +			  STRUCT_STAT *st)
> +{
> +	int fd = -1;
> +
> +	if (statret == 0) {
> +		if (S_ISREG(st->st_mode)) {
> +			if (update_only
> +			    && cmp_modtime(st->st_mtime, file->modtime) > 0) {
> +				if (verbose > 1)
> +					rprintf(FINFO,"%s is newer\n",fname);
> +				return -2;
> +			}
> +			if (skip_file(fname, file, st)) {
> +				set_perms(fname, file, st, 1);
> +				return -2;
> +			}
> +		 	fd = do_open(fname, O_RDONLY, 0);
> +			if (fd == -1) {
> +				rprintf(FERROR,"failed to open %s, continuing : %s\n",fname,strerror(errno));
> +				return -1;
> +			} else
> +				return fd;
> +		} else {
> +			/* Try to use symlink contents */
> +			if (S_ISLNK(st->st_mode)) {
> +				fd = do_open_regular(fname);
> +				/* Don't delete yet; receiver will need it */
> +			} else {
> +				if (delete_file(fname) != 0) {
> +					if (fd != -1)
> +						close(fd);
> +					return -2;
> +				}
> +			}
> +		}
> +	}
> +
> +	if (fd == -1 && compare_dest != NULL)
> +		fd = open_alternate_base_comparedir(fname);
>  
> +	if (fd == -1 && fuzzy)
> +		fd = open_alternate_base_fuzzy(fname);
> +
> +	/* Update stat to understand size */
> +	if (fd != -1) {
> +		if (do_fstat(fd, st) != 0)
> +			rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
> +	}
> +
> +	return fd;
> +}
>  
>  /*
>   * Acts on file number I from FLIST, whose name is fname.
> @@ -203,9 +258,6 @@
>  	struct sum_struct *s;
>  	int statret;
>  	struct file_struct *file = flist->files[i];
> -	char *fnamecmp;
> -	char fnamecmpbuf[MAXPATHLEN];
> -	extern char *compare_dest;
>  	extern int list_only;
>  	extern int preserve_perms;
>  	extern int only_existing;
> @@ -341,82 +393,29 @@
>  		return;
>  	}
>  
> -	fnamecmp = fname;
> -
> -	if ((statret == -1) && (compare_dest != NULL)) {
> -		/* try the file at compare_dest instead */
> -		int saveerrno = errno;
> -		snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
> -		statret = link_stat(fnamecmpbuf,&st);
> -		if (!S_ISREG(st.st_mode))
> -			statret = -1;
> -		if (statret == -1)
> -			errno = saveerrno;
> -		else
> -			fnamecmp = fnamecmpbuf;
> -	}
> -
> -	if (statret == -1) {
> -		if (errno == ENOENT) {
> -			write_int(f_out,i);
> -			if (!dry_run) send_sums(NULL,f_out);
> -		} else {
> -			if (verbose > 1)
> -				rprintf(FERROR, RSYNC_NAME
> -					": recv_generator failed to open \"%s\": %s\n",
> -					fname, strerror(errno));
> -		}
> -		return;
> -	}
> -
> -	if (!S_ISREG(st.st_mode)) {
> -		if (delete_file(fname) != 0) {
> -			return;
> -		}
> -
> -		/* now pretend the file didn't exist */
> -		write_int(f_out,i);
> -		if (!dry_run) send_sums(NULL,f_out);    
> -		return;
> -	}
> -
> -	if (opt_ignore_existing && fnamecmp == fname) { 
> -		if (verbose > 1)
> -			rprintf(FINFO,"%s exists\n",fname);
> -		return;
> -	} 
> -
> -	if (update_only && cmp_modtime(st.st_mtime,file->modtime)>0 && fnamecmp == fname) {
> +	/* Failed to stat for some other reason. */
> +	if (statret == -1 && errno != ENOENT) {
>  		if (verbose > 1)
> -			rprintf(FINFO,"%s is newer\n",fname);
> +			rprintf(FERROR, RSYNC_NAME
> +				": recv_generator failed to open \"%s\": %s\n",
> +				fname, strerror(errno));
>  		return;
>  	}
>  
> -	if (skip_file(fname, file, &st)) {
> -		if (fnamecmp == fname)
> -			set_perms(fname,file,&st,1);
> -		return;
> -	}
> -
> -	if (dry_run) {
> -		write_int(f_out,i);
> +	fd = open_base_file(file, fname, statret, &st);
> +	if (fd == -2)
>  		return;
> -	}
> -
> -	if (whole_file) {
> -		write_int(f_out,i);
> -		send_sums(NULL,f_out);    
> -		return;
> -	}
> -
> -	/* open the file */  
> -	fd = do_open(fnamecmp, O_RDONLY, 0);
>  
> -	if (fd == -1) {
> -		rprintf(FERROR,RSYNC_NAME": failed to open \"%s\", continuing : %s\n",fnamecmp,strerror(errno));
> -		/* pretend the file didn't exist */
> +	if ((whole_file || dry_run) && fd != -1) {
> +		close(fd);
> +		fd = -1;
> + 	}
> + 
> + 	if (fd == -1) {
> +		/* the file didn't exist, or we can pretend it doesn't */
>  		write_int(f_out,i);
> -		send_sums(NULL,f_out);
> +		if (!dry_run)
> +			send_sums(NULL,f_out);
>  		return;
>  	}
>  
> @@ -427,7 +426,7 @@
>  	}
>  
>  	if (verbose > 3)
> -		rprintf(FINFO,"gen mapped %s of size %.0f\n",fnamecmp,(double)st.st_size);
> +		rprintf(FINFO,"gen mapped %s of size %.0f\n",fname,(double)st.st_size);
>  
>  	s = generate_sums(buf,st.st_size,adapt_block_size(file, block_size));
>  
> diff -urN rsync-2.5.4/options.c rsync-2.5.4-fuzzy/options.c
> --- rsync-2.5.4/options.c	2002-02-28 09:49:57.000000000 +1100
> +++ rsync-2.5.4-fuzzy/options.c	2002-04-03 16:43:54.000000000 +1000
> @@ -73,6 +73,7 @@
>  #else
>  int modify_window=0;
>  #endif
> +int fuzzy=0;
>  int blocking_io=-1;
>  
>  /** Network address family. **/
> @@ -245,6 +246,7 @@
>    rprintf(F,"     --bwlimit=KBPS          limit I/O bandwidth, KBytes per second\n");
>    rprintf(F,"     --write-batch=PREFIX    write batch fileset starting with PREFIX\n");
>    rprintf(F,"     --read-batch=PREFIX     read batch fileset starting with PREFIX\n");
> +  rprintf(F,"     --fuzzy	          use similar file as basis if it does't exist\n");
>    rprintf(F," -h, --help                  show this help screen\n");
>  #ifdef INET6
>    rprintf(F," -4                          prefer IPv4\n");
> @@ -340,6 +342,7 @@
>    {"hard-links",      'H', POPT_ARG_NONE,   &preserve_hard_links},
>    {"read-batch",       0,  POPT_ARG_STRING, &batch_prefix, OPT_READ_BATCH},
>    {"write-batch",      0,  POPT_ARG_STRING, &batch_prefix, OPT_WRITE_BATCH},
> +  {"fuzzy",	       0,  POPT_ARG_NONE,   &fuzzy},
>  #ifdef INET6
>    {0,		      '4', POPT_ARG_VAL,    &default_af_hint,   AF_INET },
>    {0,		      '6', POPT_ARG_VAL,    &default_af_hint,   AF_INET6 },
> @@ -757,7 +760,9 @@
>  		args[ac++] = "--compare-dest";
>  		args[ac++] = compare_dest;
>  	}
> -
> +	
> +	if (fuzzy && am_sender)
> +		args[ac++] = "--fuzzy";
>  
>  	*argc = ac;
>  }
> diff -urN rsync-2.5.4/proto.h rsync-2.5.4-fuzzy/proto.h
> --- rsync-2.5.4/proto.h	2002-02-23 11:05:06.000000000 +1100
> +++ rsync-2.5.4-fuzzy/proto.h	2002-04-03 16:35:25.000000000 +1000
> @@ -256,3 +256,6 @@
>  int cmp_modtime(time_t file1, time_t file2);
>  int _Insure_trap_error(int a1, int a2, int a3, int a4, int a5, int a6);
>  int sys_gettimeofday(struct timeval *tv);
> +int do_open_regular(char *fname);
> +int open_alternate_base_fuzzy(const char *fname);
> +int open_alternate_base_comparedir(const char *fname);
> diff -urN rsync-2.5.4/receiver.c rsync-2.5.4-fuzzy/receiver.c
> --- rsync-2.5.4/receiver.c	2002-02-14 05:42:20.000000000 +1100
> +++ rsync-2.5.4-fuzzy/receiver.c	2002-04-03 16:46:46.000000000 +1000
> @@ -36,6 +36,7 @@
>  extern char *compare_dest;
>  extern int make_backups;
>  extern char *backup_suffix;
> +extern int fuzzy;
>  
>  static struct delete_list {
>  	DEV64_T dev;
> @@ -307,8 +308,6 @@
>  	char *fname;
>  	char template[MAXPATHLEN];
>  	char fnametmp[MAXPATHLEN];
> -	char *fnamecmp;
> -	char fnamecmpbuf[MAXPATHLEN];
>  	struct map_struct *buf;
>  	int i;
>  	struct file_struct *file;
> @@ -366,28 +365,24 @@
>  		if (verbose > 2)
>  			rprintf(FINFO,"recv_files(%s)\n",fname);
>  
> -		fnamecmp = fname;
> -
>  		/* open the file */  
> -		fd1 = do_open(fnamecmp, O_RDONLY, 0);
> +		fd1 = do_open(fname, O_RDONLY, 0);
>  
> -		if ((fd1 == -1) && (compare_dest != NULL)) {
> -			/* try the file at compare_dest instead */
> -			snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",
> -						compare_dest,fname);
> -			fnamecmp = fnamecmpbuf;
> -			fd1 = do_open(fnamecmp, O_RDONLY, 0);
> -		}
> +		if (fd1 == -1 && compare_dest != NULL)
> +			fd1 = open_alternate_base_comparedir(fname);
> +
> +		if (fd1 == -1 && fuzzy)
> +			fd1 = open_alternate_base_fuzzy(fname);
>  
>  		if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
> -			rprintf(FERROR,"fstat %s : %s\n",fnamecmp,strerror(errno));
> +			rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
>  			receive_data(f_in,NULL,-1,NULL,file->length);
>  			close(fd1);
>  			continue;
>  		}
>  
>  		if (fd1 != -1 && !S_ISREG(st.st_mode)) {
> -			rprintf(FERROR,"%s : not a regular file (recv_files)\n",fnamecmp);
> +			rprintf(FERROR,"%s : not a regular file (recv_files)\n",fname);
>  			receive_data(f_in,NULL,-1,NULL,file->length);
>  			close(fd1);
>  			continue;
> @@ -403,7 +398,7 @@
>  		if (fd1 != -1 && st.st_size > 0) {
>  			buf = map_file(fd1,st.st_size);
>  			if (verbose > 2)
> -				rprintf(FINFO,"recv mapped %s of size %.0f\n",fnamecmp,(double)st.st_size);
> +				rprintf(FINFO,"recv mapped %s of size %.0f\n",fname,(double)st.st_size);
>  		} else {
>  			buf = NULL;
>  		}
> 
> -- 
> A bad analogy is like a leaky screwdriver -- Richard Braakman
-- 
Martin


More information about the rsync mailing list