[patch] Add `--link-by-hash' option.

Jason M. Felice jfelice at cronosys.com
Mon Feb 9 20:48:07 GMT 2004


This patch adds the --link-by-hash=DIR option, which hard links received
files in a link farm arranged by MD4 file hash.  The result is that the system
will only store one copy of the unique contents of each file, regardless of
the file's name.

Anyone have an example of an MD4 collision so I can test that case? :)

Patch Summary:

    -1   +1    Makefile.in
    -0   +304  hashlink.c (new)
    -1   +21   options.c
    -0   +6    proto.h
    -5   +21   receiver.c
    -0   +6    rsync.c
    -0   +7    rsync.h
-------------- next part --------------
patchwork diff options.c
--- options.c	2004-02-09 15:42:40.000000000 -0500
+++ options.c	2004-02-09 15:42:40.000000000 -0500
@@ -117,6 +117,7 @@
 char *password_file = NULL;
 char *rsync_path = RSYNC_PATH;
 char *backup_dir = NULL;
+char *link_by_hash_dir = NULL;
 int rsync_port = RSYNC_PORT;
 int link_dest = 0;
 
@@ -258,6 +259,7 @@
   rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
   rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
   rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
+  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash to DIR for regular files\n");
   rprintf(F," -P                          equivalent to --partial --progress\n");
   rprintf(F," -z, --compress              compress file data\n");
   rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
@@ -297,7 +299,7 @@
 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
       OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
       OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
-      OPT_READ_BATCH, OPT_WRITE_BATCH};
+      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH};
 
 static struct poptOption long_options[] = {
   /* longName, shortName, argInfo, argPtr, value, descrip, argDesc */
@@ -353,6 +355,7 @@
   {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
   {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
   {"link-dest",        0,  POPT_ARG_STRING, 0,              OPT_LINK_DEST, 0, 0 },
+  {"link-by-hash",     0,  POPT_ARG_STRING, 0,              OPT_LINK_BY_HASH, 0, 0},
   /* TODO: Should this take an optional int giving the compression level? */
   {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
   {"daemon",           0,  POPT_ARG_NONE,   &am_daemon, 0, 0, 0 },
@@ -585,6 +588,18 @@
 			return 0;
 #endif
 
+                case OPT_LINK_BY_HASH:
+#if HAVE_LINK
+			link_by_hash_dir = (char *)poptGetOptArg(pc);
+			break;
+#else
+			snprintf(err_buf, sizeof err_buf,
+				 "hard links are not supported on this %s\n",
+				 am_server ? "server" : "client");
+			rprintf(FERROR, "ERROR: %s", err_buf);
+			return 0;
+#endif
+
 
 		default:
 			/* FIXME: If --daemon is specified, then errors for later
@@ -889,6 +904,11 @@
 		args[ac++] = compare_dest;
 	}
 
+	if (link_by_hash_dir && am_sender) {
+		args[ac++] = "--link-by-hash";
+		args[ac++] = link_by_hash_dir;
+	}
+
 	if (files_from && (!am_sender || remote_filesfrom_file)) {
 		if (remote_filesfrom_file) {
 			args[ac++] = "--files-from";
patchwork diff rsync.c
--- rsync.c	2004-02-09 15:42:40.000000000 -0500
+++ rsync.c	2004-02-09 15:42:40.000000000 -0500
@@ -30,6 +30,7 @@
 extern int preserve_gid;
 extern int preserve_perms;
 extern int make_backups;
+extern char *link_by_hash_dir;
 
 
 /*
@@ -268,4 +269,9 @@
 	} else {
 		set_perms(fname,file,NULL,0);
 	}
+
+#ifdef HAVE_LINK
+	if (link_by_hash_dir)
+		link_by_hash(fname,file);
+#endif
 }
patchwork diff receiver.c
--- receiver.c	2004-02-09 15:42:40.000000000 -0500
+++ receiver.c	2004-02-09 15:42:40.000000000 -0500
@@ -237,10 +237,11 @@
 
 
 static int receive_data(int f_in,struct map_struct *buf,int fd,char *fname,
-			OFF_T total_size)
+			OFF_T total_size,char *md4)
 {
 	int i;
 	struct sum_struct sum;
+	struct mdfour mdfour_data;
 	unsigned int len;
 	OFF_T offset = 0;
 	OFF_T offset2;
@@ -250,6 +251,8 @@
 	char *map=NULL;
 	
 	read_sum_head(f_in, &sum);
+	if (md4)
+		mdfour_begin(&mdfour_data);
 	
 	sum_init();
 	
@@ -269,6 +272,8 @@
 			cleanup_got_literal = 1;
       
 			sum_update(data,i);
+			if (md4)
+				mdfour_update(&mdfour_data,data,i);
 
 			if (fd != -1 && write_file(fd,data,i) != i) {
 				rprintf(FERROR, "write failed on %s: %s\n",
@@ -296,6 +301,8 @@
 		
 			see_token(map, len);
 			sum_update(map,len);
+			if (md4)
+				mdfour_update(&mdfour_data,map,len);
 		}
 		
 		if (fd != -1 && write_file(fd,map,len) != (int) len) {
@@ -316,6 +323,9 @@
 	}
 
 	sum_end(file_sum1);
+	if (md4) {
+		mdfour_result(&mdfour_data, (unsigned char*)md4);
+	}
 
 	read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
 	if (verbose > 2) {
@@ -351,6 +361,7 @@
 	extern int preserve_perms;
 	extern int delete_after;
 	extern int orig_umask;
+	extern char *link_by_hash_dir;
 	struct stats initial_stats;
 
 	if (verbose > 2) {
@@ -416,7 +427,7 @@
 		if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
 			rprintf(FERROR, "fstat %s failed: %s\n",
 				full_fname(fnamecmp), strerror(errno));
-			receive_data(f_in,NULL,-1,NULL,file->length);
+			receive_data(f_in,NULL,-1,NULL,file->length,NULL);
 			close(fd1);
 			continue;
 		}
@@ -429,7 +440,7 @@
 			 */
 			rprintf(FERROR,"recv_files: %s is a directory\n",
 				full_fname(fnamecmp));
-			receive_data(f_in, NULL, -1, NULL, file->length);
+			receive_data(f_in,NULL,-1,NULL,file->length,NULL);
 			close(fd1);
 			continue;
 		}
@@ -482,7 +493,7 @@
 		if (fd2 == -1) {
 			rprintf(FERROR, "mkstemp %s failed: %s\n",
 				full_fname(fnametmp), strerror(errno));
-			receive_data(f_in,buf,-1,NULL,file->length);
+			receive_data(f_in,buf,-1,NULL,file->length,NULL);
 			if (buf) unmap_file(buf);
 			if (fd1 != -1) close(fd1);
 			continue;
@@ -495,7 +506,12 @@
 		}
 
 		/* recv file data */
-		recv_ok = receive_data(f_in,buf,fd2,fname,file->length);
+#ifdef HAVE_LINK
+		if (link_by_hash_dir) {
+			file->sum = (char*)malloc (MD4_SUM_LENGTH);
+		}
+#endif
+		recv_ok = receive_data(f_in,buf,fd2,fname,file->length,file->sum);
 
 		log_recv(file, &initial_stats);
 		
patchwork diff rsync.h
--- rsync.h	2004-02-09 15:42:40.000000000 -0500
+++ rsync.h	2004-02-09 15:42:40.000000000 -0500
@@ -453,6 +453,13 @@
 	int num_transferred_files;
 };
 
+struct hashfile_struct {
+	struct hashfile_struct *next;
+	struct hashfile_struct *prev;
+	char *name;
+	int fd;
+};
+
 
 /* we need this function because of the silly way in which duplicate
    entries are handled in the file lists - we can't change this
patchwork diff hashlink.c
--- hashlink.c	1969-12-31 19:00:00.000000000 -0500
+++ hashlink.c	2004-02-09 15:42:40.000000000 -0500
@@ -0,0 +1,304 @@
+/*
+   Copyright (C) Cronosys, LLC 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
+
+extern char *link_by_hash_dir;
+
+#ifdef HAVE_LINK
+
+char* make_hash_name(struct file_struct *file)
+{
+	char hash[33], *dst;
+	unsigned char *src;
+	unsigned char c;
+	int i;
+
+	src = (unsigned char*)file->sum;
+	for (dst = hash, i = 0; i < 4; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst++ = '/';
+	for (i = 0; i < 12; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst = 0;
+
+	asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
+	return dst;
+}
+
+
+void kill_hashfile(struct hashfile_struct *hashfile)
+{
+	if (!hashfile)
+		return;
+	free(hashfile->name);
+	close(hashfile->fd);
+	free(hashfile);
+}
+
+
+void kill_hashfiles(struct hashfile_struct *hashfiles)
+{
+	struct hashfile_struct *iter, *next;
+	if ((iter = hashfiles) != NULL) {
+		do {
+			next = iter->next;
+			kill_hashfile(iter);
+			iter = next;
+		} while (iter != hashfiles);
+	}
+}
+
+
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
+{
+	DIR *d;
+	struct dirent *di;
+	struct hashfile_struct *hashfiles = NULL, *hashfile;
+	STRUCT_STAT st;
+	long this_fnbr;
+
+	*fnbr = 0;
+	
+	/* Build a list of potential candidates and open
+	 * them. */
+	if ((d = opendir(hashname)) == NULL) {
+		rprintf(FERROR,"opendir \"%s\": %s\n",
+			hashname, strerror(errno));
+		free(hashname);
+		return NULL;
+	}
+	while ((di = readdir(d)) != NULL) {
+		if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
+			continue;
+		}
+
+		/* We need to have the largest fnbr in case we need to store
+		 * a new file. */
+		this_fnbr = atol(di->d_name);
+		if (this_fnbr > *fnbr)
+			*fnbr = this_fnbr;
+
+		hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
+		asprintf(&hashfile->name,"%s/%s",hashname,
+			 di->d_name);
+		if (do_stat(hashfile->name,&st) == -1) {
+			rprintf(FERROR,"%s: %s", hashfile->name,
+				strerror(errno));
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (st.st_size != size) {
+			kill_hashfile(hashfile);
+			continue;
+		}
+		hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
+		if (hashfile->fd == -1) {
+			rprintf(FERROR,"%s: %s\n", hashfile->name,
+				strerror(errno));
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (hashfiles == NULL)
+			hashfiles = hashfile->next = hashfile->prev = hashfile;
+		else {
+			hashfile->next = hashfiles;
+			hashfile->prev = hashfiles->prev;
+			hashfile->next->prev = hashfile;
+			hashfile->prev->next = hashfile;
+		}
+	}
+	closedir(d);
+
+	return hashfiles;
+}
+
+
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
+{
+	int amt, hamt;
+	char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
+	struct hashfile_struct *iter, *next;
+
+	if (!files)
+		return NULL;
+
+	iter = files; /* in case files are 0 bytes */
+	while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
+		iter = files;
+		do {
+			/* Icky bit to resync when we steal the first node. */
+			if (!files)
+				files = iter;
+
+			next = iter->next;
+
+			hamt = read(iter->fd, cmpbuffer, BUFSIZ);
+			if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
+				if (iter == files) {
+					files = files->prev;
+				}
+				if (iter->next == iter) {
+					files = next = NULL;
+				} else {
+					next = iter->next;
+					if (iter == files) {
+						/* So we know to resync */
+						files = NULL;
+					}
+				}
+				iter->next->prev = iter->prev;
+				iter->prev->next = iter->next;
+				kill_hashfile(iter);
+			}
+
+			iter = next;
+		} while (iter != files);
+
+		if (iter == NULL && files == NULL) {
+			/* There are no matches. */
+			return NULL;
+		}
+		
+	}
+
+	if (amt == -1) {
+		rprintf(FERROR,"%s",strerror(errno));
+		kill_hashfiles(files);
+		return NULL;
+	}
+
+	/* All files which remain in the list are identical and should have
+	 * the same size.  We shouldn't have more than one, but if we do
+	 * we just pick one. */
+	while (iter->next != iter) {
+		next = iter->next;
+		iter->prev->next = iter->next;
+		iter->next->prev = iter->prev;
+		kill_hashfile(iter);
+		iter = next;
+	}
+
+	return iter;
+}
+
+
+void link_by_hash(char *fname,struct file_struct *file)
+{
+	STRUCT_STAT st;
+	char *hashname = make_hash_name(file);		
+	int first = 0;
+	char *linkname;
+
+	rprintf(FINFO, "hash link %s -> \"%s\"\n", full_fname(fname),
+			hashname);
+	if (do_stat(hashname, &st) == -1) {
+		char *dirname;
+
+		/* Directory does not exist. */
+		dirname = strdup(hashname);
+		*strrchr(dirname,'/') = 0;
+		if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
+			rprintf(FERROR, "mkdir %s: %s\n", dirname,
+				strerror(errno));
+			free(hashname);
+			free(dirname);
+			return;
+		}
+		free(dirname);
+
+		if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
+			rprintf(FERROR, "mkdir %s: %s\n", hashname,
+				strerror(errno));
+			free(hashname);
+			return;
+		}
+
+		first = 1;
+		asprintf(&linkname,"%s/0",hashname);
+			
+	} else {
+		struct hashfile_struct *hashfiles, *hashfile;
+		int fd;
+		long last_fnbr;
+
+		if (do_stat(fname,&st) == -1) {
+			rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
+			return;
+		}
+		hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
+
+		if (hashfiles == NULL) {
+			first = 1;
+			asprintf(&linkname,"%s/0",hashname);
+		} else {
+			
+			/* Search for one identical to us. */
+			if ((fd = open(fname,O_RDONLY|O_BINARY)) == -1) {
+				rprintf(FERROR,"%s: %s\n",fname,
+					strerror(errno));
+				kill_hashfiles(hashfiles);
+				return;
+			}
+			hashfile = compare_hashfiles(fd, hashfiles);
+			hashfiles = NULL;
+
+			if (hashfile) {
+				first = 0;
+				linkname = strdup(hashfile->name);
+				kill_hashfile(hashfile);
+			} else {
+				first = 1;
+				asprintf(&linkname, "%s/%ld", hashname,
+					 last_fnbr + 1);
+			}
+		}
+	}
+
+	if (first) {
+		if (do_link(fname, linkname) == -1) {
+			rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
+				full_fname(fname),linkname,
+				strerror(errno));
+		}
+	} else {
+		if (do_unlink(fname) == -1) {
+			rprintf(FERROR,"unlink \"%s\": %s\n",
+				full_fname(fname),strerror(errno));
+		} else if (do_link(linkname, fname) == -1) {
+			rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
+				linkname,full_fname(fname),
+				strerror(errno));
+		}
+	}
+
+	free(linkname);
+	free(hashname);
+}
+
+#endif
patchwork diff Makefile.in
--- Makefile.in	2004-02-09 15:42:40.000000000 -0500
+++ Makefile.in	2004-02-09 15:42:40.000000000 -0500
@@ -35,7 +35,7 @@
 	main.o checksum.o match.o syscall.o log.o backup.o
 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
 	fileio.o batch.o clientname.o
-OBJS3=progress.o pipe.o
+OBJS3=progress.o pipe.o hashlink.o
 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
 	popt/popthelp.o popt/poptparse.o
patchwork diff proto.h
--- proto.h	2004-02-09 15:42:50.000000000 -0500
+++ proto.h	2004-02-09 15:42:52.000000000 -0500
@@ -90,6 +90,12 @@
 void write_sum_head(int f, struct sum_struct *sum);
 void recv_generator(char *fname, struct file_list *flist, int i, int f_out);
 void generate_files(int f,struct file_list *flist,char *local_name,int f_recv);
+char* make_hash_name(struct file_struct *file);
+void kill_hashfile(struct hashfile_struct *hashfile);
+void kill_hashfiles(struct hashfile_struct *hashfiles);
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
+void link_by_hash(char *fname,struct file_struct *file);
 void init_hard_links(struct file_list *flist);
 int check_hard_link(struct file_struct *file);
 void do_hard_links(void);


More information about the rsync mailing list