[patch] Add `--link-by-hash' option (rev 3).

Jason M. Felice jfelice at cronosys.com
Tue Feb 17 18:13:22 GMT 2004


This patch adds the --link-by-hash=DIR option, which hard links received
files in a link farm arranged by MD4 file hash.  The result is that the system
will only store one copy of the unique contents of each file, regardless of
the file's name.

(rev 3)
* Don't link empty files.
* Roll over to new file when filesystem maximum link count is reached.
* If link fails for another reason, leave non-linked file there.
* Depends on rsync-rename.diff

(rev 2)
* This revision is actually against CVS HEAD (I didn't realize I was working
  from a stale rsync'd CVS).
* Apply permissions after linking (permissions were lost if we already had
  a copy of the file in the link farm).

Patch Summary:

    -1   +1    Makefile.in
    -0   +351  hashlink.c (new)
    -1   +21   options.c
    -0   +6    proto.h
    -6   +21   receiver.c
    -3   +11   rsync.c
    -0   +8    rsync.h
-------------- next part --------------
patchwork diff hashlink.c
--- hashlink.c	1969-12-31 19:00:00.000000000 -0500
+++ hashlink.c	2004-02-17 12:53:46.000000000 -0500
@@ -0,0 +1,351 @@
+/*
+   Copyright (C) Cronosys, LLC 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
+
+extern char *link_by_hash_dir;
+
+#ifdef HAVE_LINK
+
+char* make_hash_name(struct file_struct *file)
+{
+	char hash[33], *dst;
+	unsigned char *src;
+	unsigned char c;
+	int i;
+
+	src = (unsigned char*)file->u.sum;
+	for (dst = hash, i = 0; i < 4; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst++ = '/';
+	for (i = 0; i < 12; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst = 0;
+
+	asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
+	return dst;
+}
+
+
+void kill_hashfile(struct hashfile_struct *hashfile)
+{
+	if (!hashfile)
+		return;
+	free(hashfile->name);
+	close(hashfile->fd);
+	free(hashfile);
+}
+
+
+void kill_hashfiles(struct hashfile_struct *hashfiles)
+{
+	struct hashfile_struct *iter, *next;
+	if ((iter = hashfiles) != NULL) {
+		do {
+			next = iter->next;
+			kill_hashfile(iter);
+			iter = next;
+		} while (iter != hashfiles);
+	}
+}
+
+
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
+{
+	DIR *d;
+	struct dirent *di;
+	struct hashfile_struct *hashfiles = NULL, *hashfile;
+	STRUCT_STAT st;
+	long this_fnbr;
+
+	*fnbr = 0;
+	
+	/* Build a list of potential candidates and open
+	 * them. */
+	if ((d = opendir(hashname)) == NULL) {
+		rprintf(FERROR,"opendir \"%s\": %s\n",
+			hashname, strerror(errno));
+		free(hashname);
+		return NULL;
+	}
+	while ((di = readdir(d)) != NULL) {
+		if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
+			continue;
+		}
+
+		/* We need to have the largest fnbr in case we need to store
+		 * a new file. */
+		this_fnbr = atol(di->d_name);
+		if (this_fnbr > *fnbr)
+			*fnbr = this_fnbr;
+
+		hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
+		asprintf(&hashfile->name,"%s/%s",hashname,
+			 di->d_name);
+		if (do_stat(hashfile->name,&st) == -1) {
+			rprintf(FERROR,"%s: %s", hashfile->name,
+				strerror(errno));
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (st.st_size != size) {
+			kill_hashfile(hashfile);
+			continue;
+		}
+		hashfile->nlink = st.st_nlink;
+		hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
+		if (hashfile->fd == -1) {
+			rprintf(FERROR,"%s: %s\n", hashfile->name,
+				strerror(errno));
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (hashfiles == NULL)
+			hashfiles = hashfile->next = hashfile->prev = hashfile;
+		else {
+			hashfile->next = hashfiles;
+			hashfile->prev = hashfiles->prev;
+			hashfile->next->prev = hashfile;
+			hashfile->prev->next = hashfile;
+		}
+	}
+	closedir(d);
+
+	return hashfiles;
+}
+
+
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
+{
+	int amt, hamt;
+	char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
+	struct hashfile_struct *iter, *next, *best;
+	uint32 nlink;
+
+	if (!files)
+		return NULL;
+
+	iter = files; /* in case files are 0 bytes */
+	while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
+		iter = files;
+		do {
+			/* Icky bit to resync when we steal the first node. */
+			if (!files)
+				files = iter;
+
+			next = iter->next;
+
+			hamt = read(iter->fd, cmpbuffer, BUFSIZ);
+			if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
+				if (iter == files) {
+					files = files->prev;
+				}
+				if (iter->next == iter) {
+					files = next = NULL;
+				} else {
+					next = iter->next;
+					if (iter == files) {
+						/* So we know to resync */
+						files = NULL;
+					}
+				}
+				iter->next->prev = iter->prev;
+				iter->prev->next = iter->next;
+				kill_hashfile(iter);
+			}
+
+			iter = next;
+		} while (iter != files);
+
+		if (iter == NULL && files == NULL) {
+			/* There are no matches. */
+			return NULL;
+		}
+		
+	}
+
+	if (amt == -1) {
+		rprintf(FERROR,"%s",strerror(errno));
+		kill_hashfiles(files);
+		return NULL;
+	}
+
+	/* If we only have one file left, use it. */
+	if (files == files->next) {
+		return files;
+	}
+
+	/* All files which remain in the list are identical and should have
+	 * the same size.  We pick the one with the lowest link count (we
+	 * may have rolled over because we hit the maximum link count for
+	 * the filesystem). */
+	best = iter = files;
+	nlink = iter->nlink;
+	do {
+		if (iter->nlink < nlink) {
+			nlink = iter->nlink;
+			best = iter;
+		}
+		iter = iter->next;
+	} while (iter != files);
+
+	best->next->prev = best->prev;
+	best->prev->next = best->next;
+	if (files == best)
+		files = files->next;
+	kill_hashfiles(files);
+	return best;
+}
+
+
+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
+{
+	STRUCT_STAT st;
+	char *hashname = make_hash_name(file);		
+	int first = 0, rc;
+	char *linkname;
+	long last_fnbr;
+
+	if (file->length == 0) {
+		return robust_rename(fnametmp,fname);
+	}
+
+	if (do_stat(hashname, &st) == -1) {
+		char *dirname;
+
+		/* Directory does not exist. */
+		dirname = strdup(hashname);
+		*strrchr(dirname,'/') = 0;
+		if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
+			rprintf(FERROR, "mkdir %s: %s\n", dirname,
+				strerror(errno));
+			free(hashname);
+			free(dirname);
+			return robust_rename(fnametmp,fname);
+		}
+		free(dirname);
+
+		if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
+			rprintf(FERROR, "mkdir %s: %s\n", hashname,
+				strerror(errno));
+			free(hashname);
+			return robust_rename(fnametmp,fname);
+		}
+
+		first = 1;
+		asprintf(&linkname,"%s/0",hashname);
+		rprintf(FINFO, "(1) linkname = %s\n", linkname);
+			
+	} else {
+		struct hashfile_struct *hashfiles, *hashfile;
+		int fd;
+
+		if (do_stat(fnametmp,&st) == -1) {
+			rprintf(FERROR,"%s: %s\n",fname,strerror(errno));
+			return -1;
+		}
+		hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
+
+		if (hashfiles == NULL) {
+			first = 1;
+			asprintf(&linkname,"%s/0",hashname);
+			rprintf(FINFO, "(2) linkname = %s\n", linkname);
+		} else {
+			
+			/* Search for one identical to us. */
+			if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
+				rprintf(FERROR,"%s: %s\n",fnametmp,
+					strerror(errno));
+				kill_hashfiles(hashfiles);
+				return -1;
+			}
+			hashfile = compare_hashfiles(fd, hashfiles);
+			hashfiles = NULL;
+
+			if (hashfile) {
+				first = 0;
+				linkname = strdup(hashfile->name);
+				rprintf(FINFO, "(3) linkname = %s\n", linkname);
+				kill_hashfile(hashfile);
+			} else {
+				first = 1;
+				asprintf(&linkname, "%s/%ld", hashname,
+					 last_fnbr + 1);
+				rprintf(FINFO, "(4) linkname = %s\n", linkname);
+			}
+		}
+	}
+
+	if (!first) {
+		rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
+				linkname, full_fname(fname));
+		rc = do_link(linkname, fname);
+		if (rc == -1) {
+			if (errno == EMLINK) {
+				first = 1;
+				free(linkname);
+				asprintf(&linkname,"%s/%ld",hashname,
+					 last_fnbr + 1);
+				rprintf(FINFO, "(5) linkname = %s\n", linkname);
+				rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
+			} else {
+				rprintf(FERROR,"link \"%s\" -> %s: %s\n",
+					linkname,full_fname(fname),
+					strerror(errno));
+				robust_unlink(fname);
+				rc = robust_rename(fnametmp,fname);
+			}
+		} else {
+			do_unlink(fnametmp);
+		}
+	}
+
+	if (first) {
+		rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
+				full_fname(fname),linkname);
+
+		rc = robust_rename(fnametmp,fname);
+		if (rc != 0) {
+			rprintf(FERROR,"rename \"%s\" -> \"%s\": %s\n",
+				full_fname(fnametmp),full_fname(fname),
+				strerror(errno));
+		}
+		rc = do_link(fname,linkname);
+		if (rc != 0) {
+			rprintf(FERROR,"link \"%s\" -> \"%s\": %s\n",
+				full_fname(fname),linkname,
+				strerror(errno));
+		}
+	}
+
+	free(linkname);
+	free(hashname);
+	return rc;
+}
+
+#endif
patchwork diff Makefile.in
--- Makefile.in	2004-02-17 10:36:43.000000000 -0500
+++ Makefile.in	2004-02-17 10:36:44.000000000 -0500
@@ -35,7 +35,7 @@
 	main.o checksum.o match.o syscall.o log.o backup.o
 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
 	fileio.o batch.o clientname.o
-OBJS3=progress.o pipe.o
+OBJS3=progress.o pipe.o hashlink.o
 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
 	popt/popthelp.o popt/poptparse.o
patchwork diff options.c
--- options.c	2004-02-17 10:36:44.000000000 -0500
+++ options.c	2004-02-17 10:36:44.000000000 -0500
@@ -119,6 +119,7 @@
 char *password_file = NULL;
 char *rsync_path = RSYNC_PATH;
 char *backup_dir = NULL;
+char *link_by_hash_dir = NULL;
 char backup_dir_buf[MAXPATHLEN];
 int rsync_port = RSYNC_PORT;
 int link_dest = 0;
@@ -264,6 +265,7 @@
   rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
   rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
   rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
+  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash to DIR for regular files\n");
   rprintf(F," -P                          equivalent to --partial --progress\n");
   rprintf(F," -z, --compress              compress file data\n");
   rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
@@ -303,7 +305,7 @@
 enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
       OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
       OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
-      OPT_READ_BATCH, OPT_WRITE_BATCH};
+      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_LINK_BY_HASH};
 
 static struct poptOption long_options[] = {
   /* longName, shortName, argInfo, argPtr, value, descrip, argDesc */
@@ -359,6 +361,7 @@
   {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
   {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
   {"link-dest",        0,  POPT_ARG_STRING, 0,              OPT_LINK_DEST, 0, 0 },
+  {"link-by-hash",     0,  POPT_ARG_STRING, 0,              OPT_LINK_BY_HASH, 0, 0},
   /* TODO: Should this take an optional int giving the compression level? */
   {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
   {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
@@ -577,6 +580,18 @@
 			return 0;
 #endif
 
+                case OPT_LINK_BY_HASH:
+#if HAVE_LINK
+			link_by_hash_dir = (char *)poptGetOptArg(pc);
+			checksum_seed = FIXED_CHECKSUM_SEED;
+			break;
+#else
+			snprintf(err_buf, sizeof err_buf,
+				 "hard links are not supported on this %s\n",
+				 am_server ? "server" : "client");
+			rprintf(FERROR, "ERROR: %s", err_buf);
+			return 0;
+#endif
 
 		default:
 			snprintf(err_buf, sizeof err_buf,
@@ -921,6 +936,11 @@
 		args[ac++] = compare_dest;
 	}
 
+	if (link_by_hash_dir && am_sender) {
+		args[ac++] = "--link-by-hash";
+		args[ac++] = link_by_hash_dir;
+	}
+
 	if (files_from && (!am_sender || remote_filesfrom_file)) {
 		if (remote_filesfrom_file) {
 			args[ac++] = "--files-from";
patchwork diff proto.h
--- proto.h	2004-02-17 10:36:44.000000000 -0500
+++ proto.h	2004-02-17 10:43:31.000000000 -0500
@@ -93,6 +93,12 @@
 void write_sum_head(int f, struct sum_struct *sum);
 void recv_generator(char *fname, struct file_struct *file, int i, int f_out);
 void generate_files(int f, struct file_list *flist, char *local_name);
+char* make_hash_name(struct file_struct *file);
+void kill_hashfile(struct hashfile_struct *hashfile);
+void kill_hashfiles(struct hashfile_struct *hashfiles);
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
 void init_hard_links(struct file_list *flist);
 int hard_link_check(struct file_struct *file, int skip);
 void do_hard_links(void);
patchwork diff receiver.c
--- receiver.c	2004-02-17 10:36:44.000000000 -0500
+++ receiver.c	2004-02-17 13:10:23.000000000 -0500
@@ -186,10 +186,11 @@
 
 
 static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
-			OFF_T total_size)
+			OFF_T total_size,char *md4)
 {
 	int i;
 	struct sum_struct sum;
+	struct mdfour mdfour_data;
 	unsigned int len;
 	OFF_T offset = 0;
 	OFF_T offset2;
@@ -199,7 +200,9 @@
 	char *map=NULL;
 
 	read_sum_head(f_in, &sum);
-
+	if (md4)
+		mdfour_begin(&mdfour_data);
+	
 	sum_init();
 
 	while ((i = recv_token(f_in, &data)) != 0) {
@@ -216,6 +219,8 @@
 			cleanup_got_literal = 1;
 
 			sum_update(data,i);
+			if (md4)
+				mdfour_update(&mdfour_data,data,i);
 
 			if (fd != -1 && write_file(fd,data,i) != i) {
 				rprintf(FERROR, "write failed on %s: %s\n",
@@ -243,6 +248,8 @@
 
 			see_token(map, len);
 			sum_update(map,len);
+			if (md4)
+				mdfour_update(&mdfour_data,map,len);
 		}
 
 		if (fd != -1 && write_file(fd,map,len) != (int) len) {
@@ -265,6 +272,8 @@
 	}
 
 	sum_end(file_sum1);
+	if (md4)
+		mdfour_result(&mdfour_data, (unsigned char*)md4);
 
 	read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
 	if (verbose > 2) {
@@ -299,6 +308,7 @@
 	extern int preserve_perms;
 	extern int delete_after;
 	extern int orig_umask;
+	extern char *link_by_hash_dir;
 	struct stats initial_stats;
 
 	if (verbose > 2) {
@@ -372,7 +382,7 @@
 		if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
 			rprintf(FERROR, "fstat %s failed: %s\n",
 				full_fname(fnamecmp), strerror(errno));
-			receive_data(f_in,NULL,-1,NULL,file->length);
+			receive_data(f_in,NULL,-1,NULL,file->length,NULL);
 			close(fd1);
 			continue;
 		}
@@ -385,7 +395,7 @@
 			 */
 			rprintf(FERROR,"recv_files: %s is a directory\n",
 				full_fname(fnamecmp));
-			receive_data(f_in, NULL, -1, NULL, file->length);
+			receive_data(f_in,NULL,-1,NULL,file->length,NULL);
 			close(fd1);
 			continue;
 		}
@@ -437,7 +447,7 @@
 		if (fd2 == -1) {
 			rprintf(FERROR, "mkstemp %s failed: %s\n",
 				full_fname(fnametmp), strerror(errno));
-			receive_data(f_in,mapbuf,-1,NULL,file->length);
+			receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
 			if (mapbuf) unmap_file(mapbuf);
 			if (fd1 != -1) close(fd1);
 			continue;
@@ -450,7 +460,12 @@
 		}
 
 		/* recv file data */
-		recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
+#ifdef HAVE_LINK
+		if (link_by_hash_dir) {
+			file->u.sum = (char*)malloc (MD4_SUM_LENGTH);
+		}
+#endif
+		recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
 
 		log_recv(file, &initial_stats);
 
patchwork diff rsync.c
--- rsync.c	2004-02-17 10:36:44.000000000 -0500
+++ rsync.c	2004-02-17 10:43:07.000000000 -0500
@@ -33,6 +33,7 @@
 extern int preserve_gid;
 extern int preserve_perms;
 extern int make_backups;
+extern char *link_by_hash_dir;
 
 
 /*
@@ -234,13 +235,20 @@
 	if (make_backups && !make_backup(fname))
 		return;
 
+#ifdef HAVE_LINK
+	if (link_by_hash_dir) {
+		if (link_by_hash(fnametmp,fname,file) != 0)
+			return;
+	} else
+#endif
 	/* move tmp file over real file */
 	if (robust_rename(fnametmp,fname) != 0) {
 		rprintf(FERROR,"rename %s -> \"%s\": %s\n",
 			full_fname(fnametmp), fname, strerror(errno));
-	} else {
-		set_perms(fname,file,NULL,0);
-	}
+		return;
+	}	
+
+	set_perms(fname,file,NULL,0);
 }
 
 const char *who_am_i(void)
patchwork diff rsync.h
--- rsync.h	2004-02-17 10:36:44.000000000 -0500
+++ rsync.h	2004-02-17 10:36:44.000000000 -0500
@@ -513,6 +513,14 @@
 	int current_file_index;
 };
 
+struct hashfile_struct {
+	struct hashfile_struct *next;
+	struct hashfile_struct *prev;
+	char *name;
+	int fd;
+	uint32 nlink;
+};
+
 
 /* we need this function because of the silly way in which duplicate
    entries are handled in the file lists - we can't change this


More information about the rsync mailing list