patch to enable faster mirroring of large filesystems

Andrew J. Schorr ajschorr at yahoo.com
Tue Nov 20 08:06:45 EST 2001


I have attached a patch that adds 4 options to rsync that have helped
me to speed up my mirroring.  I hope this is useful to someone else,
but I fear that my relative inexperience with rsync has caused me to
miss a way to do what I want without having to patch the code.  So please
let me know if I'm all wet.

Here's my story: I have a large filesystem (around 20 gigabytes of data)
that I'm mirroring over a T1 link to a backup site.  Each night, 
about 600 megabytes of data needs to be transferred to the backup site.
Much of this data has been appended to the end of various existing files,
so a tool like rsync that sends partial updates instead of the whole
file is appropriate.

Normally, one could just use rsync with the --recursive and --delete features
to do this.  However, this takes a lot more time than necessary, basically
because rsync spends a lot of time walking through the directory tree
(which contains over 300,000 files).

One can speed this up by caching a listing of the directory tree.  I maintain
an additional state file at the backup site that contains a listing
of the state of the tree after the last backup operation.  This is essentially
equivalent to saving the output of "find . -ls" in a file.

Then, the next night, one generates the updated directory tree for the source
file system and does a diff with the directory listing on the backup file
system to find out what has changed.  This seems to be much faster than
using rsync's recursive and delete features.

I have my own script and programs to delete any files that have been removed,
and then I just need to update the files that have been added or changed.
One could use cpio for this, but it's too slow when only partial files
have changed.

So I added the following options to rsync:

     --source-list           SRC arg will be a (local) file name containing a list of files, or - to read file names from stdin
     --null                  used with --source-list to indicate that the file names will be separated by null (zero) bytes instead of linefeed characters; useful with gfind -print0
     --send-dirs             send directory entries even though not in recursive mode
     --no-implicit-dirs      do not send implicit directories (parents of the file being sent)

The --source-list option allows me to supply an explicit list of filenames
to transport without using the --recursive feature and without playing
around with include and exclude files.  I'm not really clear on whether
the include and exclude files could have gotten me the same place, but it
seems to me that they work hand-in-hand with the --recursive feature that
I don't want to use.

The --null flag allows me to handle files with embedded linefeeds.  This
is in the style of gnu find's -print0 operator.

The --send-dirs overcomes a problem where rsync refuses to send directories
unless it's in recursive mode.  One needs this to make sure that even
empty directories get mirrored.

And the --no-implicit-dirs option turns off the default behavior in which
all the parent directories of a file are transmitted before sending the
file.  That default behavior is very inefficient in my scenario where I
am taking the responsibility for sending those directories myself.

So, the patch is attached.  If you think it's an abomination, please let
me know what the better solution is.  If you would like some elaboration
on how this stuff really works, please let me know.

Cheers,
Andy
-------------- next part --------------
--- flist.c.orig	Tue Sep  5 22:46:43 2000
+++ flist.c	Fri Nov  9 12:01:56 2001
@@ -30,6 +30,7 @@
 extern int cvs_exclude;
 
 extern int recurse;
+extern int send_dirs;
 
 extern int one_file_system;
 extern int make_backups;
@@ -501,8 +502,8 @@
 	/* we use noexcludes from backup.c */
 	if (noexcludes) goto skip_excludes;
 
-	if (S_ISDIR(st.st_mode) && !recurse) {
-		rprintf(FINFO,"skipping directory %s\n",fname);
+	if (S_ISDIR(st.st_mode) && !recurse && !send_dirs) {
+		rprintf(FINFO,"make_file: skipping directory %s\n",fname);
 		return NULL;
 	}
 	
@@ -689,14 +690,16 @@
 }
 
 
-struct file_list *send_file_list(int f,int argc,char *argv[])
+static struct file_list *send_file_list_proc(int f,char *(*ffunc)(), void *opq)
 {
-	int i,l;
+	int l;
 	STRUCT_STAT st;
 	char *p,*dir,*olddir;
 	char lastpath[MAXPATHLEN]="";
 	struct file_list *flist;
 	int64 start_write;
+	char *in_fn;
+	extern int implicit_dirs;
 
 	if (verbose && recurse && !am_server && f != -1) {
 		rprintf(FINFO,"building file list ... ");
@@ -711,10 +714,10 @@
 		io_start_buffering(f);
 	}
 
-	for (i=0;i<argc;i++) {
+	while ((in_fn = (*ffunc)(opq)) != NULL) {
 		char *fname = topsrcname;
 
-		strlcpy(fname,argv[i],MAXPATHLEN);
+		strlcpy(fname,in_fn,MAXPATHLEN);
 
 		l = strlen(fname);
 		if (l != 1 && fname[l-1] == '/') {
@@ -738,8 +741,8 @@
 			continue;
 		}
 
-		if (S_ISDIR(st.st_mode) && !recurse) {
-			rprintf(FINFO,"skipping directory %s\n",fname);
+		if (S_ISDIR(st.st_mode) && !recurse && !send_dirs) {
+			rprintf(FINFO,"send_file_list: skipping directory %s\n",fname);
 			continue;
 		}
 
@@ -756,7 +759,7 @@
 					dir = fname;      
 				fname = p+1;      
 			}
-		} else if (f != -1 && (p=strrchr(fname,'/'))) {
+		} else if (f != -1 && (p=strrchr(fname,'/')) && implicit_dirs) {
 			/* this ensures we send the intermediate directories,
 			   thus getting their permissions right */
 			*p = 0;
@@ -849,6 +852,49 @@
 	return flist;
 }
 
+struct argv_data {
+   int argc;
+   char **argv;
+};
+
+static char *
+get_arg(struct argv_data *ad)
+{
+   return (ad->argc-- > 0) ? *(ad->argv++) : NULL;
+}
+
+struct file_list *send_file_list(int f,int argc,char *argv[])
+{
+   struct argv_data arg_info;
+
+   arg_info.argc = argc;
+   arg_info.argv = argv;
+   return send_file_list_proc(f,get_arg,&arg_info);
+}
+
+/* note that send_file_list_proc silently truncates the filename to fit
+   in a buffer of MAXPATHLEN characters, so we can safely truncate there */
+static char *
+get_stdio(FILE *fp)
+{
+   static char fnbuf[MAXPATHLEN];
+   char *s = fnbuf;
+   char *eob = &fnbuf[sizeof(fnbuf)-1];
+   int cc;
+   extern int list_rs;
+
+   while (((cc = getc(fp)) != list_rs) && (cc != EOF)) {
+      if (s < eob)
+	 *(s++) = cc;
+   }
+   *s = '\0';
+   return ((cc == EOF) && (s == fnbuf)) ? NULL : fnbuf;
+}
+
+struct file_list *send_file_list_fp(int f,FILE *fp)
+{
+   return send_file_list_proc(f,get_stdio,fp);
+}
 
 struct file_list *recv_file_list(int f)
 {
--- main.c.orig	Fri Nov  2 14:48:47 2001
+++ main.c	Thu Nov  8 18:01:37 2001
@@ -23,6 +23,8 @@
 
 struct stats stats;
 
+static FILE *src_list_fp;
+
 extern int verbose;
 
 
@@ -480,11 +482,13 @@
 		extern int cvs_exclude;
 		extern int delete_mode;
 		extern int delete_excluded;
+		extern int source_list;
 		if (cvs_exclude)
 			add_cvs_excludes();
 		if (delete_mode && !delete_excluded) 
 			send_exclude_list(f_out);
-		flist = send_file_list(f_out,argc,argv);
+		flist = (source_list ? send_file_list_fp(f_out,src_list_fp) :
+			 send_file_list(f_out,argc,argv));
 		if (verbose > 3) 
 			rprintf(FINFO,"file list sent\n");
 
@@ -677,6 +681,7 @@
 	extern int dry_run;
 	extern int am_daemon;
 	extern int am_server;
+	extern int source_list;
 
 	signal(SIGUSR1, sigusr1_handler);
 	signal(SIGUSR2, sigusr2_handler);
@@ -702,6 +707,14 @@
 	argc -= optind;
 	argv += optind;
 	optind = 0;
+
+	if (source_list &&
+	    ((argc != 2) ||
+	     !(src_list_fp = (strcmp(argv[0],"-") ?
+			      fopen(argv[0],"r") : stdin)))) {
+		usage(FERROR);
+		exit_cleanup(RERR_SYNTAX);
+	}
 
 	signal(SIGCHLD,SIG_IGN);
 	signal(SIGINT,SIGNAL_CAST sig_int);
--- options.c.orig	Tue Sep  5 22:46:43 2000
+++ options.c	Fri Nov  9 12:03:39 2001
@@ -53,6 +53,10 @@
 int module_id = -1;
 int am_server = 0;
 int am_sender=0;
+int source_list=0;
+int list_rs='\n';
+int send_dirs=0;
+int implicit_dirs=1;
 int recurse = 0;
 int am_daemon=0;
 int do_stats=0;
@@ -172,6 +176,10 @@
   rprintf(F,"     --log-format=FORMAT     log file transfers using specified format\n");  
   rprintf(F,"     --password-file=FILE    get password from FILE\n");
   rprintf(F,"     --bwlimit=KBPS          limit I/O bandwidth, KBytes per second\n");
+  rprintf(F,"     --source-list           SRC arg will be a (local) file name containing a list of files, or - to read file names from stdin\n");
+  rprintf(F,"     --null                  used with --source-list to indicate that the file names will be separated by null (zero) bytes instead of linefeed characters; useful with gfind -print0\n");
+  rprintf(F,"     --send-dirs             send directory entries even though not in recursive mode\n");
+  rprintf(F,"     --no-implicit-dirs      do not send implicit directories (parents of the file being sent)\n");
   rprintf(F," -h, --help                  show this help screen\n");
 
   rprintf(F,"\n");
@@ -188,7 +196,8 @@
       OPT_LOG_FORMAT, OPT_PASSWORD_FILE, OPT_SIZE_ONLY, OPT_ADDRESS,
       OPT_DELETE_AFTER, OPT_EXISTING, OPT_MAX_DELETE, OPT_BACKUP_DIR, 
       OPT_IGNORE_ERRORS, OPT_BWLIMIT, OPT_BLOCKING_IO,
-      OPT_MODIFY_WINDOW};
+      OPT_MODIFY_WINDOW, OPT_SOURCE_LIST, OPT_NULL, OPT_SEND_DIRS,
+      OPT_NO_IMPLICIT_DIRS};
 
 static char *short_options = "oblLWHpguDCtcahvqrRIxnSe:B:T:zP";
 
@@ -255,6 +264,10 @@
   {"address",     1,     0,    OPT_ADDRESS},
   {"max-delete",  1,     0,    OPT_MAX_DELETE},
   {"backup-dir",  1,     0,    OPT_BACKUP_DIR},
+  {"source-list", 0,     0,    OPT_SOURCE_LIST},
+  {"null",        0,     0,    OPT_NULL},
+  {"send-dirs",   0,     0,    OPT_SEND_DIRS},
+  {"no-implicit-dirs", 0, 0,   OPT_NO_IMPLICIT_DIRS},
   {0,0,0,0}};
 
 
@@ -594,6 +607,22 @@
 
 		case OPT_BACKUP_DIR:
 			backup_dir = optarg;
+			break;
+
+		case OPT_SOURCE_LIST:
+			source_list = 1;
+			break;
+
+		case OPT_NULL:
+			list_rs = '\0';
+			break;
+
+		case OPT_SEND_DIRS:
+			send_dirs = 1;
+			break;
+
+		case OPT_NO_IMPLICIT_DIRS:
+			implicit_dirs = 0;
 			break;
 
 		default:
--- proto.h.orig	Tue Sep  5 22:46:43 2000
+++ proto.h	Wed Nov  7 17:08:59 2001
@@ -46,6 +46,7 @@
 void send_file_name(int f,struct file_list *flist,char *fname,
 			   int recursive, unsigned base_flags);
 struct file_list *send_file_list(int f,int argc,char *argv[]);
+struct file_list *send_file_list_fp(int f,FILE *fp);
 struct file_list *recv_file_list(int f);
 int file_compare(struct file_struct **f1,struct file_struct **f2);
 int flist_find(struct file_list *flist,struct file_struct *f);


More information about the rsync mailing list