TODO hardlink performance optimizations

jw schultz jw at pegasys.ws
Sun Jan 4 13:30:03 GMT 2004


On Sun, Jan 04, 2004 at 06:35:03AM -0600, John Van Essen wrote:
> Lester,
> 
> You articulated your situation clear enough for me.  Thanks.
> 
> I'll address your issue about when rsync is running locally for /vol/N
> to /vol/N_mirror syncing, it exhausts all of the RAM and swap.
> 
> If you haven't read jw schultz's "How Rsync Works" page, here the link:
> 
>   http://www.pegasys.ws/how-rsync-works.html
> 
> The sender, receiver, and generator each have a full copy of the file
> list (each file's entry uses 100 bytes on average).
> 
> Additonally, the --hard-links option creates yet *another* full copy of
> the file list in the receiver, so that's even more memory consumed.
> 
> So you are in a world o' hurt rsyncing an entire /vol/N internally
> with --hard-links, since there will be FOUR copies of the file list.
> 
> I'd suggest breaking the /vol/N rsync up into separate rsyncs for each
> of the maxdepth 1 hierarchies.  If I understand your situation correctly,
> all hard link groups are self contained within each of those hierarchies
> so you will be OK.
> 
> I've modified hlink.c to use a list of file struct pointers instead of
> copies of the actual file structs themselves, so that will save memory.
> I'll submit that patch for review in a day or two after I've tested it.

I've just done the same.  It reduces the memory requirements
of the hlink list to 1/18th.  It is also somewhat faster to
build that way because we don't have to walk the list.

If we built the hlink_list one element at a time the way we
do the file_list only putting those files that we might link
in it it would be smaller but building it would be slower.

I've only done a little testing but it seems to be working
and warnings about theory v. practice aside it should be
good.


===================================================================
RCS file: /data/cvs/rsync/hlink.c,v
retrieving revision 1.23
diff -p -u -r1.23 hlink.c
--- hlink.c	2 Jan 2004 07:34:49 -0000	1.23
+++ hlink.c	4 Jan 2004 13:21:14 -0000
@@ -24,45 +24,43 @@ extern int dry_run;
 extern int verbose;
 
 #if SUPPORT_HARD_LINKS
-static int hlink_compare(struct file_struct *f1, struct file_struct *f2)
+static int hlink_compare(struct file_struct **f1, struct file_struct **f2)
 {
-	if (!S_ISREG(f1->mode) && !S_ISREG(f2->mode))
+	if (!S_ISREG((*f1)->mode) && !S_ISREG((*f2)->mode))
 		return 0;
-	if (!S_ISREG(f1->mode))
+	if (!S_ISREG((*f1)->mode))
 		return -1;
-	if (!S_ISREG(f2->mode))
+	if (!S_ISREG((*f2)->mode))
 		return 1;
 
-	if (f1->dev != f2->dev)
-		return (int) (f1->dev > f2->dev ? 1 : -1);
+	if ((*f1)->dev != (*f2)->dev)
+		return (int) ((*f1)->dev > (*f2)->dev ? 1 : -1);
 
-	if (f1->inode != f2->inode)
-		return (int) (f1->inode > f2->inode ? 1 : -1);
+	if ((*f1)->inode != (*f2)->inode)
+		return (int) ((*f1)->inode > (*f2)->inode ? 1 : -1);
 
-	return file_compare(&f1, &f2);
+
+	return file_compare(f1, f2);
 }
 
 
-static struct file_struct *hlink_list;
+static struct file_struct **hlink_list;
 static int hlink_count;
 #endif
 
 void init_hard_links(struct file_list *flist)
 {
 #if SUPPORT_HARD_LINKS
-	int i;
 	if (flist->count < 2)
 		return;
 
 	if (hlink_list)
 		free(hlink_list);
 
-	if (!(hlink_list = new_array(struct file_struct, flist->count)))
+	if (!(hlink_list = new_array(struct file_struct *, flist->count)))
 		out_of_memory("init_hard_links");
-
-	for (i = 0; i < flist->count; i++)
-		memcpy(&hlink_list[i], flist->files[i],
-		       sizeof(hlink_list[0]));
+	
+	memcpy(hlink_list, flist->files, sizeof(hlink_list[0]) * flist->count);	
 
 	qsort(hlink_list, flist->count,
 	      sizeof(hlink_list[0]), (int (*)()) hlink_compare);
@@ -84,7 +82,7 @@ int check_hard_link(struct file_struct *
 
 	while (low != high) {
 		int mid = (low + high) / 2;
-		ret = hlink_compare(&hlink_list[mid], file);
+		ret = hlink_compare(&hlink_list[mid], &file);
 		if (ret == 0) {
 			low = mid;
 			break;
@@ -97,16 +95,16 @@ int check_hard_link(struct file_struct *
 
 	/* XXX: To me this looks kind of dodgy -- why do we use [low]
 	 * here and [low-1] below? -- mbp */
-	if (hlink_compare(&hlink_list[low], file) != 0)
+	if (hlink_compare(&hlink_list[low], &file) != 0)
 		return 0;
 
 	if (low > 0 &&
-	    S_ISREG(hlink_list[low - 1].mode) &&
-	    file->dev == hlink_list[low - 1].dev &&
-	    file->inode == hlink_list[low - 1].inode) {
+	    S_ISREG(hlink_list[low - 1]->mode) &&
+	    file->dev == hlink_list[low - 1]->dev &&
+	    file->inode == hlink_list[low - 1]->inode) {
 		if (verbose >= 2) {
 			rprintf(FINFO, "check_hard_link: \"%s\" is a hard link to file %d, \"%s\"\n",
-				f_name(file), low-1, f_name(&hlink_list[low-1]));
+				f_name(file), low-1, f_name(hlink_list[low-1]));
 		}
 		return 1;
 	}
@@ -120,12 +118,12 @@ int check_hard_link(struct file_struct *
 static void hard_link_one(int i)
 {
 	STRUCT_STAT st1, st2;
-	char *hlink2, *hlink1 = f_name(&hlink_list[i - 1]);
+	char *hlink2, *hlink1 = f_name(hlink_list[i - 1]);
 
 	if (link_stat(hlink1, &st1) != 0)
 		return;
 
-	hlink2 = f_name(&hlink_list[i]);
+	hlink2 = f_name(hlink_list[i]);
 	if (link_stat(hlink2, &st2) != 0) {
 		if (do_link(hlink1, hlink2)) {
 			if (verbose > 0) {
@@ -166,11 +164,11 @@ void do_hard_links(void)
 		return;
 
 	for (i = 1; i < hlink_count; i++) {
-		if (S_ISREG(hlink_list[i].mode) &&
-		    S_ISREG(hlink_list[i - 1].mode) &&
-		    hlink_list[i].basename && hlink_list[i - 1].basename &&
-		    hlink_list[i].dev == hlink_list[i - 1].dev &&
-		    hlink_list[i].inode == hlink_list[i - 1].inode) {
+		if (S_ISREG(hlink_list[i]->mode) &&
+		    S_ISREG(hlink_list[i - 1]->mode) &&
+		    hlink_list[i]->basename && hlink_list[i - 1]->basename &&
+		    hlink_list[i]->dev == hlink_list[i - 1]->dev &&
+		    hlink_list[i]->inode == hlink_list[i - 1]->inode) {
 			hard_link_one(i);
 		}
 	}



More information about the rsync mailing list