TODO hardlink performance optimizations
John Van Essen
vanes002 at umn.edu
Mon Jan 5 07:44:37 GMT 2004
On Sun, 4 Jan 2004, jw schultz <jw at pegasys.ws> wrote:
> On Sun, Jan 04, 2004 at 06:35:03AM -0600, John Van Essen wrote:
....
>> I've modified hlink.c to use a list of file struct pointers instead of
>> copies of the actual file structs themselves, so that will save memory.
>> I'll submit that patch for review in a day or two after I've tested it.
>
> I've just done the same. It reduces the memory requirements
> of the hlink list to 1/18th. It is also somewhat faster to
> build that way because we don't have to walk the list.
>
> If we built the hlink_list one element at a time the way we
> do the file_list only putting those files that we might link
> in it it would be smaller but building it would be slower.
Right. But as you noted in a later followup, it's a real time-saver
for searches in a filelist with a typical percentage of hardlinks.
> I've only done a little testing but it seems to be working
> and warnings about theory v. practice aside it should be
> good.
Your changes are almost identical to mine, so I will address only the
main differences...
> ===================================================================
> RCS file: /data/cvs/rsync/hlink.c,v
> retrieving revision 1.23
> diff -p -u -r1.23 hlink.c
> --- hlink.c 2 Jan 2004 07:34:49 -0000 1.23
> +++ hlink.c 4 Jan 2004 13:21:14 -0000
> @@ -24,45 +24,43 @@ extern int dry_run;
> extern int verbose;
>
> #if SUPPORT_HARD_LINKS
> -static int hlink_compare(struct file_struct *f1, struct file_struct *f2)
> +static int hlink_compare(struct file_struct **f1, struct file_struct **f2)
> {
> - if (!S_ISREG(f1->mode) && !S_ISREG(f2->mode))
> + if (!S_ISREG((*f1)->mode) && !S_ISREG((*f2)->mode))
> return 0;
> - if (!S_ISREG(f1->mode))
> + if (!S_ISREG((*f1)->mode))
> return -1;
> - if (!S_ISREG(f2->mode))
> + if (!S_ISREG((*f2)->mode))
> return 1;
>
> - if (f1->dev != f2->dev)
> - return (int) (f1->dev > f2->dev ? 1 : -1);
> + if ((*f1)->dev != (*f2)->dev)
> + return (int) ((*f1)->dev > (*f2)->dev ? 1 : -1);
>
> - if (f1->inode != f2->inode)
> - return (int) (f1->inode > f2->inode ? 1 : -1);
> + if ((*f1)->inode != (*f2)->inode)
> + return (int) ((*f1)->inode > (*f2)->inode ? 1 : -1);
>
> - return file_compare(&f1, &f2);
> +
> + return file_compare(f1, f2);
> }
I did that differently (and more simply, I think). Changed only one line
and added two lines:
@@ -24,8 +24,11 @@
extern int verbose;
#if SUPPORT_HARD_LINKS
-static int hlink_compare(struct file_struct *f1, struct file_struct *f2)
+static int hlink_compare(struct file_struct **f1p, struct file_struct **f2p)
{
+ struct file_struct *f1 = *f1p;
+ struct file_struct *f2 = *f2p;
+
if (!S_ISREG(f1->mode) && !S_ISREG(f2->mode))
return 0;
if (!S_ISREG(f1->mode))
>
>
> -static struct file_struct *hlink_list;
> +static struct file_struct **hlink_list;
> static int hlink_count;
> #endif
>
> void init_hard_links(struct file_list *flist)
> {
> #if SUPPORT_HARD_LINKS
> - int i;
> if (flist->count < 2)
> return;
>
> if (hlink_list)
> free(hlink_list);
>
> - if (!(hlink_list = new_array(struct file_struct, flist->count)))
> + if (!(hlink_list = new_array(struct file_struct *, flist->count)))
> out_of_memory("init_hard_links");
> -
> - for (i = 0; i < flist->count; i++)
> - memcpy(&hlink_list[i], flist->files[i],
> - sizeof(hlink_list[0]));
> +
> + memcpy(hlink_list, flist->files, sizeof(hlink_list[0]) * flist->count);
Nice optimization but we will need to walk it anyway to extract only
the candidates for hardlinking (discussed separately):
>
> qsort(hlink_list, flist->count,
> sizeof(hlink_list[0]), (int (*)()) hlink_compare);
> @@ -84,7 +82,7 @@ int check_hard_link(struct file_struct *
>
> while (low != high) {
> int mid = (low + high) / 2;
> - ret = hlink_compare(&hlink_list[mid], file);
> + ret = hlink_compare(&hlink_list[mid], &file);
> if (ret == 0) {
> low = mid;
> break;
> @@ -97,16 +95,16 @@ int check_hard_link(struct file_struct *
>
> /* XXX: To me this looks kind of dodgy -- why do we use [low]
> * here and [low-1] below? -- mbp */
> - if (hlink_compare(&hlink_list[low], file) != 0)
> + if (hlink_compare(&hlink_list[low], &file) != 0)
> return 0;
>
> if (low > 0 &&
I think it's time to explain what's going on, so I changed the comment
and added another one. I also optimized to skip the extra hlink_compare
if it was already matched in the loop.
....
- /* XXX: To me this looks kind of dodgy -- why do we use [low]
- * here and [low-1] below? -- mbp */
- if (hlink_compare(&hlink_list[low], file) != 0)
+ /* Verify the match for this file. If no match, cannot skip. */
+ if (!(ret == 0 || hlink_compare(&hlink_list[low], &file) == 0))
return 0;
+ /* If the immediately preceding entry in the hlink list is the
+ * same dev and inode, then there is at least one matching
+ * entry earlier in the list and this file can be skipped
+ * since it will eventually be hardlinked to the earlier file.
+ */
if (low > 0 &&
My changes from here on down were the same as yours.
--
John Van Essen Univ of MN Alumnus <vanes002 at umn.edu>
More information about the rsync
mailing list