TODO hardlink performance optimizations

John Van Essen vanes002 at umn.edu
Mon Jan 5 07:44:37 GMT 2004


On Sun, 4 Jan 2004, jw schultz <jw at pegasys.ws> wrote:
> On Sun, Jan 04, 2004 at 06:35:03AM -0600, John Van Essen wrote:
....
>> I've modified hlink.c to use a list of file struct pointers instead of
>> copies of the actual file structs themselves, so that will save memory.
>> I'll submit that patch for review in a day or two after I've tested it.
> 
> I've just done the same.  It reduces the memory requirements
> of the hlink list to 1/18th.  It is also somewhat faster to
> build that way because we don't have to walk the list.
> 
> If we built the hlink_list one element at a time the way we
> do the file_list only putting those files that we might link
> in it it would be smaller but building it would be slower.

Right.  But as you noted in a later followup, it's a real time-saver
for searches in a filelist with a typical percentage of hardlinks.
 
> I've only done a little testing but it seems to be working
> and warnings about theory v. practice aside it should be
> good.

Your changes are almost identical to mine, so I will address only the
main differences...

> ===================================================================
> RCS file: /data/cvs/rsync/hlink.c,v
> retrieving revision 1.23
> diff -p -u -r1.23 hlink.c
> --- hlink.c        2 Jan 2004 07:34:49 -0000        1.23
> +++ hlink.c        4 Jan 2004 13:21:14 -0000
> @@ -24,45 +24,43 @@ extern int dry_run;
>  extern int verbose;
> 
>  #if SUPPORT_HARD_LINKS
> -static int hlink_compare(struct file_struct *f1, struct file_struct *f2)
> +static int hlink_compare(struct file_struct **f1, struct file_struct **f2)
>  {
> -        if (!S_ISREG(f1->mode) && !S_ISREG(f2->mode))
> +        if (!S_ISREG((*f1)->mode) && !S_ISREG((*f2)->mode))
>                  return 0;
> -        if (!S_ISREG(f1->mode))
> +        if (!S_ISREG((*f1)->mode))
>                  return -1;
> -        if (!S_ISREG(f2->mode))
> +        if (!S_ISREG((*f2)->mode))
>                  return 1;
> 
> -        if (f1->dev != f2->dev)
> -                return (int) (f1->dev > f2->dev ? 1 : -1);
> +        if ((*f1)->dev != (*f2)->dev)
> +                return (int) ((*f1)->dev > (*f2)->dev ? 1 : -1);
> 
> -        if (f1->inode != f2->inode)
> -                return (int) (f1->inode > f2->inode ? 1 : -1);
> +        if ((*f1)->inode != (*f2)->inode)
> +                return (int) ((*f1)->inode > (*f2)->inode ? 1 : -1);
> 
> -        return file_compare(&f1, &f2);
> +
> +        return file_compare(f1, f2);
>  }


I did that differently (and more simply, I think).  Changed only one line
and added two lines:

@@ -24,8 +24,11 @@
 extern int verbose;
 
 #if SUPPORT_HARD_LINKS
-static int hlink_compare(struct file_struct *f1, struct file_struct *f2)
+static int hlink_compare(struct file_struct **f1p, struct file_struct **f2p)
 {
+       struct file_struct *f1 = *f1p;
+       struct file_struct *f2 = *f2p;
+
        if (!S_ISREG(f1->mode) && !S_ISREG(f2->mode))
                return 0;
        if (!S_ISREG(f1->mode))


> 
> 
> -static struct file_struct *hlink_list;
> +static struct file_struct **hlink_list;
>  static int hlink_count;
>  #endif
> 
>  void init_hard_links(struct file_list *flist)
>  {
>  #if SUPPORT_HARD_LINKS
> -        int i;
>          if (flist->count < 2)
>                  return;
> 
>          if (hlink_list)
>                  free(hlink_list);
> 
> -        if (!(hlink_list = new_array(struct file_struct, flist->count)))
> +        if (!(hlink_list = new_array(struct file_struct *, flist->count)))
>                  out_of_memory("init_hard_links");
> -
> -        for (i = 0; i < flist->count; i++)
> -                memcpy(&hlink_list[i], flist->files[i],
> -                       sizeof(hlink_list[0]));
> +       
> +        memcpy(hlink_list, flist->files, sizeof(hlink_list[0]) * flist->count);


Nice optimization but we will need to walk it anyway to extract only
the candidates for hardlinking (discussed separately):


> 
>          qsort(hlink_list, flist->count,
>                sizeof(hlink_list[0]), (int (*)()) hlink_compare);
> @@ -84,7 +82,7 @@ int check_hard_link(struct file_struct *
> 
>          while (low != high) {
>                  int mid = (low + high) / 2;
> -                ret = hlink_compare(&hlink_list[mid], file);
> +                ret = hlink_compare(&hlink_list[mid], &file);
>                  if (ret == 0) {
>                          low = mid;
>                          break;
> @@ -97,16 +95,16 @@ int check_hard_link(struct file_struct *
> 
>          /* XXX: To me this looks kind of dodgy -- why do we use [low]
>           * here and [low-1] below? -- mbp */
> -        if (hlink_compare(&hlink_list[low], file) != 0)
> +        if (hlink_compare(&hlink_list[low], &file) != 0)
>                  return 0;
> 
>          if (low > 0 &&


I think it's time to explain what's going on, so I changed the comment
and added another one.  I also optimized to skip the extra hlink_compare
if it was already matched in the loop.

....
-       /* XXX: To me this looks kind of dodgy -- why do we use [low]
-        * here and [low-1] below? -- mbp */
-       if (hlink_compare(&hlink_list[low], file) != 0)
+       /* Verify the match for this file.  If no match, cannot skip. */
+       if (!(ret == 0 || hlink_compare(&hlink_list[low], &file) == 0))
                return 0;
 
+       /* If the immediately preceding entry in the hlink list is the
+        *  same dev and inode, then there is at least one matching
+        *  entry earlier in the list and this file can be skipped
+        *  since it will eventually be hardlinked to the earlier file.
+        */
        if (low > 0 &&


My changes from here on down were the same as yours.
-- 
        John Van Essen  Univ of MN Alumnus  <vanes002 at umn.edu>



More information about the rsync mailing list