[ccache] PATCH: Look at include files' mtimes

Justin Lebar justin.lebar at gmail.com
Sun May 20 14:49:33 MDT 2012


This patch lets ccache examine an include file's mtime and size in
lieu of hashing it, during direct mode.  If the mtime and size don't
match, we fall back to hashing.

The net result is roughly a factor-of-two speedup in ccache hits (*),
on my machine.

I'm not sure if this is a desirable feature, because obviously mtimes
can be tampered with.

I didn't provide a way to disable the feature in this patch because,
presuming we wanted to take this patch, I'm not sure if we'd want
mtime-snooping enabled by default.  Since most projects already rely
on accurate mtimes in their build systems, turning this on by default
doesn't seem particularly outrageous to me.

Please let me know what you think about this.

Regards,
-Justin

(*) Experimental procedure: In a Firefox debug objdir
(CCACHE_HARDLINK, Linux-64, Ubuntu 12.04, 4 CPU cores), let

* Let |nop| be the average real time from a few runs of

    $ time make -C dom -sj16

  when there's nothing to do.

* Let |orig| be the average real time from a few runs of

    $ find dom -name '*.o' && time make -C dom -sj16

  with ccache master (701f13192ee) (discarding the first run, of course).

* Let |mtime| be the real time from the same command as |orig|, but
with patched ccache.

Speedup is (orig - nop) / (mtime - nop).  On my machine, nop = 3.71,
orig = 4.88, mtime = 4.31.  Yes, our nop build times are atrocious.
-------------- next part --------------
From 2bd9951a076993f9cd1874fc2413660711b7a07a Mon Sep 17 00:00:00 2001
From: Justin Lebar <justin.lebar at gmail.com>
Date: Sun, 20 May 2012 15:18:44 -0400
Subject: [PATCH] Look at mtime before hashing include files.

---
 ccache.c   |    2 +-
 ccache.h   |    1 +
 manifest.c |   73 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 test.sh    |    2 +-
 4 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/ccache.c b/ccache.c
index 8b50c36..af8898e 100644
--- a/ccache.c
+++ b/ccache.c
@@ -129,7 +129,7 @@ static char *manifest_path;
  * Time of compilation. Used to see if include files have changed after
  * compilation.
  */
-static time_t time_of_compilation;
+time_t time_of_compilation;
 
 /*
  * Files included by the preprocessor and their hashes/sizes. Key: file path.
diff --git a/ccache.h b/ccache.h
index 7e25883..3d0d93a 100644
--- a/ccache.h
+++ b/ccache.h
@@ -211,6 +211,7 @@ void lockfile_release(const char *path);
 /* ------------------------------------------------------------------------- */
 /* ccache.c */
 
+extern time_t time_of_compilation;
 bool cc_process_args(struct args *orig_args, struct args **preprocessor_args,
                     struct args **compiler_args);
 void cc_reset(void);
diff --git a/manifest.c b/manifest.c
index fc60503..9e58ee5 100644
--- a/manifest.c
+++ b/manifest.c
@@ -41,10 +41,12 @@
  * <index[0]>      index of include file path          (4 bytes unsigned int)
  * <hash[0]>       hash of include file                (<hash_size> bytes)
  * <size[0]>       size of include file                (4 bytes unsigned int)
+ * <mtime[0]>      mtime of include file               (8 bytes signed int)
  * ...
+ * <index[n-1]>
  * <hash[n-1]>
  * <size[n-1]>
- * <index[n-1]>
+ * <mtime[n-1]>
  * ----------------------------------------------------------------------------
  * <n>             number of object name entries       (4 bytes unsigned int)
  * <m[0]>          number of include file hash indexes (4 bytes unsigned int)
@@ -63,7 +65,7 @@
  */
 
 static const uint32_t MAGIC = 0x63436d46U;
-static const uint8_t  VERSION = 0;
+static const uint8_t  VERSION = 1;
 static const uint32_t MAX_MANIFEST_ENTRIES = 100;
 
 #define static_assert(e) do { enum { static_assert__ = 1/(e) }; } while (false)
@@ -75,6 +77,8 @@ struct file_info {
 	uint8_t hash[16];
 	/* Size of referenced file. */
 	uint32_t size;
+	/* mtime of referenced file. */
+	int64_t mtime;
 };
 
 struct object {
@@ -109,10 +113,15 @@ struct manifest {
 	struct object *objects;
 };
 
+struct file_mtime_and_size {
+	uint32_t size;
+	int64_t mtime;
+};
+
 static unsigned int
 hash_from_file_info(void *key)
 {
-	static_assert(sizeof(struct file_info) == 24); /* No padding. */
+	static_assert(sizeof(struct file_info) == 32); /* No padding. */
 	return murmurhashneutral2(key, sizeof(struct file_info), 0);
 }
 
@@ -123,7 +132,8 @@ file_infos_equal(void *key1, void *key2)
 	struct file_info *fi2 = (struct file_info *)key2;
 	return fi1->index == fi2->index
 	       && memcmp(fi1->hash, fi2->hash, 16) == 0
-	       && fi1->size == fi2->size;
+	       && fi1->size == fi2->size
+	       && fi1->mtime == fi2->mtime;
 }
 
 static void
@@ -262,6 +272,7 @@ read_manifest(gzFile f)
 		READ_INT(4, mf->file_infos[i].index);
 		READ_BYTES(mf->hash_size, mf->file_infos[i].hash);
 		READ_INT(4, mf->file_infos[i].size);
+		READ_INT(8, mf->file_infos[i].mtime);
 	}
 
 	READ_INT(4, mf->n_objects);
@@ -335,6 +346,7 @@ write_manifest(gzFile f, const struct manifest *mf)
 		WRITE_INT(4, mf->file_infos[i].index);
 		WRITE_BYTES(mf->hash_size, mf->file_infos[i].hash);
 		WRITE_INT(4, mf->file_infos[i].size);
+		WRITE_INT(8, mf->file_infos[i].mtime);
 	}
 
 	WRITE_INT(4, mf->n_objects);
@@ -356,23 +368,49 @@ error:
 
 static int
 verify_object(struct conf *conf, struct manifest *mf, struct object *obj,
-              struct hashtable *hashed_files)
+              struct hashtable *stated_files, struct hashtable *hashed_files)
 {
 	uint32_t i;
 	struct file_info *fi;
 	struct file_hash *actual;
+	struct file_mtime_and_size *mtime_and_size;
 	struct mdfour hash;
 	int result;
+	char *path;
 
 	for (i = 0; i < obj->n_file_info_indexes; i++) {
 		fi = &mf->file_infos[obj->file_info_indexes[i]];
-		actual = hashtable_search(hashed_files, mf->files[fi->index]);
+		path = mf->files[fi->index];
+		mtime_and_size = hashtable_search(hashed_files, path);
+		if (!mtime_and_size) {
+			struct stat file_stat;
+			if (stat(path, &file_stat) == -1) {
+				cc_log("Failed to stat include file %s: %s", path, strerror(errno));
+				return 0;
+			}
+			mtime_and_size = x_malloc(sizeof(*mtime_and_size));
+			mtime_and_size->size = file_stat.st_size;
+			mtime_and_size->mtime = file_stat.st_mtime;
+			hashtable_insert(stated_files, x_strdup(path), mtime_and_size);
+		}
+
+		if (fi->size == mtime_and_size->size
+		    && fi->mtime == mtime_and_size->mtime
+                    && fi->mtime >= time_of_compilation) {
+			cc_log("Got size/mtime hit for %s.", path);
+			continue;
+		}
+		else {
+			cc_log("size/mtime miss for %s.", path);
+		}
+
+		actual = hashtable_search(hashed_files, path);
 		if (!actual) {
 			actual = x_malloc(sizeof(*actual));
 			hash_start(&hash);
-			result = hash_source_code_file(conf, &hash, mf->files[fi->index]);
+			result = hash_source_code_file(conf, &hash, path);
 			if (result & HASH_SOURCE_CODE_ERROR) {
-				cc_log("Failed hashing %s", mf->files[fi->index]);
+				cc_log("Failed hashing %s", path);
 				free(actual);
 				return 0;
 			}
@@ -382,7 +420,7 @@ verify_object(struct conf *conf, struct manifest *mf, struct object *obj,
 			}
 			hash_result_as_bytes(&hash, actual->hash);
 			actual->size = hash.totalN;
-			hashtable_insert(hashed_files, x_strdup(mf->files[fi->index]), actual);
+			hashtable_insert(hashed_files, x_strdup(path), actual);
 		}
 		if (memcmp(fi->hash, actual->hash, mf->hash_size) != 0
 		    || fi->size != actual->size) {
@@ -458,11 +496,19 @@ get_file_hash_index(struct manifest *mf,
 	struct file_info fi;
 	uint32_t *fi_index;
 	uint32_t n;
+	struct stat file_stat;
 
 	fi.index = get_include_file_index(mf, path, mf_files);
 	memcpy(fi.hash, file_hash->hash, sizeof(fi.hash));
 	fi.size = file_hash->size;
 
+	if (stat(path, &file_stat) != -1) {
+		fi.mtime = file_stat.st_mtime;
+	}
+	else {
+		fi.mtime = -1;
+	}
+
 	fi_index = hashtable_search(mf_file_infos, &fi);
 	if (fi_index) {
 		return *fi_index;
@@ -540,6 +586,7 @@ manifest_get(struct conf *conf, const char *manifest_path)
 	gzFile f = NULL;
 	struct manifest *mf = NULL;
 	struct hashtable *hashed_files = NULL; /* path --> struct file_hash */
+	struct hashtable *stated_files = NULL; /* path --> struct file_mtime_and_size */
 	uint32_t i;
 	struct file_hash *fh = NULL;
 
@@ -562,10 +609,12 @@ manifest_get(struct conf *conf, const char *manifest_path)
 	}
 
 	hashed_files = create_hashtable(1000, hash_from_string, strings_equal);
+	stated_files = create_hashtable(1000, hash_from_string, strings_equal);
 
 	/* Check newest object first since it's a bit more likely to match. */
 	for (i = mf->n_objects; i > 0; i--) {
-		if (verify_object(conf, mf, &mf->objects[i - 1], hashed_files)) {
+		if (verify_object(conf, mf, &mf->objects[i - 1],
+		                  stated_files, hashed_files)) {
 			fh = x_malloc(sizeof(*fh));
 			*fh = mf->objects[i - 1].hash;
 			goto out;
@@ -576,6 +625,9 @@ out:
 	if (hashed_files) {
 		hashtable_destroy(hashed_files, 1);
 	}
+	if (stated_files) {
+		hashtable_destroy(stated_files, 1);
+	}
 	if (f) {
 		gzclose(f);
 	}
@@ -688,6 +740,7 @@ out:
 bool
 manifest_dump(const char *manifest_path, FILE *stream)
 {
+	/* XXX modify, add mtime, then fix tests. */
 	struct manifest *mf = NULL;
 	int fd;
 	gzFile f = NULL;
diff --git a/test.sh b/test.sh
index e632557..aa0acf1 100755
--- a/test.sh
+++ b/test.sh
@@ -1232,7 +1232,7 @@ EOF
         >manifest.dump
     cat <<EOF >expected.dump
 Magic: cCmF
-Version: 0
+Version: 1
 Hash size: 16
 Reserved field: 0
 File paths (3):
-- 
1.7.9.5


More information about the ccache mailing list