Merge branch 'ds/commit-graph-bloom-updates' into master

Updates to the changed-paths bloom filter.

* ds/commit-graph-bloom-updates:
  commit-graph: check all leading directories in changed path Bloom filters
  revision: empty pathspecs should not use Bloom filters
  revision.c: fix whitespace
  commit-graph: check chunk sizes after writing
  commit-graph: simplify chunk writes into loop
  commit-graph: unify the signatures of all write_graph_chunk_*() functions
  commit-graph: persist existence of changed-paths
  bloom: fix logic in get_bloom_filter()
  commit-graph: change test to die on parse, not load
  commit-graph: place bloom_settings in context
This commit is contained in:
Junio C Hamano 2020-07-30 13:20:31 -07:00
commit 70cdbbe3a7
9 changed files with 217 additions and 74 deletions

View File

@ -62,7 +62,10 @@ existing commit-graph file.
With the `--changed-paths` option, compute and write information about the
paths changed between a commit and its first parent. This operation can
take a while on large repositories. It provides significant performance gains
for getting history of a directory or a file with `git log -- <path>`.
for getting history of a directory or a file with `git log -- <path>`. If
this option is given, future commit-graph writes will automatically assume
that this option was intended. Use `--no-changed-paths` to stop storing this
data.
+
With the `--split[=<strategy>]` option, write the commit-graph as a
chain of multiple commit-graph files stored in

14
bloom.c
View File

@ -187,7 +187,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
struct diff_options diffopt;
int max_changes = 512;
if (bloom_filters.slab_size == 0)
if (!bloom_filters.slab_size)
return NULL;
filter = bloom_filter_slab_at(&bloom_filters, c);
@ -195,16 +195,14 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
if (!filter->data) {
load_commit_graph_info(r, c);
if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH &&
r->objects->commit_graph->chunk_bloom_indexes) {
if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
return filter;
else
return NULL;
}
r->objects->commit_graph->chunk_bloom_indexes)
load_bloom_filter_from_graph(r->objects->commit_graph, filter, c);
}
if (filter->data || !compute_if_not_present)
if (filter->data)
return filter;
if (!compute_if_not_present)
return NULL;
repo_diff_setup(r, &diffopt);
diffopt.flags.recursive = 1;

View File

@ -201,6 +201,7 @@ static int graph_write(int argc, const char **argv)
};
opts.progress = isatty(2);
opts.enable_changed_paths = -1;
split_opts.size_multiple = 2;
split_opts.max_commits = 0;
split_opts.expire_time = 0;
@ -221,7 +222,9 @@ static int graph_write(int argc, const char **argv)
flags |= COMMIT_GRAPH_WRITE_SPLIT;
if (opts.progress)
flags |= COMMIT_GRAPH_WRITE_PROGRESS;
if (opts.enable_changed_paths ||
if (!opts.enable_changed_paths)
flags |= COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS;
if (opts.enable_changed_paths == 1 ||
git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;

View File

@ -17,6 +17,8 @@
#include "bloom.h"
#include "commit-slab.h"
#include "shallow.h"
#include "json-writer.h"
#include "trace2.h"
void git_test_write_commit_graph_or_die(void)
{
@ -617,10 +619,6 @@ static int prepare_commit_graph(struct repository *r)
return !!r->objects->commit_graph;
r->objects->commit_graph_attempted = 1;
if (git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD, 0))
die("dying as requested by the '%s' variable on commit-graph load!",
GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD);
prepare_repo_settings(r);
if (!git_env_bool(GIT_TEST_COMMIT_GRAPH, 0) &&
@ -849,6 +847,14 @@ static int parse_commit_in_graph_one(struct repository *r,
int parse_commit_in_graph(struct repository *r, struct commit *item)
{
static int checked_env = 0;
if (!checked_env &&
git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE, 0))
die("dying as requested by the '%s' variable on commit-graph parse!",
GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE);
checked_env = 1;
if (!prepare_commit_graph(r))
return 0;
return parse_commit_in_graph_one(r, r->objects->commit_graph, item);
@ -941,9 +947,10 @@ struct write_commit_graph_context {
const struct split_commit_graph_opts *split_opts;
size_t total_bloom_filter_data_size;
const struct bloom_filter_settings *bloom_settings;
};
static void write_graph_chunk_fanout(struct hashfile *f,
static int write_graph_chunk_fanout(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
int i, count = 0;
@ -965,17 +972,21 @@ static void write_graph_chunk_fanout(struct hashfile *f,
hashwrite_be32(f, count);
}
return 0;
}
static void write_graph_chunk_oids(struct hashfile *f, int hash_len,
static int write_graph_chunk_oids(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
int count;
for (count = 0; count < ctx->commits.nr; count++, list++) {
display_progress(ctx->progress, ++ctx->progress_cnt);
hashwrite(f, (*list)->object.oid.hash, (int)hash_len);
hashwrite(f, (*list)->object.oid.hash, the_hash_algo->rawsz);
}
return 0;
}
static const unsigned char *commit_to_sha1(size_t index, void *table)
@ -984,7 +995,7 @@ static const unsigned char *commit_to_sha1(size_t index, void *table)
return commits[index]->object.oid.hash;
}
static void write_graph_chunk_data(struct hashfile *f, int hash_len,
static int write_graph_chunk_data(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
@ -1002,7 +1013,7 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len,
die(_("unable to parse commit %s"),
oid_to_hex(&(*list)->object.oid));
tree = get_commit_tree_oid(*list);
hashwrite(f, tree->hash, hash_len);
hashwrite(f, tree->hash, the_hash_algo->rawsz);
parent = (*list)->parents;
@ -1082,9 +1093,11 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len,
list++;
}
return 0;
}
static void write_graph_chunk_extra_edges(struct hashfile *f,
static int write_graph_chunk_extra_edges(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
@ -1134,9 +1147,11 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
list++;
}
return 0;
}
static void write_graph_chunk_bloom_indexes(struct hashfile *f,
static int write_graph_chunk_bloom_indexes(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
@ -1145,30 +1160,54 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f,
while (list < last) {
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
cur_pos += filter->len;
size_t len = filter ? filter->len : 0;
cur_pos += len;
display_progress(ctx->progress, ++ctx->progress_cnt);
hashwrite_be32(f, cur_pos);
list++;
}
return 0;
}
static void write_graph_chunk_bloom_data(struct hashfile *f,
struct write_commit_graph_context *ctx,
const struct bloom_filter_settings *settings)
static void trace2_bloom_filter_settings(struct write_commit_graph_context *ctx)
{
struct json_writer jw = JSON_WRITER_INIT;
jw_object_begin(&jw, 0);
jw_object_intmax(&jw, "hash_version", ctx->bloom_settings->hash_version);
jw_object_intmax(&jw, "num_hashes", ctx->bloom_settings->num_hashes);
jw_object_intmax(&jw, "bits_per_entry", ctx->bloom_settings->bits_per_entry);
jw_end(&jw);
trace2_data_json("bloom", ctx->r, "settings", &jw);
jw_release(&jw);
}
static int write_graph_chunk_bloom_data(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
struct commit **last = ctx->commits.list + ctx->commits.nr;
hashwrite_be32(f, settings->hash_version);
hashwrite_be32(f, settings->num_hashes);
hashwrite_be32(f, settings->bits_per_entry);
trace2_bloom_filter_settings(ctx);
hashwrite_be32(f, ctx->bloom_settings->hash_version);
hashwrite_be32(f, ctx->bloom_settings->num_hashes);
hashwrite_be32(f, ctx->bloom_settings->bits_per_entry);
while (list < last) {
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
size_t len = filter ? filter->len : 0;
display_progress(ctx->progress, ++ctx->progress_cnt);
hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
if (len)
hashwrite(f, filter->data, len * sizeof(unsigned char));
list++;
}
return 0;
}
static int oid_compare(const void *_a, const void *_b)
@ -1579,9 +1618,13 @@ static int write_graph_chunk_base(struct hashfile *f,
return 0;
}
typedef int (*chunk_write_fn)(struct hashfile *f,
struct write_commit_graph_context *ctx);
struct chunk_info {
uint32_t id;
uint64_t size;
chunk_write_fn write_fn;
};
static int write_commit_graph_file(struct write_commit_graph_context *ctx)
@ -1596,7 +1639,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
int num_chunks = 3;
uint64_t chunk_offset;
struct object_id file_hash;
const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
if (!ctx->bloom_settings) {
bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
bloom_settings.bits_per_entry);
bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
bloom_settings.num_hashes);
ctx->bloom_settings = &bloom_settings;
}
if (ctx->split) {
struct strbuf tmp_file = STRBUF_INIT;
@ -1644,27 +1695,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
chunks[0].id = GRAPH_CHUNKID_OIDFANOUT;
chunks[0].size = GRAPH_FANOUT_SIZE;
chunks[0].write_fn = write_graph_chunk_fanout;
chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP;
chunks[1].size = hashsz * ctx->commits.nr;
chunks[1].write_fn = write_graph_chunk_oids;
chunks[2].id = GRAPH_CHUNKID_DATA;
chunks[2].size = (hashsz + 16) * ctx->commits.nr;
chunks[2].write_fn = write_graph_chunk_data;
if (ctx->num_extra_edges) {
chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES;
chunks[num_chunks].size = 4 * ctx->num_extra_edges;
chunks[num_chunks].write_fn = write_graph_chunk_extra_edges;
num_chunks++;
}
if (ctx->changed_paths) {
chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES;
chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr;
chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes;
num_chunks++;
chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA;
chunks[num_chunks].size = sizeof(uint32_t) * 3
+ ctx->total_bloom_filter_data_size;
chunks[num_chunks].write_fn = write_graph_chunk_bloom_data;
num_chunks++;
}
if (ctx->num_commit_graphs_after > 1) {
chunks[num_chunks].id = GRAPH_CHUNKID_BASE;
chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1);
chunks[num_chunks].write_fn = write_graph_chunk_base;
num_chunks++;
}
@ -1700,19 +1758,19 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
progress_title.buf,
num_chunks * ctx->commits.nr);
}
write_graph_chunk_fanout(f, ctx);
write_graph_chunk_oids(f, hashsz, ctx);
write_graph_chunk_data(f, hashsz, ctx);
if (ctx->num_extra_edges)
write_graph_chunk_extra_edges(f, ctx);
if (ctx->changed_paths) {
write_graph_chunk_bloom_indexes(f, ctx);
write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
}
if (ctx->num_commit_graphs_after > 1 &&
write_graph_chunk_base(f, ctx)) {
for (i = 0; i < num_chunks; i++) {
uint64_t start_offset = f->total + f->offset;
if (chunks[i].write_fn(f, ctx))
return -1;
if (f->total + f->offset != start_offset + chunks[i].size)
BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
chunks[i].size, chunks[i].id,
f->total + f->offset - start_offset);
}
stop_progress(&ctx->progress);
strbuf_release(&progress_title);
@ -2046,9 +2104,23 @@ int write_commit_graph(struct object_directory *odb,
ctx->report_progress = flags & COMMIT_GRAPH_WRITE_PROGRESS ? 1 : 0;
ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
ctx->split_opts = split_opts;
ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
ctx->total_bloom_filter_data_size = 0;
if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS)
ctx->changed_paths = 1;
if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {
struct commit_graph *g;
prepare_commit_graph_one(ctx->r, ctx->odb);
g = ctx->r->objects->commit_graph;
/* We have changed-paths already. Keep them in the next graph */
if (g && g->chunk_bloom_data) {
ctx->changed_paths = 1;
ctx->bloom_settings = g->bloom_filter_settings;
}
}
if (ctx->split) {
struct commit_graph *g;
prepare_commit_graph(ctx->r);

View File

@ -6,7 +6,7 @@
#include "oidset.h"
#define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE "GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE"
#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
/*
@ -92,6 +92,7 @@ enum commit_graph_write_flags {
COMMIT_GRAPH_WRITE_PROGRESS = (1 << 1),
COMMIT_GRAPH_WRITE_SPLIT = (1 << 2),
COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 3),
COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS = (1 << 4),
};
enum commit_graph_split_flags {

View File

@ -633,7 +633,6 @@ static unsigned int count_bloom_filter_maybe;
static unsigned int count_bloom_filter_definitely_not;
static unsigned int count_bloom_filter_false_positive;
static unsigned int count_bloom_filter_not_present;
static unsigned int count_bloom_filter_length_zero;
static void trace2_bloom_filter_statistics_atexit(void)
{
@ -641,7 +640,6 @@ static void trace2_bloom_filter_statistics_atexit(void)
jw_object_begin(&jw, 0);
jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present);
jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero);
jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe);
jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not);
jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive);
@ -670,9 +668,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
{
struct pathspec_item *pi;
char *path_alloc = NULL;
const char *path;
const char *path, *p;
int last_index;
int len;
size_t len;
int path_component_nr = 1;
if (!revs->commits)
return;
@ -704,9 +703,38 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
path = pi->match;
len = strlen(path);
if (!len) {
revs->bloom_filter_settings = NULL;
return;
}
revs->bloom_key = xmalloc(sizeof(struct bloom_key));
fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
p = path;
while (*p) {
/*
* At this point, the path is normalized to use Unix-style
* path separators. This is required due to how the
* changed-path Bloom filters store the paths.
*/
if (*p == '/')
path_component_nr++;
p++;
}
revs->bloom_keys_nr = path_component_nr;
ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
fill_bloom_key(path, len, &revs->bloom_keys[0],
revs->bloom_filter_settings);
path_component_nr = 1;
p = path + len - 1;
while (p > path) {
if (*p == '/')
fill_bloom_key(path, p - path,
&revs->bloom_keys[path_component_nr++],
revs->bloom_filter_settings);
p--;
}
if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
atexit(trace2_bloom_filter_statistics_atexit);
@ -720,7 +748,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
struct commit *commit)
{
struct bloom_filter *filter;
int result;
int result = 1, j;
if (!revs->repo->objects->commit_graph)
return -1;
@ -735,14 +763,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
return -1;
}
if (!filter->len) {
count_bloom_filter_length_zero++;
return -1;
}
for (j = 0; result && j < revs->bloom_keys_nr; j++) {
result = bloom_filter_contains(filter,
revs->bloom_key,
&revs->bloom_keys[j],
revs->bloom_filter_settings);
}
if (result)
count_bloom_filter_maybe++;
@ -782,7 +807,7 @@ static int rev_compare_tree(struct rev_info *revs,
return REV_TREE_SAME;
}
if (revs->bloom_key && !nth_parent) {
if (revs->bloom_keys_nr && !nth_parent) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
if (bloom_ret == 0)

View File

@ -301,8 +301,10 @@ struct rev_info {
struct topo_walk_info *topo_walk_info;
/* Commit graph bloom filter fields */
/* The bloom filter key for the pathspec */
struct bloom_key *bloom_key;
/* The bloom filter key(s) for the pathspec */
struct bloom_key *bloom_keys;
int bloom_keys_nr;
/*
* The bloom filter settings used to generate the key.
* This is loaded from the commit-graph being used.

View File

@ -60,7 +60,7 @@ setup () {
test_bloom_filters_used () {
log_args=$1
bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\""
bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"maybe\""
setup "$log_args" &&
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
test_cmp log_wo_bloom log_w_bloom &&
@ -112,6 +112,10 @@ test_expect_success 'git log -- multiple path specs does not use Bloom filters'
test_bloom_filters_not_used "-- file4 A/file1"
'
test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' '
test_bloom_filters_not_used "-- ."
'
test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' '
test_bloom_filters_used "-- *4" &&
test_bloom_filters_used "-- *renamed"
@ -126,7 +130,7 @@ test_expect_success 'setup - add commit-graph to the chain without Bloom filters
test_commit c14 A/anotherFile2 &&
test_commit c15 A/B/anotherFile2 &&
test_commit c16 A/B/C/anotherFile2 &&
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split &&
git commit-graph write --reachable --split --no-changed-paths &&
test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
'
@ -142,7 +146,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
test_bloom_filters_used_when_some_filters_are_missing () {
log_args=$1
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":8"
setup "$log_args" &&
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
test_cmp log_wo_bloom log_w_bloom
@ -152,4 +156,39 @@ test_expect_success 'Use Bloom filters if they exist in the latest but not all c
test_bloom_filters_used_when_some_filters_are_missing "-- A/B"
'
test_expect_success 'persist filter settings' '
test_when_finished rm -rf .git/objects/info/commit-graph* &&
rm -rf .git/objects/info/commit-graph* &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
GIT_TRACE2_EVENT_NESTING=5 \
GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=9 \
GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY=15 \
git commit-graph write --reachable --changed-paths &&
grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15}" trace2.txt &&
GIT_TRACE2_EVENT="$(pwd)/trace2-auto.txt" \
GIT_TRACE2_EVENT_NESTING=5 \
git commit-graph write --reachable --changed-paths &&
grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15}" trace2-auto.txt
'
test_expect_success 'correctly report changes over limit' '
git init 513changes &&
(
cd 513changes &&
for i in $(test_seq 1 513)
do
echo $i >file$i.txt || return 1
done &&
git add . &&
git commit -m "files" &&
git commit-graph write --reachable --changed-paths &&
for i in $(test_seq 1 513)
do
git -c core.commitGraph=false log -- file$i.txt >expect &&
git log -- file$i.txt >actual &&
test_cmp expect actual || return 1
done
)
'
test_done

View File

@ -476,7 +476,7 @@ corrupt_graph_verify() {
cp $objdir/info/commit-graph commit-graph-pre-write-test
fi &&
git status --short &&
GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD=true git commit-graph write &&
GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE=true git commit-graph write &&
chmod u+w $objdir/info/commit-graph &&
git commit-graph verify
}