Merge branch 'jk/pack-delta-reuse-with-bitmap'

When creating a thin pack, which allows objects to be made into a
delta against another object that is not in the resulting pack but
is known to be present on the receiving end, the code learned to
take advantage of the reachability bitmap; this allows the server
to send a delta against a base beyond the "boundary" commit.

* jk/pack-delta-reuse-with-bitmap:
  pack-objects: reuse on-disk deltas for thin "have" objects
  pack-bitmap: save "have" bitmap from walk
  t/perf: add perf tests for fetches from a bitmapped server
  t/perf: add infrastructure for measuring sizes
  t/perf: factor out percent calculations
  t/perf: factor boilerplate out of test_perf
This commit is contained in:
Junio C Hamano 2018-09-17 13:53:53 -07:00
commit 3ebdef2e1b
9 changed files with 262 additions and 54 deletions

View File

@ -41,6 +41,7 @@
#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
#define SET_DELTA_EXT(obj, oid) oe_set_delta_ext(&to_pack, obj, oid)
#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
@ -60,6 +61,7 @@ static struct packing_data to_pack;
static struct pack_idx_entry **written_list;
static uint32_t nr_result, nr_written, nr_seen;
static struct bitmap_index *bitmap_git;
static int non_empty;
static int reuse_delta = 1, reuse_object = 1;
@ -80,6 +82,7 @@ static unsigned long pack_size_limit;
static int depth = 50;
static int delta_search_threads;
static int pack_to_stdout;
static int thin;
static int num_preferred_base;
static struct progress *progress_state;
@ -1538,11 +1541,15 @@ static void check_object(struct object_entry *entry)
break;
}
if (base_ref && (base_entry = packlist_find(&to_pack, base_ref, NULL))) {
if (base_ref && (
(base_entry = packlist_find(&to_pack, base_ref, NULL)) ||
(thin &&
bitmap_has_sha1_in_uninteresting(bitmap_git, base_ref)))) {
/*
* If base_ref was set above that means we wish to
* reuse delta data, and we even found that base
* in the list of objects we want to pack. Goodie!
* reuse delta data, and either we found that object in
* the list of objects we want to pack, or it's one we
* know the receiver has.
*
* Depth value does not matter - find_deltas() will
* never consider reused delta as the base object to
@ -1551,10 +1558,16 @@ static void check_object(struct object_entry *entry)
*/
oe_set_type(entry, entry->in_pack_type);
SET_SIZE(entry, in_pack_size); /* delta size */
SET_DELTA(entry, base_entry);
SET_DELTA_SIZE(entry, in_pack_size);
entry->delta_sibling_idx = base_entry->delta_child_idx;
SET_DELTA_CHILD(base_entry, entry);
if (base_entry) {
SET_DELTA(entry, base_entry);
entry->delta_sibling_idx = base_entry->delta_child_idx;
SET_DELTA_CHILD(base_entry, entry);
} else {
SET_DELTA_EXT(entry, base_ref);
}
unuse_pack(&w_curs);
return;
}
@ -2979,7 +2992,6 @@ static int pack_options_allow_reuse(void)
static int get_object_list_from_bitmap(struct rev_info *revs)
{
struct bitmap_index *bitmap_git;
if (!(bitmap_git = prepare_bitmap_walk(revs)))
return -1;
@ -2995,7 +3007,6 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
}
traverse_bitmap_commit_list(bitmap_git, &add_object_entry_from_bitmap);
free_bitmap_index(bitmap_git);
return 0;
}
@ -3143,7 +3154,6 @@ static int option_parse_unpack_unreachable(const struct option *opt,
int cmd_pack_objects(int argc, const char **argv, const char *prefix)
{
int use_internal_rev_list = 0;
int thin = 0;
int shallow = 0;
int all_progress_implied = 0;
struct argv_array rp = ARGV_ARRAY_INIT;

View File

@ -86,6 +86,9 @@ struct bitmap_index {
/* Bitmap result of the last performed walk */
struct bitmap *result;
/* "have" bitmap from the last performed walk */
struct bitmap *haves;
/* Version of the bitmap index */
unsigned int version;
@ -759,8 +762,8 @@ struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs)
bitmap_and_not(wants_bitmap, haves_bitmap);
bitmap_git->result = wants_bitmap;
bitmap_git->haves = haves_bitmap;
bitmap_free(haves_bitmap);
return bitmap_git;
cleanup:
@ -1114,5 +1117,25 @@ void free_bitmap_index(struct bitmap_index *b)
free(b->ext_index.objects);
free(b->ext_index.hashes);
bitmap_free(b->result);
bitmap_free(b->haves);
free(b);
}
int bitmap_has_sha1_in_uninteresting(struct bitmap_index *bitmap_git,
const unsigned char *sha1)
{
int pos;
if (!bitmap_git)
return 0; /* no bitmap loaded */
if (!bitmap_git->result)
BUG("failed to perform bitmap walk before querying");
if (!bitmap_git->haves)
return 0; /* walk had no "haves" */
pos = bitmap_position_packfile(bitmap_git, sha1);
if (pos < 0)
return 0;
return bitmap_get(bitmap_git->haves, pos);
}

View File

@ -53,6 +53,13 @@ int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping
khash_sha1 *reused_bitmaps, int show_progress);
void free_bitmap_index(struct bitmap_index *);
/*
* After a traversal has been performed on the bitmap_index, this can be
* queried to see if a particular object was reachable from any of the
* objects flagged as UNINTERESTING.
*/
int bitmap_has_sha1_in_uninteresting(struct bitmap_index *, const unsigned char *sha1);
void bitmap_writer_show_progress(int show);
void bitmap_writer_set_checksum(unsigned char *sha1);
void bitmap_writer_build_type_index(struct packing_data *to_pack,

View File

@ -181,3 +181,22 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
return new_entry;
}
void oe_set_delta_ext(struct packing_data *pdata,
struct object_entry *delta,
const unsigned char *sha1)
{
struct object_entry *base;
ALLOC_GROW(pdata->ext_bases, pdata->nr_ext + 1, pdata->alloc_ext);
base = &pdata->ext_bases[pdata->nr_ext++];
memset(base, 0, sizeof(*base));
hashcpy(base->idx.oid.hash, sha1);
/* These flags mark that we are not part of the actual pack output. */
base->preferred_base = 1;
base->filled = 1;
delta->ext_base = 1;
delta->delta_idx = base - pdata->ext_bases + 1;
}

View File

@ -112,6 +112,7 @@ struct object_entry {
unsigned filled:1; /* assigned write-order */
unsigned dfs_state:OE_DFS_STATE_BITS;
unsigned depth:OE_DEPTH_BITS;
unsigned ext_base:1; /* delta_idx points outside packlist */
/*
* pahole results on 64-bit linux (gcc and clang)
@ -147,6 +148,14 @@ struct packing_data {
pthread_mutex_t lock;
#endif
/*
* This list contains entries for bases which we know the other side
* has (e.g., via reachability bitmaps), but which aren't in our
* "objects" list.
*/
struct object_entry *ext_bases;
uint32_t nr_ext, alloc_ext;
uintmax_t oe_size_limit;
uintmax_t oe_delta_size_limit;
};
@ -249,9 +258,12 @@ static inline struct object_entry *oe_delta(
const struct packing_data *pack,
const struct object_entry *e)
{
if (e->delta_idx)
if (!e->delta_idx)
return NULL;
if (e->ext_base)
return &pack->ext_bases[e->delta_idx - 1];
else
return &pack->objects[e->delta_idx - 1];
return NULL;
}
static inline void oe_set_delta(struct packing_data *pack,
@ -264,6 +276,10 @@ static inline void oe_set_delta(struct packing_data *pack,
e->delta_idx = 0;
}
void oe_set_delta_ext(struct packing_data *pack,
struct object_entry *e,
const unsigned char *sha1);
static inline struct object_entry *oe_delta_child(
const struct packing_data *pack,
const struct object_entry *e)

View File

@ -168,3 +168,28 @@ that
While we have tried to make sure that it can cope with embedded
whitespace and other special characters, it will not work with
multi-line data.
Rather than tracking the performance by run-time as `test_perf` does, you
may also track output size by using `test_size`. The stdout of the
function should be a single numeric value, which will be captured and
shown in the aggregated output. For example:
test_perf 'time foo' '
./foo >foo.out
'
test_size 'output size'
wc -c <foo.out
'
might produce output like:
Test origin HEAD
-------------------------------------------------------------
1234.1 time foo 0.37(0.79+0.02) 0.26(0.51+0.02) -29.7%
1234.2 output size 4.3M 3.6M -14.7%
The item being measured (and its units) is up to the test; the context
and the test title should make it clear to the user whether bigger or
smaller numbers are better. Unlike test_perf, the test code will only be
run once, since output sizes tend to be more deterministic than timings.

View File

@ -13,27 +13,42 @@ sub get_times {
my $line = <$fh>;
return undef if not defined $line;
close $fh or die "cannot close $name: $!";
$line =~ /^(?:(\d+):)?(\d+):(\d+(?:\.\d+)?) (\d+(?:\.\d+)?) (\d+(?:\.\d+)?)$/
or die "bad input line: $line";
my $rt = ((defined $1 ? $1 : 0.0)*60+$2)*60+$3;
return ($rt, $4, $5);
# times
if ($line =~ /^(?:(\d+):)?(\d+):(\d+(?:\.\d+)?) (\d+(?:\.\d+)?) (\d+(?:\.\d+)?)$/) {
my $rt = ((defined $1 ? $1 : 0.0)*60+$2)*60+$3;
return ($rt, $4, $5);
# size
} elsif ($line =~ /^\d+$/) {
return $&;
} else {
die "bad input line: $line";
}
}
sub relative_change {
my ($r, $firstr) = @_;
if ($firstr > 0) {
return sprintf "%+.1f%%", 100.0*($r-$firstr)/$firstr;
} elsif ($r == 0) {
return "=";
} else {
return "+inf";
}
}
sub format_times {
my ($r, $u, $s, $firstr) = @_;
# no value means we did not finish the test
if (!defined $r) {
return "<missing>";
}
my $out = sprintf "%.2f(%.2f+%.2f)", $r, $u, $s;
if (defined $firstr) {
if ($firstr > 0) {
$out .= sprintf " %+.1f%%", 100.0*($r-$firstr)/$firstr;
} elsif ($r == 0) {
$out .= " =";
} else {
$out .= " +inf";
}
# a single value means we have a size, not times
if (!defined $u) {
return format_size($r, $firstr);
}
# otherwise, we have real/user/system times
my $out = sprintf "%.2f(%.2f+%.2f)", $r, $u, $s;
$out .= ' ' . relative_change($r, $firstr) if defined $firstr;
return $out;
}
@ -51,6 +66,25 @@ EOT
exit(1);
}
sub human_size {
my $n = shift;
my @units = ('', qw(K M G));
while ($n > 900 && @units > 1) {
$n /= 1000;
shift @units;
}
return $n unless length $units[0];
return sprintf '%.1f%s', $n, $units[0];
}
sub format_size {
my ($size, $first) = @_;
# match the width of a time: 0.00(0.00+0.00)
my $out = sprintf '%15s', human_size($size);
$out .= ' ' . relative_change($size, $first) if defined $first;
return $out;
}
my (@dirs, %dirnames, %dirabbrevs, %prefixes, @tests,
$codespeed, $sortby, $subsection, $reponame);
@ -181,7 +215,14 @@ sub print_default_results {
my $firstr;
for my $i (0..$#dirs) {
my $d = $dirs[$i];
$times{$prefixes{$d}.$t} = [get_times("$resultsdir/$prefixes{$d}$t.times")];
my $base = "$resultsdir/$prefixes{$d}$t";
$times{$prefixes{$d}.$t} = [];
foreach my $type (qw(times size)) {
if (-e "$base.$type") {
$times{$prefixes{$d}.$t} = [get_times("$base.$type")];
last;
}
}
my ($r,$u,$s) = @{$times{$prefixes{$d}.$t}};
my $w = length format_times($r,$u,$s,$firstr);
$colwidth[$i] = $w if $w > $colwidth[$i];

View File

@ -0,0 +1,45 @@
#!/bin/sh
test_description='performance of fetches from bitmapped packs'
. ./perf-lib.sh
test_perf_default_repo
test_expect_success 'create bitmapped server repo' '
git config pack.writebitmaps true &&
git config pack.writebitmaphashcache true &&
git repack -ad
'
# simulate a fetch from a repository that last fetched N days ago, for
# various values of N. We do so by following the first-parent chain,
# and assume the first entry in the chain that is N days older than the current
# HEAD is where the HEAD would have been then.
for days in 1 2 4 8 16 32 64 128; do
title=$(printf '%10s' "($days days)")
test_expect_success "setup revs from $days days ago" '
now=$(git log -1 --format=%ct HEAD) &&
then=$(($now - ($days * 86400))) &&
tip=$(git rev-list -1 --first-parent --until=$then HEAD) &&
{
echo HEAD &&
echo ^$tip
} >revs
'
test_perf "server $title" '
git pack-objects --stdout --revs \
--thin --delta-base-offset \
<revs >tmp.pack
'
test_size "size $title" '
wc -c <tmp.pack
'
test_perf "client $title" '
git index-pack --stdin --fix-thin <tmp.pack
'
done
test_done

View File

@ -179,8 +179,8 @@ exit $ret' >&3 2>&4
return "$eval_ret"
}
test_perf () {
test_wrapper_ () {
test_wrapper_func_=$1; shift
test_start_
test "$#" = 3 && { test_prereq=$1; shift; } || test_prereq=
test "$#" = 2 ||
@ -191,37 +191,59 @@ test_perf () {
base=$(basename "$0" .sh)
echo "$test_count" >>"$perf_results_dir"/$base.subtests
echo "$1" >"$perf_results_dir"/$base.$test_count.descr
if test -z "$verbose"; then
printf "%s" "perf $test_count - $1:"
else
echo "perf $test_count - $1:"
fi
for i in $(test_seq 1 $GIT_PERF_REPEAT_COUNT); do
say >&3 "running: $2"
if test_run_perf_ "$2"
then
if test -z "$verbose"; then
printf " %s" "$i"
else
echo "* timing run $i/$GIT_PERF_REPEAT_COUNT:"
fi
else
test -z "$verbose" && echo
test_failure_ "$@"
break
fi
done
if test -z "$verbose"; then
echo " ok"
else
test_ok_ "$1"
fi
base="$perf_results_dir"/"$perf_results_prefix$(basename "$0" .sh)"."$test_count"
"$TEST_DIRECTORY"/perf/min_time.perl test_time.* >"$base".times
"$test_wrapper_func_" "$@"
fi
test_finish_
}
test_perf_ () {
if test -z "$verbose"; then
printf "%s" "perf $test_count - $1:"
else
echo "perf $test_count - $1:"
fi
for i in $(test_seq 1 $GIT_PERF_REPEAT_COUNT); do
say >&3 "running: $2"
if test_run_perf_ "$2"
then
if test -z "$verbose"; then
printf " %s" "$i"
else
echo "* timing run $i/$GIT_PERF_REPEAT_COUNT:"
fi
else
test -z "$verbose" && echo
test_failure_ "$@"
break
fi
done
if test -z "$verbose"; then
echo " ok"
else
test_ok_ "$1"
fi
"$TEST_DIRECTORY"/perf/min_time.perl test_time.* >"$base".times
}
test_perf () {
test_wrapper_ test_perf_ "$@"
}
test_size_ () {
say >&3 "running: $2"
if test_eval_ "$2" 3>"$base".size; then
test_ok_ "$1"
else
test_failure_ "$@"
fi
}
test_size () {
test_wrapper_ test_size_ "$@"
}
# We extend test_done to print timings at the end (./run disables this
# and does it after running everything)
test_at_end_hook_ () {