git/commit-reach.c

805 lines
20 KiB
C
Raw Normal View History

#include "cache.h"
#include "commit.h"
#include "commit-graph.h"
#include "decorate.h"
#include "prio-queue.h"
#include "tree.h"
#include "ref-filter.h"
#include "revision.h"
#include "tag.h"
#include "commit-reach.h"
/* Remember to update object flag allocation in object.h */
#define PARENT1 (1u<<16)
#define PARENT2 (1u<<17)
#define STALE (1u<<18)
#define RESULT (1u<<19)
static const unsigned all_flags = (PARENT1 | PARENT2 | STALE | RESULT);
static int queue_has_nonstale(struct prio_queue *queue)
{
int i;
for (i = 0; i < queue->nr; i++) {
struct commit *commit = queue->array[i].data;
if (!(commit->object.flags & STALE))
return 1;
}
return 0;
}
/* all input commits in one and twos[] must have been parsed! */
static struct commit_list *paint_down_to_common(struct repository *r,
struct commit *one, int n,
struct commit **twos,
int min_generation)
{
struct prio_queue queue = { compare_commits_by_gen_then_commit_date };
struct commit_list *result = NULL;
int i;
uint32_t last_gen = GENERATION_NUMBER_INFINITY;
if (!min_generation)
queue.compare = compare_commits_by_commit_date;
one->object.flags |= PARENT1;
if (!n) {
commit_list_append(one, &result);
return result;
}
prio_queue_put(&queue, one);
for (i = 0; i < n; i++) {
twos[i]->object.flags |= PARENT2;
prio_queue_put(&queue, twos[i]);
}
while (queue_has_nonstale(&queue)) {
struct commit *commit = prio_queue_get(&queue);
struct commit_list *parents;
int flags;
uint32_t generation = commit_graph_generation(commit);
if (min_generation && generation > last_gen)
BUG("bad generation skip %8x > %8x at %s",
generation, last_gen,
oid_to_hex(&commit->object.oid));
last_gen = generation;
if (generation < min_generation)
break;
flags = commit->object.flags & (PARENT1 | PARENT2 | STALE);
if (flags == (PARENT1 | PARENT2)) {
if (!(commit->object.flags & RESULT)) {
commit->object.flags |= RESULT;
commit_list_insert_by_date(commit, &result);
}
/* Mark parents of a found merge stale */
flags |= STALE;
}
parents = commit->parents;
while (parents) {
struct commit *p = parents->item;
parents = parents->next;
if ((p->object.flags & flags) == flags)
continue;
if (repo_parse_commit(r, p))
return NULL;
p->object.flags |= flags;
prio_queue_put(&queue, p);
}
}
clear_prio_queue(&queue);
return result;
}
static struct commit_list *merge_bases_many(struct repository *r,
struct commit *one, int n,
struct commit **twos)
{
struct commit_list *list = NULL;
struct commit_list *result = NULL;
int i;
for (i = 0; i < n; i++) {
if (one == twos[i])
/*
* We do not mark this even with RESULT so we do not
* have to clean it up.
*/
return commit_list_insert(one, &result);
}
if (repo_parse_commit(r, one))
return NULL;
for (i = 0; i < n; i++) {
if (repo_parse_commit(r, twos[i]))
return NULL;
}
list = paint_down_to_common(r, one, n, twos, 0);
while (list) {
struct commit *commit = pop_commit(&list);
if (!(commit->object.flags & STALE))
commit_list_insert_by_date(commit, &result);
}
return result;
}
struct commit_list *get_octopus_merge_bases(struct commit_list *in)
{
struct commit_list *i, *j, *k, *ret = NULL;
if (!in)
return ret;
commit_list_insert(in->item, &ret);
for (i = in->next; i; i = i->next) {
struct commit_list *new_commits = NULL, *end = NULL;
for (j = ret; j; j = j->next) {
struct commit_list *bases;
bases = get_merge_bases(i->item, j->item);
if (!new_commits)
new_commits = bases;
else
end->next = bases;
for (k = bases; k; k = k->next)
end = k;
}
ret = new_commits;
}
return ret;
}
static int remove_redundant(struct repository *r, struct commit **array, int cnt)
{
/*
* Some commit in the array may be an ancestor of
* another commit. Move such commit to the end of
* the array, and return the number of commits that
* are independent from each other.
*/
struct commit **work;
unsigned char *redundant;
int *filled_index;
int i, j, filled;
work = xcalloc(cnt, sizeof(*work));
redundant = xcalloc(cnt, 1);
ALLOC_ARRAY(filled_index, cnt - 1);
for (i = 0; i < cnt; i++)
repo_parse_commit(r, array[i]);
for (i = 0; i < cnt; i++) {
struct commit_list *common;
uint32_t min_generation = commit_graph_generation(array[i]);
if (redundant[i])
continue;
for (j = filled = 0; j < cnt; j++) {
uint32_t curr_generation;
if (i == j || redundant[j])
continue;
filled_index[filled] = j;
work[filled++] = array[j];
curr_generation = commit_graph_generation(array[j]);
if (curr_generation < min_generation)
min_generation = curr_generation;
}
common = paint_down_to_common(r, array[i], filled,
work, min_generation);
if (array[i]->object.flags & PARENT2)
redundant[i] = 1;
for (j = 0; j < filled; j++)
if (work[j]->object.flags & PARENT1)
redundant[filled_index[j]] = 1;
clear_commit_marks(array[i], all_flags);
clear_commit_marks_many(filled, work, all_flags);
free_commit_list(common);
}
/* Now collect the result */
COPY_ARRAY(work, array, cnt);
for (i = filled = 0; i < cnt; i++)
if (!redundant[i])
array[filled++] = work[i];
for (j = filled, i = 0; i < cnt; i++)
if (redundant[i])
array[j++] = work[i];
free(work);
free(redundant);
free(filled_index);
return filled;
}
static struct commit_list *get_merge_bases_many_0(struct repository *r,
struct commit *one,
int n,
struct commit **twos,
int cleanup)
{
struct commit_list *list;
struct commit **rslt;
struct commit_list *result;
int cnt, i;
result = merge_bases_many(r, one, n, twos);
for (i = 0; i < n; i++) {
if (one == twos[i])
return result;
}
if (!result || !result->next) {
if (cleanup) {
clear_commit_marks(one, all_flags);
clear_commit_marks_many(n, twos, all_flags);
}
return result;
}
/* There are more than one */
cnt = commit_list_count(result);
rslt = xcalloc(cnt, sizeof(*rslt));
for (list = result, i = 0; list; list = list->next)
rslt[i++] = list->item;
free_commit_list(result);
clear_commit_marks(one, all_flags);
clear_commit_marks_many(n, twos, all_flags);
cnt = remove_redundant(r, rslt, cnt);
result = NULL;
for (i = 0; i < cnt; i++)
commit_list_insert_by_date(rslt[i], &result);
free(rslt);
return result;
}
struct commit_list *repo_get_merge_bases_many(struct repository *r,
struct commit *one,
int n,
struct commit **twos)
{
return get_merge_bases_many_0(r, one, n, twos, 1);
}
struct commit_list *repo_get_merge_bases_many_dirty(struct repository *r,
struct commit *one,
int n,
struct commit **twos)
{
return get_merge_bases_many_0(r, one, n, twos, 0);
}
struct commit_list *repo_get_merge_bases(struct repository *r,
struct commit *one,
struct commit *two)
{
return get_merge_bases_many_0(r, one, 1, &two, 1);
}
/*
* Is "commit" a descendant of one of the elements on the "with_commit" list?
*/
int repo_is_descendant_of(struct repository *r,
struct commit *commit,
struct commit_list *with_commit)
{
if (!with_commit)
return 1;
commit-reach: use can_all_from_reach The is_descendant_of method previously used in_merge_bases() to check if the commit can reach any of the commits in the provided list. This had two performance problems: 1. The performance is quadratic in worst-case. 2. A single in_merge_bases() call requires walking beyond the target commit in order to find the full set of boundary commits that may be merge-bases. The can_all_from_reach method avoids this quadratic behavior and can limit the search beyond the target commits using generation numbers. It requires a small prototype adjustment to stop using commit-date as a cutoff, as that optimization is no longer appropriate here. Since in_merge_bases() uses paint_down_to_common(), is_descendant_of() naturally found cutoffs to avoid walking the entire commit graph. Since we want to always return the correct result, we cannot use the min_commit_date cutoff in can_all_from_reach. We then rely on generation numbers to provide the cutoff. Since not all repos will have a commit-graph file, nor will we always have generation numbers computed for a commit-graph file, create a new method, generation_numbers_enabled(), that checks for a commit-graph file and sees if the first commit in the file has a non-zero generation number. In the case that we do not have generation numbers, use the old logic for is_descendant_of(). Performance was meausured on a copy of the Linux repository using the 'test-tool reach is_descendant_of' command using this input: A:v4.9 X:v4.10 X:v4.11 X:v4.12 X:v4.13 X:v4.14 X:v4.15 X:v4.16 X:v4.17 X.v3.0 Note that this input is tailored to demonstrate the quadratic nature of the previous method, as it will compute merge-bases for v4.9 versus all of the later versions before checking against v4.1. Before: 0.26 s After: 0.21 s Since we previously used the is_descendant_of method in the ref_newer method, we also measured performance there using 'test-tool reach ref_newer' with this input: A:v4.9 B:v3.19 Before: 0.10 s After: 0.08 s By adding a new commit with parent v3.19, we test the non-reachable case of ref_newer: Before: 0.09 s After: 0.08 s Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:30 +08:00
if (generation_numbers_enabled(the_repository)) {
struct commit_list *from_list = NULL;
int result;
commit_list_insert(commit, &from_list);
result = can_all_from_reach(from_list, with_commit, 0);
free_commit_list(from_list);
return result;
} else {
while (with_commit) {
struct commit *other;
other = with_commit->item;
with_commit = with_commit->next;
commit-reach: use fast logic in repo_in_merge_base The repo_is_descendant_of() method is aware of the existence of the commit-graph file. It checks for generation_numbers_enabled() before deciding on using can_all_from_reach() or repo_in_merge_bases() depending on the situation. The reason here is that can_all_from_reach() uses a depth-first search that is limited by the minimum generation number of the target commits, and that algorithm can be very slow when generation numbers are not present. The alternative uses paint_down_to_common() which will walk the entire merge-base boundary, which is typically slower. This method is used by commands like "git tag --contains" and "git branch --contains" for very fast results when a commit-graph file exists. Unfortunately, it is _not_ used in commands like "git merge-base --is-ancestor" which is doing an even simpler request. This issue was raised recently [1] with respect to a change to how generation numbers are stored, but was also reported much earlier [2] before commit-reach.c existed to simplify these reachability queries. [1] https://lore.kernel.org/git/20200607195347.GA8232@szeder.dev/ [2] https://lore.kernel.org/git/87608bawoa.fsf@evledraar.gmail.com/ The root cause is that builtin/merge-base.c has a method handle_is_ancestor() that calls in_merge_bases(), an older version of repo_in_merge_bases(). It would be better if we have every caller to in_merge_bases() use the logic in can_all_from_reach() when possible. This is where things get a little tricky: repo_is_descendant_of() calls repo_in_merge_bases() in the non-generation numbers enabled case! If we simply update repo_in_merge_bases() to call repo_is_descendant_of() instead of repo_in_merge_bases_many(), then we will get a recursive call loop. Thankfully, this is caught by the test suite in the default mode (i.e. GIT_TEST_COMMIT_GRAPH=0). The trick, then, is to make the non-generation number case for repo_is_descendant_of() call repo_in_merge_bases_many() directly, skipping the non-_many version. This allows us to take advantage of this faster code path, when possible. The easiest way to measure the performance impact is to test the following command on the Linux kernel repository: git merge-base --is-ancestor <A> <B> | A | B | Time Before | Time After | |------|------|-------------|------------| | v3.0 | v5.7 | 0.459s | 0.028s | | v4.0 | v5.7 | 0.267s | 0.021s | | v5.0 | v5.7 | 0.074s | 0.013s | Note that each of these samples return success. The old code performed the same operation when <A> and <B> are swapped. However, can_all_from_reach() will return immediately if the generation numbers show that <A> has larger generation number than <B>. Thus, the time for the swapped case is universally 0.004s in each case. Reported-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Reported-by: SZEDER Gábor <szeder.dev@gmail.com> Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-18 01:24:29 +08:00
if (repo_in_merge_bases_many(r, other, 1, &commit))
commit-reach: use can_all_from_reach The is_descendant_of method previously used in_merge_bases() to check if the commit can reach any of the commits in the provided list. This had two performance problems: 1. The performance is quadratic in worst-case. 2. A single in_merge_bases() call requires walking beyond the target commit in order to find the full set of boundary commits that may be merge-bases. The can_all_from_reach method avoids this quadratic behavior and can limit the search beyond the target commits using generation numbers. It requires a small prototype adjustment to stop using commit-date as a cutoff, as that optimization is no longer appropriate here. Since in_merge_bases() uses paint_down_to_common(), is_descendant_of() naturally found cutoffs to avoid walking the entire commit graph. Since we want to always return the correct result, we cannot use the min_commit_date cutoff in can_all_from_reach. We then rely on generation numbers to provide the cutoff. Since not all repos will have a commit-graph file, nor will we always have generation numbers computed for a commit-graph file, create a new method, generation_numbers_enabled(), that checks for a commit-graph file and sees if the first commit in the file has a non-zero generation number. In the case that we do not have generation numbers, use the old logic for is_descendant_of(). Performance was meausured on a copy of the Linux repository using the 'test-tool reach is_descendant_of' command using this input: A:v4.9 X:v4.10 X:v4.11 X:v4.12 X:v4.13 X:v4.14 X:v4.15 X:v4.16 X:v4.17 X.v3.0 Note that this input is tailored to demonstrate the quadratic nature of the previous method, as it will compute merge-bases for v4.9 versus all of the later versions before checking against v4.1. Before: 0.26 s After: 0.21 s Since we previously used the is_descendant_of method in the ref_newer method, we also measured performance there using 'test-tool reach ref_newer' with this input: A:v4.9 B:v3.19 Before: 0.10 s After: 0.08 s By adding a new commit with parent v3.19, we test the non-reachable case of ref_newer: Before: 0.09 s After: 0.08 s Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:30 +08:00
return 1;
}
return 0;
}
}
/*
* Is "commit" an ancestor of one of the "references"?
*/
int repo_in_merge_bases_many(struct repository *r, struct commit *commit,
int nr_reference, struct commit **reference)
{
struct commit_list *bases;
int ret = 0, i;
uint32_t generation, min_generation = GENERATION_NUMBER_INFINITY;
if (repo_parse_commit(r, commit))
return ret;
for (i = 0; i < nr_reference; i++) {
if (repo_parse_commit(r, reference[i]))
return ret;
generation = commit_graph_generation(reference[i]);
if (generation < min_generation)
min_generation = generation;
}
generation = commit_graph_generation(commit);
if (generation > min_generation)
return ret;
bases = paint_down_to_common(r, commit,
nr_reference, reference,
generation);
if (commit->object.flags & PARENT2)
ret = 1;
clear_commit_marks(commit, all_flags);
clear_commit_marks_many(nr_reference, reference, all_flags);
free_commit_list(bases);
return ret;
}
/*
* Is "commit" an ancestor of (i.e. reachable from) the "reference"?
*/
int repo_in_merge_bases(struct repository *r,
struct commit *commit,
struct commit *reference)
{
commit-reach: use fast logic in repo_in_merge_base The repo_is_descendant_of() method is aware of the existence of the commit-graph file. It checks for generation_numbers_enabled() before deciding on using can_all_from_reach() or repo_in_merge_bases() depending on the situation. The reason here is that can_all_from_reach() uses a depth-first search that is limited by the minimum generation number of the target commits, and that algorithm can be very slow when generation numbers are not present. The alternative uses paint_down_to_common() which will walk the entire merge-base boundary, which is typically slower. This method is used by commands like "git tag --contains" and "git branch --contains" for very fast results when a commit-graph file exists. Unfortunately, it is _not_ used in commands like "git merge-base --is-ancestor" which is doing an even simpler request. This issue was raised recently [1] with respect to a change to how generation numbers are stored, but was also reported much earlier [2] before commit-reach.c existed to simplify these reachability queries. [1] https://lore.kernel.org/git/20200607195347.GA8232@szeder.dev/ [2] https://lore.kernel.org/git/87608bawoa.fsf@evledraar.gmail.com/ The root cause is that builtin/merge-base.c has a method handle_is_ancestor() that calls in_merge_bases(), an older version of repo_in_merge_bases(). It would be better if we have every caller to in_merge_bases() use the logic in can_all_from_reach() when possible. This is where things get a little tricky: repo_is_descendant_of() calls repo_in_merge_bases() in the non-generation numbers enabled case! If we simply update repo_in_merge_bases() to call repo_is_descendant_of() instead of repo_in_merge_bases_many(), then we will get a recursive call loop. Thankfully, this is caught by the test suite in the default mode (i.e. GIT_TEST_COMMIT_GRAPH=0). The trick, then, is to make the non-generation number case for repo_is_descendant_of() call repo_in_merge_bases_many() directly, skipping the non-_many version. This allows us to take advantage of this faster code path, when possible. The easiest way to measure the performance impact is to test the following command on the Linux kernel repository: git merge-base --is-ancestor <A> <B> | A | B | Time Before | Time After | |------|------|-------------|------------| | v3.0 | v5.7 | 0.459s | 0.028s | | v4.0 | v5.7 | 0.267s | 0.021s | | v5.0 | v5.7 | 0.074s | 0.013s | Note that each of these samples return success. The old code performed the same operation when <A> and <B> are swapped. However, can_all_from_reach() will return immediately if the generation numbers show that <A> has larger generation number than <B>. Thus, the time for the swapped case is universally 0.004s in each case. Reported-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Reported-by: SZEDER Gábor <szeder.dev@gmail.com> Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-18 01:24:29 +08:00
int res;
struct commit_list *list = NULL;
struct commit_list **next = &list;
next = commit_list_append(commit, next);
res = repo_is_descendant_of(r, reference, list);
free_commit_list(list);
return res;
}
struct commit_list *reduce_heads(struct commit_list *heads)
{
struct commit_list *p;
struct commit_list *result = NULL, **tail = &result;
struct commit **array;
int num_head, i;
if (!heads)
return NULL;
/* Uniquify */
for (p = heads; p; p = p->next)
p->item->object.flags &= ~STALE;
for (p = heads, num_head = 0; p; p = p->next) {
if (p->item->object.flags & STALE)
continue;
p->item->object.flags |= STALE;
num_head++;
}
array = xcalloc(num_head, sizeof(*array));
for (p = heads, i = 0; p; p = p->next) {
if (p->item->object.flags & STALE) {
array[i++] = p->item;
p->item->object.flags &= ~STALE;
}
}
num_head = remove_redundant(the_repository, array, num_head);
for (i = 0; i < num_head; i++)
tail = &commit_list_insert(array[i], tail)->next;
free(array);
return result;
}
void reduce_heads_replace(struct commit_list **heads)
{
struct commit_list *result = reduce_heads(*heads);
free_commit_list(*heads);
*heads = result;
}
int ref_newer(const struct object_id *new_oid, const struct object_id *old_oid)
{
struct object *o;
struct commit *old_commit, *new_commit;
struct commit_list *old_commit_list = NULL;
int ret;
/*
* Both new_commit and old_commit must be commit-ish and new_commit is descendant of
* old_commit. Otherwise we require --force.
*/
o = deref_tag(the_repository, parse_object(the_repository, old_oid),
NULL, 0);
if (!o || o->type != OBJ_COMMIT)
return 0;
old_commit = (struct commit *) o;
o = deref_tag(the_repository, parse_object(the_repository, new_oid),
NULL, 0);
if (!o || o->type != OBJ_COMMIT)
return 0;
new_commit = (struct commit *) o;
if (parse_commit(new_commit) < 0)
return 0;
commit_list_insert(old_commit, &old_commit_list);
ret = repo_is_descendant_of(the_repository,
new_commit, old_commit_list);
free_commit_list(old_commit_list);
return ret;
}
/*
* Mimicking the real stack, this stack lives on the heap, avoiding stack
* overflows.
*
* At each recursion step, the stack items points to the commits whose
* ancestors are to be inspected.
*/
struct contains_stack {
int nr, alloc;
struct contains_stack_entry {
struct commit *commit;
struct commit_list *parents;
} *contains_stack;
};
static int in_commit_list(const struct commit_list *want, struct commit *c)
{
for (; want; want = want->next)
if (oideq(&want->item->object.oid, &c->object.oid))
return 1;
return 0;
}
/*
* Test whether the candidate is contained in the list.
* Do not recurse to find out, though, but return -1 if inconclusive.
*/
static enum contains_result contains_test(struct commit *candidate,
const struct commit_list *want,
struct contains_cache *cache,
uint32_t cutoff)
{
enum contains_result *cached = contains_cache_at(cache, candidate);
/* If we already have the answer cached, return that. */
if (*cached)
return *cached;
/* or are we it? */
if (in_commit_list(want, candidate)) {
*cached = CONTAINS_YES;
return CONTAINS_YES;
}
/* Otherwise, we don't know; prepare to recurse */
parse_commit_or_die(candidate);
if (commit_graph_generation(candidate) < cutoff)
return CONTAINS_NO;
return CONTAINS_UNKNOWN;
}
static void push_to_contains_stack(struct commit *candidate, struct contains_stack *contains_stack)
{
ALLOC_GROW(contains_stack->contains_stack, contains_stack->nr + 1, contains_stack->alloc);
contains_stack->contains_stack[contains_stack->nr].commit = candidate;
contains_stack->contains_stack[contains_stack->nr++].parents = candidate->parents;
}
static enum contains_result contains_tag_algo(struct commit *candidate,
const struct commit_list *want,
struct contains_cache *cache)
{
struct contains_stack contains_stack = { 0, 0, NULL };
enum contains_result result;
uint32_t cutoff = GENERATION_NUMBER_INFINITY;
const struct commit_list *p;
for (p = want; p; p = p->next) {
uint32_t generation;
struct commit *c = p->item;
load_commit_graph_info(the_repository, c);
generation = commit_graph_generation(c);
if (generation < cutoff)
cutoff = generation;
}
result = contains_test(candidate, want, cache, cutoff);
if (result != CONTAINS_UNKNOWN)
return result;
push_to_contains_stack(candidate, &contains_stack);
while (contains_stack.nr) {
struct contains_stack_entry *entry = &contains_stack.contains_stack[contains_stack.nr - 1];
struct commit *commit = entry->commit;
struct commit_list *parents = entry->parents;
if (!parents) {
*contains_cache_at(cache, commit) = CONTAINS_NO;
contains_stack.nr--;
}
/*
* If we just popped the stack, parents->item has been marked,
* therefore contains_test will return a meaningful yes/no.
*/
else switch (contains_test(parents->item, want, cache, cutoff)) {
case CONTAINS_YES:
*contains_cache_at(cache, commit) = CONTAINS_YES;
contains_stack.nr--;
break;
case CONTAINS_NO:
entry->parents = parents->next;
break;
case CONTAINS_UNKNOWN:
push_to_contains_stack(parents->item, &contains_stack);
break;
}
}
free(contains_stack.contains_stack);
return contains_test(candidate, want, cache, cutoff);
}
int commit_contains(struct ref_filter *filter, struct commit *commit,
struct commit_list *list, struct contains_cache *cache)
{
if (filter->with_commit_tag_algo)
return contains_tag_algo(commit, list, cache) == CONTAINS_YES;
return repo_is_descendant_of(the_repository, commit, list);
}
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
static int compare_commits_by_gen(const void *_a, const void *_b)
{
const struct commit *a = *(const struct commit * const *)_a;
const struct commit *b = *(const struct commit * const *)_b;
uint32_t generation_a = commit_graph_generation(a);
uint32_t generation_b = commit_graph_generation(b);
if (generation_a < generation_b)
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
return -1;
if (generation_a > generation_b)
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
return 1;
return 0;
}
int can_all_from_reach_with_flag(struct object_array *from,
unsigned int with_flag,
unsigned int assign_flag,
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
time_t min_commit_date,
uint32_t min_generation)
{
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
struct commit **list = NULL;
int i;
int nr_commits;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
int result = 1;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
ALLOC_ARRAY(list, from->nr);
nr_commits = 0;
for (i = 0; i < from->nr; i++) {
struct object *from_one = from->objects[i].item;
if (!from_one || from_one->flags & assign_flag)
continue;
from_one = deref_tag(the_repository, from_one,
"a from object", 0);
if (!from_one || from_one->type != OBJ_COMMIT) {
/*
* no way to tell if this is reachable by
* looking at the ancestry chain alone, so
* leave a note to ourselves not to worry about
* this object anymore.
*/
from->objects[i].item->flags |= assign_flag;
continue;
}
list[nr_commits] = (struct commit *)from_one;
if (parse_commit(list[nr_commits]) ||
commit_graph_generation(list[nr_commits]) < min_generation) {
result = 0;
goto cleanup;
}
nr_commits++;
}
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
QSORT(list, nr_commits, compare_commits_by_gen);
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
for (i = 0; i < nr_commits; i++) {
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
/* DFS from list[i] */
struct commit_list *stack = NULL;
list[i]->object.flags |= assign_flag;
commit_list_insert(list[i], &stack);
while (stack) {
struct commit_list *parent;
commit-reach: fix first-parent heuristic The algorithm in can_all_from_reach_with_flags() performs a depth- first-search, terminated by generation number, intending to use a hueristic that "important" commits are found in the first-parent history. This heuristic is valuable in scenarios like fetch negotiation. However, there is a problem! After the search finds a target commit, it should pop all commits off the stack and mark them as "can reach". This logic is incorrect, so the algorithm instead walks all reachable commits above the generation-number cutoff. The existing algorithm is still an improvement over the previous algorithm, as the worst-case complexity went from quadratic to linear. The performance measurement at the time was good, but not dramatic. By fixing this heuristic, we reduce the number of walked commits. We can also re-run the performance tests from commit 4fbcca4e "commit-reach: make can_all_from_reach... linear". Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: 4fbcca4e~1: 0.85 s 4fbcca4e: 0.26 s (num_walked: 1,011,035) HEAD: 0.14 s (num_walked: 8,601) Large Case: 4fbcca4e~1: 24.0 s 4fbcca4e: 0.12 s (num_walked: 503,925) HEAD: 0.06 s (num_walked: 217,243) Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-10-19 01:24:40 +08:00
if (stack->item->object.flags & (with_flag | RESULT)) {
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
pop_commit(&stack);
commit-reach: fix first-parent heuristic The algorithm in can_all_from_reach_with_flags() performs a depth- first-search, terminated by generation number, intending to use a hueristic that "important" commits are found in the first-parent history. This heuristic is valuable in scenarios like fetch negotiation. However, there is a problem! After the search finds a target commit, it should pop all commits off the stack and mark them as "can reach". This logic is incorrect, so the algorithm instead walks all reachable commits above the generation-number cutoff. The existing algorithm is still an improvement over the previous algorithm, as the worst-case complexity went from quadratic to linear. The performance measurement at the time was good, but not dramatic. By fixing this heuristic, we reduce the number of walked commits. We can also re-run the performance tests from commit 4fbcca4e "commit-reach: make can_all_from_reach... linear". Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: 4fbcca4e~1: 0.85 s 4fbcca4e: 0.26 s (num_walked: 1,011,035) HEAD: 0.14 s (num_walked: 8,601) Large Case: 4fbcca4e~1: 24.0 s 4fbcca4e: 0.12 s (num_walked: 503,925) HEAD: 0.06 s (num_walked: 217,243) Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-10-19 01:24:40 +08:00
if (stack)
stack->item->object.flags |= RESULT;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
continue;
}
for (parent = stack->item->parents; parent; parent = parent->next) {
if (parent->item->object.flags & (with_flag | RESULT))
stack->item->object.flags |= RESULT;
if (!(parent->item->object.flags & assign_flag)) {
parent->item->object.flags |= assign_flag;
if (parse_commit(parent->item) ||
parent->item->date < min_commit_date ||
commit_graph_generation(parent->item) < min_generation)
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
continue;
commit_list_insert(parent->item, &stack);
break;
}
}
if (!parent)
pop_commit(&stack);
}
if (!(list[i]->object.flags & (with_flag | RESULT))) {
result = 0;
goto cleanup;
}
}
cleanup:
clear_commit_marks_many(nr_commits, list, RESULT | assign_flag);
free(list);
for (i = 0; i < from->nr; i++)
from->objects[i].item->flags &= ~assign_flag;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
return result;
}
int can_all_from_reach(struct commit_list *from, struct commit_list *to,
int cutoff_by_min_date)
{
struct object_array from_objs = OBJECT_ARRAY_INIT;
time_t min_commit_date = cutoff_by_min_date ? from->item->date : 0;
struct commit_list *from_iter = from, *to_iter = to;
int result;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
uint32_t min_generation = GENERATION_NUMBER_INFINITY;
while (from_iter) {
add_object_array(&from_iter->item->object, NULL, &from_objs);
if (!parse_commit(from_iter->item)) {
uint32_t generation;
if (from_iter->item->date < min_commit_date)
min_commit_date = from_iter->item->date;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
generation = commit_graph_generation(from_iter->item);
if (generation < min_generation)
min_generation = generation;
}
from_iter = from_iter->next;
}
while (to_iter) {
if (!parse_commit(to_iter->item)) {
uint32_t generation;
if (to_iter->item->date < min_commit_date)
min_commit_date = to_iter->item->date;
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
generation = commit_graph_generation(to_iter->item);
if (generation < min_generation)
min_generation = generation;
}
to_iter->item->object.flags |= PARENT2;
to_iter = to_iter->next;
}
result = can_all_from_reach_with_flag(&from_objs, PARENT2, PARENT1,
commit-reach: make can_all_from_reach... linear The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-21 00:33:28 +08:00
min_commit_date, min_generation);
while (from) {
clear_commit_marks(from->item, PARENT1);
from = from->next;
}
while (to) {
clear_commit_marks(to->item, PARENT2);
to = to->next;
}
object_array_clear(&from_objs);
return result;
}
commit-reach: implement get_reachable_subset The existing reachability algorithms in commit-reach.c focus on finding merge-bases or determining if all commits in a set X can reach at least one commit in a set Y. However, for two commits sets X and Y, we may also care about which commits in Y are reachable from at least one commit in X. Implement get_reachable_subset() which answers this question. Given two arrays of commits, 'from' and 'to', return a commit_list with every commit from the 'to' array that is reachable from at least one commit in the 'from' array. The algorithm is a simple walk starting at the 'from' commits, using the PARENT2 flag to indicate "this commit has already been added to the walk queue". By marking the 'to' commits with the PARENT1 flag, we can determine when we see a commit from the 'to' array. We remove the PARENT1 flag as we add that commit to the result list to avoid duplicates. The order of the resulting list is a reverse of the order that the commits are discovered in the walk. There are a couple shortcuts to avoid walking more than we need: 1. We determine the minimum generation number of commits in the 'to' array. We do not walk commits with generation number below this minimum. 2. We count how many distinct commits are in the 'to' array, and decrement this count when we discover a 'to' commit during the walk. If this number reaches zero, then we can terminate the walk. Tests will be added using the 'test-tool reach' helper in a subsequent commit. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-02 21:14:45 +08:00
struct commit_list *get_reachable_subset(struct commit **from, int nr_from,
struct commit **to, int nr_to,
unsigned int reachable_flag)
{
struct commit **item;
struct commit *current;
struct commit_list *found_commits = NULL;
struct commit **to_last = to + nr_to;
struct commit **from_last = from + nr_from;
uint32_t min_generation = GENERATION_NUMBER_INFINITY;
int num_to_find = 0;
struct prio_queue queue = { compare_commits_by_gen_then_commit_date };
for (item = to; item < to_last; item++) {
uint32_t generation;
commit-reach: implement get_reachable_subset The existing reachability algorithms in commit-reach.c focus on finding merge-bases or determining if all commits in a set X can reach at least one commit in a set Y. However, for two commits sets X and Y, we may also care about which commits in Y are reachable from at least one commit in X. Implement get_reachable_subset() which answers this question. Given two arrays of commits, 'from' and 'to', return a commit_list with every commit from the 'to' array that is reachable from at least one commit in the 'from' array. The algorithm is a simple walk starting at the 'from' commits, using the PARENT2 flag to indicate "this commit has already been added to the walk queue". By marking the 'to' commits with the PARENT1 flag, we can determine when we see a commit from the 'to' array. We remove the PARENT1 flag as we add that commit to the result list to avoid duplicates. The order of the resulting list is a reverse of the order that the commits are discovered in the walk. There are a couple shortcuts to avoid walking more than we need: 1. We determine the minimum generation number of commits in the 'to' array. We do not walk commits with generation number below this minimum. 2. We count how many distinct commits are in the 'to' array, and decrement this count when we discover a 'to' commit during the walk. If this number reaches zero, then we can terminate the walk. Tests will be added using the 'test-tool reach' helper in a subsequent commit. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-02 21:14:45 +08:00
struct commit *c = *item;
parse_commit(c);
generation = commit_graph_generation(c);
if (generation < min_generation)
min_generation = generation;
commit-reach: implement get_reachable_subset The existing reachability algorithms in commit-reach.c focus on finding merge-bases or determining if all commits in a set X can reach at least one commit in a set Y. However, for two commits sets X and Y, we may also care about which commits in Y are reachable from at least one commit in X. Implement get_reachable_subset() which answers this question. Given two arrays of commits, 'from' and 'to', return a commit_list with every commit from the 'to' array that is reachable from at least one commit in the 'from' array. The algorithm is a simple walk starting at the 'from' commits, using the PARENT2 flag to indicate "this commit has already been added to the walk queue". By marking the 'to' commits with the PARENT1 flag, we can determine when we see a commit from the 'to' array. We remove the PARENT1 flag as we add that commit to the result list to avoid duplicates. The order of the resulting list is a reverse of the order that the commits are discovered in the walk. There are a couple shortcuts to avoid walking more than we need: 1. We determine the minimum generation number of commits in the 'to' array. We do not walk commits with generation number below this minimum. 2. We count how many distinct commits are in the 'to' array, and decrement this count when we discover a 'to' commit during the walk. If this number reaches zero, then we can terminate the walk. Tests will be added using the 'test-tool reach' helper in a subsequent commit. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-02 21:14:45 +08:00
if (!(c->object.flags & PARENT1)) {
c->object.flags |= PARENT1;
num_to_find++;
}
}
for (item = from; item < from_last; item++) {
struct commit *c = *item;
if (!(c->object.flags & PARENT2)) {
c->object.flags |= PARENT2;
parse_commit(c);
prio_queue_put(&queue, *item);
}
}
while (num_to_find && (current = prio_queue_get(&queue)) != NULL) {
struct commit_list *parents;
if (current->object.flags & PARENT1) {
current->object.flags &= ~PARENT1;
current->object.flags |= reachable_flag;
commit_list_insert(current, &found_commits);
num_to_find--;
}
for (parents = current->parents; parents; parents = parents->next) {
struct commit *p = parents->item;
parse_commit(p);
if (commit_graph_generation(p) < min_generation)
commit-reach: implement get_reachable_subset The existing reachability algorithms in commit-reach.c focus on finding merge-bases or determining if all commits in a set X can reach at least one commit in a set Y. However, for two commits sets X and Y, we may also care about which commits in Y are reachable from at least one commit in X. Implement get_reachable_subset() which answers this question. Given two arrays of commits, 'from' and 'to', return a commit_list with every commit from the 'to' array that is reachable from at least one commit in the 'from' array. The algorithm is a simple walk starting at the 'from' commits, using the PARENT2 flag to indicate "this commit has already been added to the walk queue". By marking the 'to' commits with the PARENT1 flag, we can determine when we see a commit from the 'to' array. We remove the PARENT1 flag as we add that commit to the result list to avoid duplicates. The order of the resulting list is a reverse of the order that the commits are discovered in the walk. There are a couple shortcuts to avoid walking more than we need: 1. We determine the minimum generation number of commits in the 'to' array. We do not walk commits with generation number below this minimum. 2. We count how many distinct commits are in the 'to' array, and decrement this count when we discover a 'to' commit during the walk. If this number reaches zero, then we can terminate the walk. Tests will be added using the 'test-tool reach' helper in a subsequent commit. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-02 21:14:45 +08:00
continue;
if (p->object.flags & PARENT2)
continue;
p->object.flags |= PARENT2;
prio_queue_put(&queue, p);
}
}
clear_commit_marks_many(nr_to, to, PARENT1);
clear_commit_marks_many(nr_from, from, PARENT2);
return found_commits;
}