Merge branch 'en/ort-perf-batch-9'

The ort merge backend has been optimized by skipping irrelevant
renames.

* en/ort-perf-batch-9:
  diffcore-rename: avoid doing basename comparisons for irrelevant sources
  merge-ort: skip rename detection entirely if possible
  merge-ort: use relevant_sources to filter possible rename sources
  merge-ort: precompute whether directory rename detection is needed
  merge-ort: introduce wrappers for alternate tree traversal
  merge-ort: add data structures for an alternate tree traversal
  merge-ort: precompute subset of sources for which we need rename detection
  diffcore-rename: enable filtering possible rename sources
This commit is contained in:
Junio C Hamano 2021-04-08 13:23:26 -07:00
commit 1b31224e59
4 changed files with 354 additions and 15 deletions

View File

@ -527,6 +527,7 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
}
static void initialize_dir_rename_info(struct dir_rename_info *info,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strmap *dir_rename_count)
{
@ -534,7 +535,7 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
struct strmap_entry *entry;
int i;
if (!dirs_removed) {
if (!dirs_removed && !relevant_sources) {
info->setup = 0;
return;
}
@ -549,7 +550,20 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
strmap_init_with_options(&info->dir_rename_guess, NULL, 0);
/* Setup info->relevant_source_dirs */
info->relevant_source_dirs = dirs_removed;
info->relevant_source_dirs = NULL;
if (dirs_removed || !relevant_sources) {
info->relevant_source_dirs = dirs_removed; /* might be NULL */
} else {
info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
strset_init(info->relevant_source_dirs);
strset_for_each_entry(relevant_sources, &iter, entry) {
char *dirname = get_dirname(entry->key);
if (!dirs_removed ||
strset_contains(dirs_removed, dirname))
strset_add(info->relevant_source_dirs, dirname);
free(dirname);
}
}
/*
* Loop setting up both info->idx_map, and doing setup of
@ -627,6 +641,13 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
/* dir_rename_guess */
strmap_clear(&info->dir_rename_guess, 1);
/* relevant_source_dirs */
if (info->relevant_source_dirs &&
info->relevant_source_dirs != dirs_removed) {
strset_clear(info->relevant_source_dirs);
FREE_AND_NULL(info->relevant_source_dirs);
}
/* dir_rename_count */
if (!keep_dir_rename_count) {
partial_clear_dir_rename_count(info->dir_rename_count);
@ -749,6 +770,7 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
static int find_basename_matches(struct diff_options *options,
int minimum_score,
struct dir_rename_info *info,
struct strset *relevant_sources,
struct strset *dirs_removed)
{
/*
@ -839,6 +861,11 @@ static int find_basename_matches(struct diff_options *options,
intptr_t src_index;
intptr_t dst_index;
/* Skip irrelevant sources */
if (relevant_sources &&
!strset_contains(relevant_sources, filename))
continue;
/*
* If the basename is unique among remaining sources, then
* src_index will equal 'i' and we can attempt to match it
@ -991,11 +1018,12 @@ static int find_renames(struct diff_score *mx,
return count;
}
static void remove_unneeded_paths_from_src(int detecting_copies)
static void remove_unneeded_paths_from_src(int detecting_copies,
struct strset *interesting)
{
int i, new_num_src;
if (detecting_copies)
if (detecting_copies && !interesting)
return; /* nothing to remove */
if (break_idx)
return; /* culling incompatible with break detection */
@ -1022,12 +1050,18 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
* from rename_src here.
*/
for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
struct diff_filespec *one = rename_src[i].p->one;
/*
* renames are stored in rename_dst, so if a rename has
* already been detected using this source, we can just
* remove the source knowing rename_dst has its info.
*/
if (rename_src[i].p->one->rename_used)
if (!detecting_copies && one->rename_used)
continue;
/* If we don't care about the source path, skip it */
if (interesting && !strset_contains(interesting, one->path))
continue;
if (new_num_src < i)
@ -1040,6 +1074,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
}
void diffcore_rename_extended(struct diff_options *options,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strmap *dir_rename_count)
{
@ -1060,6 +1095,8 @@ void diffcore_rename_extended(struct diff_options *options,
want_copies = (detect_rename == DIFF_DETECT_COPY);
if (dirs_removed && (break_idx || want_copies))
BUG("dirs_removed incompatible with break/copy detection");
if (break_idx && relevant_sources)
BUG("break detection incompatible with source specification");
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
@ -1127,9 +1164,10 @@ void diffcore_rename_extended(struct diff_options *options,
/*
* Cull sources:
* - remove ones corresponding to exact renames
* - remove ones not found in relevant_sources
*/
trace2_region_enter("diff", "cull after exact", options->repo);
remove_unneeded_paths_from_src(want_copies);
remove_unneeded_paths_from_src(want_copies, relevant_sources);
trace2_region_leave("diff", "cull after exact", options->repo);
} else {
/* Determine minimum score to match basenames */
@ -1148,12 +1186,12 @@ void diffcore_rename_extended(struct diff_options *options,
* - remove ones involved in renames (found via exact match)
*/
trace2_region_enter("diff", "cull after exact", options->repo);
remove_unneeded_paths_from_src(want_copies);
remove_unneeded_paths_from_src(want_copies, NULL);
trace2_region_leave("diff", "cull after exact", options->repo);
/* Preparation for basename-driven matching. */
trace2_region_enter("diff", "dir rename setup", options->repo);
initialize_dir_rename_info(&info,
initialize_dir_rename_info(&info, relevant_sources,
dirs_removed, dir_rename_count);
trace2_region_leave("diff", "dir rename setup", options->repo);
@ -1161,15 +1199,18 @@ void diffcore_rename_extended(struct diff_options *options,
trace2_region_enter("diff", "basename matches", options->repo);
rename_count += find_basename_matches(options,
min_basename_score,
&info, dirs_removed);
&info,
relevant_sources,
dirs_removed);
trace2_region_leave("diff", "basename matches", options->repo);
/*
* Cull sources, again:
* - remove ones involved in renames (found via basenames)
* - remove ones not found in relevant_sources
*/
trace2_region_enter("diff", "cull basename", options->repo);
remove_unneeded_paths_from_src(want_copies);
remove_unneeded_paths_from_src(want_copies, relevant_sources);
trace2_region_leave("diff", "cull basename", options->repo);
}
@ -1341,5 +1382,5 @@ void diffcore_rename_extended(struct diff_options *options,
void diffcore_rename(struct diff_options *options)
{
diffcore_rename_extended(options, NULL, NULL);
diffcore_rename_extended(options, NULL, NULL, NULL);
}

View File

@ -166,6 +166,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
void diffcore_break(struct repository *, int);
void diffcore_rename(struct diff_options *);
void diffcore_rename_extended(struct diff_options *options,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strmap *dir_rename_count);
void diffcore_merge_broken(void);

View File

@ -51,6 +51,12 @@ enum merge_side {
MERGE_SIDE2 = 2
};
struct traversal_callback_data {
unsigned long mask;
unsigned long dirmask;
struct name_entry names[3];
};
struct rename_info {
/*
* All variables that are arrays of size 3 correspond to data tracked
@ -88,6 +94,44 @@ struct rename_info {
*/
struct strmap dir_renames[3];
/*
* relevant_sources: deleted paths for which we need rename detection
*
* relevant_sources is a set of deleted paths on each side of
* history for which we need rename detection. If a path is deleted
* on one side of history, we need to detect if it is part of a
* rename if either
* * we need to detect renames for an ancestor directory
* * the file is modified/deleted on the other side of history
* If neither of those are true, we can skip rename detection for
* that path.
*/
struct strset relevant_sources[3];
/*
* dir_rename_mask:
* 0: optimization removing unmodified potential rename source okay
* 2 or 4: optimization okay, but must check for files added to dir
* 7: optimization forbidden; need rename source in case of dir rename
*/
unsigned dir_rename_mask:3;
/*
* callback_data_*: supporting data structures for alternate traversal
*
* We sometimes need to be able to traverse through all the files
* in a given tree before all immediate subdirectories within that
* tree. Since traverse_trees() doesn't do that naturally, we have
* a traverse_trees_wrapper() that stores any immediate
* subdirectories while traversing files, then traverses the
* immediate subdirectories later. These callback_data* variables
* store the information for the subdirectories so that we can do
* that traversal order.
*/
struct traversal_callback_data *callback_data;
int callback_data_nr, callback_data_alloc;
char *callback_data_traverse_path;
/*
* needed_limit: value needed for inexact rename detection to run
*
@ -358,6 +402,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
strmap_clear(&renames->dir_rename_count[i], 1);
strmap_func(&renames->dir_renames[i], 0);
strset_func(&renames->relevant_sources[i]);
}
if (!reinitialize) {
@ -380,6 +426,12 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
}
strmap_clear(&opti->output, 0);
}
renames->dir_rename_mask = 0;
/* Clean out callback_data as well. */
FREE_AND_NULL(renames->callback_data);
renames->callback_data_nr = renames->callback_data_alloc = 0;
}
static int err(struct merge_options *opt, const char *err, ...)
@ -470,6 +522,82 @@ static char *unique_path(struct strmap *existing_paths,
/*** Function Grouping: functions related to collect_merge_info() ***/
static int traverse_trees_wrapper_callback(int n,
unsigned long mask,
unsigned long dirmask,
struct name_entry *names,
struct traverse_info *info)
{
struct merge_options *opt = info->data;
struct rename_info *renames = &opt->priv->renames;
unsigned filemask = mask & ~dirmask;
assert(n==3);
if (!renames->callback_data_traverse_path)
renames->callback_data_traverse_path = xstrdup(info->traverse_path);
if (filemask && filemask == renames->dir_rename_mask)
renames->dir_rename_mask = 0x07;
ALLOC_GROW(renames->callback_data, renames->callback_data_nr + 1,
renames->callback_data_alloc);
renames->callback_data[renames->callback_data_nr].mask = mask;
renames->callback_data[renames->callback_data_nr].dirmask = dirmask;
COPY_ARRAY(renames->callback_data[renames->callback_data_nr].names,
names, 3);
renames->callback_data_nr++;
return mask;
}
/*
* Much like traverse_trees(), BUT:
* - read all the tree entries FIRST, saving them
* - note that the above step provides an opportunity to compute necessary
* additional details before the "real" traversal
* - loop through the saved entries and call the original callback on them
*/
static int traverse_trees_wrapper(struct index_state *istate,
int n,
struct tree_desc *t,
struct traverse_info *info)
{
int ret, i, old_offset;
traverse_callback_t old_fn;
char *old_callback_data_traverse_path;
struct merge_options *opt = info->data;
struct rename_info *renames = &opt->priv->renames;
assert(renames->dir_rename_mask == 2 || renames->dir_rename_mask == 4);
old_callback_data_traverse_path = renames->callback_data_traverse_path;
old_fn = info->fn;
old_offset = renames->callback_data_nr;
renames->callback_data_traverse_path = NULL;
info->fn = traverse_trees_wrapper_callback;
ret = traverse_trees(istate, n, t, info);
if (ret < 0)
return ret;
info->traverse_path = renames->callback_data_traverse_path;
info->fn = old_fn;
for (i = old_offset; i < renames->callback_data_nr; ++i) {
info->fn(n,
renames->callback_data[i].mask,
renames->callback_data[i].dirmask,
renames->callback_data[i].names,
info);
}
renames->callback_data_nr = old_offset;
free(renames->callback_data_traverse_path);
renames->callback_data_traverse_path = old_callback_data_traverse_path;
info->traverse_path = NULL;
return 0;
}
static void setup_path_info(struct merge_options *opt,
struct string_list_item *result,
const char *current_dir_name,
@ -533,12 +661,22 @@ static void add_pair(struct merge_options *opt,
struct name_entry *names,
const char *pathname,
unsigned side,
unsigned is_add /* if false, is_delete */)
unsigned is_add /* if false, is_delete */,
unsigned match_mask,
unsigned dir_rename_mask)
{
struct diff_filespec *one, *two;
struct rename_info *renames = &opt->priv->renames;
int names_idx = is_add ? side : 0;
if (!is_add) {
unsigned content_relevant = (match_mask == 0);
unsigned location_relevant = (dir_rename_mask == 0x07);
if (content_relevant || location_relevant)
strset_add(&renames->relevant_sources[side], pathname);
}
one = alloc_filespec(pathname);
two = alloc_filespec(pathname);
fill_filespec(is_add ? two : one,
@ -557,6 +695,36 @@ static void collect_rename_info(struct merge_options *opt,
struct rename_info *renames = &opt->priv->renames;
unsigned side;
/*
* Update dir_rename_mask (determines ignore-rename-source validity)
*
* dir_rename_mask helps us keep track of when directory rename
* detection may be relevant. Basically, whenver a directory is
* removed on one side of history, and a file is added to that
* directory on the other side of history, directory rename
* detection is relevant (meaning we have to detect renames for all
* files within that directory to deduce where the directory
* moved). Also, whenever a directory needs directory rename
* detection, due to the "majority rules" choice for where to move
* it (see t6423 testcase 1f), we also need to detect renames for
* all files within subdirectories of that directory as well.
*
* Here we haven't looked at files within the directory yet, we are
* just looking at the directory itself. So, if we aren't yet in
* a case where a parent directory needed directory rename detection
* (i.e. dir_rename_mask != 0x07), and if the directory was removed
* on one side of history, record the mask of the other side of
* history in dir_rename_mask.
*/
if (renames->dir_rename_mask != 0x07 &&
(dirmask == 3 || dirmask == 5)) {
/* simple sanity check */
assert(renames->dir_rename_mask == 0 ||
renames->dir_rename_mask == (dirmask & ~1));
/* update dir_rename_mask; have it record mask of new side */
renames->dir_rename_mask = (dirmask & ~1);
}
/* Update dirs_removed, as needed */
if (dirmask == 1 || dirmask == 3 || dirmask == 5) {
/* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */
@ -575,11 +743,15 @@ static void collect_rename_info(struct merge_options *opt,
/* Check for deletion on side */
if ((filemask & 1) && !(filemask & side_mask))
add_pair(opt, names, fullname, side, 0 /* delete */);
add_pair(opt, names, fullname, side, 0 /* delete */,
match_mask & filemask,
renames->dir_rename_mask);
/* Check for addition on side */
if (!(filemask & 1) && (filemask & side_mask))
add_pair(opt, names, fullname, side, 1 /* add */);
add_pair(opt, names, fullname, side, 1 /* add */,
match_mask & filemask,
renames->dir_rename_mask);
}
}
@ -597,12 +769,14 @@ static int collect_merge_info_callback(int n,
*/
struct merge_options *opt = info->data;
struct merge_options_internal *opti = opt->priv;
struct rename_info *renames = &opt->priv->renames;
struct string_list_item pi; /* Path Info */
struct conflict_info *ci; /* typed alias to pi.util (which is void*) */
struct name_entry *p;
size_t len;
char *fullpath;
const char *dirname = opti->current_dir_name;
unsigned prev_dir_rename_mask = renames->dir_rename_mask;
unsigned filemask = mask & ~dirmask;
unsigned match_mask = 0; /* will be updated below */
unsigned mbase_null = !(mask & 1);
@ -743,8 +917,13 @@ static int collect_merge_info_callback(int n,
original_dir_name = opti->current_dir_name;
opti->current_dir_name = pi.string;
ret = traverse_trees(NULL, 3, t, &newinfo);
if (renames->dir_rename_mask == 0 ||
renames->dir_rename_mask == 0x07)
ret = traverse_trees(NULL, 3, t, &newinfo);
else
ret = traverse_trees_wrapper(NULL, 3, t, &newinfo);
opti->current_dir_name = original_dir_name;
renames->dir_rename_mask = prev_dir_rename_mask;
for (i = MERGE_BASE; i <= MERGE_SIDE2; i++)
free(buf[i]);
@ -1977,6 +2156,19 @@ static int process_renames(struct merge_options *opt,
return clean_merge;
}
static inline int possible_side_renames(struct rename_info *renames,
unsigned side_index)
{
return renames->pairs[side_index].nr > 0 &&
!strset_empty(&renames->relevant_sources[side_index]);
}
static inline int possible_renames(struct rename_info *renames)
{
return possible_side_renames(renames, 1) ||
possible_side_renames(renames, 2);
}
static void resolve_diffpair_statuses(struct diff_queue_struct *q)
{
/*
@ -2013,6 +2205,16 @@ static void detect_regular_renames(struct merge_options *opt,
struct diff_options diff_opts;
struct rename_info *renames = &opt->priv->renames;
if (!possible_side_renames(renames, side_index)) {
/*
* No rename detection needed for this side, but we still need
* to make sure 'adds' are marked correctly in case the other
* side had directory renames.
*/
resolve_diffpair_statuses(&renames->pairs[side_index]);
return;
}
repo_diff_setup(opt->repo, &diff_opts);
diff_opts.flags.recursive = 1;
diff_opts.flags.rename_empty = 0;
@ -2028,6 +2230,7 @@ static void detect_regular_renames(struct merge_options *opt,
diff_queued_diff = renames->pairs[side_index];
trace2_region_enter("diff", "diffcore_rename", opt->repo);
diffcore_rename_extended(&diff_opts,
&renames->relevant_sources[side_index],
&renames->dirs_removed[side_index],
&renames->dir_rename_count[side_index]);
trace2_region_leave("diff", "diffcore_rename", opt->repo);
@ -2129,6 +2332,8 @@ static int detect_and_process_renames(struct merge_options *opt,
int need_dir_renames, s, clean = 1;
memset(&combined, 0, sizeof(combined));
if (!possible_renames(renames))
goto cleanup;
trace2_region_enter("merge", "regular renames", opt->repo);
detect_regular_renames(opt, MERGE_SIDE1);
@ -2163,6 +2368,25 @@ static int detect_and_process_renames(struct merge_options *opt,
clean &= process_renames(opt, &combined);
trace2_region_leave("merge", "process renames", opt->repo);
goto simple_cleanup; /* collect_renames() handles some of cleanup */
cleanup:
/*
* Free now unneeded filepairs, which would have been handled
* in collect_renames() normally but we skipped that code.
*/
for (s = MERGE_SIDE1; s <= MERGE_SIDE2; s++) {
struct diff_queue_struct *side_pairs;
int i;
side_pairs = &renames->pairs[s];
for (i = 0; i < side_pairs->nr; ++i) {
struct diff_filepair *p = side_pairs->queue[i];
diff_free_filepair(p);
}
}
simple_cleanup:
/* Free memory for renames->pairs[] and combined */
for (s = MERGE_SIDE1; s <= MERGE_SIDE2; s++) {
free(renames->pairs[s].queue);
@ -3226,6 +3450,8 @@ static void merge_start(struct merge_options *opt, struct merge_result *result)
NULL, 1);
strmap_init_with_options(&renames->dir_renames[i],
NULL, 0);
strset_init_with_options(&renames->relevant_sources[i],
NULL, 0);
}
/*

View File

@ -4895,6 +4895,77 @@ test_expect_merge_algorithm failure success '12f: Trivial directory resolve, cac
)
'
# Testcase 12g, Testcase with two kinds of "relevant" renames
# Commit O: somefile_O, subdir/{a_O,b_O}
# Commit A: somefile_A, subdir/{a_O,b_O,c_A}
# Commit B: newfile_B, newdir/{a_B,b_B}
# Expected: newfile_{merged}, newdir/{a_B,b_B,c_A}
test_setup_12g () {
test_create_repo 12g &&
(
cd 12g &&
mkdir -p subdir &&
test_write_lines upon a time there was a >somefile &&
test_write_lines 1 2 3 4 5 6 7 8 9 10 >subdir/a &&
test_write_lines one two three four five six >subdir/b &&
git add . &&
test_tick &&
git commit -m "O" &&
git branch O &&
git branch A &&
git branch B &&
git switch A &&
test_write_lines once upon a time there was a >somefile &&
> subdir/c &&
git add somefile subdir/c &&
test_tick &&
git commit -m "A" &&
git checkout B &&
git mv somefile newfile &&
git mv subdir newdir &&
echo repo >>newfile &&
test_write_lines 1 2 3 4 5 6 7 8 9 10 11 >newdir/a &&
test_write_lines one two three four five six seven >newdir/b &&
git add newfile newdir &&
test_tick &&
git commit -m "B"
)
}
test_expect_success '12g: Testcase with two kinds of "relevant" renames' '
test_setup_12g &&
(
cd 12g &&
git checkout A^0 &&
git -c merge.directoryRenames=true merge -s recursive B^0 &&
test_write_lines once upon a time there was a repo >expect &&
test_cmp expect newfile &&
git ls-files -s >out &&
test_line_count = 4 out &&
git rev-parse >actual \
HEAD:newdir/a HEAD:newdir/b HEAD:newdir/c &&
git rev-parse >expect \
B:newdir/a B:newdir/b A:subdir/c &&
test_cmp expect actual &&
test_must_fail git rev-parse HEAD:subdir/a &&
test_must_fail git rev-parse HEAD:subdir/b &&
test_must_fail git rev-parse HEAD:subdir/c &&
test_path_is_missing subdir/ &&
test_path_is_file newdir/c
)
'
###########################################################################
# SECTION 13: Checking informational and conflict messages
#