2005-05-21 17:39:09 +08:00
|
|
|
/*
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
*
|
2005-05-21 17:39:09 +08:00
|
|
|
* Copyright (C) 2005 Junio C Hamano
|
|
|
|
*/
|
|
|
|
#include "cache.h"
|
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2018-05-16 07:42:15 +08:00
|
|
|
#include "object-store.h"
|
2013-11-15 03:20:26 +08:00
|
|
|
#include "hashmap.h"
|
2011-02-20 17:51:16 +08:00
|
|
|
#include "progress.h"
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
#include "promisor-remote.h"
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
/* Table of rename/copy destinations */
|
|
|
|
|
|
|
|
static struct diff_rename_dst {
|
|
|
|
struct diff_filespec *two;
|
|
|
|
struct diff_filepair *pair;
|
|
|
|
} *rename_dst;
|
|
|
|
static int rename_dst_nr, rename_dst_alloc;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2015-02-27 09:39:48 +08:00
|
|
|
static int find_rename_dst(struct diff_filespec *two)
|
2005-05-21 17:39:09 +08:00
|
|
|
{
|
2005-05-24 16:10:48 +08:00
|
|
|
int first, last;
|
|
|
|
|
|
|
|
first = 0;
|
|
|
|
last = rename_dst_nr;
|
|
|
|
while (last > first) {
|
2019-06-14 01:51:56 +08:00
|
|
|
int next = first + ((last - first) >> 1);
|
2005-05-24 16:10:48 +08:00
|
|
|
struct diff_rename_dst *dst = &(rename_dst[next]);
|
|
|
|
int cmp = strcmp(two->path, dst->two->path);
|
|
|
|
if (!cmp)
|
2015-02-27 09:39:48 +08:00
|
|
|
return next;
|
2005-05-24 16:10:48 +08:00
|
|
|
if (cmp < 0) {
|
|
|
|
last = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
first = next+1;
|
|
|
|
}
|
2015-02-27 09:39:48 +08:00
|
|
|
return -first - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two)
|
|
|
|
{
|
|
|
|
int ofs = find_rename_dst(two);
|
|
|
|
return ofs < 0 ? NULL : &rename_dst[ofs];
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns 0 on success, -1 if we found a duplicate.
|
|
|
|
*/
|
|
|
|
static int add_rename_dst(struct diff_filespec *two)
|
|
|
|
{
|
|
|
|
int first = find_rename_dst(two);
|
|
|
|
|
|
|
|
if (first >= 0)
|
|
|
|
return -1;
|
|
|
|
first = -first - 1;
|
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
/* insert to make it at "first" */
|
2014-03-04 06:31:54 +08:00
|
|
|
ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc);
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_dst_nr++;
|
|
|
|
if (first < rename_dst_nr)
|
2018-01-23 01:50:09 +08:00
|
|
|
MOVE_ARRAY(rename_dst + first + 1, rename_dst + first,
|
|
|
|
rename_dst_nr - first - 1);
|
2005-09-16 07:13:43 +08:00
|
|
|
rename_dst[first].two = alloc_filespec(two->path);
|
2017-05-31 01:30:50 +08:00
|
|
|
fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid,
|
2016-06-25 07:09:23 +08:00
|
|
|
two->mode);
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_dst[first].pair = NULL;
|
2015-02-27 09:39:48 +08:00
|
|
|
return 0;
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
|
|
|
|
2005-05-28 06:55:55 +08:00
|
|
|
/* Table of rename/copy src files */
|
2005-05-24 16:10:48 +08:00
|
|
|
static struct diff_rename_src {
|
2011-01-07 05:50:05 +08:00
|
|
|
struct diff_filepair *p;
|
2006-04-09 11:17:46 +08:00
|
|
|
unsigned short score; /* to remember the break score */
|
2005-05-24 16:10:48 +08:00
|
|
|
} *rename_src;
|
|
|
|
static int rename_src_nr, rename_src_alloc;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2011-01-07 05:50:05 +08:00
|
|
|
static struct diff_rename_src *register_rename_src(struct diff_filepair *p)
|
2005-05-24 16:10:48 +08:00
|
|
|
{
|
|
|
|
int first, last;
|
2011-01-07 05:50:05 +08:00
|
|
|
struct diff_filespec *one = p->one;
|
|
|
|
unsigned short score = p->score;
|
2005-05-24 16:10:48 +08:00
|
|
|
|
|
|
|
first = 0;
|
|
|
|
last = rename_src_nr;
|
|
|
|
while (last > first) {
|
2019-06-14 01:51:56 +08:00
|
|
|
int next = first + ((last - first) >> 1);
|
2005-05-24 16:10:48 +08:00
|
|
|
struct diff_rename_src *src = &(rename_src[next]);
|
2011-01-07 05:50:05 +08:00
|
|
|
int cmp = strcmp(one->path, src->p->one->path);
|
2005-05-24 16:10:48 +08:00
|
|
|
if (!cmp)
|
|
|
|
return src;
|
|
|
|
if (cmp < 0) {
|
|
|
|
last = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
first = next+1;
|
|
|
|
}
|
2005-05-28 06:55:55 +08:00
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
/* insert to make it at "first" */
|
2014-03-04 06:31:54 +08:00
|
|
|
ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc);
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_src_nr++;
|
|
|
|
if (first < rename_src_nr)
|
2018-01-23 01:50:09 +08:00
|
|
|
MOVE_ARRAY(rename_src + first + 1, rename_src + first,
|
|
|
|
rename_src_nr - first - 1);
|
2011-01-07 05:50:05 +08:00
|
|
|
rename_src[first].p = p;
|
2006-04-09 11:17:46 +08:00
|
|
|
rename_src[first].score = score;
|
2005-05-24 16:10:48 +08:00
|
|
|
return &(rename_src[first]);
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
|
|
|
|
2007-06-21 19:52:11 +08:00
|
|
|
static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
|
|
|
|
{
|
|
|
|
int src_len = strlen(src->path), dst_len = strlen(dst->path);
|
|
|
|
while (src_len && dst_len) {
|
|
|
|
char c1 = src->path[--src_len];
|
|
|
|
char c2 = dst->path[--dst_len];
|
|
|
|
if (c1 != c2)
|
|
|
|
return 0;
|
|
|
|
if (c1 == '/')
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return (!src_len || src->path[src_len - 1] == '/') &&
|
|
|
|
(!dst_len || dst->path[dst_len - 1] == '/');
|
|
|
|
}
|
|
|
|
|
2005-05-21 17:39:09 +08:00
|
|
|
struct diff_score {
|
2005-05-24 16:10:48 +08:00
|
|
|
int src; /* index in rename_src */
|
|
|
|
int dst; /* index in rename_dst */
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
unsigned short score;
|
|
|
|
short name_score;
|
2005-05-21 17:39:09 +08:00
|
|
|
};
|
|
|
|
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
struct prefetch_options {
|
|
|
|
struct repository *repo;
|
|
|
|
int skip_unmodified;
|
|
|
|
};
|
|
|
|
static void prefetch(void *prefetch_options)
|
|
|
|
{
|
|
|
|
struct prefetch_options *options = prefetch_options;
|
|
|
|
int i;
|
|
|
|
struct oid_array to_fetch = OID_ARRAY_INIT;
|
|
|
|
|
|
|
|
for (i = 0; i < rename_dst_nr; i++) {
|
|
|
|
if (rename_dst[i].pair)
|
|
|
|
/*
|
|
|
|
* The loop in diffcore_rename() will not need these
|
|
|
|
* blobs, so skip prefetching.
|
|
|
|
*/
|
|
|
|
continue; /* already found exact match */
|
|
|
|
diff_add_if_missing(options->repo, &to_fetch,
|
|
|
|
rename_dst[i].two);
|
|
|
|
}
|
|
|
|
for (i = 0; i < rename_src_nr; i++) {
|
|
|
|
if (options->skip_unmodified &&
|
|
|
|
diff_unmodified_pair(rename_src[i].p))
|
|
|
|
/*
|
|
|
|
* The loop in diffcore_rename() will not need these
|
|
|
|
* blobs, so skip prefetching.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
diff_add_if_missing(options->repo, &to_fetch,
|
|
|
|
rename_src[i].p->one);
|
|
|
|
}
|
|
|
|
promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
|
|
|
|
oid_array_clear(&to_fetch);
|
|
|
|
}
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
static int estimate_similarity(struct repository *r,
|
|
|
|
struct diff_filespec *src,
|
2005-05-21 17:39:09 +08:00
|
|
|
struct diff_filespec *dst,
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
int minimum_score,
|
|
|
|
int skip_unmodified)
|
2005-05-21 17:39:09 +08:00
|
|
|
{
|
|
|
|
/* src points at a file that existed in the original tree (or
|
|
|
|
* optionally a file in the destination tree) and dst points
|
|
|
|
* at a newly created file. They may be quite similar, in which
|
|
|
|
* case we want to say src is renamed to dst or src is copied into
|
|
|
|
* dst, and then some edit has been applied to dst.
|
|
|
|
*
|
|
|
|
* Compare them and return how similar they are, representing
|
2005-05-28 06:56:38 +08:00
|
|
|
* the score as an integer between 0 and MAX_SCORE.
|
|
|
|
*
|
|
|
|
* When there is an exact match, it is considered a better
|
|
|
|
* match than anything else; the destination does not even
|
|
|
|
* call into this function in that case.
|
2005-05-21 17:39:09 +08:00
|
|
|
*/
|
2006-03-13 14:26:34 +08:00
|
|
|
unsigned long max_size, delta_size, base_size, src_copied, literal_added;
|
2005-05-21 17:39:09 +08:00
|
|
|
int score;
|
2020-04-08 06:11:41 +08:00
|
|
|
struct diff_populate_filespec_options dpf_options = {
|
|
|
|
.check_size_only = 1
|
|
|
|
};
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
struct prefetch_options prefetch_options = {r, skip_unmodified};
|
|
|
|
|
|
|
|
if (r == the_repository && has_promisor_remote()) {
|
|
|
|
dpf_options.missing_object_cb = prefetch;
|
|
|
|
dpf_options.missing_object_data = &prefetch_options;
|
|
|
|
}
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-23 12:24:49 +08:00
|
|
|
/* We deal only with regular files. Symlink renames are handled
|
|
|
|
* only when they are exact matches --- in other words, no edits
|
|
|
|
* after renaming.
|
|
|
|
*/
|
|
|
|
if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
|
|
|
|
return 0;
|
|
|
|
|
2007-10-27 07:51:28 +08:00
|
|
|
/*
|
|
|
|
* Need to check that source and destination sizes are
|
|
|
|
* filled in before comparing them.
|
|
|
|
*
|
|
|
|
* If we already have "cnt_data" filled in, we know it's
|
|
|
|
* all good (avoid checking the size for zero, as that
|
|
|
|
* is a possible size - we really should have a flag to
|
|
|
|
* say whether the size is valid or not!)
|
|
|
|
*/
|
2014-08-16 11:08:04 +08:00
|
|
|
if (!src->cnt_data &&
|
2020-04-08 06:11:41 +08:00
|
|
|
diff_populate_filespec(r, src, &dpf_options))
|
2007-10-27 07:51:28 +08:00
|
|
|
return 0;
|
2014-08-16 11:08:04 +08:00
|
|
|
if (!dst->cnt_data &&
|
2020-04-08 06:11:41 +08:00
|
|
|
diff_populate_filespec(r, dst, &dpf_options))
|
2007-10-27 07:51:28 +08:00
|
|
|
return 0;
|
|
|
|
|
2006-03-13 14:26:34 +08:00
|
|
|
max_size = ((src->size > dst->size) ? src->size : dst->size);
|
2005-05-22 06:55:18 +08:00
|
|
|
base_size = ((src->size < dst->size) ? src->size : dst->size);
|
2006-03-13 14:26:34 +08:00
|
|
|
delta_size = max_size - base_size;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-22 06:55:18 +08:00
|
|
|
/* We would not consider edits that change the file size so
|
|
|
|
* drastically. delta_size must be smaller than
|
2005-05-22 16:31:28 +08:00
|
|
|
* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
|
2005-05-28 06:56:38 +08:00
|
|
|
*
|
2005-05-22 06:55:18 +08:00
|
|
|
* Note that base_size == 0 case is handled here already
|
|
|
|
* and the final score computation below would not have a
|
|
|
|
* divide-by-zero issue.
|
2005-05-21 17:39:09 +08:00
|
|
|
*/
|
2011-02-19 12:12:06 +08:00
|
|
|
if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
|
2005-05-21 17:39:09 +08:00
|
|
|
return 0;
|
|
|
|
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
dpf_options.check_size_only = 0;
|
|
|
|
|
|
|
|
if (!src->cnt_data && diff_populate_filespec(r, src, &dpf_options))
|
2009-01-20 23:59:57 +08:00
|
|
|
return 0;
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
if (!dst->cnt_data && diff_populate_filespec(r, dst, &dpf_options))
|
2009-01-20 23:59:57 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
if (diffcore_count_changes(r, src, dst,
|
2006-03-12 19:22:10 +08:00
|
|
|
&src->cnt_data, &dst->cnt_data,
|
2006-03-01 08:01:36 +08:00
|
|
|
&src_copied, &literal_added))
|
2005-05-25 03:09:32 +08:00
|
|
|
return 0;
|
2005-06-03 16:36:03 +08:00
|
|
|
|
2006-03-03 14:11:25 +08:00
|
|
|
/* How similar are they?
|
|
|
|
* what percentage of material in dst are from source?
|
2005-05-21 17:39:09 +08:00
|
|
|
*/
|
2006-03-13 14:26:34 +08:00
|
|
|
if (!dst->size)
|
2006-03-03 14:11:25 +08:00
|
|
|
score = 0; /* should not happen */
|
2007-06-25 06:23:28 +08:00
|
|
|
else
|
2007-03-07 09:44:37 +08:00
|
|
|
score = (int)(src_copied * MAX_SCORE / max_size);
|
2005-05-21 17:39:09 +08:00
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
2005-09-16 07:13:43 +08:00
|
|
|
static void record_rename_pair(int dst_index, int src_index, int score)
|
2005-05-21 17:39:09 +08:00
|
|
|
{
|
2007-10-26 02:19:10 +08:00
|
|
|
struct diff_filespec *src, *dst;
|
2005-05-24 16:10:48 +08:00
|
|
|
struct diff_filepair *dp;
|
[PATCH] Rename/copy detection fix.
The rename/copy detection logic in earlier round was only good
enough to show patch output and discussion on the mailing list
about the diff-raw format updates revealed many problems with
it. This patch fixes all the ones known to me, without making
things I want to do later impossible, mostly related to patch
reordering.
(1) Earlier rename/copy detector determined which one is rename
and which one is copy too early, which made it impossible
to later introduce diffcore transformers to reorder
patches. This patch fixes it by moving that logic to the
very end of the processing.
(2) Earlier output routine diff_flush() was pruning all the
"no-change" entries indiscriminatingly. This was done due
to my false assumption that one of the requirements in the
diff-raw output was not to show such an entry (which
resulted in my incorrect comment about "diff-helper never
being able to be equivalent to built-in diff driver"). My
special thanks go to Linus for correcting me about this.
When we produce diff-raw output, for the downstream to be
able to tell renames from copies, sometimes it _is_
necessary to output "no-change" entries, and this patch
adds diffcore_prune() function for doing it.
(3) Earlier diff_filepair structure was trying to be not too
specific about rename/copy operations, but the purpose of
the structure was to record one or two paths, which _was_
indeed about rename/copy. This patch discards xfrm_msg
field which was trying to be generic for this wrong reason,
and introduces a couple of fields (rename_score and
rename_rank) that are explicitly specific to rename/copy
logic. One thing to note is that the information in a
single diff_filepair structure _still_ does not distinguish
renames from copies, and it is deliberately so. This is to
allow patches to be reordered in later stages.
(4) This patch also adds some tests about diff-raw format
output and makes sure that necessary "no-change" entries
appear on the output.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-23 12:26:09 +08:00
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
if (rename_dst[dst_index].pair)
|
|
|
|
die("internal error: dst already matched.");
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2011-01-07 05:50:05 +08:00
|
|
|
src = rename_src[src_index].p->one;
|
2007-10-26 02:20:56 +08:00
|
|
|
src->rename_used++;
|
2007-10-26 02:19:10 +08:00
|
|
|
src->count++;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
dst = rename_dst[dst_index].two;
|
2007-10-26 02:19:10 +08:00
|
|
|
dst->count++;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2007-10-26 02:19:10 +08:00
|
|
|
dp = diff_queue(NULL, src, dst);
|
2006-08-04 03:01:01 +08:00
|
|
|
dp->renamed_pair = 1;
|
2006-04-09 11:17:46 +08:00
|
|
|
if (!strcmp(src->path, dst->path))
|
|
|
|
dp->score = rename_src[src_index].score;
|
|
|
|
else
|
|
|
|
dp->score = score;
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_dst[dst_index].pair = dp;
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We sort the rename similarity matrix with the score, in descending
|
2005-05-28 06:55:55 +08:00
|
|
|
* order (the most similar first).
|
2005-05-21 17:39:09 +08:00
|
|
|
*/
|
|
|
|
static int score_compare(const void *a_, const void *b_)
|
|
|
|
{
|
|
|
|
const struct diff_score *a = a_, *b = b_;
|
2007-06-25 06:23:28 +08:00
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
/* sink the unused ones to the bottom */
|
|
|
|
if (a->dst < 0)
|
|
|
|
return (0 <= b->dst);
|
|
|
|
else if (b->dst < 0)
|
|
|
|
return -1;
|
|
|
|
|
2007-06-25 06:23:28 +08:00
|
|
|
if (a->score == b->score)
|
|
|
|
return b->name_score - a->name_score;
|
|
|
|
|
2005-05-21 17:39:09 +08:00
|
|
|
return b->score - a->score;
|
|
|
|
}
|
|
|
|
|
2007-10-26 02:23:26 +08:00
|
|
|
struct file_similarity {
|
2013-11-15 03:20:26 +08:00
|
|
|
struct hashmap_entry entry;
|
2013-11-15 03:19:34 +08:00
|
|
|
int index;
|
2007-10-26 02:23:26 +08:00
|
|
|
struct diff_filespec *filespec;
|
|
|
|
};
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
static unsigned int hash_filespec(struct repository *r,
|
|
|
|
struct diff_filespec *filespec)
|
2013-11-15 03:19:04 +08:00
|
|
|
{
|
2016-06-25 07:09:24 +08:00
|
|
|
if (!filespec->oid_valid) {
|
2020-04-08 06:11:41 +08:00
|
|
|
if (diff_populate_filespec(r, filespec, NULL))
|
2013-11-15 03:19:04 +08:00
|
|
|
return 0;
|
2020-01-31 04:32:22 +08:00
|
|
|
hash_object_file(r->hash_algo, filespec->data, filespec->size,
|
|
|
|
"blob", &filespec->oid);
|
2013-11-15 03:19:04 +08:00
|
|
|
}
|
2019-06-20 15:41:49 +08:00
|
|
|
return oidhash(&filespec->oid);
|
2013-11-15 03:19:04 +08:00
|
|
|
}
|
|
|
|
|
2013-11-15 03:20:26 +08:00
|
|
|
static int find_identical_files(struct hashmap *srcs,
|
2013-11-15 03:19:34 +08:00
|
|
|
int dst_index,
|
2011-02-19 11:55:19 +08:00
|
|
|
struct diff_options *options)
|
2007-10-26 02:23:26 +08:00
|
|
|
{
|
|
|
|
int renames = 0;
|
2013-11-15 03:19:34 +08:00
|
|
|
struct diff_filespec *target = rename_dst[dst_index].two;
|
2014-07-03 06:22:11 +08:00
|
|
|
struct file_similarity *p, *best = NULL;
|
2013-11-15 03:19:04 +08:00
|
|
|
int i = 100, best_score = -1;
|
2019-10-07 07:30:35 +08:00
|
|
|
unsigned int hash = hash_filespec(options->repo, target);
|
2013-11-15 03:19:04 +08:00
|
|
|
|
|
|
|
/*
|
2013-11-15 03:19:34 +08:00
|
|
|
* Find the best source match for specified destination.
|
2013-11-15 03:19:04 +08:00
|
|
|
*/
|
2019-10-07 07:30:35 +08:00
|
|
|
p = hashmap_get_entry_from_hash(srcs, hash, NULL,
|
|
|
|
struct file_similarity, entry);
|
2019-10-07 07:30:41 +08:00
|
|
|
hashmap_for_each_entry_from(srcs, p, entry) {
|
2013-11-15 03:19:04 +08:00
|
|
|
int score;
|
|
|
|
struct diff_filespec *source = p->filespec;
|
|
|
|
|
|
|
|
/* False hash collision? */
|
2018-08-29 05:22:48 +08:00
|
|
|
if (!oideq(&source->oid, &target->oid))
|
2013-11-15 03:19:04 +08:00
|
|
|
continue;
|
|
|
|
/* Non-regular files? If so, the modes must match! */
|
|
|
|
if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) {
|
|
|
|
if (source->mode != target->mode)
|
2011-02-19 12:10:32 +08:00
|
|
|
continue;
|
2007-10-26 02:23:26 +08:00
|
|
|
}
|
2013-11-15 03:19:04 +08:00
|
|
|
/* Give higher scores to sources that haven't been used already */
|
|
|
|
score = !source->rename_used;
|
|
|
|
if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY)
|
|
|
|
continue;
|
|
|
|
score += basename_same(source, target);
|
|
|
|
if (score > best_score) {
|
|
|
|
best = p;
|
|
|
|
best_score = score;
|
|
|
|
if (score == 2)
|
|
|
|
break;
|
2007-10-26 02:23:26 +08:00
|
|
|
}
|
2013-11-15 03:19:04 +08:00
|
|
|
|
|
|
|
/* Too many identical alternatives? Pick one */
|
|
|
|
if (!--i)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (best) {
|
2013-11-15 03:19:34 +08:00
|
|
|
record_rename_pair(dst_index, best->index, MAX_SCORE);
|
2013-11-15 03:19:04 +08:00
|
|
|
renames++;
|
|
|
|
}
|
2007-10-26 02:23:26 +08:00
|
|
|
return renames;
|
|
|
|
}
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
static void insert_file_table(struct repository *r,
|
|
|
|
struct hashmap *table, int index,
|
|
|
|
struct diff_filespec *filespec)
|
2007-10-26 02:23:26 +08:00
|
|
|
{
|
|
|
|
struct file_similarity *entry = xmalloc(sizeof(*entry));
|
|
|
|
|
|
|
|
entry->index = index;
|
|
|
|
entry->filespec = filespec;
|
|
|
|
|
2019-10-07 07:30:27 +08:00
|
|
|
hashmap_entry_init(&entry->entry, hash_filespec(r, filespec));
|
2019-10-07 07:30:29 +08:00
|
|
|
hashmap_add(table, &entry->entry);
|
2007-10-26 02:23:26 +08:00
|
|
|
}
|
|
|
|
|
2007-10-26 02:17:55 +08:00
|
|
|
/*
|
|
|
|
* Find exact renames first.
|
|
|
|
*
|
|
|
|
* The first round matches up the up-to-date entries,
|
|
|
|
* and then during the second round we try to match
|
|
|
|
* cache-dirty entries as well.
|
|
|
|
*/
|
2011-02-19 11:55:19 +08:00
|
|
|
static int find_exact_renames(struct diff_options *options)
|
2007-10-26 02:17:55 +08:00
|
|
|
{
|
2013-11-15 03:19:34 +08:00
|
|
|
int i, renames = 0;
|
2013-11-15 03:20:26 +08:00
|
|
|
struct hashmap file_table;
|
2007-10-26 02:17:55 +08:00
|
|
|
|
diffcore: fix iteration order of identical files during rename detection
If the two paths 'dir/A/file' and 'dir/B/file' have identical content
and the parent directory is renamed, e.g. 'git mv dir other-dir', then
diffcore reports the following exact renames:
renamed: dir/B/file -> other-dir/A/file
renamed: dir/A/file -> other-dir/B/file
While technically not wrong, this is confusing not only for the user,
but also for git commands that make decisions based on rename
information, e.g. 'git log --follow other-dir/A/file' follows
'dir/B/file' past the rename.
This behavior is a side effect of commit v2.0.0-rc4~8^2~14
(diffcore-rename.c: simplify finding exact renames, 2013-11-14): the
hashmap storing sources returns entries from the same bucket, i.e.
sources matching the current destination, in LIFO order. Thus the
iteration first examines 'other-dir/A/file' and 'dir/B/file' and, upon
finding identical content and basename, reports an exact rename.
Other hashmap users are apparently happy with the current iteration
order over the entries of a bucket. Changing the iteration order
would risk upsetting other hashmap users and would increase the memory
footprint of each bucket by a pointer to the tail element.
Fill the hashmap with source entries in reverse order to restore the
original exact rename detection behavior.
Reported-by: Bill Okara <billokara@gmail.com>
Signed-off-by: SZEDER Gábor <szeder@ira.uka.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-03-30 16:35:07 +08:00
|
|
|
/* Add all sources to the hash table in reverse order, because
|
|
|
|
* later on they will be retrieved in LIFO order.
|
|
|
|
*/
|
2017-07-01 03:14:05 +08:00
|
|
|
hashmap_init(&file_table, NULL, NULL, rename_src_nr);
|
diffcore: fix iteration order of identical files during rename detection
If the two paths 'dir/A/file' and 'dir/B/file' have identical content
and the parent directory is renamed, e.g. 'git mv dir other-dir', then
diffcore reports the following exact renames:
renamed: dir/B/file -> other-dir/A/file
renamed: dir/A/file -> other-dir/B/file
While technically not wrong, this is confusing not only for the user,
but also for git commands that make decisions based on rename
information, e.g. 'git log --follow other-dir/A/file' follows
'dir/B/file' past the rename.
This behavior is a side effect of commit v2.0.0-rc4~8^2~14
(diffcore-rename.c: simplify finding exact renames, 2013-11-14): the
hashmap storing sources returns entries from the same bucket, i.e.
sources matching the current destination, in LIFO order. Thus the
iteration first examines 'other-dir/A/file' and 'dir/B/file' and, upon
finding identical content and basename, reports an exact rename.
Other hashmap users are apparently happy with the current iteration
order over the entries of a bucket. Changing the iteration order
would risk upsetting other hashmap users and would increase the memory
footprint of each bucket by a pointer to the tail element.
Fill the hashmap with source entries in reverse order to restore the
original exact rename detection behavior.
Reported-by: Bill Okara <billokara@gmail.com>
Signed-off-by: SZEDER Gábor <szeder@ira.uka.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-03-30 16:35:07 +08:00
|
|
|
for (i = rename_src_nr-1; i >= 0; i--)
|
2018-09-21 23:57:19 +08:00
|
|
|
insert_file_table(options->repo,
|
|
|
|
&file_table, i,
|
|
|
|
rename_src[i].p->one);
|
2007-10-26 02:23:26 +08:00
|
|
|
|
2013-11-15 03:19:34 +08:00
|
|
|
/* Walk the destinations and find best source match */
|
2007-10-26 02:23:26 +08:00
|
|
|
for (i = 0; i < rename_dst_nr; i++)
|
2013-11-15 03:19:34 +08:00
|
|
|
renames += find_identical_files(&file_table, i, options);
|
2007-10-26 02:23:26 +08:00
|
|
|
|
2013-11-15 03:20:26 +08:00
|
|
|
/* Free the hash data structure and entries */
|
2019-10-07 07:30:40 +08:00
|
|
|
hashmap_free_entries(&file_table, struct file_similarity, entry);
|
2007-10-26 02:23:26 +08:00
|
|
|
|
2013-11-15 03:19:34 +08:00
|
|
|
return renames;
|
2007-10-26 02:17:55 +08:00
|
|
|
}
|
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
#define NUM_CANDIDATE_PER_DST 4
|
|
|
|
static void record_if_better(struct diff_score m[], struct diff_score *o)
|
|
|
|
{
|
|
|
|
int i, worst;
|
|
|
|
|
|
|
|
/* find the worst one */
|
|
|
|
worst = 0;
|
|
|
|
for (i = 1; i < NUM_CANDIDATE_PER_DST; i++)
|
|
|
|
if (score_compare(&m[i], &m[worst]) > 0)
|
|
|
|
worst = i;
|
|
|
|
|
|
|
|
/* is it better than the worst one? */
|
|
|
|
if (score_compare(&m[worst], o) > 0)
|
|
|
|
m[worst] = *o;
|
|
|
|
}
|
|
|
|
|
2011-01-07 05:50:06 +08:00
|
|
|
/*
|
|
|
|
* Returns:
|
|
|
|
* 0 if we are under the limit;
|
|
|
|
* 1 if we need to disable inexact rename detection;
|
|
|
|
* 2 if we would be under the limit if we were given -C instead of -C -C.
|
|
|
|
*/
|
2011-01-07 05:50:04 +08:00
|
|
|
static int too_many_rename_candidates(int num_create,
|
|
|
|
struct diff_options *options)
|
|
|
|
{
|
|
|
|
int rename_limit = options->rename_limit;
|
|
|
|
int num_src = rename_src_nr;
|
2011-01-07 05:50:06 +08:00
|
|
|
int i;
|
2011-01-07 05:50:04 +08:00
|
|
|
|
|
|
|
options->needed_rename_limit = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This basically does a test for the rename matrix not
|
|
|
|
* growing larger than a "rename_limit" square matrix, ie:
|
|
|
|
*
|
|
|
|
* num_create * num_src > rename_limit * rename_limit
|
|
|
|
*/
|
2017-11-30 04:11:54 +08:00
|
|
|
if (rename_limit <= 0)
|
|
|
|
rename_limit = 32767;
|
2011-01-07 05:50:04 +08:00
|
|
|
if ((num_create <= rename_limit || num_src <= rename_limit) &&
|
2017-11-14 04:15:59 +08:00
|
|
|
((uint64_t)num_create * (uint64_t)num_src
|
|
|
|
<= (uint64_t)rename_limit * (uint64_t)rename_limit))
|
2011-01-07 05:50:04 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
options->needed_rename_limit =
|
|
|
|
num_src > num_create ? num_src : num_create;
|
2011-01-07 05:50:06 +08:00
|
|
|
|
|
|
|
/* Are we running under -C -C? */
|
2017-11-01 02:19:11 +08:00
|
|
|
if (!options->flags.find_copies_harder)
|
2011-01-07 05:50:06 +08:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* Would we bust the limit if we were running under -C? */
|
|
|
|
for (num_src = i = 0; i < rename_src_nr; i++) {
|
|
|
|
if (diff_unmodified_pair(rename_src[i].p))
|
|
|
|
continue;
|
|
|
|
num_src++;
|
|
|
|
}
|
|
|
|
if ((num_create <= rename_limit || num_src <= rename_limit) &&
|
2017-11-14 04:15:59 +08:00
|
|
|
((uint64_t)num_create * (uint64_t)num_src
|
|
|
|
<= (uint64_t)rename_limit * (uint64_t)rename_limit))
|
2011-01-07 05:50:06 +08:00
|
|
|
return 2;
|
2011-01-07 05:50:04 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2011-02-19 12:10:32 +08:00
|
|
|
static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies)
|
|
|
|
{
|
|
|
|
int count = 0, i;
|
|
|
|
|
|
|
|
for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) {
|
|
|
|
struct diff_rename_dst *dst;
|
|
|
|
|
|
|
|
if ((mx[i].dst < 0) ||
|
|
|
|
(mx[i].score < minimum_score))
|
|
|
|
break; /* there is no more usable pair. */
|
|
|
|
dst = &rename_dst[mx[i].dst];
|
|
|
|
if (dst->pair)
|
|
|
|
continue; /* already done, either exact or fuzzy. */
|
2011-01-07 05:50:05 +08:00
|
|
|
if (!copies && rename_src[mx[i].src].p->one->rename_used)
|
2011-02-19 12:10:32 +08:00
|
|
|
continue;
|
|
|
|
record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2005-09-21 15:18:27 +08:00
|
|
|
void diffcore_rename(struct diff_options *options)
|
2005-05-21 17:39:09 +08:00
|
|
|
{
|
2005-09-21 15:18:27 +08:00
|
|
|
int detect_rename = options->detect_rename;
|
|
|
|
int minimum_score = options->rename_score;
|
2005-05-22 10:40:36 +08:00
|
|
|
struct diff_queue_struct *q = &diff_queued_diff;
|
2005-09-16 07:13:43 +08:00
|
|
|
struct diff_queue_struct outq;
|
2005-05-21 17:39:09 +08:00
|
|
|
struct diff_score *mx;
|
2011-01-07 05:50:06 +08:00
|
|
|
int i, j, rename_count, skip_unmodified = 0;
|
2011-04-29 17:42:41 +08:00
|
|
|
int num_create, dst_cnt;
|
2011-02-20 17:51:16 +08:00
|
|
|
struct progress *progress = NULL;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-22 14:33:32 +08:00
|
|
|
if (!minimum_score)
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
minimum_score = DEFAULT_RENAME_SCORE;
|
2005-05-21 17:39:09 +08:00
|
|
|
|
|
|
|
for (i = 0; i < q->nr; i++) {
|
2005-05-21 17:40:01 +08:00
|
|
|
struct diff_filepair *p = q->queue[i];
|
2006-11-02 16:02:11 +08:00
|
|
|
if (!DIFF_FILE_VALID(p->one)) {
|
2005-05-22 10:42:18 +08:00
|
|
|
if (!DIFF_FILE_VALID(p->two))
|
2005-05-23 12:24:49 +08:00
|
|
|
continue; /* unmerged */
|
2006-11-02 16:02:11 +08:00
|
|
|
else if (options->single_follow &&
|
|
|
|
strcmp(options->single_follow, p->two->path))
|
|
|
|
continue; /* not interested */
|
2017-11-01 02:19:11 +08:00
|
|
|
else if (!options->flags.rename_empty &&
|
2017-05-31 01:31:08 +08:00
|
|
|
is_empty_blob_oid(&p->two->oid))
|
teach diffcore-rename to optionally ignore empty content
Our rename detection is a heuristic, matching pairs of
removed and added files with similar or identical content.
It's unlikely to be wrong when there is actual content to
compare, and we already take care not to do inexact rename
detection when there is not enough content to produce good
results.
However, we always do exact rename detection, even when the
blob is tiny or empty. It's easy to get false positives with
an empty blob, simply because it is an obvious content to
use as a boilerplate (e.g., when telling git that an empty
directory is worth tracking via an empty .gitignore).
This patch lets callers specify whether or not they are
interested in using empty files as rename sources and
destinations. The default is "yes", keeping the original
behavior. It works by detecting the empty-blob sha1 for
rename sources and destinations.
One more flexible alternative would be to allow the caller
to specify a minimum size for a blob to be "interesting" for
rename detection. But that would catch small boilerplate
files, not large ones (e.g., if you had the GPL COPYING file
in many directories).
A better alternative would be to allow a "-rename"
gitattribute to allow boilerplate files to be marked as
such. I'll leave the complexity of that solution until such
time as somebody actually wants it. The complaints we've
seen so far revolve around empty files, so let's start with
the simple thing.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-03-23 06:52:13 +08:00
|
|
|
continue;
|
2015-02-27 09:42:27 +08:00
|
|
|
else if (add_rename_dst(p->two) < 0) {
|
|
|
|
warning("skipping rename detection, detected"
|
|
|
|
" duplicate destination '%s'",
|
|
|
|
p->two->path);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-11-02 16:02:11 +08:00
|
|
|
}
|
2017-11-01 02:19:11 +08:00
|
|
|
else if (!options->flags.rename_empty &&
|
2017-05-31 01:31:08 +08:00
|
|
|
is_empty_blob_oid(&p->one->oid))
|
teach diffcore-rename to optionally ignore empty content
Our rename detection is a heuristic, matching pairs of
removed and added files with similar or identical content.
It's unlikely to be wrong when there is actual content to
compare, and we already take care not to do inexact rename
detection when there is not enough content to produce good
results.
However, we always do exact rename detection, even when the
blob is tiny or empty. It's easy to get false positives with
an empty blob, simply because it is an obvious content to
use as a boilerplate (e.g., when telling git that an empty
directory is worth tracking via an empty .gitignore).
This patch lets callers specify whether or not they are
interested in using empty files as rename sources and
destinations. The default is "yes", keeping the original
behavior. It works by detecting the empty-blob sha1 for
rename sources and destinations.
One more flexible alternative would be to allow the caller
to specify a minimum size for a blob to be "interesting" for
rename detection. But that would catch small boilerplate
files, not large ones (e.g., if you had the GPL COPYING file
in many directories).
A better alternative would be to allow a "-rename"
gitattribute to allow boilerplate files to be marked as
such. I'll leave the complexity of that solution until such
time as somebody actually wants it. The complaints we've
seen so far revolve around empty files, so let's start with
the simple thing.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-03-23 06:52:13 +08:00
|
|
|
continue;
|
diffcore-rename: don't consider unmerged path as source
Since e9c8409 (diff-index --cached --raw: show tree entry on the LHS for
unmerged entries., 2007-01-05), an unmerged entry should be detected by
using DIFF_PAIR_UNMERGED(p), not by noticing both one and two sides of
the filepair records mode=0 entries. However, it forgot to update some
parts of the rename detection logic.
This only makes difference in the "diff --cached" codepath where an
unmerged filepair carries information on the entries that came from the
tree. It probably hasn't been noticed for a long time because nobody
would run "diff -M" during a conflict resolution, but "git status" uses
rename detection when it internally runs "diff-index" and "diff-files"
and gives nonsense results.
In an unmerged pair, "one" side can have a valid filespec to record the
tree entry (e.g. what's in HEAD) when running "diff --cached". This can
be used as a rename source to other paths in the index that are not
unmerged. The path that is unmerged by definition does not have the
final content yet (i.e. "two" side cannot have a valid filespec), so it
can never be a rename destination.
Use the DIFF_PAIR_UNMERGED() to detect unmerged filepair correctly, and
allow the valid "one" side of an unmerged filepair to be considered a
potential rename source, but never to be considered a rename destination.
Commit message and first two test cases by Junio, the rest by Martin.
Signed-off-by: Martin von Zweigbergk <martin.von.zweigbergk@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-03-24 10:41:01 +08:00
|
|
|
else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) {
|
2007-10-26 02:20:56 +08:00
|
|
|
/*
|
|
|
|
* If the source is a broken "delete", and
|
2005-06-12 11:55:20 +08:00
|
|
|
* they did not really want to get broken,
|
|
|
|
* that means the source actually stays.
|
2007-10-26 02:20:56 +08:00
|
|
|
* So we increment the "rename_used" score
|
|
|
|
* by one, to indicate ourselves as a user
|
|
|
|
*/
|
|
|
|
if (p->broken_pair && !p->score)
|
|
|
|
p->one->rename_used++;
|
2011-01-07 05:50:05 +08:00
|
|
|
register_rename_src(p);
|
2007-10-26 02:20:56 +08:00
|
|
|
}
|
|
|
|
else if (detect_rename == DIFF_DETECT_COPY) {
|
|
|
|
/*
|
|
|
|
* Increment the "rename_used" score by
|
|
|
|
* one, to indicate ourselves as a user.
|
2005-06-12 11:55:20 +08:00
|
|
|
*/
|
2007-10-26 02:20:56 +08:00
|
|
|
p->one->rename_used++;
|
2011-01-07 05:50:05 +08:00
|
|
|
register_rename_src(p);
|
2005-06-12 11:55:20 +08:00
|
|
|
}
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-15 01:39:48 +08:00
|
|
|
if (rename_dst_nr == 0 || rename_src_nr == 0)
|
2005-05-21 17:39:09 +08:00
|
|
|
goto cleanup; /* nothing to do */
|
|
|
|
|
Do exact rename detection regardless of rename limits
Now that the exact rename detection is linear-time (with a very small
constant factor to boot), there is no longer any reason to limit it by
the number of files involved.
In some trivial testing, I created a repository with a directory that
had a hundred thousand files in it (all with different contents), and
then moved that directory to show the effects of renaming 100,000 files.
With the new code, that resulted in
[torvalds@woody big-rename]$ time ~/git/git show -C | wc -l
400006
real 0m2.071s
user 0m1.520s
sys 0m0.576s
ie the code can correctly detect the hundred thousand renames in about 2
seconds (the number "400006" comes from four lines for each rename:
diff --git a/really-big-dir/file-1-1-1-1-1 b/moved-big-dir/file-1-1-1-1-1
similarity index 100%
rename from really-big-dir/file-1-1-1-1-1
rename to moved-big-dir/file-1-1-1-1-1
and the extra six lines is from a one-liner commit message and all the
commit information and spacing).
Most of those two seconds weren't even really the rename detection, it's
really all the other stuff needed to get there.
With the old code, this wouldn't have been practically possible. Doing
a pairwise check of the ten billion possible pairs would have been
prohibitively expensive. In fact, even with the rename limiter in
place, the old code would waste a lot of time just on the diff_filespec
checks, and despite not even trying to find renames, it used to look
like:
[torvalds@woody big-rename]$ time git show -C | wc -l
1400006
real 0m12.337s
user 0m12.285s
sys 0m0.192s
ie we used to take 12 seconds for this load and not even do any rename
detection! (The number 1400006 comes from fourteen lines per file moved:
seven lines each for the delete and the create of a one-liner file, and
the same extra six lines of commit information).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-10-26 02:24:47 +08:00
|
|
|
/*
|
|
|
|
* We really want to cull the candidates list early
|
|
|
|
* with cheap tests in order to avoid doing deltas.
|
|
|
|
*/
|
2011-02-19 11:55:19 +08:00
|
|
|
rename_count = find_exact_renames(options);
|
Do exact rename detection regardless of rename limits
Now that the exact rename detection is linear-time (with a very small
constant factor to boot), there is no longer any reason to limit it by
the number of files involved.
In some trivial testing, I created a repository with a directory that
had a hundred thousand files in it (all with different contents), and
then moved that directory to show the effects of renaming 100,000 files.
With the new code, that resulted in
[torvalds@woody big-rename]$ time ~/git/git show -C | wc -l
400006
real 0m2.071s
user 0m1.520s
sys 0m0.576s
ie the code can correctly detect the hundred thousand renames in about 2
seconds (the number "400006" comes from four lines for each rename:
diff --git a/really-big-dir/file-1-1-1-1-1 b/moved-big-dir/file-1-1-1-1-1
similarity index 100%
rename from really-big-dir/file-1-1-1-1-1
rename to moved-big-dir/file-1-1-1-1-1
and the extra six lines is from a one-liner commit message and all the
commit information and spacing).
Most of those two seconds weren't even really the rename detection, it's
really all the other stuff needed to get there.
With the old code, this wouldn't have been practically possible. Doing
a pairwise check of the ten billion possible pairs would have been
prohibitively expensive. In fact, even with the rename limiter in
place, the old code would waste a lot of time just on the diff_filespec
checks, and despite not even trying to find renames, it used to look
like:
[torvalds@woody big-rename]$ time git show -C | wc -l
1400006
real 0m12.337s
user 0m12.285s
sys 0m0.192s
ie we used to take 12 seconds for this load and not even do any rename
detection! (The number 1400006 comes from fourteen lines per file moved:
seven lines each for the delete and the create of a one-liner file, and
the same extra six lines of commit information).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-10-26 02:24:47 +08:00
|
|
|
|
2007-10-27 07:56:34 +08:00
|
|
|
/* Did we only want exact renames? */
|
|
|
|
if (minimum_score == MAX_SCORE)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate how many renames are left (but all the source
|
|
|
|
* files still remain as options for rename/copies!)
|
|
|
|
*/
|
|
|
|
num_create = (rename_dst_nr - rename_count);
|
|
|
|
|
|
|
|
/* All done? */
|
|
|
|
if (!num_create)
|
|
|
|
goto cleanup;
|
|
|
|
|
2011-01-07 05:50:06 +08:00
|
|
|
switch (too_many_rename_candidates(num_create, options)) {
|
|
|
|
case 1:
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-15 01:39:48 +08:00
|
|
|
goto cleanup;
|
2011-01-07 05:50:06 +08:00
|
|
|
case 2:
|
|
|
|
options->degraded_cc_to_c = 1;
|
|
|
|
skip_unmodified = 1;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-15 01:39:48 +08:00
|
|
|
|
2011-02-20 17:51:16 +08:00
|
|
|
if (options->show_rename_progress) {
|
progress: simplify "delayed" progress API
We used to expose the full power of the delayed progress API to the
callers, so that they can specify, not just the message to show and
expected total amount of work that is used to compute the percentage
of work performed so far, the percent-threshold parameter P and the
delay-seconds parameter N. The progress meter starts to show at N
seconds into the operation only if we have not yet completed P per-cent
of the total work.
Most callers used either (0%, 2s) or (50%, 1s) as (P, N), but there
are oddballs that chose more random-looking values like 95%.
For a smoother workload, (50%, 1s) would allow us to start showing
the progress meter earlier than (0%, 2s), while keeping the chance
of not showing progress meter for long running operation the same as
the latter. For a task that would take 2s or more to complete, it
is likely that less than half of it would complete within the first
second, if the workload is smooth. But for a spiky workload whose
earlier part is easier, such a setting is likely to fail to show the
progress meter entirely and (0%, 2s) is more appropriate.
But that is merely a theory. Realistically, it is of dubious value
to ask each codepath to carefully consider smoothness of their
workload and specify their own setting by passing two extra
parameters. Let's simplify the API by dropping both parameters and
have everybody use (0%, 2s).
Oh, by the way, the percent-threshold parameter and the structure
member were consistently misspelled, which also is now fixed ;-)
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-08-20 01:39:41 +08:00
|
|
|
progress = start_delayed_progress(
|
2014-02-21 20:50:18 +08:00
|
|
|
_("Performing inexact rename detection"),
|
2017-11-14 04:15:58 +08:00
|
|
|
(uint64_t)rename_dst_nr * (uint64_t)rename_src_nr);
|
2011-02-20 17:51:16 +08:00
|
|
|
}
|
|
|
|
|
2016-07-31 02:18:31 +08:00
|
|
|
mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_create), sizeof(*mx));
|
2005-05-24 16:10:48 +08:00
|
|
|
for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
|
|
|
|
struct diff_filespec *two = rename_dst[i].two;
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
struct diff_score *m;
|
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
if (rename_dst[i].pair)
|
2005-05-21 17:39:09 +08:00
|
|
|
continue; /* dealt with exact match already. */
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
|
|
|
|
m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
|
|
|
|
for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)
|
|
|
|
m[j].dst = -1;
|
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
for (j = 0; j < rename_src_nr; j++) {
|
2011-01-07 05:50:05 +08:00
|
|
|
struct diff_filespec *one = rename_src[j].p->one;
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
struct diff_score this_src;
|
2011-01-07 05:50:06 +08:00
|
|
|
|
|
|
|
if (skip_unmodified &&
|
|
|
|
diff_unmodified_pair(rename_src[j].p))
|
|
|
|
continue;
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
this_src.score = estimate_similarity(options->repo,
|
|
|
|
one, two,
|
diff: restrict when prefetching occurs
Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-08)
optimized "diff" by prefetching blobs in a partial clone, but there are
some cases wherein blobs do not need to be prefetched. In these cases,
any command that uses the diff machinery will unnecessarily fetch blobs.
diffcore_std() may read blobs when it calls the following functions:
(1) diffcore_skip_stat_unmatch() (controlled by the config variable
diff.autorefreshindex)
(2) diffcore_break() and diffcore_merge_broken() (for break-rewrite
detection)
(3) diffcore_rename() (for rename detection)
(4) diffcore_pickaxe() (for detecting addition/deletion of specified
string)
Instead of always prefetching blobs, teach diffcore_skip_stat_unmatch(),
diffcore_break(), and diffcore_rename() to prefetch blobs upon the first
read of a missing object. This covers (1), (2), and (3): to cover the
rest, teach diffcore_std() to prefetch if the output type is one that
includes blob data (and hence blob data will be required later anyway),
or if it knows that (4) will be run.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-08 06:11:43 +08:00
|
|
|
minimum_score,
|
|
|
|
skip_unmodified);
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
this_src.name_score = basename_same(one, two);
|
|
|
|
this_src.dst = i;
|
|
|
|
this_src.src = j;
|
|
|
|
record_if_better(m, &this_src);
|
2009-11-21 14:13:47 +08:00
|
|
|
/*
|
|
|
|
* Once we run estimate_similarity,
|
|
|
|
* We do not need the text anymore.
|
|
|
|
*/
|
2007-10-03 12:01:03 +08:00
|
|
|
diff_free_filespec_blob(one);
|
2009-11-21 14:13:47 +08:00
|
|
|
diff_free_filespec_blob(two);
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
|
|
|
dst_cnt++;
|
2017-11-14 04:15:58 +08:00
|
|
|
display_progress(progress, (uint64_t)(i+1)*(uint64_t)rename_src_nr);
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
2011-02-20 17:51:16 +08:00
|
|
|
stop_progress(&progress);
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
|
2005-05-21 17:39:09 +08:00
|
|
|
/* cost matrix sorted by most to least similar pair */
|
2019-10-01 01:21:55 +08:00
|
|
|
STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 12:54:56 +08:00
|
|
|
|
2011-02-19 12:10:32 +08:00
|
|
|
rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
|
|
|
|
if (detect_rename == DIFF_DETECT_COPY)
|
|
|
|
rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
|
2005-05-21 17:39:09 +08:00
|
|
|
free(mx);
|
|
|
|
|
2005-05-28 06:55:55 +08:00
|
|
|
cleanup:
|
2005-05-21 17:39:09 +08:00
|
|
|
/* At this point, we have found some renames and copies and they
|
2005-09-16 07:13:43 +08:00
|
|
|
* are recorded in rename_dst. The original list is still in *q.
|
2005-05-21 17:39:09 +08:00
|
|
|
*/
|
2010-05-07 12:52:27 +08:00
|
|
|
DIFF_QUEUE_CLEAR(&outq);
|
2005-05-21 17:39:09 +08:00
|
|
|
for (i = 0; i < q->nr; i++) {
|
2005-05-24 16:10:48 +08:00
|
|
|
struct diff_filepair *p = q->queue[i];
|
|
|
|
struct diff_filepair *pair_to_free = NULL;
|
|
|
|
|
diffcore-rename: don't consider unmerged path as source
Since e9c8409 (diff-index --cached --raw: show tree entry on the LHS for
unmerged entries., 2007-01-05), an unmerged entry should be detected by
using DIFF_PAIR_UNMERGED(p), not by noticing both one and two sides of
the filepair records mode=0 entries. However, it forgot to update some
parts of the rename detection logic.
This only makes difference in the "diff --cached" codepath where an
unmerged filepair carries information on the entries that came from the
tree. It probably hasn't been noticed for a long time because nobody
would run "diff -M" during a conflict resolution, but "git status" uses
rename detection when it internally runs "diff-index" and "diff-files"
and gives nonsense results.
In an unmerged pair, "one" side can have a valid filespec to record the
tree entry (e.g. what's in HEAD) when running "diff --cached". This can
be used as a rename source to other paths in the index that are not
unmerged. The path that is unmerged by definition does not have the
final content yet (i.e. "two" side cannot have a valid filespec), so it
can never be a rename destination.
Use the DIFF_PAIR_UNMERGED() to detect unmerged filepair correctly, and
allow the valid "one" side of an unmerged filepair to be considered a
potential rename source, but never to be considered a rename destination.
Commit message and first two test cases by Junio, the rest by Martin.
Signed-off-by: Martin von Zweigbergk <martin.von.zweigbergk@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-03-24 10:41:01 +08:00
|
|
|
if (DIFF_PAIR_UNMERGED(p)) {
|
|
|
|
diff_q(&outq, p);
|
|
|
|
}
|
|
|
|
else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
|
2005-05-30 15:08:07 +08:00
|
|
|
/*
|
|
|
|
* Creation
|
|
|
|
*
|
|
|
|
* We would output this create record if it has
|
|
|
|
* not been turned into a rename/copy already.
|
|
|
|
*/
|
2015-02-27 09:39:48 +08:00
|
|
|
struct diff_rename_dst *dst = locate_rename_dst(p->two);
|
2005-05-30 15:08:07 +08:00
|
|
|
if (dst && dst->pair) {
|
2005-05-24 16:10:48 +08:00
|
|
|
diff_q(&outq, dst->pair);
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
|
|
|
else
|
2005-05-30 15:08:07 +08:00
|
|
|
/* no matching rename/copy source, so
|
|
|
|
* record this as a creation.
|
2005-05-24 16:10:48 +08:00
|
|
|
*/
|
|
|
|
diff_q(&outq, p);
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
2005-05-30 15:08:07 +08:00
|
|
|
else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
|
|
|
|
/*
|
|
|
|
* Deletion
|
|
|
|
*
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
* We would output this delete record if:
|
|
|
|
*
|
|
|
|
* (1) this is a broken delete and the counterpart
|
|
|
|
* broken create remains in the output; or
|
2005-09-16 07:13:43 +08:00
|
|
|
* (2) this is not a broken delete, and rename_dst
|
|
|
|
* does not have a rename/copy to move p->one->path
|
|
|
|
* out of existence.
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
*
|
|
|
|
* Otherwise, the counterpart broken create
|
|
|
|
* has been turned into a rename-edit; or
|
|
|
|
* delete did not have a matching create to
|
|
|
|
* begin with.
|
2005-05-30 15:08:07 +08:00
|
|
|
*/
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
if (DIFF_PAIR_BROKEN(p)) {
|
|
|
|
/* broken delete */
|
2015-02-27 09:39:48 +08:00
|
|
|
struct diff_rename_dst *dst = locate_rename_dst(p->one);
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
if (dst && dst->pair)
|
|
|
|
/* counterpart is now rename/copy */
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
|
|
|
else {
|
2007-10-26 02:20:56 +08:00
|
|
|
if (p->one->rename_used)
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 15:08:37 +08:00
|
|
|
/* this path remains */
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
2005-05-30 15:08:07 +08:00
|
|
|
|
|
|
|
if (pair_to_free)
|
|
|
|
;
|
|
|
|
else
|
|
|
|
diff_q(&outq, p);
|
|
|
|
}
|
2005-05-24 16:10:48 +08:00
|
|
|
else if (!diff_unmodified_pair(p))
|
2005-05-28 06:55:55 +08:00
|
|
|
/* all the usual ones need to be kept */
|
2005-05-24 16:10:48 +08:00
|
|
|
diff_q(&outq, p);
|
2005-05-28 06:55:55 +08:00
|
|
|
else
|
|
|
|
/* no need to keep unmodified pairs */
|
|
|
|
pair_to_free = p;
|
|
|
|
|
2005-05-28 06:50:30 +08:00
|
|
|
if (pair_to_free)
|
|
|
|
diff_free_filepair(pair_to_free);
|
2005-05-21 17:39:09 +08:00
|
|
|
}
|
2005-05-24 16:10:48 +08:00
|
|
|
diff_debug_queue("done copying original", &outq);
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2005-05-24 16:10:48 +08:00
|
|
|
free(q->queue);
|
|
|
|
*q = outq;
|
|
|
|
diff_debug_queue("done collapsing", q);
|
2005-05-21 17:39:09 +08:00
|
|
|
|
2007-10-26 02:19:10 +08:00
|
|
|
for (i = 0; i < rename_dst_nr; i++)
|
|
|
|
free_filespec(rename_dst[i].two);
|
2005-09-16 07:13:43 +08:00
|
|
|
|
2017-06-16 07:15:46 +08:00
|
|
|
FREE_AND_NULL(rename_dst);
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_dst_nr = rename_dst_alloc = 0;
|
2017-06-16 07:15:46 +08:00
|
|
|
FREE_AND_NULL(rename_src);
|
2005-05-24 16:10:48 +08:00
|
|
|
rename_src_nr = rename_src_alloc = 0;
|
2005-05-21 17:39:09 +08:00
|
|
|
return;
|
|
|
|
}
|