mirror of
https://github.com/git/git.git
synced 2024-12-03 23:14:23 +08:00
[PATCH] Update rename/copy similarity estimator.
The second round similarity estimator simply used the size of the xdelta itself to estimate the extent of damage. This patch keeps that logic to detect big insertions to terminate the check early, but otherwise looks at the generated delta in order to estimate the extent of edit more accurately. Signed-off-by: Junio C Hamano <junkio@cox.net> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
bba0f401ee
commit
8597697458
5
Makefile
5
Makefile
@ -46,8 +46,9 @@ LIB_H=cache.h object.h blob.h tree.h commit.h tag.h delta.h
|
||||
LIB_H += strbuf.h
|
||||
LIB_OBJS += strbuf.o
|
||||
|
||||
LIB_H += diff.h
|
||||
LIB_OBJS += diff.o diffcore-rename.o diffcore-pickaxe.o diffcore-pathspec.o
|
||||
LIB_H += diff.h count-delta.h
|
||||
LIB_OBJS += diff.o diffcore-rename.o diffcore-pickaxe.o diffcore-pathspec.o \
|
||||
count-delta.o
|
||||
|
||||
LIB_OBJS += gitenv.o
|
||||
|
||||
|
93
count-delta.c
Normal file
93
count-delta.c
Normal file
@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Junio C Hamano
|
||||
* The delta-parsing part is almost straight copy of patch-delta.c
|
||||
* which is (C) 2005 Nicolas Pitre <nico@cam.org>.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "delta.h"
|
||||
#include "count-delta.h"
|
||||
|
||||
static unsigned long get_hdr_size(const unsigned char **datap)
|
||||
{
|
||||
const unsigned char *data = *datap;
|
||||
unsigned long size;
|
||||
unsigned char cmd;
|
||||
int i;
|
||||
size = i = 0;
|
||||
cmd = *data++;
|
||||
while (cmd) {
|
||||
if (cmd & 1)
|
||||
size |= *data++ << i;
|
||||
i += 8;
|
||||
cmd >>= 1;
|
||||
}
|
||||
*datap = data;
|
||||
return size;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE. We do not _interpret_ delta fully. As an approximation, we
|
||||
* just count the number of bytes that are copied from the source, and
|
||||
* the number of literal data bytes that are inserted. Number of
|
||||
* bytes that are _not_ copied from the source is deletion, and number
|
||||
* of inserted literal bytes are addition, so sum of them is what we
|
||||
* return. xdelta can express an edit that copies data inside of the
|
||||
* destination which originally came from the source. We do not count
|
||||
* that in the following routine, so we are undercounting the source
|
||||
* material that remains in the final output that way.
|
||||
*/
|
||||
unsigned long count_delta(void *delta_buf, unsigned long delta_size)
|
||||
{
|
||||
unsigned long copied_from_source, added_literal;
|
||||
const unsigned char *data, *top;
|
||||
unsigned char cmd;
|
||||
unsigned long src_size, dst_size, out;
|
||||
|
||||
/* the smallest delta size possible is 6 bytes */
|
||||
if (delta_size < 6)
|
||||
return UINT_MAX;
|
||||
|
||||
data = delta_buf;
|
||||
top = delta_buf + delta_size;
|
||||
|
||||
src_size = get_hdr_size(&data);
|
||||
dst_size = get_hdr_size(&data);
|
||||
|
||||
added_literal = copied_from_source = out = 0;
|
||||
while (data < top) {
|
||||
cmd = *data++;
|
||||
if (cmd & 0x80) {
|
||||
unsigned long cp_off = 0, cp_size = 0;
|
||||
if (cmd & 0x01) cp_off = *data++;
|
||||
if (cmd & 0x02) cp_off |= (*data++ << 8);
|
||||
if (cmd & 0x04) cp_off |= (*data++ << 16);
|
||||
if (cmd & 0x08) cp_off |= (*data++ << 24);
|
||||
if (cmd & 0x10) cp_size = *data++;
|
||||
if (cmd & 0x20) cp_size |= (*data++ << 8);
|
||||
if (cp_size == 0) cp_size = 0x10000;
|
||||
|
||||
if (cmd & 0x40)
|
||||
/* copy from dst */
|
||||
;
|
||||
else
|
||||
copied_from_source += cp_size;
|
||||
out += cp_size;
|
||||
} else {
|
||||
/* write literal into dst */
|
||||
added_literal += cmd;
|
||||
out += cmd;
|
||||
data += cmd;
|
||||
}
|
||||
}
|
||||
|
||||
/* sanity check */
|
||||
if (data != top || out != dst_size)
|
||||
return UINT_MAX;
|
||||
|
||||
/* delete size is what was _not_ copied from source.
|
||||
* edit size is that and literal additions.
|
||||
*/
|
||||
return (src_size - copied_from_source) + added_literal;
|
||||
}
|
9
count-delta.h
Normal file
9
count-delta.h
Normal file
@ -0,0 +1,9 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Junio C Hamano
|
||||
*/
|
||||
#ifndef COUNT_DELTA_H
|
||||
#define COUNT_DELTA_H
|
||||
|
||||
unsigned long count_delta(void *, unsigned long);
|
||||
|
||||
#endif
|
@ -5,6 +5,7 @@
|
||||
#include "diff.h"
|
||||
#include "diffcore.h"
|
||||
#include "delta.h"
|
||||
#include "count-delta.h"
|
||||
|
||||
/* Table of rename/copy destinations */
|
||||
|
||||
@ -158,13 +159,18 @@ static int estimate_similarity(struct diff_filespec *src,
|
||||
delta = diff_delta(src->data, src->size,
|
||||
dst->data, dst->size,
|
||||
&delta_size);
|
||||
/*
|
||||
* We currently punt here, but we may later end up parsing the
|
||||
* delta to really assess the extent of damage. A big consecutive
|
||||
* remove would produce small delta_size that affects quite a
|
||||
* big portion of the file.
|
||||
|
||||
/* A delta that has a lot of literal additions would have
|
||||
* big delta_size no matter what else it does.
|
||||
*/
|
||||
if (minimum_score < MAX_SCORE * delta_size / base_size)
|
||||
return 0;
|
||||
|
||||
/* Estimate the edit size by interpreting delta. */
|
||||
delta_size = count_delta(delta, delta_size);
|
||||
free(delta);
|
||||
if (delta_size == UINT_MAX)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Now we will give some score to it. 100% edit gets 0 points
|
||||
|
Loading…
Reference in New Issue
Block a user