diff --git a/Makefile b/Makefile index b957cec1a0..92d0e87535 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,9 @@ LIB_H=cache.h object.h blob.h tree.h commit.h tag.h delta.h LIB_H += strbuf.h LIB_OBJS += strbuf.o -LIB_H += diff.h -LIB_OBJS += diff.o diffcore-rename.o diffcore-pickaxe.o diffcore-pathspec.o +LIB_H += diff.h count-delta.h +LIB_OBJS += diff.o diffcore-rename.o diffcore-pickaxe.o diffcore-pathspec.o \ + count-delta.o LIB_OBJS += gitenv.o diff --git a/count-delta.c b/count-delta.c new file mode 100644 index 0000000000..dd81e92965 --- /dev/null +++ b/count-delta.c @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2005 Junio C Hamano + * The delta-parsing part is almost straight copy of patch-delta.c + * which is (C) 2005 Nicolas Pitre . + */ +#include +#include +#include +#include "delta.h" +#include "count-delta.h" + +static unsigned long get_hdr_size(const unsigned char **datap) +{ + const unsigned char *data = *datap; + unsigned long size; + unsigned char cmd; + int i; + size = i = 0; + cmd = *data++; + while (cmd) { + if (cmd & 1) + size |= *data++ << i; + i += 8; + cmd >>= 1; + } + *datap = data; + return size; +} + +/* + * NOTE. We do not _interpret_ delta fully. As an approximation, we + * just count the number of bytes that are copied from the source, and + * the number of literal data bytes that are inserted. Number of + * bytes that are _not_ copied from the source is deletion, and number + * of inserted literal bytes are addition, so sum of them is what we + * return. xdelta can express an edit that copies data inside of the + * destination which originally came from the source. We do not count + * that in the following routine, so we are undercounting the source + * material that remains in the final output that way. + */ +unsigned long count_delta(void *delta_buf, unsigned long delta_size) +{ + unsigned long copied_from_source, added_literal; + const unsigned char *data, *top; + unsigned char cmd; + unsigned long src_size, dst_size, out; + + /* the smallest delta size possible is 6 bytes */ + if (delta_size < 6) + return UINT_MAX; + + data = delta_buf; + top = delta_buf + delta_size; + + src_size = get_hdr_size(&data); + dst_size = get_hdr_size(&data); + + added_literal = copied_from_source = out = 0; + while (data < top) { + cmd = *data++; + if (cmd & 0x80) { + unsigned long cp_off = 0, cp_size = 0; + if (cmd & 0x01) cp_off = *data++; + if (cmd & 0x02) cp_off |= (*data++ << 8); + if (cmd & 0x04) cp_off |= (*data++ << 16); + if (cmd & 0x08) cp_off |= (*data++ << 24); + if (cmd & 0x10) cp_size = *data++; + if (cmd & 0x20) cp_size |= (*data++ << 8); + if (cp_size == 0) cp_size = 0x10000; + + if (cmd & 0x40) + /* copy from dst */ + ; + else + copied_from_source += cp_size; + out += cp_size; + } else { + /* write literal into dst */ + added_literal += cmd; + out += cmd; + data += cmd; + } + } + + /* sanity check */ + if (data != top || out != dst_size) + return UINT_MAX; + + /* delete size is what was _not_ copied from source. + * edit size is that and literal additions. + */ + return (src_size - copied_from_source) + added_literal; +} diff --git a/count-delta.h b/count-delta.h new file mode 100644 index 0000000000..4e6b584f43 --- /dev/null +++ b/count-delta.h @@ -0,0 +1,9 @@ +/* + * Copyright (C) 2005 Junio C Hamano + */ +#ifndef COUNT_DELTA_H +#define COUNT_DELTA_H + +unsigned long count_delta(void *, unsigned long); + +#endif diff --git a/diffcore-rename.c b/diffcore-rename.c index 34e83dac8d..07782f4b7b 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -5,6 +5,7 @@ #include "diff.h" #include "diffcore.h" #include "delta.h" +#include "count-delta.h" /* Table of rename/copy destinations */ @@ -158,13 +159,18 @@ static int estimate_similarity(struct diff_filespec *src, delta = diff_delta(src->data, src->size, dst->data, dst->size, &delta_size); - /* - * We currently punt here, but we may later end up parsing the - * delta to really assess the extent of damage. A big consecutive - * remove would produce small delta_size that affects quite a - * big portion of the file. + + /* A delta that has a lot of literal additions would have + * big delta_size no matter what else it does. */ + if (minimum_score < MAX_SCORE * delta_size / base_size) + return 0; + + /* Estimate the edit size by interpreting delta. */ + delta_size = count_delta(delta, delta_size); free(delta); + if (delta_size == UINT_MAX) + return 0; /* * Now we will give some score to it. 100% edit gets 0 points