2023-02-24 08:09:30 +08:00
|
|
|
#include "git-compat-util.h"
|
2006-03-01 08:01:36 +08:00
|
|
|
#include "diffcore.h"
|
2006-03-04 19:21:55 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Idea here is very simple.
|
|
|
|
*
|
2007-06-29 14:11:40 +08:00
|
|
|
* Almost all data we are interested in are text, but sometimes we have
|
|
|
|
* to deal with binary data. So we cut them into chunks delimited by
|
|
|
|
* LF byte, or 64-byte sequence, whichever comes first, and hash them.
|
2006-03-04 19:21:55 +08:00
|
|
|
*
|
2007-06-29 14:11:40 +08:00
|
|
|
* For those chunks, if the source buffer has more instances of it
|
|
|
|
* than the destination buffer, that means the difference are the
|
|
|
|
* number of bytes not copied from source to destination. If the
|
|
|
|
* counts are the same, everything was copied from source to
|
|
|
|
* destination. If the destination has more, everything was copied,
|
|
|
|
* and destination added more.
|
2006-03-04 19:21:55 +08:00
|
|
|
*
|
|
|
|
* We are doing an approximation so we do not really have to waste
|
|
|
|
* memory by actually storing the sequence. We just hash them into
|
|
|
|
* somewhere around 2^16 hashbuckets and count the occurrences.
|
|
|
|
*/
|
|
|
|
|
2006-03-12 19:22:10 +08:00
|
|
|
/* Wild guess at the initial hash size */
|
2006-03-13 08:39:51 +08:00
|
|
|
#define INITIAL_HASH_SIZE 9
|
2006-03-13 12:32:06 +08:00
|
|
|
|
2006-03-13 08:39:51 +08:00
|
|
|
/* We leave more room in smaller hash but do not let it
|
|
|
|
* grow to have unused hole too much.
|
|
|
|
*/
|
|
|
|
#define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))
|
2006-03-04 19:21:55 +08:00
|
|
|
|
2006-03-13 12:32:06 +08:00
|
|
|
/* A prime rather carefully chosen between 2^16..2^17, so that
|
|
|
|
* HASHBASE < INITIAL_FREE(17). We want to keep the maximum hashtable
|
|
|
|
* size under the current 2<<17 maximum, which can hold this many
|
|
|
|
* different values before overflowing to hashtable of size 2<<18.
|
|
|
|
*/
|
|
|
|
#define HASHBASE 107927
|
|
|
|
|
2006-03-12 19:22:10 +08:00
|
|
|
struct spanhash {
|
2006-03-15 16:37:57 +08:00
|
|
|
unsigned int hashval;
|
|
|
|
unsigned int cnt;
|
2006-03-12 19:22:10 +08:00
|
|
|
};
|
|
|
|
struct spanhash_top {
|
|
|
|
int alloc_log2;
|
|
|
|
int free;
|
|
|
|
struct spanhash data[FLEX_ARRAY];
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
|
|
|
|
{
|
2018-02-15 02:59:40 +08:00
|
|
|
struct spanhash_top *new_spanhash;
|
2006-03-12 19:22:10 +08:00
|
|
|
int i;
|
|
|
|
int osz = 1 << orig->alloc_log2;
|
|
|
|
int sz = osz << 1;
|
|
|
|
|
2018-02-15 02:59:40 +08:00
|
|
|
new_spanhash = xmalloc(st_add(sizeof(*orig),
|
2016-02-23 06:44:35 +08:00
|
|
|
st_mult(sizeof(struct spanhash), sz)));
|
2018-02-15 02:59:40 +08:00
|
|
|
new_spanhash->alloc_log2 = orig->alloc_log2 + 1;
|
|
|
|
new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);
|
|
|
|
memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);
|
2006-03-12 19:22:10 +08:00
|
|
|
for (i = 0; i < osz; i++) {
|
|
|
|
struct spanhash *o = &(orig->data[i]);
|
|
|
|
int bucket;
|
|
|
|
if (!o->cnt)
|
|
|
|
continue;
|
|
|
|
bucket = o->hashval & (sz - 1);
|
|
|
|
while (1) {
|
2018-02-15 02:59:40 +08:00
|
|
|
struct spanhash *h = &(new_spanhash->data[bucket++]);
|
2006-03-12 19:22:10 +08:00
|
|
|
if (!h->cnt) {
|
|
|
|
h->hashval = o->hashval;
|
|
|
|
h->cnt = o->cnt;
|
2018-02-15 02:59:40 +08:00
|
|
|
new_spanhash->free--;
|
2006-03-12 19:22:10 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (sz <= bucket)
|
|
|
|
bucket = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
free(orig);
|
2018-02-15 02:59:40 +08:00
|
|
|
return new_spanhash;
|
2006-03-12 19:22:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct spanhash_top *add_spanhash(struct spanhash_top *top,
|
2006-03-15 16:37:57 +08:00
|
|
|
unsigned int hashval, int cnt)
|
2006-03-12 19:22:10 +08:00
|
|
|
{
|
|
|
|
int bucket, lim;
|
|
|
|
struct spanhash *h;
|
|
|
|
|
|
|
|
lim = (1 << top->alloc_log2);
|
|
|
|
bucket = hashval & (lim - 1);
|
|
|
|
while (1) {
|
|
|
|
h = &(top->data[bucket++]);
|
|
|
|
if (!h->cnt) {
|
|
|
|
h->hashval = hashval;
|
2006-03-15 16:37:57 +08:00
|
|
|
h->cnt = cnt;
|
2006-03-12 19:22:10 +08:00
|
|
|
top->free--;
|
|
|
|
if (top->free < 0)
|
|
|
|
return spanhash_rehash(top);
|
|
|
|
return top;
|
|
|
|
}
|
|
|
|
if (h->hashval == hashval) {
|
2006-03-15 16:37:57 +08:00
|
|
|
h->cnt += cnt;
|
2006-03-12 19:22:10 +08:00
|
|
|
return top;
|
|
|
|
}
|
|
|
|
if (lim <= bucket)
|
|
|
|
bucket = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-03 10:28:19 +08:00
|
|
|
static int spanhash_cmp(const void *a_, const void *b_)
|
|
|
|
{
|
|
|
|
const struct spanhash *a = a_;
|
|
|
|
const struct spanhash *b = b_;
|
|
|
|
|
|
|
|
/* A count of zero compares at the end.. */
|
|
|
|
if (!a->cnt)
|
|
|
|
return !b->cnt ? 0 : 1;
|
|
|
|
if (!b->cnt)
|
|
|
|
return -1;
|
|
|
|
return a->hashval < b->hashval ? -1 :
|
|
|
|
a->hashval > b->hashval ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
static struct spanhash_top *hash_chars(struct repository *r,
|
|
|
|
struct diff_filespec *one)
|
2006-03-01 08:01:36 +08:00
|
|
|
{
|
2006-03-15 16:37:57 +08:00
|
|
|
int i, n;
|
2006-03-15 16:37:57 +08:00
|
|
|
unsigned int accum1, accum2, hashval;
|
2006-03-12 19:22:10 +08:00
|
|
|
struct spanhash_top *hash;
|
2007-06-29 14:14:13 +08:00
|
|
|
unsigned char *buf = one->data;
|
|
|
|
unsigned int sz = one->size;
|
2018-09-21 23:57:19 +08:00
|
|
|
int is_text = !diff_filespec_is_binary(r, one);
|
2006-03-12 19:22:10 +08:00
|
|
|
|
|
|
|
i = INITIAL_HASH_SIZE;
|
2016-02-23 06:44:35 +08:00
|
|
|
hash = xmalloc(st_add(sizeof(*hash),
|
2021-12-01 08:29:01 +08:00
|
|
|
st_mult(sizeof(struct spanhash), (size_t)1 << i)));
|
2006-03-12 19:22:10 +08:00
|
|
|
hash->alloc_log2 = i;
|
2006-03-13 08:39:51 +08:00
|
|
|
hash->free = INITIAL_FREE(i);
|
2021-12-01 08:29:01 +08:00
|
|
|
memset(hash->data, 0, sizeof(struct spanhash) * ((size_t)1 << i));
|
2006-03-01 08:01:36 +08:00
|
|
|
|
2006-03-15 16:37:57 +08:00
|
|
|
n = 0;
|
|
|
|
accum1 = accum2 = 0;
|
2006-03-04 19:21:55 +08:00
|
|
|
while (sz) {
|
2006-03-15 16:37:57 +08:00
|
|
|
unsigned int c = *buf++;
|
|
|
|
unsigned int old_1 = accum1;
|
2006-03-04 19:21:55 +08:00
|
|
|
sz--;
|
2007-06-29 14:14:13 +08:00
|
|
|
|
|
|
|
/* Ignore CR in CRLF sequence if text */
|
|
|
|
if (is_text && c == '\r' && sz && *buf == '\n')
|
|
|
|
continue;
|
|
|
|
|
2006-03-15 16:37:57 +08:00
|
|
|
accum1 = (accum1 << 7) ^ (accum2 >> 25);
|
|
|
|
accum2 = (accum2 << 7) ^ (old_1 >> 25);
|
2006-03-15 16:37:57 +08:00
|
|
|
accum1 += c;
|
|
|
|
if (++n < 64 && c != '\n')
|
|
|
|
continue;
|
|
|
|
hashval = (accum1 + accum2 * 0x61) % HASHBASE;
|
|
|
|
hash = add_spanhash(hash, hashval, n);
|
|
|
|
n = 0;
|
|
|
|
accum1 = accum2 = 0;
|
2006-03-01 08:01:36 +08:00
|
|
|
}
|
2024-01-13 12:26:13 +08:00
|
|
|
if (n > 0) {
|
|
|
|
hashval = (accum1 + accum2 * 0x61) % HASHBASE;
|
|
|
|
hash = add_spanhash(hash, hashval, n);
|
|
|
|
}
|
2021-12-01 08:29:01 +08:00
|
|
|
QSORT(hash->data, (size_t)1ul << hash->alloc_log2, spanhash_cmp);
|
2006-03-12 19:22:10 +08:00
|
|
|
return hash;
|
2006-03-01 08:01:36 +08:00
|
|
|
}
|
|
|
|
|
2018-09-21 23:57:19 +08:00
|
|
|
int diffcore_count_changes(struct repository *r,
|
|
|
|
struct diff_filespec *src,
|
2007-06-29 13:54:37 +08:00
|
|
|
struct diff_filespec *dst,
|
2006-03-12 19:22:10 +08:00
|
|
|
void **src_count_p,
|
|
|
|
void **dst_count_p,
|
2006-03-01 08:01:36 +08:00
|
|
|
unsigned long *src_copied,
|
|
|
|
unsigned long *literal_added)
|
|
|
|
{
|
2007-10-03 10:28:19 +08:00
|
|
|
struct spanhash *s, *d;
|
2006-03-12 19:22:10 +08:00
|
|
|
struct spanhash_top *src_count, *dst_count;
|
2006-03-04 19:21:55 +08:00
|
|
|
unsigned long sc, la;
|
|
|
|
|
2006-03-12 19:22:10 +08:00
|
|
|
src_count = dst_count = NULL;
|
|
|
|
if (src_count_p)
|
|
|
|
src_count = *src_count_p;
|
|
|
|
if (!src_count) {
|
2018-09-21 23:57:19 +08:00
|
|
|
src_count = hash_chars(r, src);
|
2006-03-12 19:22:10 +08:00
|
|
|
if (src_count_p)
|
|
|
|
*src_count_p = src_count;
|
|
|
|
}
|
|
|
|
if (dst_count_p)
|
|
|
|
dst_count = *dst_count_p;
|
|
|
|
if (!dst_count) {
|
2018-09-21 23:57:19 +08:00
|
|
|
dst_count = hash_chars(r, dst);
|
2006-03-12 19:22:10 +08:00
|
|
|
if (dst_count_p)
|
|
|
|
*dst_count_p = dst_count;
|
|
|
|
}
|
2006-03-04 19:21:55 +08:00
|
|
|
sc = la = 0;
|
2006-03-12 19:22:10 +08:00
|
|
|
|
2007-10-03 10:28:19 +08:00
|
|
|
s = src_count->data;
|
|
|
|
d = dst_count->data;
|
|
|
|
for (;;) {
|
2006-03-12 19:22:10 +08:00
|
|
|
unsigned dst_cnt, src_cnt;
|
|
|
|
if (!s->cnt)
|
2007-10-03 10:28:19 +08:00
|
|
|
break; /* we checked all in src */
|
|
|
|
while (d->cnt) {
|
|
|
|
if (d->hashval >= s->hashval)
|
|
|
|
break;
|
2009-12-05 04:07:47 +08:00
|
|
|
la += d->cnt;
|
2007-10-03 10:28:19 +08:00
|
|
|
d++;
|
|
|
|
}
|
2006-03-12 19:22:10 +08:00
|
|
|
src_cnt = s->cnt;
|
2009-12-05 04:07:47 +08:00
|
|
|
dst_cnt = 0;
|
|
|
|
if (d->cnt && d->hashval == s->hashval) {
|
|
|
|
dst_cnt = d->cnt;
|
|
|
|
d++;
|
|
|
|
}
|
2006-03-12 19:22:10 +08:00
|
|
|
if (src_cnt < dst_cnt) {
|
|
|
|
la += dst_cnt - src_cnt;
|
|
|
|
sc += src_cnt;
|
2006-03-04 19:21:55 +08:00
|
|
|
}
|
2006-03-12 19:22:10 +08:00
|
|
|
else
|
|
|
|
sc += dst_cnt;
|
2007-10-03 10:28:19 +08:00
|
|
|
s++;
|
2006-03-04 19:21:55 +08:00
|
|
|
}
|
2009-12-05 04:07:47 +08:00
|
|
|
while (d->cnt) {
|
|
|
|
la += d->cnt;
|
|
|
|
d++;
|
|
|
|
}
|
2006-03-12 19:22:10 +08:00
|
|
|
|
|
|
|
if (!src_count_p)
|
|
|
|
free(src_count);
|
|
|
|
if (!dst_count_p)
|
|
|
|
free(dst_count);
|
2006-03-04 19:21:55 +08:00
|
|
|
*src_copied = sc;
|
|
|
|
*literal_added = la;
|
|
|
|
return 0;
|
2006-03-01 08:01:36 +08:00
|
|
|
}
|