git/xdiff-interface.c

324 lines
7.5 KiB
C
Raw Normal View History

#include "cache.h"
#include "config.h"
#include "object-store.h"
#include "xdiff-interface.h"
#include "xdiff/xtypes.h"
#include "xdiff/xdiffi.h"
#include "xdiff/xemit.h"
#include "xdiff/xmacros.h"
#include "xdiff/xutils.h"
struct xdiff_emit_state {
xdiff_emit_hunk_fn hunk_fn;
xdiff_emit_line_fn line_fn;
void *consume_callback_data;
struct strbuf remainder;
};
static int xdiff_out_hunk(void *priv_,
long old_begin, long old_nr,
long new_begin, long new_nr,
const char *func, long funclen)
{
struct xdiff_emit_state *priv = priv_;
if (priv->remainder.len)
BUG("xdiff emitted hunk in the middle of a line");
priv->hunk_fn(priv->consume_callback_data,
old_begin, old_nr, new_begin, new_nr,
func, funclen);
return 0;
}
static void consume_one(void *priv_, char *s, unsigned long size)
{
struct xdiff_emit_state *priv = priv_;
char *ep;
while (size) {
unsigned long this_size;
ep = memchr(s, '\n', size);
this_size = (ep == NULL) ? size : (ep - s + 1);
priv->line_fn(priv->consume_callback_data, s, this_size);
size -= this_size;
s += this_size;
}
}
static int xdiff_outf(void *priv_, mmbuffer_t *mb, int nbuf)
{
struct xdiff_emit_state *priv = priv_;
int i;
if (!priv->line_fn)
return 0;
for (i = 0; i < nbuf; i++) {
if (mb[i].ptr[mb[i].size-1] != '\n') {
/* Incomplete line */
strbuf_add(&priv->remainder, mb[i].ptr, mb[i].size);
continue;
}
/* we have a complete line */
if (!priv->remainder.len) {
consume_one(priv, mb[i].ptr, mb[i].size);
continue;
}
strbuf_add(&priv->remainder, mb[i].ptr, mb[i].size);
consume_one(priv, priv->remainder.buf, priv->remainder.len);
strbuf_reset(&priv->remainder);
}
if (priv->remainder.len) {
consume_one(priv, priv->remainder.buf, priv->remainder.len);
strbuf_reset(&priv->remainder);
}
return 0;
}
/*
* Trim down common substring at the end of the buffers,
* but end on a complete line.
*/
static void trim_common_tail(mmfile_t *a, mmfile_t *b)
{
const int blk = 1024;
long trimmed = 0, recovered = 0;
avoid computing zero offsets from NULL pointer The Undefined Behavior Sanitizer in clang-11 seems to have learned a new trick: it complains about computing offsets from a NULL pointer, even if that offset is 0. This causes numerous test failures. For example, from t1090: unpack-trees.c:1355:41: runtime error: applying zero offset to null pointer ... not ok 6 - in partial clone, sparse checkout only fetches needed blobs The code in question looks like this: struct cache_entry **cache_end = cache + nr; ... while (cache != cache_end) and we sometimes pass in a NULL and 0 for "cache" and "nr". This is conceptually fine, as "cache_end" would be equal to "cache" in this case, and we wouldn't enter the loop at all. But computing even a zero offset violates the C standard. And given the fact that UBSan is noticing this behavior, this might be a potential problem spot if the compiler starts making unexpected assumptions based on undefined behavior. So let's just avoid it, which is pretty easy. In some cases we can just switch to iterating with a numeric index (as we do in sequencer.c here). In other cases (like the cache_end one) the use of an end pointer is more natural; we can keep that by just explicitly checking for the NULL/0 case when assigning the end pointer. Note that there are two ways you can write this latter case, checking for the pointer: cache_end = cache ? cache + nr : cache; or the size: cache_end = nr ? cache + nr : cache; For the case of a NULL/0 ptr/len combo, they are equivalent. But writing it the second way (as this patch does) has the property that if somebody were to incorrectly pass a NULL pointer with a non-zero length, we'd continue to notice and segfault, rather than silently pretending the length was zero. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-29 13:46:47 +08:00
char *ap = a->size ? a->ptr + a->size : a->ptr;
char *bp = b->size ? b->ptr + b->size : b->ptr;
long smaller = (a->size < b->size) ? a->size : b->size;
while (blk + trimmed <= smaller && !memcmp(ap - blk, bp - blk, blk)) {
trimmed += blk;
ap -= blk;
bp -= blk;
}
while (recovered < trimmed)
if (ap[recovered++] == '\n')
break;
a->size -= trimmed - recovered;
b->size -= trimmed - recovered;
}
int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *xecb)
{
mmfile_t a = *mf1;
mmfile_t b = *mf2;
xdiff: reject files larger than ~1GB The xdiff code is not prepared to handle extremely large files. It uses "int" in many places, which can overflow if we have a very large number of lines or even bytes in our input files. This can cause us to produce incorrect diffs, with no indication that the output is wrong. Or worse, we may even underallocate a buffer whose size is the result of an overflowing addition. We're much better off to tell the user that we cannot diff or merge such a large file. This patch covers both cases, but in slightly different ways: 1. For merging, we notice the large file and cleanly fall back to a binary merge (which is effectively "we cannot merge this"). 2. For diffing, we make the binary/text distinction much earlier, and in many different places. For this case, we'll use the xdi_diff as our choke point, and reject any diff there before it hits the xdiff code. This means in most cases we'll die() immediately after. That's not ideal, but in practice we shouldn't generally hit this code path unless the user is trying to do something tricky. We already consider files larger than core.bigfilethreshold to be binary, so this code would only kick in when that is circumvented (either by bumping that value, or by using a .gitattribute to mark a file as diffable). In other words, we can avoid being "nice" here, because there is already nice code that tries to do the right thing. We are adding the suspenders to the nice code's belt, so notice when it has been worked around (both to protect the user from malicious inputs, and because it is better to die() than generate bogus output). The maximum size was chosen after experimenting with feeding large files to the xdiff code. It's just under a gigabyte, which leaves room for two obvious cases: - a diff3 merge conflict result on files of maximum size X could be 3*X plus the size of the markers, which would still be only about 3G, which fits in a 32-bit int. - some of the diff code allocates arrays of one int per record. Even if each file consists only of blank lines, then a file smaller than 1G will have fewer than 1G records, and therefore the int array will fit in 4G. Since the limit is arbitrary anyway, I chose to go under a gigabyte, to leave a safety margin (e.g., we would not want to overflow by allocating "(records + 1) * sizeof(int)" or similar. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2015-09-25 07:12:45 +08:00
if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE)
return -1;
if (!xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT))
trim_common_tail(&a, &b);
return xdl_diff(&a, &b, xpp, xecfg, xecb);
}
void discard_hunk_line(void *priv,
long ob, long on, long nb, long nn,
const char *func, long funclen)
{
}
int xdi_diff_outf(mmfile_t *mf1, mmfile_t *mf2,
xdiff_emit_hunk_fn hunk_fn,
xdiff_emit_line_fn line_fn,
void *consume_callback_data,
xpparam_t const *xpp, xdemitconf_t const *xecfg)
{
int ret;
struct xdiff_emit_state state;
xdemitcb_t ecb;
memset(&state, 0, sizeof(state));
state.hunk_fn = hunk_fn;
state.line_fn = line_fn;
state.consume_callback_data = consume_callback_data;
memset(&ecb, 0, sizeof(ecb));
if (hunk_fn)
ecb.out_hunk = xdiff_out_hunk;
ecb.out_line = xdiff_outf;
ecb.priv = &state;
strbuf_init(&state.remainder, 0);
ret = xdi_diff(mf1, mf2, xpp, xecfg, &ecb);
strbuf_release(&state.remainder);
return ret;
}
int read_mmfile(mmfile_t *ptr, const char *filename)
{
struct stat st;
FILE *f;
size_t sz;
if (stat(filename, &st))
return error_errno("Could not stat %s", filename);
if ((f = fopen(filename, "rb")) == NULL)
return error_errno("Could not open %s", filename);
sz = xsize_t(st.st_size);
ptr->ptr = xmalloc(sz ? sz : 1);
if (sz && fread(ptr->ptr, sz, 1, f) != 1) {
fclose(f);
return error("Could not read %s", filename);
}
fclose(f);
ptr->size = sz;
return 0;
}
void read_mmblob(mmfile_t *ptr, const struct object_id *oid)
{
unsigned long size;
enum object_type type;
if (oideq(oid, &null_oid)) {
ptr->ptr = xstrdup("");
ptr->size = 0;
return;
}
ptr->ptr = read_object_file(oid, &type, &size);
if (!ptr->ptr || type != OBJ_BLOB)
die("unable to read blob object %s", oid_to_hex(oid));
ptr->size = size;
}
#define FIRST_FEW_BYTES 8000
int buffer_is_binary(const char *ptr, unsigned long size)
{
if (FIRST_FEW_BYTES < size)
size = FIRST_FEW_BYTES;
return !!memchr(ptr, 0, size);
}
struct ff_regs {
int nr;
struct ff_reg {
regex_t re;
int negate;
} *array;
};
static long ff_regexp(const char *line, long len,
char *buffer, long buffer_size, void *priv)
{
struct ff_regs *regs = priv;
regmatch_t pmatch[2];
int i;
int result;
/* Exclude terminating newline (and cr) from matching */
if (len > 0 && line[len-1] == '\n') {
if (len > 1 && line[len-2] == '\r')
len -= 2;
else
len--;
}
for (i = 0; i < regs->nr; i++) {
struct ff_reg *reg = regs->array + i;
if (!regexec_buf(&reg->re, line, len, 2, pmatch, 0)) {
if (reg->negate)
return -1;
break;
}
}
if (regs->nr <= i)
return -1;
i = pmatch[1].rm_so >= 0 ? 1 : 0;
line += pmatch[i].rm_so;
result = pmatch[i].rm_eo - pmatch[i].rm_so;
if (result > buffer_size)
result = buffer_size;
while (result > 0 && (isspace(line[result - 1])))
result--;
memcpy(buffer, line, result);
return result;
}
void xdiff_set_find_func(xdemitconf_t *xecfg, const char *value, int cflags)
{
int i;
struct ff_regs *regs;
xecfg->find_func = ff_regexp;
regs = xecfg->find_func_priv = xmalloc(sizeof(struct ff_regs));
for (i = 0, regs->nr = 1; value[i]; i++)
if (value[i] == '\n')
regs->nr++;
ALLOC_ARRAY(regs->array, regs->nr);
for (i = 0; i < regs->nr; i++) {
struct ff_reg *reg = regs->array + i;
const char *ep, *expression;
char *buffer = NULL;
if (!value)
BUG("mismatch between line count and parsing");
ep = strchr(value, '\n');
reg->negate = (*value == '!');
if (reg->negate && i == regs->nr - 1)
die("Last expression must not be negated: %s", value);
if (*value == '!')
value++;
if (ep)
expression = buffer = xstrndup(value, ep - value);
else
expression = value;
if (regcomp(&reg->re, expression, cflags))
die("Invalid regexp to look for hunk header: %s", expression);
Avoid unnecessary "if-before-free" tests. This change removes all obvious useless if-before-free tests. E.g., it replaces code like this: if (some_expression) free (some_expression); with the now-equivalent: free (some_expression); It is equivalent not just because POSIX has required free(NULL) to work for a long time, but simply because it has worked for so long that no reasonable porting target fails the test. Here's some evidence from nearly 1.5 years ago: http://www.winehq.org/pipermail/wine-patches/2006-October/031544.html FYI, the change below was prepared by running the following: git ls-files -z | xargs -0 \ perl -0x3b -pi -e \ 's/\bif\s*\(\s*(\S+?)(?:\s*!=\s*NULL)?\s*\)\s+(free\s*\(\s*\1\s*\))/$2/s' Note however, that it doesn't handle brace-enclosed blocks like "if (x) { free (x); }". But that's ok, since there were none like that in git sources. Beware: if you do use the above snippet, note that it can produce syntactically invalid C code. That happens when the affected "if"-statement has a matching "else". E.g., it would transform this if (x) free (x); else foo (); into this: free (x); else foo (); There were none of those here, either. If you're interested in automating detection of the useless tests, you might like the useless-if-before-free script in gnulib: [it *does* detect brace-enclosed free statements, and has a --name=S option to make it detect free-like functions with different names] http://git.sv.gnu.org/gitweb/?p=gnulib.git;a=blob;f=build-aux/useless-if-before-free Addendum: Remove one more (in imap-send.c), spotted by Jean-Luc Herren <jlh@gmx.ch>. Signed-off-by: Jim Meyering <meyering@redhat.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-02-01 01:26:32 +08:00
free(buffer);
value = ep ? ep + 1 : NULL;
}
}
void xdiff_clear_find_func(xdemitconf_t *xecfg)
{
if (xecfg->find_func) {
int i;
struct ff_regs *regs = xecfg->find_func_priv;
for (i = 0; i < regs->nr; i++)
regfree(&regs->array[i].re);
free(regs->array);
free(regs);
xecfg->find_func = NULL;
xecfg->find_func_priv = NULL;
}
}
unsigned long xdiff_hash_string(const char *s, size_t len, long flags)
{
return xdl_hash_record(&s, s + len, flags);
}
int xdiff_compare_lines(const char *l1, long s1,
const char *l2, long s2, long flags)
{
return xdl_recmatch(l1, s1, l2, s2, flags);
}
int git_xmerge_style = -1;
int git_xmerge_config(const char *var, const char *value, void *cb)
{
if (!strcmp(var, "merge.conflictstyle")) {
if (!value)
die("'%s' is not a boolean", var);
if (!strcmp(value, "diff3"))
git_xmerge_style = XDL_MERGE_DIFF3;
else if (!strcmp(value, "merge"))
git_xmerge_style = 0;
/*
* Please update _git_checkout() in
* git-completion.bash when you add new merge config
*/
else
die("unknown style '%s' given for '%s'",
value, var);
return 0;
}
return git_default_config(var, value, cb);
}