fast-import: tighten path unquoting

Path parsing in fast-import is inconsistent and many unquoting errors
are suppressed or not checked.

<path> appears in the grammar in these places:

    filemodify ::= 'M' SP <mode> (<dataref> | 'inline') SP <path> LF
    filedelete ::= 'D' SP <path> LF
    filecopy   ::= 'C' SP <path> SP <path> LF
    filerename ::= 'R' SP <path> SP <path> LF
    ls         ::= 'ls' SP <dataref> SP <path> LF
    ls-commit  ::= 'ls' SP <path> LF

and fast-import.c parses them in five different ways:

1. For filemodify and filedelete:
   Try to unquote <path>. If it unquotes without errors, use the
   unquoted version; otherwise, treat it as literal bytes to the end of
   the line (including any number of SP).
2. For filecopy (source) and filerename (source):
   Try to unquote <path>. If it unquotes without errors, use the
   unquoted version; otherwise, treat it as literal bytes up to, but not
   including, the next SP.
3. For filecopy (dest) and filerename (dest):
   Like 1., but an unquoted empty string is forbidden.
4. For ls:
   If <path> starts with `"`, unquote it and report parse errors;
   otherwise, treat it as literal bytes to the end of the line
   (including any number of SP).
5. For ls-commit:
   Unquote <path> and report parse errors.
   (It must start with `"` to disambiguate from ls.)

In the first three, any errors from trying to unquote a string are
suppressed, so a quoted string that contains invalid escapes would be
interpreted as literal bytes. For example, `"\xff"` would fail to
unquote (because hex escapes are not supported), and it would instead be
interpreted as the byte sequence '"', '\\', 'x', 'f', 'f', '"', which is
certainly not intended. Some front-ends erroneously use their language's
standard quoting routine instead of matching Git's, which could silently
introduce escapes that would be incorrectly parsed due to this and lead
to data corruption.

The documentation states “To use a source path that contains SP the path
must be quoted.”, so it is expected that some implementations depend on
spaces being allowed in paths in the final position. Thus we have two
documented ways to parse paths, so simplify the implementation to that.

Now we have:

1. `parse_path_eol` for filemodify, filedelete, filecopy (dest),
   filerename (dest), ls, and ls-commit:

   If <path> starts with `"`, unquote it and report parse errors;
   otherwise, treat it as literal bytes to the end of the line
   (including any number of SP).

2. `parse_path_space` for filecopy (source) and filerename (source):

   If <path> starts with `"`, unquote it and report parse errors;
   otherwise, treat it as literal bytes up to, but not including, the
   next SP. It must be followed by SP.

There remain two special cases: The dest <path> in filecopy and rename
cannot be an unquoted empty string (this will be addressed subsequently)
and <path> in ls-commit must be quoted to disambiguate it from ls.

Signed-off-by: Thalia Archibald <thalia@archibald.dev>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Thalia Archibald 2024-04-14 01:11:37 +00:00 committed by Junio C Hamano
parent 91ec36f2cc
commit 0df86b6689
2 changed files with 322 additions and 44 deletions

View File

@ -2258,10 +2258,60 @@ static uintmax_t parse_mark_ref_space(const char **p)
return mark;
}
/*
* Parse the path string into the strbuf. The path can either be quoted with
* escape sequences or unquoted without escape sequences. Unquoted strings may
* contain spaces only if `is_last_field` is nonzero; otherwise, it stops
* parsing at the first space.
*/
static void parse_path(struct strbuf *sb, const char *p, const char **endp,
int is_last_field, const char *field)
{
if (*p == '"') {
if (unquote_c_style(sb, p, endp))
die("Invalid %s: %s", field, command_buf.buf);
} else {
/*
* Unless we are parsing the last field of a line,
* SP is the end of this field.
*/
*endp = is_last_field
? p + strlen(p)
: strchrnul(p, ' ');
strbuf_add(sb, p, *endp - p);
}
}
/*
* Parse the path string into the strbuf, and complain if this is not the end of
* the string. Unquoted strings may contain spaces.
*/
static void parse_path_eol(struct strbuf *sb, const char *p, const char *field)
{
const char *end;
parse_path(sb, p, &end, 1, field);
if (*end)
die("Garbage after %s: %s", field, command_buf.buf);
}
/*
* Parse the path string into the strbuf, and ensure it is followed by a space.
* Unquoted strings may not contain spaces. Update *endp to point to the first
* character after the space.
*/
static void parse_path_space(struct strbuf *sb, const char *p,
const char **endp, const char *field)
{
parse_path(sb, p, endp, 0, field);
if (**endp != ' ')
die("Missing space after %s: %s", field, command_buf.buf);
(*endp)++;
}
static void file_change_m(const char *p, struct branch *b)
{
static struct strbuf uq = STRBUF_INIT;
const char *endp;
struct object_entry *oe;
struct object_id oid;
uint16_t mode, inline_data = 0;
@ -2299,11 +2349,8 @@ static void file_change_m(const char *p, struct branch *b)
}
strbuf_reset(&uq);
if (!unquote_c_style(&uq, p, &endp)) {
if (*endp)
die("Garbage after path in: %s", command_buf.buf);
p = uq.buf;
}
parse_path_eol(&uq, p, "path");
p = uq.buf;
/* Git does not track empty, non-toplevel directories. */
if (S_ISDIR(mode) && is_empty_tree_oid(&oid) && *p) {
@ -2367,48 +2414,29 @@ static void file_change_m(const char *p, struct branch *b)
static void file_change_d(const char *p, struct branch *b)
{
static struct strbuf uq = STRBUF_INIT;
const char *endp;
strbuf_reset(&uq);
if (!unquote_c_style(&uq, p, &endp)) {
if (*endp)
die("Garbage after path in: %s", command_buf.buf);
p = uq.buf;
}
parse_path_eol(&uq, p, "path");
p = uq.buf;
tree_content_remove(&b->branch_tree, p, NULL, 1);
}
static void file_change_cr(const char *s, struct branch *b, int rename)
static void file_change_cr(const char *p, struct branch *b, int rename)
{
const char *d;
const char *s, *d;
static struct strbuf s_uq = STRBUF_INIT;
static struct strbuf d_uq = STRBUF_INIT;
const char *endp;
struct tree_entry leaf;
strbuf_reset(&s_uq);
if (!unquote_c_style(&s_uq, s, &endp)) {
if (*endp != ' ')
die("Missing space after source: %s", command_buf.buf);
} else {
endp = strchr(s, ' ');
if (!endp)
die("Missing space after source: %s", command_buf.buf);
strbuf_add(&s_uq, s, endp - s);
}
parse_path_space(&s_uq, p, &p, "source");
s = s_uq.buf;
endp++;
if (!*endp)
if (!*p)
die("Missing dest: %s", command_buf.buf);
d = endp;
strbuf_reset(&d_uq);
if (!unquote_c_style(&d_uq, d, &endp)) {
if (*endp)
die("Garbage after dest in: %s", command_buf.buf);
d = d_uq.buf;
}
parse_path_eol(&d_uq, p, "dest");
d = d_uq.buf;
memset(&leaf, 0, sizeof(leaf));
if (rename)
@ -3152,6 +3180,7 @@ static void print_ls(int mode, const unsigned char *hash, const char *path)
static void parse_ls(const char *p, struct branch *b)
{
static struct strbuf uq = STRBUF_INIT;
struct tree_entry *root = NULL;
struct tree_entry leaf = {NULL};
@ -3168,16 +3197,9 @@ static void parse_ls(const char *p, struct branch *b)
root->versions[1].mode = S_IFDIR;
load_tree(root);
}
if (*p == '"') {
static struct strbuf uq = STRBUF_INIT;
const char *endp;
strbuf_reset(&uq);
if (unquote_c_style(&uq, p, &endp))
die("Invalid path: %s", command_buf.buf);
if (*endp)
die("Garbage after path in: %s", command_buf.buf);
p = uq.buf;
}
strbuf_reset(&uq);
parse_path_eol(&uq, p, "path");
p = uq.buf;
tree_content_get(root, p, &leaf, 1);
/*
* A directory in preparation would have a sha1 of zero

View File

@ -2142,6 +2142,7 @@ test_expect_success 'Q: deny note on empty branch' '
EOF
test_must_fail git fast-import <input
'
###
### series R (feature and option)
###
@ -2790,7 +2791,7 @@ test_expect_success 'R: blob appears only once' '
'
###
### series S
### series S (mark and path parsing)
###
#
# Make sure missing spaces and EOLs after mark references
@ -3060,6 +3061,261 @@ test_expect_success 'S: ls with garbage after sha1 must fail' '
test_grep "space after tree-ish" err
'
#
# Path parsing
#
# There are two sorts of ways a path can be parsed, depending on whether it is
# the last field on the line. Additionally, ls without a <dataref> has a special
# case. Test every occurrence of <path> in the grammar against every error case.
#
#
# Valid paths at the end of a line: filemodify, filedelete, filecopy (dest),
# filerename (dest), and ls.
#
# commit :301 from root -- modify hello.c (for setup)
# commit :302 from :301 -- modify $path
# commit :303 from :302 -- delete $path
# commit :304 from :301 -- copy hello.c $path
# commit :305 from :301 -- rename hello.c $path
# ls :305 $path
#
test_path_eol_success () {
local test="$1" path="$2" unquoted_path="$3"
test_expect_success "S: paths at EOL with $test must work" '
test_when_finished "git branch -D S-path-eol" &&
git fast-import --export-marks=marks.out <<-EOF >out 2>err &&
blob
mark :401
data <<BLOB
hello world
BLOB
blob
mark :402
data <<BLOB
hallo welt
BLOB
commit refs/heads/S-path-eol
mark :301
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
initial commit
COMMIT
M 100644 :401 hello.c
commit refs/heads/S-path-eol
mark :302
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filemodify
COMMIT
from :301
M 100644 :402 $path
commit refs/heads/S-path-eol
mark :303
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filedelete
COMMIT
from :302
D $path
commit refs/heads/S-path-eol
mark :304
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filecopy dest
COMMIT
from :301
C hello.c $path
commit refs/heads/S-path-eol
mark :305
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filerename dest
COMMIT
from :301
R hello.c $path
ls :305 $path
EOF
commit_m=$(grep :302 marks.out | cut -d\ -f2) &&
commit_d=$(grep :303 marks.out | cut -d\ -f2) &&
commit_c=$(grep :304 marks.out | cut -d\ -f2) &&
commit_r=$(grep :305 marks.out | cut -d\ -f2) &&
blob1=$(grep :401 marks.out | cut -d\ -f2) &&
blob2=$(grep :402 marks.out | cut -d\ -f2) &&
(
printf "100644 blob $blob2\t$unquoted_path\n" &&
printf "100644 blob $blob1\thello.c\n"
) | sort >tree_m.exp &&
git ls-tree $commit_m | sort >tree_m.out &&
test_cmp tree_m.exp tree_m.out &&
printf "100644 blob $blob1\thello.c\n" >tree_d.exp &&
git ls-tree $commit_d >tree_d.out &&
test_cmp tree_d.exp tree_d.out &&
(
printf "100644 blob $blob1\t$unquoted_path\n" &&
printf "100644 blob $blob1\thello.c\n"
) | sort >tree_c.exp &&
git ls-tree $commit_c | sort >tree_c.out &&
test_cmp tree_c.exp tree_c.out &&
printf "100644 blob $blob1\t$unquoted_path\n" >tree_r.exp &&
git ls-tree $commit_r >tree_r.out &&
test_cmp tree_r.exp tree_r.out &&
test_cmp out tree_r.exp
'
}
test_path_eol_success 'quoted spaces' '" hello world.c "' ' hello world.c '
test_path_eol_success 'unquoted spaces' ' hello world.c ' ' hello world.c '
#
# Valid paths before a space: filecopy (source) and filerename (source).
#
# commit :301 from root -- modify $path (for setup)
# commit :302 from :301 -- copy $path hello2.c
# commit :303 from :301 -- rename $path hello2.c
#
test_path_space_success () {
local test="$1" path="$2" unquoted_path="$3"
test_expect_success "S: paths before space with $test must work" '
test_when_finished "git branch -D S-path-space" &&
git fast-import --export-marks=marks.out <<-EOF 2>err &&
blob
mark :401
data <<BLOB
hello world
BLOB
commit refs/heads/S-path-space
mark :301
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
initial commit
COMMIT
M 100644 :401 $path
commit refs/heads/S-path-space
mark :302
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filecopy source
COMMIT
from :301
C $path hello2.c
commit refs/heads/S-path-space
mark :303
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit filerename source
COMMIT
from :301
R $path hello2.c
EOF
commit_c=$(grep :302 marks.out | cut -d\ -f2) &&
commit_r=$(grep :303 marks.out | cut -d\ -f2) &&
blob=$(grep :401 marks.out | cut -d\ -f2) &&
(
printf "100644 blob $blob\t$unquoted_path\n" &&
printf "100644 blob $blob\thello2.c\n"
) | sort >tree_c.exp &&
git ls-tree $commit_c | sort >tree_c.out &&
test_cmp tree_c.exp tree_c.out &&
printf "100644 blob $blob\thello2.c\n" >tree_r.exp &&
git ls-tree $commit_r >tree_r.out &&
test_cmp tree_r.exp tree_r.out
'
}
test_path_space_success 'quoted spaces' '" hello world.c "' ' hello world.c '
test_path_space_success 'no unquoted spaces' 'hello_world.c' 'hello_world.c'
#
# Test a single commit change with an invalid path. Run it with all occurrences
# of <path> in the grammar against all error kinds.
#
test_path_fail () {
local change="$1" what="$2" prefix="$3" path="$4" suffix="$5" err_grep="$6"
test_expect_success "S: $change with $what must fail" '
test_must_fail git fast-import <<-EOF 2>err &&
blob
mark :1
data <<BLOB
hello world
BLOB
commit refs/heads/S-path-fail
mark :2
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit setup
COMMIT
M 100644 :1 hello.c
commit refs/heads/S-path-fail
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
data <<COMMIT
commit with bad path
COMMIT
from :2
$prefix$path$suffix
EOF
test_grep "$err_grep" err
'
}
test_path_base_fail () {
local change="$1" prefix="$2" field="$3" suffix="$4"
test_path_fail "$change" 'unclosed " in '"$field" "$prefix" '"hello.c' "$suffix" "Invalid $field"
test_path_fail "$change" "invalid escape in quoted $field" "$prefix" '"hello\xff"' "$suffix" "Invalid $field"
}
test_path_eol_quoted_fail () {
local change="$1" prefix="$2" field="$3"
test_path_base_fail "$change" "$prefix" "$field" ''
test_path_fail "$change" "garbage after quoted $field" "$prefix" '"hello.c"' 'x' "Garbage after $field"
test_path_fail "$change" "space after quoted $field" "$prefix" '"hello.c"' ' ' "Garbage after $field"
}
test_path_eol_fail () {
local change="$1" prefix="$2" field="$3"
test_path_eol_quoted_fail "$change" "$prefix" "$field"
}
test_path_space_fail () {
local change="$1" prefix="$2" field="$3"
test_path_base_fail "$change" "$prefix" "$field" ' world.c'
test_path_fail "$change" "missing space after quoted $field" "$prefix" '"hello.c"' 'x world.c' "Missing space after $field"
test_path_fail "$change" "missing space after unquoted $field" "$prefix" 'hello.c' '' "Missing space after $field"
}
test_path_eol_fail filemodify 'M 100644 :1 ' path
test_path_eol_fail filedelete 'D ' path
test_path_space_fail filecopy 'C ' source
test_path_eol_fail filecopy 'C hello.c ' dest
test_path_space_fail filerename 'R ' source
test_path_eol_fail filerename 'R hello.c ' dest
test_path_eol_fail 'ls (in commit)' 'ls :2 ' path
# When 'ls' has no <dataref>, the <path> must be quoted.
test_path_eol_quoted_fail 'ls (without dataref in commit)' 'ls ' path
###
### series T (ls)
###