2006-12-23 05:06:08 +08:00
|
|
|
#include "git-compat-util.h"
|
2008-11-11 01:47:00 +08:00
|
|
|
#include "strbuf.h"
|
2006-12-23 05:06:08 +08:00
|
|
|
#include "utf8.h"
|
|
|
|
|
2023-11-24 11:35:13 +08:00
|
|
|
/* This code is originally from https://www.cl.cam.ac.uk/~mgk25/ucs/ */
|
2006-12-23 05:06:08 +08:00
|
|
|
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
static const char utf16_be_bom[] = {'\xFE', '\xFF'};
|
|
|
|
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
|
|
|
|
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
|
|
|
|
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
|
|
|
|
|
2006-12-23 05:06:08 +08:00
|
|
|
struct interval {
|
2014-02-17 00:06:04 +08:00
|
|
|
ucs_char_t first;
|
|
|
|
ucs_char_t last;
|
2006-12-23 05:06:08 +08:00
|
|
|
};
|
|
|
|
|
2013-04-19 07:08:52 +08:00
|
|
|
size_t display_mode_esc_sequence_len(const char *s)
|
2013-04-19 07:08:44 +08:00
|
|
|
{
|
|
|
|
const char *p = s;
|
|
|
|
if (*p++ != '\033')
|
|
|
|
return 0;
|
|
|
|
if (*p++ != '[')
|
|
|
|
return 0;
|
|
|
|
while (isdigit(*p) || *p == ';')
|
|
|
|
p++;
|
|
|
|
if (*p++ != 'm')
|
|
|
|
return 0;
|
|
|
|
return p - s;
|
|
|
|
}
|
|
|
|
|
2006-12-23 05:06:08 +08:00
|
|
|
/* auxiliary function for binary search in interval table */
|
2007-11-09 07:35:32 +08:00
|
|
|
static int bisearch(ucs_char_t ucs, const struct interval *table, int max)
|
|
|
|
{
|
2006-12-23 05:06:08 +08:00
|
|
|
int min = 0;
|
|
|
|
int mid;
|
|
|
|
|
|
|
|
if (ucs < table[0].first || ucs > table[max].last)
|
|
|
|
return 0;
|
|
|
|
while (max >= min) {
|
2017-10-09 02:29:37 +08:00
|
|
|
mid = min + (max - min) / 2;
|
2006-12-23 05:06:08 +08:00
|
|
|
if (ucs > table[mid].last)
|
|
|
|
min = mid + 1;
|
|
|
|
else if (ucs < table[mid].first)
|
|
|
|
max = mid - 1;
|
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The following two functions define the column width of an ISO 10646
|
|
|
|
* character as follows:
|
|
|
|
*
|
|
|
|
* - The null character (U+0000) has a column width of 0.
|
|
|
|
*
|
|
|
|
* - Other C0/C1 control characters and DEL will lead to a return
|
|
|
|
* value of -1.
|
|
|
|
*
|
|
|
|
* - Non-spacing and enclosing combining characters (general
|
|
|
|
* category code Mn or Me in the Unicode database) have a
|
|
|
|
* column width of 0.
|
|
|
|
*
|
|
|
|
* - SOFT HYPHEN (U+00AD) has a column width of 1.
|
|
|
|
*
|
|
|
|
* - Other format characters (general category code Cf in the Unicode
|
|
|
|
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
|
|
|
|
*
|
|
|
|
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
|
|
|
|
* have a column width of 0.
|
|
|
|
*
|
|
|
|
* - Spacing characters in the East Asian Wide (W) or East Asian
|
|
|
|
* Full-width (F) category as defined in Unicode Technical
|
|
|
|
* Report #11 have a column width of 2.
|
|
|
|
*
|
|
|
|
* - All remaining characters (including all printable
|
|
|
|
* ISO 8859-1 and WGL4 characters, Unicode control characters,
|
|
|
|
* etc.) have a column width of 1.
|
|
|
|
*
|
2007-03-04 02:28:57 +08:00
|
|
|
* This implementation assumes that ucs_char_t characters are encoded
|
2006-12-23 05:06:08 +08:00
|
|
|
* in ISO 10646.
|
|
|
|
*/
|
|
|
|
|
2007-05-08 12:46:08 +08:00
|
|
|
static int git_wcwidth(ucs_char_t ch)
|
2006-12-23 05:06:08 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Sorted list of non-overlapping intervals of non-spacing characters,
|
|
|
|
*/
|
2018-04-11 05:26:17 +08:00
|
|
|
#include "unicode-width.h"
|
2006-12-23 05:06:08 +08:00
|
|
|
|
|
|
|
/* test for 8-bit control characters */
|
|
|
|
if (ch == 0)
|
|
|
|
return 0;
|
|
|
|
if (ch < 32 || (ch >= 0x7f && ch < 0xa0))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* binary search in table of non-spacing characters */
|
2019-10-12 02:41:23 +08:00
|
|
|
if (bisearch(ch, zero_width, ARRAY_SIZE(zero_width) - 1))
|
2006-12-23 05:06:08 +08:00
|
|
|
return 0;
|
|
|
|
|
2014-05-10 05:51:38 +08:00
|
|
|
/* binary search in table of double width characters */
|
2019-10-12 02:41:23 +08:00
|
|
|
if (bisearch(ch, double_width, ARRAY_SIZE(double_width) - 1))
|
2014-05-10 05:51:38 +08:00
|
|
|
return 2;
|
2006-12-23 05:06:08 +08:00
|
|
|
|
2014-05-10 05:51:38 +08:00
|
|
|
return 1;
|
2006-12-23 05:06:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-01-07 11:02:22 +08:00
|
|
|
* Pick one ucs character starting from the location *start points at,
|
|
|
|
* and return it, while updating the *start pointer to point at the
|
2008-01-02 17:49:58 +08:00
|
|
|
* end of that character. When remainder_p is not NULL, the location
|
|
|
|
* holds the number of bytes remaining in the string that we are allowed
|
|
|
|
* to pick from. Otherwise we are allowed to pick up to the NUL that
|
|
|
|
* would eventually appear in the string. *remainder_p is also reduced
|
|
|
|
* by the number of bytes we have consumed.
|
2008-01-07 11:02:22 +08:00
|
|
|
*
|
|
|
|
* If the string was not a valid UTF-8, *start pointer is set to NULL
|
|
|
|
* and the return value is undefined.
|
2006-12-23 05:06:08 +08:00
|
|
|
*/
|
2010-01-12 14:32:29 +08:00
|
|
|
static ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p)
|
2006-12-23 05:06:08 +08:00
|
|
|
{
|
|
|
|
unsigned char *s = (unsigned char *)*start;
|
2007-03-04 02:28:57 +08:00
|
|
|
ucs_char_t ch;
|
2008-01-02 17:49:58 +08:00
|
|
|
size_t remainder, incr;
|
2006-12-23 05:06:08 +08:00
|
|
|
|
2008-01-02 17:49:58 +08:00
|
|
|
/*
|
|
|
|
* A caller that assumes NUL terminated text can choose
|
|
|
|
* not to bother with the remainder length. We will
|
|
|
|
* stop at the first NUL.
|
|
|
|
*/
|
|
|
|
remainder = (remainder_p ? *remainder_p : 999);
|
|
|
|
|
|
|
|
if (remainder < 1) {
|
|
|
|
goto invalid;
|
|
|
|
} else if (*s < 0x80) {
|
2006-12-23 05:06:08 +08:00
|
|
|
/* 0xxxxxxx */
|
|
|
|
ch = *s;
|
2008-01-02 17:49:58 +08:00
|
|
|
incr = 1;
|
2006-12-23 05:06:08 +08:00
|
|
|
} else if ((s[0] & 0xe0) == 0xc0) {
|
|
|
|
/* 110XXXXx 10xxxxxx */
|
2008-01-02 17:49:58 +08:00
|
|
|
if (remainder < 2 ||
|
|
|
|
(s[1] & 0xc0) != 0x80 ||
|
|
|
|
(s[0] & 0xfe) == 0xc0)
|
2006-12-23 05:06:08 +08:00
|
|
|
goto invalid;
|
|
|
|
ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
|
2008-01-02 17:49:58 +08:00
|
|
|
incr = 2;
|
2006-12-23 05:06:08 +08:00
|
|
|
} else if ((s[0] & 0xf0) == 0xe0) {
|
|
|
|
/* 1110XXXX 10Xxxxxx 10xxxxxx */
|
2008-01-02 17:49:58 +08:00
|
|
|
if (remainder < 3 ||
|
|
|
|
(s[1] & 0xc0) != 0x80 ||
|
|
|
|
(s[2] & 0xc0) != 0x80 ||
|
|
|
|
/* overlong? */
|
|
|
|
(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||
|
|
|
|
/* surrogate? */
|
|
|
|
(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||
|
|
|
|
/* U+FFFE or U+FFFF? */
|
|
|
|
(s[0] == 0xef && s[1] == 0xbf &&
|
|
|
|
(s[2] & 0xfe) == 0xbe))
|
2006-12-23 05:06:08 +08:00
|
|
|
goto invalid;
|
|
|
|
ch = ((s[0] & 0x0f) << 12) |
|
|
|
|
((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
|
2008-01-02 17:49:58 +08:00
|
|
|
incr = 3;
|
2006-12-23 05:06:08 +08:00
|
|
|
} else if ((s[0] & 0xf8) == 0xf0) {
|
|
|
|
/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
|
2008-01-02 17:49:58 +08:00
|
|
|
if (remainder < 4 ||
|
|
|
|
(s[1] & 0xc0) != 0x80 ||
|
|
|
|
(s[2] & 0xc0) != 0x80 ||
|
|
|
|
(s[3] & 0xc0) != 0x80 ||
|
|
|
|
/* overlong? */
|
|
|
|
(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||
|
|
|
|
/* > U+10FFFF? */
|
|
|
|
(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4)
|
2006-12-23 05:06:08 +08:00
|
|
|
goto invalid;
|
|
|
|
ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) |
|
|
|
|
((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
|
2008-01-02 17:49:58 +08:00
|
|
|
incr = 4;
|
2006-12-23 05:06:08 +08:00
|
|
|
} else {
|
|
|
|
invalid:
|
|
|
|
*start = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-02 17:49:58 +08:00
|
|
|
*start += incr;
|
|
|
|
if (remainder_p)
|
|
|
|
*remainder_p = remainder - incr;
|
2008-01-07 11:02:22 +08:00
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function returns the number of columns occupied by the character
|
|
|
|
* pointed to by the variable start. The pointer is updated to point at
|
2008-01-02 17:49:58 +08:00
|
|
|
* the next character. When remainder_p is not NULL, it points at the
|
|
|
|
* location that stores the number of remaining bytes we can use to pick
|
|
|
|
* a character (see pick_one_utf8_char() above).
|
2008-01-07 11:02:22 +08:00
|
|
|
*/
|
2008-01-02 17:49:58 +08:00
|
|
|
int utf8_width(const char **start, size_t *remainder_p)
|
2008-01-07 11:02:22 +08:00
|
|
|
{
|
2008-01-02 17:49:58 +08:00
|
|
|
ucs_char_t ch = pick_one_utf8_char(start, remainder_p);
|
2008-01-07 11:02:22 +08:00
|
|
|
if (!*start)
|
|
|
|
return 0;
|
2007-05-08 12:46:08 +08:00
|
|
|
return git_wcwidth(ch);
|
2006-12-23 05:06:08 +08:00
|
|
|
}
|
|
|
|
|
2009-01-30 17:41:28 +08:00
|
|
|
/*
|
|
|
|
* Returns the total number of columns required by a null-terminated
|
|
|
|
* string, assuming that the string is utf8. Returns strlen() instead
|
|
|
|
* if the string does not look like a valid utf8 string.
|
|
|
|
*/
|
2022-12-01 22:46:53 +08:00
|
|
|
int utf8_strnwidth(const char *string, size_t len, int skip_ansi)
|
2009-01-30 17:41:28 +08:00
|
|
|
{
|
|
|
|
const char *orig = string;
|
2022-12-01 22:47:04 +08:00
|
|
|
size_t width = 0;
|
2009-01-30 17:41:28 +08:00
|
|
|
|
2013-04-19 07:08:45 +08:00
|
|
|
while (string && string < orig + len) {
|
2022-12-01 22:47:04 +08:00
|
|
|
int glyph_width;
|
|
|
|
size_t skip;
|
2022-12-01 22:47:00 +08:00
|
|
|
|
2013-04-19 07:08:45 +08:00
|
|
|
while (skip_ansi &&
|
|
|
|
(skip = display_mode_esc_sequence_len(string)) != 0)
|
|
|
|
string += skip;
|
2022-12-01 22:47:00 +08:00
|
|
|
|
|
|
|
glyph_width = utf8_width(&string, NULL);
|
|
|
|
if (glyph_width > 0)
|
|
|
|
width += glyph_width;
|
2009-01-30 17:41:28 +08:00
|
|
|
}
|
2022-12-01 22:47:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO: fix the interface of this function and `utf8_strwidth()` to
|
|
|
|
* return `size_t` instead of `int`.
|
|
|
|
*/
|
|
|
|
return cast_size_t_to_int(string ? width : len);
|
2013-04-19 07:08:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int utf8_strwidth(const char *string)
|
|
|
|
{
|
2022-12-01 22:46:53 +08:00
|
|
|
return utf8_strnwidth(string, strlen(string), 0);
|
2009-01-30 17:41:28 +08:00
|
|
|
}
|
|
|
|
|
2006-12-23 05:06:08 +08:00
|
|
|
int is_utf8(const char *text)
|
|
|
|
{
|
|
|
|
while (*text) {
|
|
|
|
if (*text == '\n' || *text == '\t' || *text == '\r') {
|
|
|
|
text++;
|
|
|
|
continue;
|
|
|
|
}
|
2008-01-02 17:49:58 +08:00
|
|
|
utf8_width(&text, NULL);
|
2006-12-23 05:06:08 +08:00
|
|
|
if (!text)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-11-23 00:15:29 +08:00
|
|
|
static void strbuf_add_indented_text(struct strbuf *buf, const char *text,
|
|
|
|
int indent, int indent2)
|
|
|
|
{
|
|
|
|
if (indent < 0)
|
|
|
|
indent = 0;
|
|
|
|
while (*text) {
|
|
|
|
const char *eol = strchrnul(text, '\n');
|
|
|
|
if (*eol == '\n')
|
|
|
|
eol++;
|
2010-02-20 06:15:55 +08:00
|
|
|
strbuf_addchars(buf, ' ', indent);
|
2010-02-20 06:16:45 +08:00
|
|
|
strbuf_add(buf, text, eol - text);
|
2009-11-23 00:15:29 +08:00
|
|
|
text = eol;
|
|
|
|
indent = indent2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-12-23 05:06:08 +08:00
|
|
|
/*
|
|
|
|
* Wrap the text, if necessary. The variable indent is the indent for the
|
|
|
|
* first line, indent2 is the indent for all other lines.
|
2007-02-27 23:20:31 +08:00
|
|
|
* If indent is negative, assume that already -indent columns have been
|
|
|
|
* consumed (and no extra indent is necessary for the first line).
|
2006-12-23 05:06:08 +08:00
|
|
|
*/
|
2012-12-11 13:59:22 +08:00
|
|
|
void strbuf_add_wrapped_text(struct strbuf *buf,
|
2010-02-20 06:20:44 +08:00
|
|
|
const char *text, int indent1, int indent2, int width)
|
2006-12-23 05:06:08 +08:00
|
|
|
{
|
2010-02-20 06:20:44 +08:00
|
|
|
int indent, w, assume_utf8 = 1;
|
|
|
|
const char *bol, *space, *start = text;
|
|
|
|
size_t orig_len = buf->len;
|
2006-12-23 05:06:08 +08:00
|
|
|
|
2009-10-19 14:40:35 +08:00
|
|
|
if (width <= 0) {
|
2010-02-20 06:20:44 +08:00
|
|
|
strbuf_add_indented_text(buf, text, indent1, indent2);
|
2012-12-11 13:59:22 +08:00
|
|
|
return;
|
2009-10-19 14:40:35 +08:00
|
|
|
}
|
|
|
|
|
2010-02-20 06:20:44 +08:00
|
|
|
retry:
|
|
|
|
bol = text;
|
|
|
|
w = indent = indent1;
|
|
|
|
space = NULL;
|
2007-02-27 23:20:31 +08:00
|
|
|
if (indent < 0) {
|
|
|
|
w = -indent;
|
|
|
|
space = text;
|
|
|
|
}
|
|
|
|
|
2006-12-23 05:06:08 +08:00
|
|
|
for (;;) {
|
2009-11-24 06:40:03 +08:00
|
|
|
char c;
|
|
|
|
size_t skip;
|
|
|
|
|
|
|
|
while ((skip = display_mode_esc_sequence_len(text)))
|
|
|
|
text += skip;
|
|
|
|
|
|
|
|
c = *text;
|
2006-12-23 05:06:08 +08:00
|
|
|
if (!c || isspace(c)) {
|
utf8: fix off-by-one wrapping of text
The wrapping logic in strbuf_add_wrapped_text() does currently not allow
lines that entirely fill the allowed width, instead it wraps the line one
character too early.
For example, the text "This is the sixth commit." formatted via
"%w(11,1,2)" (wrap at 11 characters, 1 char indent of first line, 2 char
indent of following lines) results in four lines: " This is", " the",
" sixth", " commit." This is wrong, because " the sixth" is exactly
11 characters long, and thus allowed.
Fix this by allowing the (width+1) character of a line to be a valid
wrapping point if it is a whitespace character.
Signed-off-by: Jan H. Schönherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-10-18 22:43:28 +08:00
|
|
|
if (w <= width || !space) {
|
2006-12-23 05:06:08 +08:00
|
|
|
const char *start = bol;
|
2007-11-11 22:14:15 +08:00
|
|
|
if (!c && text == start)
|
2012-12-11 13:59:22 +08:00
|
|
|
return;
|
2006-12-23 05:06:08 +08:00
|
|
|
if (space)
|
|
|
|
start = space;
|
|
|
|
else
|
2010-02-20 06:15:55 +08:00
|
|
|
strbuf_addchars(buf, ' ', indent);
|
2010-02-20 06:16:45 +08:00
|
|
|
strbuf_add(buf, start, text - start);
|
2007-02-27 23:20:31 +08:00
|
|
|
if (!c)
|
2012-12-11 13:59:22 +08:00
|
|
|
return;
|
2006-12-23 05:06:08 +08:00
|
|
|
space = text;
|
2007-11-11 22:14:15 +08:00
|
|
|
if (c == '\t')
|
|
|
|
w |= 0x07;
|
|
|
|
else if (c == '\n') {
|
|
|
|
space++;
|
|
|
|
if (*space == '\n') {
|
2010-02-20 06:16:45 +08:00
|
|
|
strbuf_addch(buf, '\n');
|
2007-11-11 22:14:15 +08:00
|
|
|
goto new_line;
|
|
|
|
}
|
|
|
|
else if (!isalnum(*space))
|
|
|
|
goto new_line;
|
|
|
|
else
|
2010-02-20 06:16:45 +08:00
|
|
|
strbuf_addch(buf, ' ');
|
2007-11-11 22:14:15 +08:00
|
|
|
}
|
2006-12-23 05:06:08 +08:00
|
|
|
w++;
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
else {
|
2007-11-11 22:14:15 +08:00
|
|
|
new_line:
|
2010-02-20 06:16:45 +08:00
|
|
|
strbuf_addch(buf, '\n');
|
2007-03-02 22:28:00 +08:00
|
|
|
text = bol = space + isspace(*space);
|
2006-12-23 05:06:08 +08:00
|
|
|
space = NULL;
|
|
|
|
w = indent = indent2;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2010-02-20 06:20:44 +08:00
|
|
|
if (assume_utf8) {
|
2008-01-02 17:49:58 +08:00
|
|
|
w += utf8_width(&text, NULL);
|
2010-02-20 06:20:44 +08:00
|
|
|
if (!text) {
|
|
|
|
assume_utf8 = 0;
|
|
|
|
text = start;
|
|
|
|
strbuf_setlen(buf, orig_len);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
} else {
|
2006-12-23 05:06:08 +08:00
|
|
|
w++;
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-12-24 15:36:55 +08:00
|
|
|
|
2012-12-11 13:59:22 +08:00
|
|
|
void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
|
2011-02-23 17:50:19 +08:00
|
|
|
int indent, int indent2, int width)
|
|
|
|
{
|
|
|
|
char *tmp = xstrndup(data, len);
|
2012-12-11 13:59:22 +08:00
|
|
|
strbuf_add_wrapped_text(buf, tmp, indent, indent2, width);
|
2011-02-23 17:50:19 +08:00
|
|
|
free(tmp);
|
|
|
|
}
|
|
|
|
|
2013-04-19 07:08:51 +08:00
|
|
|
void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width,
|
|
|
|
const char *subst)
|
|
|
|
{
|
2022-12-01 22:47:15 +08:00
|
|
|
const char *src = sb_src->buf, *end = sb_src->buf + sb_src->len;
|
|
|
|
struct strbuf dst;
|
|
|
|
int w = 0;
|
2013-04-19 07:08:51 +08:00
|
|
|
|
2022-12-01 22:47:15 +08:00
|
|
|
strbuf_init(&dst, sb_src->len);
|
2013-04-19 07:08:51 +08:00
|
|
|
|
|
|
|
while (src < end) {
|
2022-12-01 22:47:15 +08:00
|
|
|
const char *old;
|
2022-12-01 22:47:10 +08:00
|
|
|
int glyph_width;
|
2013-04-19 07:08:51 +08:00
|
|
|
size_t n;
|
|
|
|
|
|
|
|
while ((n = display_mode_esc_sequence_len(src))) {
|
2022-12-01 22:47:15 +08:00
|
|
|
strbuf_add(&dst, src, n);
|
2013-04-19 07:08:51 +08:00
|
|
|
src += n;
|
|
|
|
}
|
|
|
|
|
2014-08-10 15:05:21 +08:00
|
|
|
if (src >= end)
|
|
|
|
break;
|
|
|
|
|
2013-04-19 07:08:51 +08:00
|
|
|
old = src;
|
2022-12-01 22:47:10 +08:00
|
|
|
glyph_width = utf8_width((const char**)&src, NULL);
|
|
|
|
if (!src) /* broken utf-8, do nothing */
|
2017-08-31 02:20:16 +08:00
|
|
|
goto out;
|
2022-12-01 22:47:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In case we see a control character we copy it into the
|
|
|
|
* buffer, but don't add it to the width.
|
|
|
|
*/
|
|
|
|
if (glyph_width < 0)
|
|
|
|
glyph_width = 0;
|
|
|
|
|
|
|
|
if (glyph_width && w >= pos && w < pos + width) {
|
2013-04-19 07:08:51 +08:00
|
|
|
if (subst) {
|
2022-12-01 22:47:15 +08:00
|
|
|
strbuf_addstr(&dst, subst);
|
2013-04-19 07:08:51 +08:00
|
|
|
subst = NULL;
|
|
|
|
}
|
2022-12-01 22:47:15 +08:00
|
|
|
} else {
|
|
|
|
strbuf_add(&dst, old, src - old);
|
2013-04-19 07:08:51 +08:00
|
|
|
}
|
2022-12-01 22:47:15 +08:00
|
|
|
|
2022-12-01 22:47:10 +08:00
|
|
|
w += glyph_width;
|
2013-04-19 07:08:51 +08:00
|
|
|
}
|
2022-12-01 22:47:15 +08:00
|
|
|
|
|
|
|
strbuf_swap(sb_src, &dst);
|
2017-08-31 02:20:16 +08:00
|
|
|
out:
|
2022-12-01 22:47:15 +08:00
|
|
|
strbuf_release(&dst);
|
2013-04-19 07:08:51 +08:00
|
|
|
}
|
|
|
|
|
2018-04-16 02:16:04 +08:00
|
|
|
/*
|
|
|
|
* Returns true (1) if the src encoding name matches the dst encoding
|
|
|
|
* name directly or one of its alternative names. E.g. UTF-16BE is the
|
|
|
|
* same as UTF16BE.
|
|
|
|
*/
|
|
|
|
static int same_utf_encoding(const char *src, const char *dst)
|
|
|
|
{
|
2019-11-09 04:25:21 +08:00
|
|
|
if (skip_iprefix(src, "utf", &src) && skip_iprefix(dst, "utf", &dst)) {
|
|
|
|
skip_prefix(src, "-", &src);
|
|
|
|
skip_prefix(dst, "-", &dst);
|
|
|
|
return !strcasecmp(src, dst);
|
2018-04-16 02:16:04 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-12-31 04:20:43 +08:00
|
|
|
int is_encoding_utf8(const char *name)
|
|
|
|
{
|
|
|
|
if (!name)
|
|
|
|
return 1;
|
2018-04-16 02:16:04 +08:00
|
|
|
if (same_utf_encoding("utf-8", name))
|
2006-12-31 04:20:43 +08:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-10-19 13:41:56 +08:00
|
|
|
int same_encoding(const char *src, const char *dst)
|
|
|
|
{
|
2018-04-16 02:16:04 +08:00
|
|
|
static const char utf8[] = "UTF-8";
|
|
|
|
|
|
|
|
if (!src)
|
|
|
|
src = utf8;
|
|
|
|
if (!dst)
|
|
|
|
dst = utf8;
|
|
|
|
if (same_utf_encoding(src, dst))
|
2012-10-19 13:41:56 +08:00
|
|
|
return 1;
|
|
|
|
return !strcasecmp(src, dst);
|
|
|
|
}
|
|
|
|
|
2013-02-09 14:31:09 +08:00
|
|
|
/*
|
|
|
|
* Wrapper for fprintf and returns the total number of columns required
|
|
|
|
* for the printed string, assuming that the string is utf8.
|
|
|
|
*/
|
|
|
|
int utf8_fprintf(FILE *stream, const char *format, ...)
|
|
|
|
{
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
va_list arg;
|
|
|
|
int columns;
|
|
|
|
|
|
|
|
va_start(arg, format);
|
|
|
|
strbuf_vaddf(&buf, format, arg);
|
|
|
|
va_end(arg);
|
|
|
|
|
|
|
|
columns = fputs(buf.buf, stream);
|
|
|
|
if (0 <= columns) /* keep the error from the I/O */
|
|
|
|
columns = utf8_strwidth(buf.buf);
|
|
|
|
strbuf_release(&buf);
|
|
|
|
return columns;
|
|
|
|
}
|
|
|
|
|
2006-12-24 15:36:55 +08:00
|
|
|
/*
|
|
|
|
* Given a buffer and its encoding, return it re-encoded
|
|
|
|
* with iconv. If the conversion fails, returns NULL.
|
|
|
|
*/
|
|
|
|
#ifndef NO_ICONV
|
2009-06-06 07:36:12 +08:00
|
|
|
#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6))
|
2007-03-04 02:29:03 +08:00
|
|
|
typedef const char * iconv_ibp;
|
|
|
|
#else
|
|
|
|
typedef char * iconv_ibp;
|
|
|
|
#endif
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv,
|
|
|
|
size_t bom_len, size_t *outsz_p)
|
2006-12-24 15:36:55 +08:00
|
|
|
{
|
git on Mac OS and precomposed unicode
Mac OS X mangles file names containing unicode on file systems HFS+,
VFAT or SAMBA. When a file using unicode code points outside ASCII
is created on a HFS+ drive, the file name is converted into
decomposed unicode and written to disk. No conversion is done if
the file name is already decomposed unicode.
Calling open("\xc3\x84", ...) with a precomposed "Ä" yields the same
result as open("\x41\xcc\x88",...) with a decomposed "Ä".
As a consequence, readdir() returns the file names in decomposed
unicode, even if the user expects precomposed unicode. Unlike on
HFS+, Mac OS X stores files on a VFAT drive (e.g. an USB drive) in
precomposed unicode, but readdir() still returns file names in
decomposed unicode. When a git repository is stored on a network
share using SAMBA, file names are send over the wire and written to
disk on the remote system in precomposed unicode, but Mac OS X
readdir() returns decomposed unicode to be compatible with its
behaviour on HFS+ and VFAT.
The unicode decomposition causes many problems:
- The names "git add" and other commands get from the end user may
often be precomposed form (the decomposed form is not easily input
from the keyboard), but when the commands read from the filesystem
to see what it is going to update the index with already is on the
filesystem, readdir() will give decomposed form, which is different.
- Similarly "git log", "git mv" and all other commands that need to
compare pathnames found on the command line (often but not always
precomposed form; a command line input resulting from globbing may
be in decomposed) with pathnames found in the tree objects (should
be precomposed form to be compatible with other systems and for
consistency in general).
- The same for names stored in the index, which should be
precomposed, that may need to be compared with the names read from
readdir().
NFS mounted from Linux is fully transparent and does not suffer from
the above.
As Mac OS X treats precomposed and decomposed file names as equal,
we can
- wrap readdir() on Mac OS X to return the precomposed form, and
- normalize decomposed form given from the command line also to the
precomposed form,
to ensure that all pathnames used in Git are always in the
precomposed form. This behaviour can be requested by setting
"core.precomposedunicode" configuration variable to true.
The code in compat/precomposed_utf8.c implements basically 4 new
functions: precomposed_utf8_opendir(), precomposed_utf8_readdir(),
precomposed_utf8_closedir() and precompose_argv(). The first three
are to wrap opendir(3), readdir(3), and closedir(3) functions.
The argv[] conversion allows to use the TAB filename completion done
by the shell on command line. It tolerates other tools which use
readdir() to feed decomposed file names into git.
When creating a new git repository with "git init" or "git clone",
"core.precomposedunicode" will be set "false".
The user needs to activate this feature manually. She typically
sets core.precomposedunicode to "true" on HFS and VFAT, or file
systems mounted via SAMBA.
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-08 21:50:25 +08:00
|
|
|
size_t outsz, outalloc;
|
2007-03-04 02:29:03 +08:00
|
|
|
char *out, *outpos;
|
|
|
|
iconv_ibp cp;
|
2006-12-24 15:36:55 +08:00
|
|
|
|
|
|
|
outsz = insz;
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
outalloc = st_add(outsz, 1 + bom_len); /* for terminating NUL */
|
2006-12-24 15:36:55 +08:00
|
|
|
out = xmalloc(outalloc);
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
outpos = out + bom_len;
|
2007-03-04 02:29:03 +08:00
|
|
|
cp = (iconv_ibp)in;
|
2006-12-24 15:36:55 +08:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz);
|
|
|
|
|
2014-02-17 00:06:03 +08:00
|
|
|
if (cnt == (size_t) -1) {
|
2006-12-24 15:36:55 +08:00
|
|
|
size_t sofar;
|
|
|
|
if (errno != E2BIG) {
|
|
|
|
free(out);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* insz has remaining number of bytes.
|
|
|
|
* since we started outsz the same as insz,
|
|
|
|
* it is likely that insz is not enough for
|
|
|
|
* converting the rest.
|
|
|
|
*/
|
|
|
|
sofar = outpos - out;
|
reencode_string: use st_add/st_mult helpers
When converting a string with iconv, if the output buffer
isn't big enough, we grow it. But our growth is done without
any concern for integer overflow. So when we add:
outalloc = sofar + insz * 2 + 32;
we may end up wrapping outalloc (which is a size_t), and
allocating a too-small buffer. We then manipulate it
further:
outsz = outalloc - sofar - 1;
and feed outsz back to iconv. If outalloc is wrapped and
smaller than sofar, we'll end up with a small allocation but
feed a very large outsz to iconv, which could result in it
overflowing the buffer.
Can we use this to construct an attack wherein the victim
clones a repository with a very large commit object with an
encoding header, and running "git log" reencodes it into
utf8, causing an overflow?
An attack of this sort is likely impossible in practice.
"sofar" is how many output bytes we've written total, and
"insz" is the number of input bytes remaining. Imagine our
input doubles in size as we output it (which is easy to do
by converting latin1 to utf8, for example), and that we
start with N input bytes. Our initial output buffer also
starts at N bytes, so after the first call we'd have N/2
input bytes remaining (insz), and have written N bytes
(sofar). That means our next allocation will be
(N + N/2 * 2 + 32) bytes, or (2N + 32).
We can therefore overflow a 32-bit size_t with a commit
message that's just under 2^31 bytes, assuming it consists
mostly of "doubling" sequences (e.g., latin1 0xe1 which
becomes utf8 0xc3 0xa1).
But we'll never make it that far with such a message. We'll
be spending 2^31 bytes on the original string. And our
initial output buffer will also be 2^31 bytes. Which is not
going to succeed on a system with a 32-bit size_t, since
there will be other things using the address space, too. The
initial malloc will fail.
If we imagine instead that we can triple the size when
converting, then our second allocation becomes
(N + 2/3N * 2 + 32), or (7/3N + 32). That still requires two
allocations of 3/7 of our address space (6/7 of the total)
to succeed.
If we imagine we can quadruple, it becomes (5/2N + 32); we
need to be able to allocate 4/5 of the address space to
succeed.
This might start to get plausible. But is it possible to get
a 4-to-1 increase in size? Probably if you're converting to
some obscure encoding. But since git defaults to utf8 for
its output, that's the likely destination encoding for an
attack. And while there are 4-character utf8 sequences, it's
unlikely that you'd be able find a single-byte source
sequence in any encoding.
So this is certainly buggy code which should be fixed, but
it is probably not a useful attack vector.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-24 18:50:10 +08:00
|
|
|
outalloc = st_add3(sofar, st_mult(insz, 2), 32);
|
2006-12-24 15:36:55 +08:00
|
|
|
out = xrealloc(out, outalloc);
|
|
|
|
outpos = out + sofar;
|
|
|
|
outsz = outalloc - sofar - 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
*outpos = '\0';
|
2013-04-19 07:08:46 +08:00
|
|
|
if (outsz_p)
|
|
|
|
*outsz_p = outpos - out;
|
2006-12-24 15:36:55 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
git on Mac OS and precomposed unicode
Mac OS X mangles file names containing unicode on file systems HFS+,
VFAT or SAMBA. When a file using unicode code points outside ASCII
is created on a HFS+ drive, the file name is converted into
decomposed unicode and written to disk. No conversion is done if
the file name is already decomposed unicode.
Calling open("\xc3\x84", ...) with a precomposed "Ä" yields the same
result as open("\x41\xcc\x88",...) with a decomposed "Ä".
As a consequence, readdir() returns the file names in decomposed
unicode, even if the user expects precomposed unicode. Unlike on
HFS+, Mac OS X stores files on a VFAT drive (e.g. an USB drive) in
precomposed unicode, but readdir() still returns file names in
decomposed unicode. When a git repository is stored on a network
share using SAMBA, file names are send over the wire and written to
disk on the remote system in precomposed unicode, but Mac OS X
readdir() returns decomposed unicode to be compatible with its
behaviour on HFS+ and VFAT.
The unicode decomposition causes many problems:
- The names "git add" and other commands get from the end user may
often be precomposed form (the decomposed form is not easily input
from the keyboard), but when the commands read from the filesystem
to see what it is going to update the index with already is on the
filesystem, readdir() will give decomposed form, which is different.
- Similarly "git log", "git mv" and all other commands that need to
compare pathnames found on the command line (often but not always
precomposed form; a command line input resulting from globbing may
be in decomposed) with pathnames found in the tree objects (should
be precomposed form to be compatible with other systems and for
consistency in general).
- The same for names stored in the index, which should be
precomposed, that may need to be compared with the names read from
readdir().
NFS mounted from Linux is fully transparent and does not suffer from
the above.
As Mac OS X treats precomposed and decomposed file names as equal,
we can
- wrap readdir() on Mac OS X to return the precomposed form, and
- normalize decomposed form given from the command line also to the
precomposed form,
to ensure that all pathnames used in Git are always in the
precomposed form. This behaviour can be requested by setting
"core.precomposedunicode" configuration variable to true.
The code in compat/precomposed_utf8.c implements basically 4 new
functions: precomposed_utf8_opendir(), precomposed_utf8_readdir(),
precomposed_utf8_closedir() and precompose_argv(). The first three
are to wrap opendir(3), readdir(3), and closedir(3) functions.
The argv[] conversion allows to use the TAB filename completion done
by the shell on command line. It tolerates other tools which use
readdir() to feed decomposed file names into git.
When creating a new git repository with "git init" or "git clone",
"core.precomposedunicode" will be set "false".
The user needs to activate this feature manually. She typically
sets core.precomposedunicode to "true" on HFS and VFAT, or file
systems mounted via SAMBA.
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-08 21:50:25 +08:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2016-09-27 09:09:48 +08:00
|
|
|
static const char *fallback_encoding(const char *name)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Some platforms do not have the variously spelled variants of
|
|
|
|
* UTF-8, so let's fall back to trying the most official
|
|
|
|
* spelling. We do so only as a fallback in case the platform
|
|
|
|
* does understand the user's spelling, but not our official
|
|
|
|
* one.
|
|
|
|
*/
|
|
|
|
if (is_encoding_utf8(name))
|
|
|
|
return "UTF-8";
|
|
|
|
|
2016-09-27 09:09:48 +08:00
|
|
|
/*
|
|
|
|
* Even though latin-1 is still seen in e-mail
|
|
|
|
* headers, some platforms only install ISO-8859-1.
|
|
|
|
*/
|
|
|
|
if (!strcasecmp(name, "latin-1"))
|
|
|
|
return "ISO-8859-1";
|
|
|
|
|
2016-09-27 09:09:48 +08:00
|
|
|
return name;
|
|
|
|
}
|
|
|
|
|
2018-07-24 18:50:33 +08:00
|
|
|
char *reencode_string_len(const char *in, size_t insz,
|
2013-04-19 07:08:46 +08:00
|
|
|
const char *out_encoding, const char *in_encoding,
|
2018-07-24 18:50:33 +08:00
|
|
|
size_t *outsz)
|
git on Mac OS and precomposed unicode
Mac OS X mangles file names containing unicode on file systems HFS+,
VFAT or SAMBA. When a file using unicode code points outside ASCII
is created on a HFS+ drive, the file name is converted into
decomposed unicode and written to disk. No conversion is done if
the file name is already decomposed unicode.
Calling open("\xc3\x84", ...) with a precomposed "Ä" yields the same
result as open("\x41\xcc\x88",...) with a decomposed "Ä".
As a consequence, readdir() returns the file names in decomposed
unicode, even if the user expects precomposed unicode. Unlike on
HFS+, Mac OS X stores files on a VFAT drive (e.g. an USB drive) in
precomposed unicode, but readdir() still returns file names in
decomposed unicode. When a git repository is stored on a network
share using SAMBA, file names are send over the wire and written to
disk on the remote system in precomposed unicode, but Mac OS X
readdir() returns decomposed unicode to be compatible with its
behaviour on HFS+ and VFAT.
The unicode decomposition causes many problems:
- The names "git add" and other commands get from the end user may
often be precomposed form (the decomposed form is not easily input
from the keyboard), but when the commands read from the filesystem
to see what it is going to update the index with already is on the
filesystem, readdir() will give decomposed form, which is different.
- Similarly "git log", "git mv" and all other commands that need to
compare pathnames found on the command line (often but not always
precomposed form; a command line input resulting from globbing may
be in decomposed) with pathnames found in the tree objects (should
be precomposed form to be compatible with other systems and for
consistency in general).
- The same for names stored in the index, which should be
precomposed, that may need to be compared with the names read from
readdir().
NFS mounted from Linux is fully transparent and does not suffer from
the above.
As Mac OS X treats precomposed and decomposed file names as equal,
we can
- wrap readdir() on Mac OS X to return the precomposed form, and
- normalize decomposed form given from the command line also to the
precomposed form,
to ensure that all pathnames used in Git are always in the
precomposed form. This behaviour can be requested by setting
"core.precomposedunicode" configuration variable to true.
The code in compat/precomposed_utf8.c implements basically 4 new
functions: precomposed_utf8_opendir(), precomposed_utf8_readdir(),
precomposed_utf8_closedir() and precompose_argv(). The first three
are to wrap opendir(3), readdir(3), and closedir(3) functions.
The argv[] conversion allows to use the TAB filename completion done
by the shell on command line. It tolerates other tools which use
readdir() to feed decomposed file names into git.
When creating a new git repository with "git init" or "git clone",
"core.precomposedunicode" will be set "false".
The user needs to activate this feature manually. She typically
sets core.precomposedunicode to "true" on HFS and VFAT, or file
systems mounted via SAMBA.
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-08 21:50:25 +08:00
|
|
|
{
|
|
|
|
iconv_t conv;
|
|
|
|
char *out;
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
const char *bom_str = NULL;
|
|
|
|
size_t bom_len = 0;
|
git on Mac OS and precomposed unicode
Mac OS X mangles file names containing unicode on file systems HFS+,
VFAT or SAMBA. When a file using unicode code points outside ASCII
is created on a HFS+ drive, the file name is converted into
decomposed unicode and written to disk. No conversion is done if
the file name is already decomposed unicode.
Calling open("\xc3\x84", ...) with a precomposed "Ä" yields the same
result as open("\x41\xcc\x88",...) with a decomposed "Ä".
As a consequence, readdir() returns the file names in decomposed
unicode, even if the user expects precomposed unicode. Unlike on
HFS+, Mac OS X stores files on a VFAT drive (e.g. an USB drive) in
precomposed unicode, but readdir() still returns file names in
decomposed unicode. When a git repository is stored on a network
share using SAMBA, file names are send over the wire and written to
disk on the remote system in precomposed unicode, but Mac OS X
readdir() returns decomposed unicode to be compatible with its
behaviour on HFS+ and VFAT.
The unicode decomposition causes many problems:
- The names "git add" and other commands get from the end user may
often be precomposed form (the decomposed form is not easily input
from the keyboard), but when the commands read from the filesystem
to see what it is going to update the index with already is on the
filesystem, readdir() will give decomposed form, which is different.
- Similarly "git log", "git mv" and all other commands that need to
compare pathnames found on the command line (often but not always
precomposed form; a command line input resulting from globbing may
be in decomposed) with pathnames found in the tree objects (should
be precomposed form to be compatible with other systems and for
consistency in general).
- The same for names stored in the index, which should be
precomposed, that may need to be compared with the names read from
readdir().
NFS mounted from Linux is fully transparent and does not suffer from
the above.
As Mac OS X treats precomposed and decomposed file names as equal,
we can
- wrap readdir() on Mac OS X to return the precomposed form, and
- normalize decomposed form given from the command line also to the
precomposed form,
to ensure that all pathnames used in Git are always in the
precomposed form. This behaviour can be requested by setting
"core.precomposedunicode" configuration variable to true.
The code in compat/precomposed_utf8.c implements basically 4 new
functions: precomposed_utf8_opendir(), precomposed_utf8_readdir(),
precomposed_utf8_closedir() and precompose_argv(). The first three
are to wrap opendir(3), readdir(3), and closedir(3) functions.
The argv[] conversion allows to use the TAB filename completion done
by the shell on command line. It tolerates other tools which use
readdir() to feed decomposed file names into git.
When creating a new git repository with "git init" or "git clone",
"core.precomposedunicode" will be set "false".
The user needs to activate this feature manually. She typically
sets core.precomposedunicode to "true" on HFS and VFAT, or file
systems mounted via SAMBA.
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-08 21:50:25 +08:00
|
|
|
|
|
|
|
if (!in_encoding)
|
|
|
|
return NULL;
|
utf8: accept alternate spellings of UTF-8
The iconv implementation on many platforms will accept
variants of UTF-8, including "UTF8", "utf-8", and "utf8",
but some do not. We make allowances in our code to treat
them all identically, but we sometimes hand the string from
the user directly to iconv. In this case, the platform iconv
may or may not work.
There are really four levels of platform iconv support for
these synonyms:
1. All synonyms understood (e.g., glibc).
2. Only the official "UTF-8" understood (e.g., Windows).
3. Official "UTF-8" not understood, but some other synonym
understood (it's not known whether such a platform exists).
4. Neither "UTF-8" nor any synonym understood (e.g.,
ancient systems, or ones without utf8 support
installed).
This patch teaches git to fall back to using the official
"UTF-8" spelling when iconv_open fails (and the encoding was
one of the synonym spellings). This makes things more
convenient to users of type 2 systems, as they can now use
any of the synonyms for the log output encoding.
Type 1 systems are not affected, as iconv already works on
the first try.
Type 4 systems are not affected, as both attempts already
fail.
Type 3 systems will not benefit from the feature, but
because we only use "UTF-8" as a fallback, they will not be
regressed (i.e., you can continue to use "utf8" if your
platform supports it). We could try all the various
synonyms, but since such systems are not even known to
exist, it's not worth the effort.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-02-26 04:31:00 +08:00
|
|
|
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
/* UTF-16LE-BOM is the same as UTF-16 for reading */
|
|
|
|
if (same_utf_encoding("UTF-16LE-BOM", in_encoding))
|
|
|
|
in_encoding = "UTF-16";
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
|
|
|
|
* Some users under Windows want the little endian version
|
utf8: handle systems that don't write BOM for UTF-16
When serializing UTF-16 (and UTF-32), there are three possible ways to
write the stream. One can write the data with a BOM in either big-endian
or little-endian format, or one can write the data without a BOM in
big-endian format.
Most systems' iconv implementations choose to write it with a BOM in
some endianness, since this is the most foolproof, and it is resistant
to misinterpretation on Windows, where UTF-16 and the little-endian
serialization are very common. For compatibility with Windows and to
avoid accidental misuse there, Git always wants to write UTF-16 with a
BOM, and will refuse to read UTF-16 without it.
However, musl's iconv implementation writes UTF-16 without a BOM,
relying on the user to interpret it as big-endian. This causes t0028 and
the related functionality to fail, since Git won't read the file without
a BOM.
Add a Makefile and #define knob, ICONV_OMITS_BOM, that can be set if the
iconv implementation has this behavior. When set, Git will write a BOM
manually for UTF-16 and UTF-32 and then force the data to be written in
UTF-16BE or UTF-32BE. We choose big-endian behavior here because the
tests use the raw "UTF-16" encoding, which will be big-endian when the
implementation requires this knob to be set.
Update the tests to detect this case and write test data with an added
BOM if necessary. Always write the BOM in the tests in big-endian
format, since all iconv implementations that omit a BOM must use
big-endian serialization according to the Unicode standard.
Preserve the existing behavior for systems which do not have this knob
enabled, since they may use optimized implementations, including
defaulting to the native endianness, which may improve performance.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-02-12 08:52:06 +08:00
|
|
|
*
|
|
|
|
* We handle UTF-16 and UTF-32 ourselves only if the platform does not
|
|
|
|
* provide a BOM (which we require), since we want to match the behavior
|
|
|
|
* of the system tools and libc as much as possible.
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
*/
|
|
|
|
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
|
|
|
|
bom_str = utf16_le_bom;
|
|
|
|
bom_len = sizeof(utf16_le_bom);
|
|
|
|
out_encoding = "UTF-16LE";
|
|
|
|
} else if (same_utf_encoding("UTF-16BE-BOM", out_encoding)) {
|
|
|
|
bom_str = utf16_be_bom;
|
|
|
|
bom_len = sizeof(utf16_be_bom);
|
|
|
|
out_encoding = "UTF-16BE";
|
utf8: handle systems that don't write BOM for UTF-16
When serializing UTF-16 (and UTF-32), there are three possible ways to
write the stream. One can write the data with a BOM in either big-endian
or little-endian format, or one can write the data without a BOM in
big-endian format.
Most systems' iconv implementations choose to write it with a BOM in
some endianness, since this is the most foolproof, and it is resistant
to misinterpretation on Windows, where UTF-16 and the little-endian
serialization are very common. For compatibility with Windows and to
avoid accidental misuse there, Git always wants to write UTF-16 with a
BOM, and will refuse to read UTF-16 without it.
However, musl's iconv implementation writes UTF-16 without a BOM,
relying on the user to interpret it as big-endian. This causes t0028 and
the related functionality to fail, since Git won't read the file without
a BOM.
Add a Makefile and #define knob, ICONV_OMITS_BOM, that can be set if the
iconv implementation has this behavior. When set, Git will write a BOM
manually for UTF-16 and UTF-32 and then force the data to be written in
UTF-16BE or UTF-32BE. We choose big-endian behavior here because the
tests use the raw "UTF-16" encoding, which will be big-endian when the
implementation requires this knob to be set.
Update the tests to detect this case and write test data with an added
BOM if necessary. Always write the BOM in the tests in big-endian
format, since all iconv implementations that omit a BOM must use
big-endian serialization according to the Unicode standard.
Preserve the existing behavior for systems which do not have this knob
enabled, since they may use optimized implementations, including
defaulting to the native endianness, which may improve performance.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-02-12 08:52:06 +08:00
|
|
|
#ifdef ICONV_OMITS_BOM
|
|
|
|
} else if (same_utf_encoding("UTF-16", out_encoding)) {
|
|
|
|
bom_str = utf16_be_bom;
|
|
|
|
bom_len = sizeof(utf16_be_bom);
|
|
|
|
out_encoding = "UTF-16BE";
|
|
|
|
} else if (same_utf_encoding("UTF-32", out_encoding)) {
|
|
|
|
bom_str = utf32_be_bom;
|
|
|
|
bom_len = sizeof(utf32_be_bom);
|
|
|
|
out_encoding = "UTF-32BE";
|
|
|
|
#endif
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
}
|
|
|
|
|
git on Mac OS and precomposed unicode
Mac OS X mangles file names containing unicode on file systems HFS+,
VFAT or SAMBA. When a file using unicode code points outside ASCII
is created on a HFS+ drive, the file name is converted into
decomposed unicode and written to disk. No conversion is done if
the file name is already decomposed unicode.
Calling open("\xc3\x84", ...) with a precomposed "Ä" yields the same
result as open("\x41\xcc\x88",...) with a decomposed "Ä".
As a consequence, readdir() returns the file names in decomposed
unicode, even if the user expects precomposed unicode. Unlike on
HFS+, Mac OS X stores files on a VFAT drive (e.g. an USB drive) in
precomposed unicode, but readdir() still returns file names in
decomposed unicode. When a git repository is stored on a network
share using SAMBA, file names are send over the wire and written to
disk on the remote system in precomposed unicode, but Mac OS X
readdir() returns decomposed unicode to be compatible with its
behaviour on HFS+ and VFAT.
The unicode decomposition causes many problems:
- The names "git add" and other commands get from the end user may
often be precomposed form (the decomposed form is not easily input
from the keyboard), but when the commands read from the filesystem
to see what it is going to update the index with already is on the
filesystem, readdir() will give decomposed form, which is different.
- Similarly "git log", "git mv" and all other commands that need to
compare pathnames found on the command line (often but not always
precomposed form; a command line input resulting from globbing may
be in decomposed) with pathnames found in the tree objects (should
be precomposed form to be compatible with other systems and for
consistency in general).
- The same for names stored in the index, which should be
precomposed, that may need to be compared with the names read from
readdir().
NFS mounted from Linux is fully transparent and does not suffer from
the above.
As Mac OS X treats precomposed and decomposed file names as equal,
we can
- wrap readdir() on Mac OS X to return the precomposed form, and
- normalize decomposed form given from the command line also to the
precomposed form,
to ensure that all pathnames used in Git are always in the
precomposed form. This behaviour can be requested by setting
"core.precomposedunicode" configuration variable to true.
The code in compat/precomposed_utf8.c implements basically 4 new
functions: precomposed_utf8_opendir(), precomposed_utf8_readdir(),
precomposed_utf8_closedir() and precompose_argv(). The first three
are to wrap opendir(3), readdir(3), and closedir(3) functions.
The argv[] conversion allows to use the TAB filename completion done
by the shell on command line. It tolerates other tools which use
readdir() to feed decomposed file names into git.
When creating a new git repository with "git init" or "git clone",
"core.precomposedunicode" will be set "false".
The user needs to activate this feature manually. She typically
sets core.precomposedunicode to "true" on HFS and VFAT, or file
systems mounted via SAMBA.
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-08 21:50:25 +08:00
|
|
|
conv = iconv_open(out_encoding, in_encoding);
|
utf8: accept alternate spellings of UTF-8
The iconv implementation on many platforms will accept
variants of UTF-8, including "UTF8", "utf-8", and "utf8",
but some do not. We make allowances in our code to treat
them all identically, but we sometimes hand the string from
the user directly to iconv. In this case, the platform iconv
may or may not work.
There are really four levels of platform iconv support for
these synonyms:
1. All synonyms understood (e.g., glibc).
2. Only the official "UTF-8" understood (e.g., Windows).
3. Official "UTF-8" not understood, but some other synonym
understood (it's not known whether such a platform exists).
4. Neither "UTF-8" nor any synonym understood (e.g.,
ancient systems, or ones without utf8 support
installed).
This patch teaches git to fall back to using the official
"UTF-8" spelling when iconv_open fails (and the encoding was
one of the synonym spellings). This makes things more
convenient to users of type 2 systems, as they can now use
any of the synonyms for the log output encoding.
Type 1 systems are not affected, as iconv already works on
the first try.
Type 4 systems are not affected, as both attempts already
fail.
Type 3 systems will not benefit from the feature, but
because we only use "UTF-8" as a fallback, they will not be
regressed (i.e., you can continue to use "utf8" if your
platform supports it). We could try all the various
synonyms, but since such systems are not even known to
exist, it's not worth the effort.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-02-26 04:31:00 +08:00
|
|
|
if (conv == (iconv_t) -1) {
|
2016-09-27 09:09:48 +08:00
|
|
|
in_encoding = fallback_encoding(in_encoding);
|
|
|
|
out_encoding = fallback_encoding(out_encoding);
|
|
|
|
|
utf8: accept alternate spellings of UTF-8
The iconv implementation on many platforms will accept
variants of UTF-8, including "UTF8", "utf-8", and "utf8",
but some do not. We make allowances in our code to treat
them all identically, but we sometimes hand the string from
the user directly to iconv. In this case, the platform iconv
may or may not work.
There are really four levels of platform iconv support for
these synonyms:
1. All synonyms understood (e.g., glibc).
2. Only the official "UTF-8" understood (e.g., Windows).
3. Official "UTF-8" not understood, but some other synonym
understood (it's not known whether such a platform exists).
4. Neither "UTF-8" nor any synonym understood (e.g.,
ancient systems, or ones without utf8 support
installed).
This patch teaches git to fall back to using the official
"UTF-8" spelling when iconv_open fails (and the encoding was
one of the synonym spellings). This makes things more
convenient to users of type 2 systems, as they can now use
any of the synonyms for the log output encoding.
Type 1 systems are not affected, as iconv already works on
the first try.
Type 4 systems are not affected, as both attempts already
fail.
Type 3 systems will not benefit from the feature, but
because we only use "UTF-8" as a fallback, they will not be
regressed (i.e., you can continue to use "utf8" if your
platform supports it). We could try all the various
synonyms, but since such systems are not even known to
exist, it's not worth the effort.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-02-26 04:31:00 +08:00
|
|
|
conv = iconv_open(out_encoding, in_encoding);
|
|
|
|
if (conv == (iconv_t) -1)
|
|
|
|
return NULL;
|
|
|
|
}
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
out = reencode_string_iconv(in, insz, conv, bom_len, outsz);
|
2006-12-24 15:36:55 +08:00
|
|
|
iconv_close(conv);
|
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16
The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:
a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000 g i t
Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.
iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:
d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000 376 377 \0 g \0 i \0 t
e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000 g \0 i \0 t \0
f) UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000 \0 g \0 i \0 t
There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).
When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).
iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.
Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).
Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).
Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)
Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-30 23:01:52 +08:00
|
|
|
if (out && bom_str && bom_len)
|
|
|
|
memcpy(out, bom_str, bom_len);
|
2006-12-24 15:36:55 +08:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
#endif
|
2013-03-07 18:55:07 +08:00
|
|
|
|
2018-04-16 02:16:05 +08:00
|
|
|
static int has_bom_prefix(const char *data, size_t len,
|
|
|
|
const char *bom, size_t bom_len)
|
|
|
|
{
|
|
|
|
return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
|
|
|
|
{
|
|
|
|
return (
|
|
|
|
(same_utf_encoding("UTF-16BE", enc) ||
|
|
|
|
same_utf_encoding("UTF-16LE", enc)) &&
|
|
|
|
(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
|
|
|
|
has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
|
|
|
|
) || (
|
|
|
|
(same_utf_encoding("UTF-32BE", enc) ||
|
|
|
|
same_utf_encoding("UTF-32LE", enc)) &&
|
|
|
|
(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
|
|
|
|
has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2018-04-16 02:16:06 +08:00
|
|
|
int is_missing_required_utf_bom(const char *enc, const char *data, size_t len)
|
|
|
|
{
|
|
|
|
return (
|
|
|
|
(same_utf_encoding(enc, "UTF-16")) &&
|
|
|
|
!(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
|
|
|
|
has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
|
|
|
|
) || (
|
|
|
|
(same_utf_encoding(enc, "UTF-32")) &&
|
|
|
|
!(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
|
|
|
|
has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2013-03-07 18:55:07 +08:00
|
|
|
/*
|
|
|
|
* Returns first character length in bytes for multi-byte `text` according to
|
|
|
|
* `encoding`.
|
|
|
|
*
|
|
|
|
* - The `text` pointer is updated to point at the next character.
|
|
|
|
* - When `remainder_p` is not NULL, on entry `*remainder_p` is how much bytes
|
|
|
|
* we can consume from text, and on exit `*remainder_p` is reduced by returned
|
|
|
|
* character length. Otherwise `text` is treated as limited by NUL.
|
|
|
|
*/
|
|
|
|
int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding)
|
|
|
|
{
|
|
|
|
int chrlen;
|
|
|
|
const char *p = *text;
|
|
|
|
size_t r = (remainder_p ? *remainder_p : SIZE_MAX);
|
|
|
|
|
|
|
|
if (r < 1)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (is_encoding_utf8(encoding)) {
|
|
|
|
pick_one_utf8_char(&p, &r);
|
|
|
|
|
|
|
|
chrlen = p ? (p - *text)
|
|
|
|
: 1 /* not valid UTF-8 -> raw byte sequence */;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* TODO use iconv to decode one char and obtain its chrlen
|
|
|
|
* for now, let's treat encodings != UTF-8 as one-byte
|
|
|
|
*/
|
|
|
|
chrlen = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
*text += chrlen;
|
|
|
|
if (remainder_p)
|
|
|
|
*remainder_p -= chrlen;
|
|
|
|
|
|
|
|
return chrlen;
|
|
|
|
}
|
2014-12-16 06:56:59 +08:00
|
|
|
|
|
|
|
/*
|
2014-12-23 16:45:36 +08:00
|
|
|
* Pick the next char from the stream, ignoring codepoints an HFS+ would.
|
|
|
|
* Note that this is _not_ complete by any means. It's just enough
|
2014-12-16 06:56:59 +08:00
|
|
|
* to make is_hfs_dotgit() work, and should not be used otherwise.
|
|
|
|
*/
|
|
|
|
static ucs_char_t next_hfs_char(const char **in)
|
|
|
|
{
|
|
|
|
while (1) {
|
|
|
|
ucs_char_t out = pick_one_utf8_char(in, NULL);
|
|
|
|
/*
|
|
|
|
* check for malformed utf8. Technically this
|
|
|
|
* gets converted to a percent-sequence, but
|
|
|
|
* returning 0 is good enough for is_hfs_dotgit
|
|
|
|
* to realize it cannot be .git
|
|
|
|
*/
|
|
|
|
if (!*in)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* these code points are ignored completely */
|
|
|
|
switch (out) {
|
|
|
|
case 0x200c: /* ZERO WIDTH NON-JOINER */
|
|
|
|
case 0x200d: /* ZERO WIDTH JOINER */
|
|
|
|
case 0x200e: /* LEFT-TO-RIGHT MARK */
|
|
|
|
case 0x200f: /* RIGHT-TO-LEFT MARK */
|
|
|
|
case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */
|
|
|
|
case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */
|
|
|
|
case 0x202c: /* POP DIRECTIONAL FORMATTING */
|
|
|
|
case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */
|
|
|
|
case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */
|
|
|
|
case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */
|
|
|
|
case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */
|
|
|
|
case 0x206c: /* INHIBIT ARABIC FORM SHAPING */
|
|
|
|
case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */
|
|
|
|
case 0x206e: /* NATIONAL DIGIT SHAPES */
|
|
|
|
case 0x206f: /* NOMINAL DIGIT SHAPES */
|
|
|
|
case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-12-23 16:45:36 +08:00
|
|
|
return out;
|
2014-12-16 06:56:59 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-03 03:23:45 +08:00
|
|
|
static int is_hfs_dot_generic(const char *path,
|
|
|
|
const char *needle, size_t needle_len)
|
2014-12-16 06:56:59 +08:00
|
|
|
{
|
|
|
|
ucs_char_t c;
|
|
|
|
|
2014-12-23 16:45:36 +08:00
|
|
|
c = next_hfs_char(&path);
|
|
|
|
if (c != '.')
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* there's a great deal of other case-folding that occurs
|
2018-05-03 03:23:45 +08:00
|
|
|
* in HFS+, but this is enough to catch our fairly vanilla
|
|
|
|
* hard-coded needles.
|
2014-12-23 16:45:36 +08:00
|
|
|
*/
|
2018-05-03 03:23:45 +08:00
|
|
|
for (; needle_len > 0; needle++, needle_len--) {
|
|
|
|
c = next_hfs_char(&path);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We know our needles contain only ASCII, so we clamp here to
|
|
|
|
* make the results of tolower() sane.
|
|
|
|
*/
|
|
|
|
if (c > 127)
|
|
|
|
return 0;
|
|
|
|
if (tolower(c) != *needle)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-12-16 06:56:59 +08:00
|
|
|
c = next_hfs_char(&path);
|
|
|
|
if (c && !is_dir_sep(c))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
2015-04-17 01:45:29 +08:00
|
|
|
|
2018-05-03 03:23:45 +08:00
|
|
|
/*
|
|
|
|
* Inline wrapper to make sure the compiler resolves strlen() on literals at
|
|
|
|
* compile time.
|
|
|
|
*/
|
|
|
|
static inline int is_hfs_dot_str(const char *path, const char *needle)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_generic(path, needle, strlen(needle));
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_hfs_dotgit(const char *path)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_str(path, "git");
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_hfs_dotgitmodules(const char *path)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_str(path, "gitmodules");
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_hfs_dotgitignore(const char *path)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_str(path, "gitignore");
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_hfs_dotgitattributes(const char *path)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_str(path, "gitattributes");
|
|
|
|
}
|
|
|
|
|
t0060: test ntfs/hfs-obscured dotfiles
We have tests that cover various filesystem-specific spellings of
".gitmodules", because we need to reliably identify that path for some
security checks. These are from dc2d9ba318 (is_{hfs,ntfs}_dotgitmodules:
add tests, 2018-05-12), with the actual code coming from e7cb0b4455
(is_ntfs_dotgit: match other .git files, 2018-05-11) and 0fc333ba20
(is_hfs_dotgit: match other .git files, 2018-05-02).
Those latter two commits also added similar matching functions for
.gitattributes and .gitignore. These ended up not being used in the
final series, and are currently dead code. But in preparation for them
being used in some fsck checks, let's make sure they actually work by
throwing a few basic tests at them. Likewise, let's cover .mailmap
(which does need matching code added).
I didn't bother with the whole battery of tests that we cover for
.gitmodules. These functions are all based on the same generic matcher,
so it's sufficient to test most of the corner cases just once.
Note that the ntfs magic prefix names in the tests come from the
algorithm described in e7cb0b4455 (and are different for each file).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-04 04:43:22 +08:00
|
|
|
int is_hfs_dotmailmap(const char *path)
|
|
|
|
{
|
|
|
|
return is_hfs_dot_str(path, "mailmap");
|
|
|
|
}
|
|
|
|
|
2015-04-17 01:45:29 +08:00
|
|
|
const char utf8_bom[] = "\357\273\277";
|
|
|
|
|
|
|
|
int skip_utf8_bom(char **text, size_t len)
|
|
|
|
{
|
|
|
|
if (len < strlen(utf8_bom) ||
|
|
|
|
memcmp(*text, utf8_bom, strlen(utf8_bom)))
|
|
|
|
return 0;
|
|
|
|
*text += strlen(utf8_bom);
|
|
|
|
return 1;
|
|
|
|
}
|
2015-09-10 23:48:19 +08:00
|
|
|
|
|
|
|
void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width,
|
|
|
|
const char *s)
|
|
|
|
{
|
2022-12-01 22:46:53 +08:00
|
|
|
size_t slen = strlen(s);
|
2015-09-10 23:48:19 +08:00
|
|
|
int display_len = utf8_strnwidth(s, slen, 0);
|
|
|
|
int utf8_compensation = slen - display_len;
|
|
|
|
|
|
|
|
if (display_len >= width) {
|
|
|
|
strbuf_addstr(buf, s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (position == ALIGN_LEFT)
|
|
|
|
strbuf_addf(buf, "%-*s", width + utf8_compensation, s);
|
|
|
|
else if (position == ALIGN_MIDDLE) {
|
|
|
|
int left = (width - display_len) / 2;
|
|
|
|
strbuf_addf(buf, "%*s%-*s", left, "", width - left + utf8_compensation, s);
|
|
|
|
} else if (position == ALIGN_RIGHT)
|
|
|
|
strbuf_addf(buf, "%*s", width + utf8_compensation, s);
|
|
|
|
}
|