mirror of
https://git.code.sf.net/p/ntfs-3g/ntfs-3g.git
synced 2024-11-23 18:14:24 +08:00
unistr.c: Enable encoding broken UTF-16 into broken UTF-8, A.K.A. WTF-8.
Windows filenames may contain invalid UTF-16 sequences (specifically broken surrogate pairs), which cannot be converted to UTF-8 if we do strict conversion. This patch enables encoding broken UTF-16 into similarly broken UTF-8 by encoding any surrogate character that don't have a match into a separate 3-byte UTF-8 sequence. This is "sort of" valid UTF-8, but not valid Unicode since the code points used for surrogate pair encoding are not supposed to occur in a valid Unicode string... but on the other hand the source UTF-16 data is also broken, so we aren't really making things any worse. This format is sometimes referred to as WTF-8 (Wobbly Translation Format, 8-bit encoding) and is a common solution to represent broken UTF-16 as UTF-8. It is a lossless round-trip conversion, i.e converting from broken UTF-16 to "WTF-8" and back to UTF-16 yields the same broken UTF-16 sequence. Because of this property it enables accessing these files by filename through ntfs-3g and the ntfsprogs (e.g. ls -la works as expected). To disable this behaviour you can pass the preprocessor/compiler flag '-DALLOW_BROKEN_SURROGATES=0' when building ntfs-3g.
This commit is contained in:
parent
ebdff7d4ee
commit
d9c61dd60e
@ -61,6 +61,11 @@
|
||||
|
||||
#define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
|
||||
|
||||
#ifndef ALLOW_BROKEN_SURROGATES
|
||||
/* Erik allowing broken UTF-16 surrogate pairs by default, open to debate. */
|
||||
#define ALLOW_BROKEN_SURROGATES 1
|
||||
#endif /* !defined(ALLOW_BROKEN_SURROGATES) */
|
||||
|
||||
/*
|
||||
* IMPORTANT
|
||||
* =========
|
||||
@ -462,8 +467,22 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
||||
if ((c >= 0xdc00) && (c < 0xe000)) {
|
||||
surrog = FALSE;
|
||||
count += 4;
|
||||
} else
|
||||
} else {
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
/* The first UTF-16 unit of a surrogate pair has
|
||||
* a value between 0xd800 and 0xdc00. It can be
|
||||
* encoded as an individual UTF-8 sequence if we
|
||||
* cannot combine it with the next UTF-16 unit
|
||||
* unit as a surrogate pair. */
|
||||
surrog = FALSE;
|
||||
count += 3;
|
||||
|
||||
--i;
|
||||
continue;
|
||||
#else
|
||||
goto fail;
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
}
|
||||
} else
|
||||
if (c < 0x80)
|
||||
count++;
|
||||
@ -473,6 +492,10 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
||||
count += 3;
|
||||
else if (c < 0xdc00)
|
||||
surrog = TRUE;
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
else if (c < 0xe000)
|
||||
count += 3;
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
#if NOREVBOM
|
||||
else if ((c >= 0xe000) && (c < 0xfffe))
|
||||
#else
|
||||
@ -487,7 +510,11 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l
|
||||
}
|
||||
}
|
||||
if (surrog)
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
count += 3; /* ending with a single surrogate */
|
||||
#else
|
||||
goto fail;
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
|
||||
ret = count;
|
||||
out:
|
||||
@ -548,8 +575,24 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
||||
*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
|
||||
*t++ = 0x80 + (c & 63);
|
||||
halfpair = 0;
|
||||
} else
|
||||
} else {
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
/* The first UTF-16 unit of a surrogate pair has
|
||||
* a value between 0xd800 and 0xdc00. It can be
|
||||
* encoded as an individual UTF-8 sequence if we
|
||||
* cannot combine it with the next UTF-16 unit
|
||||
* unit as a surrogate pair. */
|
||||
*t++ = 0xe0 | (halfpair >> 12);
|
||||
*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
|
||||
*t++ = 0x80 | (halfpair & 0x3f);
|
||||
halfpair = 0;
|
||||
|
||||
--i;
|
||||
continue;
|
||||
#else
|
||||
goto fail;
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
}
|
||||
} else if (c < 0x80) {
|
||||
*t++ = c;
|
||||
} else {
|
||||
@ -562,6 +605,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
||||
*t++ = 0x80 | (c & 0x3f);
|
||||
} else if (c < 0xdc00)
|
||||
halfpair = c;
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
else if (c < 0xe000) {
|
||||
*t++ = 0xe0 | (c >> 12);
|
||||
*t++ = 0x80 | ((c >> 6) & 0x3f);
|
||||
*t++ = 0x80 | (c & 0x3f);
|
||||
}
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
else if (c >= 0xe000) {
|
||||
*t++ = 0xe0 | (c >> 12);
|
||||
*t++ = 0x80 | ((c >> 6) & 0x3f);
|
||||
@ -570,6 +620,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
if (halfpair) { /* ending with a single surrogate */
|
||||
*t++ = 0xe0 | (halfpair >> 12);
|
||||
*t++ = 0x80 | ((halfpair >> 6) & 0x3f);
|
||||
*t++ = 0x80 | (halfpair & 0x3f);
|
||||
}
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
*t = '\0';
|
||||
|
||||
#if defined(__APPLE__) || defined(__DARWIN__)
|
||||
@ -693,10 +750,16 @@ static int utf8_to_unicode(u32 *wc, const char *s)
|
||||
/* Check valid ranges */
|
||||
#if NOREVBOM
|
||||
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
|| ((*wc >= 0xD800) && (*wc <= 0xDFFF))
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
|| ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
|
||||
return 3;
|
||||
#else
|
||||
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|
||||
#if ALLOW_BROKEN_SURROGATES
|
||||
|| ((*wc >= 0xD800) && (*wc <= 0xDFFF))
|
||||
#endif /* ALLOW_BROKEN_SURROGATES */
|
||||
|| ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
|
||||
return 3;
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user