From d9c61dd60ec484909f70b7a916ada3a93af94b60 Mon Sep 17 00:00:00 2001 From: Erik Larsson Date: Fri, 8 Apr 2016 05:39:48 +0200 Subject: [PATCH] unistr.c: Enable encoding broken UTF-16 into broken UTF-8, A.K.A. WTF-8. Windows filenames may contain invalid UTF-16 sequences (specifically broken surrogate pairs), which cannot be converted to UTF-8 if we do strict conversion. This patch enables encoding broken UTF-16 into similarly broken UTF-8 by encoding any surrogate character that don't have a match into a separate 3-byte UTF-8 sequence. This is "sort of" valid UTF-8, but not valid Unicode since the code points used for surrogate pair encoding are not supposed to occur in a valid Unicode string... but on the other hand the source UTF-16 data is also broken, so we aren't really making things any worse. This format is sometimes referred to as WTF-8 (Wobbly Translation Format, 8-bit encoding) and is a common solution to represent broken UTF-16 as UTF-8. It is a lossless round-trip conversion, i.e converting from broken UTF-16 to "WTF-8" and back to UTF-16 yields the same broken UTF-16 sequence. Because of this property it enables accessing these files by filename through ntfs-3g and the ntfsprogs (e.g. ls -la works as expected). To disable this behaviour you can pass the preprocessor/compiler flag '-DALLOW_BROKEN_SURROGATES=0' when building ntfs-3g. --- libntfs-3g/unistr.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/libntfs-3g/unistr.c b/libntfs-3g/unistr.c index 7f278cd1..71802aa7 100644 --- a/libntfs-3g/unistr.c +++ b/libntfs-3g/unistr.c @@ -61,6 +61,11 @@ #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ +#ifndef ALLOW_BROKEN_SURROGATES +/* Erik allowing broken UTF-16 surrogate pairs by default, open to debate. */ +#define ALLOW_BROKEN_SURROGATES 1 +#endif /* !defined(ALLOW_BROKEN_SURROGATES) */ + /* * IMPORTANT * ========= @@ -462,8 +467,22 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l if ((c >= 0xdc00) && (c < 0xe000)) { surrog = FALSE; count += 4; - } else + } else { +#if ALLOW_BROKEN_SURROGATES + /* The first UTF-16 unit of a surrogate pair has + * a value between 0xd800 and 0xdc00. It can be + * encoded as an individual UTF-8 sequence if we + * cannot combine it with the next UTF-16 unit + * unit as a surrogate pair. */ + surrog = FALSE; + count += 3; + + --i; + continue; +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ + } } else if (c < 0x80) count++; @@ -473,6 +492,10 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l count += 3; else if (c < 0xdc00) surrog = TRUE; +#if ALLOW_BROKEN_SURROGATES + else if (c < 0xe000) + count += 3; +#endif /* ALLOW_BROKEN_SURROGATES */ #if NOREVBOM else if ((c >= 0xe000) && (c < 0xfffe)) #else @@ -487,7 +510,11 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l } } if (surrog) +#if ALLOW_BROKEN_SURROGATES + count += 3; /* ending with a single surrogate */ +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ ret = count; out: @@ -548,8 +575,24 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); *t++ = 0x80 + (c & 63); halfpair = 0; - } else + } else { +#if ALLOW_BROKEN_SURROGATES + /* The first UTF-16 unit of a surrogate pair has + * a value between 0xd800 and 0xdc00. It can be + * encoded as an individual UTF-8 sequence if we + * cannot combine it with the next UTF-16 unit + * unit as a surrogate pair. */ + *t++ = 0xe0 | (halfpair >> 12); + *t++ = 0x80 | ((halfpair >> 6) & 0x3f); + *t++ = 0x80 | (halfpair & 0x3f); + halfpair = 0; + + --i; + continue; +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ + } } else if (c < 0x80) { *t++ = c; } else { @@ -562,6 +605,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, *t++ = 0x80 | (c & 0x3f); } else if (c < 0xdc00) halfpair = c; +#if ALLOW_BROKEN_SURROGATES + else if (c < 0xe000) { + *t++ = 0xe0 | (c >> 12); + *t++ = 0x80 | ((c >> 6) & 0x3f); + *t++ = 0x80 | (c & 0x3f); + } +#endif /* ALLOW_BROKEN_SURROGATES */ else if (c >= 0xe000) { *t++ = 0xe0 | (c >> 12); *t++ = 0x80 | ((c >> 6) & 0x3f); @@ -570,6 +620,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, goto fail; } } +#if ALLOW_BROKEN_SURROGATES + if (halfpair) { /* ending with a single surrogate */ + *t++ = 0xe0 | (halfpair >> 12); + *t++ = 0x80 | ((halfpair >> 6) & 0x3f); + *t++ = 0x80 | (halfpair & 0x3f); + } +#endif /* ALLOW_BROKEN_SURROGATES */ *t = '\0'; #if defined(__APPLE__) || defined(__DARWIN__) @@ -693,10 +750,16 @@ static int utf8_to_unicode(u32 *wc, const char *s) /* Check valid ranges */ #if NOREVBOM if (((*wc >= 0x800) && (*wc <= 0xD7FF)) +#if ALLOW_BROKEN_SURROGATES + || ((*wc >= 0xD800) && (*wc <= 0xDFFF)) +#endif /* ALLOW_BROKEN_SURROGATES */ || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) return 3; #else if (((*wc >= 0x800) && (*wc <= 0xD7FF)) +#if ALLOW_BROKEN_SURROGATES + || ((*wc >= 0xD800) && (*wc <= 0xDFFF)) +#endif /* ALLOW_BROKEN_SURROGATES */ || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) return 3; #endif