Integrated full utf-8 to utf-16le conversions, based on code by Berhard Kaindl

This commit is contained in:
jpandre 2008-08-21 12:04:51 +00:00
parent f4bd4e5b52
commit 13552eba52
3 changed files with 326 additions and 1 deletions

View File

@ -65,5 +65,7 @@ extern ntfschar *ntfs_str2ucs(const char *s, int *len);
extern void ntfs_ucsfree(ntfschar *ucs);
extern int ntfs_set_char_encoding(const char *locale);
#endif /* defined _NTFS_UNISTR_H */

View File

@ -3,6 +3,8 @@
*
* Copyright (c) 2000-2004 Anton Altaparmakov
* Copyright (c) 2002-2008 Szabolcs Szakacsits
* Copyright (c) 2008 Jean-Pierre Andre
* Copyright (c) 2008 Bernhard Kaindl
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@ -39,6 +41,9 @@
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#include "attrib.h"
#include "types.h"
@ -47,6 +52,8 @@
#include "logging.h"
#include "misc.h"
#define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
/*
* IMPORTANT
* =========
@ -55,6 +62,8 @@
* encoding inside the strings!!!
*/
static int use_utf8 = 1; /* use UTF-8 encoding for file names */
/*
* This is used by the name collation functions to quickly determine what
* characters are (in)valid.
@ -378,6 +387,293 @@ int ntfs_file_values_compare(const FILE_NAME_ATTR *file_name_attr1,
err_val, ic, upcase, upcase_len);
}
/*
#
# NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
# for now]) for path names, but the Unicode code points need to be
# converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
# glibc does this even without a locale in a hard-coded fashion as that
# appears to be is easy because the low 7-bit ASCII range appears to be
# available # in all charsets but it does not convert anything if
# there was some error with the locale setup or none set up like
# when mount is called during early boot where he (by policy) do
# not use locales (and may be not available if /usr is not yet mounted),
# so this patch fixes the resulting issues for systems which use
# UTF-8 and for others, specifying the locale in fstab brings them
# the encoding which they want.
#
# If no locale is defined or there was a problem with setting one
# up and whenever nl_langinfo(CODESET) returns a sting starting with
# "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
# the bug where NTFS-3G does not show any path names which include
# international characters!!! (and also fails on creating them) as result.
#
# Author: Bernhard Kaindl <bk@suse.de>
#
*/
/* Return the amount of 16-bit elements in UTF-16LE needed (without
* the terminating null to store given UTF-8 string and -1 if it does
* not fit into PATH_MAX
*
* JPA : made compliant with RFC3629 / RFC2781
*/
static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
{
int i;
int count = 0;
BOOL surrog;
surrog = FALSE;
for (i = 0; i < ins_len && ins[i]; i++) {
unsigned short c = le16_to_cpu(ins[i]);
if (surrog) {
if ((c >= 0xdc00) && (c < 0xe000)) {
surrog = FALSE;
count += 4;
} else goto fail;
} else
if (c < 0x80)
count++;
else if (c < 0x800)
count += 2;
else if (c < 0xd800)
count += 3;
else if (c < 0xdc00)
surrog = TRUE;
#if NOREVBOM
else if ((c >= 0xe000) && (c < 0xfffe))
#else
else if (c >= 0xe000)
#endif
count += 3;
else goto fail;
if (count > outs_len)
goto fail;
}
if (surrog) goto fail;
return count;
fail:
return -1;
}
/*
* ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
* @ins: input utf16 string buffer
* @ins_len: length of input string in utf16 characters
* @outs: on return contains the (allocated) output multibyte string
* @outs_len: length of output buffer in bytes
*
* JPA : made compliant with RFC3629 / RFC2781
*/
static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
char **outs, int outs_len)
{
char *t;
int i, size;
ntfschar halfpair;
halfpair = 0;
if (!*outs)
outs_len = PATH_MAX;
size = utf16_to_utf8_size(ins, ins_len, outs_len);
if (size < 0) {
errno = ENAMETOOLONG;
goto fail;
}
if (!*outs)
*outs = ntfs_malloc((outs_len = size + 1));
t = *outs;
for (i = 0; i < ins_len && ins[i]; i++) {
unsigned short c = le16_to_cpu(ins[i]);
/* size not double-checked */
if (halfpair) {
if ((c >= 0xdc00) && (c < 0xe000)) {
*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
*t++ = 0x80 + (c & 63);
halfpair = 0;
} else goto fail;
} else if (c < 0x80) {
*t++ = c;
} else {
if (c < 0x800) {
*t++ = (0xc0 | ((c >> 6) & 0x3f));
*t++ = 0x80 | (c & 0x3f);
} else if (c < 0xd800) {
*t++ = 0xe0 | (c >> 12);
*t++ = 0x80 | ((c >> 6) & 0x3f);
*t++ = 0x80 | (c & 0x3f);
} else if (c < 0xdc00)
halfpair = c;
else if (c >= 0xe000) {
*t++ = 0xe0 | (c >> 12);
*t++ = 0x80 | ((c >> 6) & 0x3f);
*t++ = 0x80 | (c & 0x3f);
} else goto fail;
}
}
*t = '\0';
return t - *outs;
fail:
return -1;
}
/* Return the amount of 16-bit elements in UTF-16LE needed (without
* the terminating null to store given UTF-8 string and -1 if it does
* not fit into PATH_MAX
*
* Note : this does not check whether the input sequence is a valid utf8
* string, and should be used only in context where such check is made
*
* JPA : made compliant with RFC3629 / RFC2781
*
*/
static int utf8_to_utf16_size(const char *s)
{
unsigned int byte;
size_t count = 0;
while ((byte = *((const unsigned char *)s++))) {
if (++count >= PATH_MAX || byte >= 0xF5)
goto fail;
if (!*s) break;
if (byte >= 0xC0) s++;
if (!*s) break;
if (byte >= 0xE0) s++;
if (!*s) break;
if (byte >= 0xF0) {
s++;
if (++count >= PATH_MAX)
goto fail;
}
}
return count;
fail:
return -1;
}
/* This converts one UTF-8 sequence to cpu-endian Unicode value
* within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
* Returns the number of used utf8 bytes or -1 if sequence is invalid
*
* JPA : made compliant with RFC3629 / RFC2781
*/
static int utf8_to_unicode(u32 *wc, const char *s)
{
unsigned int byte = *((const unsigned char *)s);
/* single byte */
if (byte == 0) {
*wc = (u32) 0;
return 0;
} else if (byte < 0x80) {
*wc = (u32) byte;
return 1;
/* double byte */
} else if (byte < 0xc2) {
goto fail;
} else if (byte < 0xE0) {
if (strlen(s) < 2)
goto fail;
if ((s[1] & 0xC0) == 0x80) {
*wc = ((u32)(byte & 0x1F) << 6)
| ((u32)(s[1] & 0x3F));
return 2;
} else
goto fail;
/* three-byte */
} else if (byte < 0xF0) {
if (strlen(s) < 3)
goto fail;
if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
*wc = ((u32)(byte & 0x0F) << 12)
| ((u32)(s[1] & 0x3F) << 6)
| ((u32)(s[2] & 0x3F));
/* Check valid ranges */
#if NOREVBOM
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|| ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
return 3;
#else
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|| ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
return 3;
#endif
}
goto fail;
/* four-byte */
} else if (byte < 0xF5) {
if (strlen(s) < 4)
goto fail;
if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
&& ((s[3] & 0xC0) == 0x80)) {
*wc = ((u32)(byte & 0x07) << 18)
| ((u32)(s[1] & 0x3F) << 12)
| ((u32)(s[2] & 0x3F) << 6)
| ((u32)(s[3] & 0x3F));
/* Check valid ranges */
if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
return 4;
}
goto fail;
}
fail:
return -1;
}
/**
* ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
* @ins: input multibyte string buffer
* @outs: on return contains the (allocated) output utf16 string
* @outs_len: length of output buffer in utf16 characters
*
* JPA : made compliant with RFC3629 / RFC2781
*/
static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
{
const char *t = ins;
u32 wc;
ntfschar *outpos;
int shorts = utf8_to_utf16_size(ins);
if (shorts < 0) {
errno = EILSEQ;
goto fail;
}
if (!*outs)
*outs = ntfs_malloc((shorts+1) * sizeof(ntfschar));
outpos = *outs;
while(1) {
int m = utf8_to_unicode(&wc, t);
if (m < 0) {
errno = EILSEQ;
goto fail;
}
if (wc < 0x10000)
*outpos++ = cpu_to_le16(wc);
else {
wc -= 0x10000;
*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
}
if (m == 0)
break;
t += m;
}
return --outpos - *outs;
fail:
return -1;
}
/**
* ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
* @ins: input Unicode string buffer
@ -424,6 +720,8 @@ int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
errno = ENAMETOOLONG;
return -1;
}
if (use_utf8)
return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
if (!mbs) {
mbs_len = (ins_len + 1) * MB_CUR_MAX;
mbs = ntfs_malloc(mbs_len);
@ -530,6 +828,9 @@ int ntfs_mbstoucs(const char *ins, ntfschar **outs)
return -1;
}
if (use_utf8)
return ntfs_utf8_to_utf16(ins, outs);
/* Determine the size of the multi-byte string in bytes. */
ins_size = strlen(ins);
/* Determine the length of the multi-byte string. */
@ -739,3 +1040,25 @@ void ntfs_ucsfree(ntfschar *ucs)
free(ucs);
}
/*
* Define the character encoding to be used
*
* Using UTF-8 unless specified otherwise
*/
int ntfs_set_char_encoding(const char *locale)
{
use_utf8 = 0;
if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
|| strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
use_utf8 = 1;
else
if (setlocale(LC_ALL, locale))
use_utf8 = 0;
else {
ntfs_log_error("Invalid locale, encoding to UTF-8\n");
use_utf8 = 1;
}
return (0); /* always successful */
}

View File

@ -2352,7 +2352,7 @@ static char *parse_mount_options(const char *orig_opts)
} else if (!strcmp(opt, "locale")) {
if (missing_option_value(val, "locale"))
goto err_exit;
if (!setlocale(LC_ALL, val))
if (ntfs_set_char_encoding(LC_ALL, val))
ntfs_log_error(locale_msg, val);
} else if (!strcmp(opt, "streams_interface")) {
if (missing_option_value(val, "streams_interface"))