unicode: s/FEATURE_ASSUME_UNICODE/UNICODE_SUPPORT, add UNICODE_USING_LOCALE

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2010-03-26 14:06:56 +01:00
parent aa167556cd
commit 19158a837d
19 changed files with 481 additions and 596 deletions

View File

@ -119,7 +119,7 @@ config LOCALE_SUPPORT
Enable this if your system has locale support and you would like Enable this if your system has locale support and you would like
busybox to support locale settings. busybox to support locale settings.
config FEATURE_ASSUME_UNICODE config UNICODE_SUPPORT
bool "Support Unicode" bool "Support Unicode"
default n default n
help help
@ -131,10 +131,18 @@ config FEATURE_ASSUME_UNICODE
Probably by the time when busybox will be fully Unicode-clean, Probably by the time when busybox will be fully Unicode-clean,
other encodings will be mainly of historic interest. other encodings will be mainly of historic interest.
config UNICODE_USING_LOCALE
bool "Use libc routines for Unicode (else uses internal ones)"
default n
depends on UNICODE_SUPPORT && LOCALE_SUPPORT
help
With this option on, Unicode support is implemented using libc
routines. Otherwise, internal implementation is used.
config FEATURE_CHECK_UNICODE_IN_ENV config FEATURE_CHECK_UNICODE_IN_ENV
bool "Check $LANG environment variable" bool "Check $LANG environment variable"
default y default y
depends on FEATURE_ASSUME_UNICODE && !LOCALE_SUPPORT depends on UNICODE_SUPPORT && !UNICODE_USING_LOCALE
help help
With this option on, Unicode support is activated With this option on, Unicode support is activated
only if LANG variable has the value of the form "xxxx.utf8" only if LANG variable has the value of the form "xxxx.utf8"
@ -143,7 +151,7 @@ config FEATURE_CHECK_UNICODE_IN_ENV
config SUBST_WCHAR config SUBST_WCHAR
int "Character code to substitute unprintable characters with" int "Character code to substitute unprintable characters with"
depends on FEATURE_ASSUME_UNICODE depends on UNICODE_SUPPORT
default 63 default 63
help help
Typical values are 63 for '?' (works with any output device), Typical values are 63 for '?' (works with any output device),
@ -152,7 +160,7 @@ config SUBST_WCHAR
config LAST_SUPPORTED_WCHAR config LAST_SUPPORTED_WCHAR
int "Range of supported Unicode characters" int "Range of supported Unicode characters"
depends on FEATURE_ASSUME_UNICODE depends on UNICODE_SUPPORT
default 767 default 767
help help
Any character with Unicode value bigger than this is assumed Any character with Unicode value bigger than this is assumed
@ -183,7 +191,7 @@ config LAST_SUPPORTED_WCHAR
config UNICODE_COMBINING_WCHARS config UNICODE_COMBINING_WCHARS
bool "Allow zero-width Unicode characters on output" bool "Allow zero-width Unicode characters on output"
default n default n
depends on FEATURE_ASSUME_UNICODE depends on UNICODE_SUPPORT
help help
With this option off, any Unicode char with width of 0 With this option off, any Unicode char with width of 0
is substituted on output. is substituted on output.
@ -191,7 +199,7 @@ config UNICODE_COMBINING_WCHARS
config UNICODE_WIDE_WCHARS config UNICODE_WIDE_WCHARS
bool "Allow wide Unicode characters on output" bool "Allow wide Unicode characters on output"
default n default n
depends on FEATURE_ASSUME_UNICODE depends on UNICODE_SUPPORT
help help
With this option off, any Unicode char with width > 1 With this option off, any Unicode char with width > 1
is substituted on output. is substituted on output.
@ -199,7 +207,7 @@ config UNICODE_WIDE_WCHARS
config UNICODE_BIDI_SUPPORT config UNICODE_BIDI_SUPPORT
bool "Bidirectional character-aware line input" bool "Bidirectional character-aware line input"
default n default n
depends on FEATURE_ASSUME_UNICODE && !LOCALE_SUPPORT depends on UNICODE_SUPPORT && !UNICODE_USING_LOCALE
help help
With this option on, right-to-left Unicode characters With this option on, right-to-left Unicode characters
are treated differently on input (e.g. cursor movement). are treated differently on input (e.g. cursor movement).

4
TODO
View File

@ -324,8 +324,8 @@ This is useful if you build against uclibc with locale support disabled.
Unicode-dependent applets must call check_unicode_in_env() when they Unicode-dependent applets must call check_unicode_in_env() when they
begin executing. begin executing.
Applet code may conditionalize on FEATURE_ASSUME_UNICODE Applet code may conditionalize on UNICODE_SUPPORT in order to use
in order to use more efficient code if unicode support is not requested. more efficient code if unicode support is not requested.
Available functions (if you need more, implement them in libbb/unicode.c Available functions (if you need more, implement them in libbb/unicode.c
so that they work without LOCALE_SUPPORT too): so that they work without LOCALE_SUPPORT too):

View File

@ -24,7 +24,7 @@ CONFIG_FEATURE_VERBOSE_USAGE=y
CONFIG_FEATURE_COMPRESS_USAGE=y CONFIG_FEATURE_COMPRESS_USAGE=y
CONFIG_FEATURE_INSTALLER=y CONFIG_FEATURE_INSTALLER=y
# CONFIG_LOCALE_SUPPORT is not set # CONFIG_LOCALE_SUPPORT is not set
# CONFIG_FEATURE_ASSUME_UNICODE is not set # CONFIG_UNICODE_SUPPORT is not set
# CONFIG_FEATURE_CHECK_UNICODE_IN_ENV is not set # CONFIG_FEATURE_CHECK_UNICODE_IN_ENV is not set
CONFIG_LONG_OPTS=y CONFIG_LONG_OPTS=y
CONFIG_FEATURE_DEVPTS=y CONFIG_FEATURE_DEVPTS=y

View File

@ -87,8 +87,8 @@ int cal_main(int argc UNUSED_PARAM, char **argv)
/* "Su Mo Tu We Th Fr Sa" */ /* "Su Mo Tu We Th Fr Sa" */
/* -j heading: */ /* -j heading: */
/* " Su Mo Tu We Th Fr Sa" */ /* " Su Mo Tu We Th Fr Sa" */
char day_headings[ENABLE_FEATURE_ASSUME_UNICODE ? 28 * 6 : 28]; char day_headings[ENABLE_UNICODE_SUPPORT ? 28 * 6 : 28];
IF_FEATURE_ASSUME_UNICODE(char *hp = day_headings;) IF_UNICODE_SUPPORT(char *hp = day_headings;)
char buf[40]; char buf[40];
init_unicode(); init_unicode();
@ -134,7 +134,7 @@ int cal_main(int argc UNUSED_PARAM, char **argv)
zero_tm.tm_wday = i; zero_tm.tm_wday = i;
/* abbreviated weekday name according to locale */ /* abbreviated weekday name according to locale */
strftime(buf, sizeof(buf), "%a", &zero_tm); strftime(buf, sizeof(buf), "%a", &zero_tm);
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
if (julian) if (julian)
*hp++ = ' '; *hp++ = ' ';
{ {
@ -149,7 +149,7 @@ int cal_main(int argc UNUSED_PARAM, char **argv)
#endif #endif
} }
} while (++i < 12); } while (++i < 12);
IF_FEATURE_ASSUME_UNICODE(hp[-1] = '\0';) IF_UNICODE_SUPPORT(hp[-1] = '\0';)
if (month) { if (month) {
unsigned row, len, days[MAXDAYS]; unsigned row, len, days[MAXDAYS];

View File

@ -174,7 +174,7 @@ int df_main(int argc UNUSED_PARAM, char **argv)
} }
#endif #endif
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
{ {
uni_stat_t uni_stat; uni_stat_t uni_stat;
char *uni_dev = unicode_conv_to_printable(&uni_stat, device); char *uni_dev = unicode_conv_to_printable(&uni_stat, device);

View File

@ -48,7 +48,7 @@ static void expand(FILE *file, unsigned tab_size, unsigned opt)
if (c == '\t') { if (c == '\t') {
unsigned len; unsigned len;
*ptr = '\0'; *ptr = '\0';
# if ENABLE_FEATURE_ASSUME_UNICODE # if ENABLE_UNICODE_SUPPORT
{ {
uni_stat_t uni_stat; uni_stat_t uni_stat;
printable_string(&uni_stat, ptr_strbeg); printable_string(&uni_stat, ptr_strbeg);
@ -107,7 +107,7 @@ static void unexpand(FILE *file, unsigned tab_size, unsigned opt)
} }
n = strcspn(ptr, "\t "); n = strcspn(ptr, "\t ");
printf("%*s%.*s", len, "", n, ptr); printf("%*s%.*s", len, "", n, ptr);
# if ENABLE_FEATURE_ASSUME_UNICODE # if ENABLE_UNICODE_SUPPORT
{ {
char c; char c;
uni_stat_t uni_stat; uni_stat_t uni_stat;

View File

@ -5,7 +5,7 @@
#ifndef UNICODE_H #ifndef UNICODE_H
#define UNICODE_H 1 #define UNICODE_H 1
#if ENABLE_LOCALE_SUPPORT #if ENABLE_UNICODE_USING_LOCALE
# include <wchar.h> # include <wchar.h>
# include <wctype.h> # include <wctype.h>
#endif #endif
@ -21,7 +21,7 @@ enum {
#define unicode_bidi_isrtl(wc) 0 #define unicode_bidi_isrtl(wc) 0
#define unicode_bidi_is_neutral_wchar(wc) (wc <= 126 && !isalpha(wc)) #define unicode_bidi_is_neutral_wchar(wc) (wc <= 126 && !isalpha(wc))
#if !ENABLE_FEATURE_ASSUME_UNICODE #if !ENABLE_UNICODE_SUPPORT
# define unicode_strlen(string) strlen(string) # define unicode_strlen(string) strlen(string)
# define unicode_status UNICODE_OFF # define unicode_status UNICODE_OFF
@ -50,7 +50,7 @@ char* FAST_FUNC unicode_conv_to_printable(uni_stat_t *stats, const char *src);
char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth); char* FAST_FUNC unicode_conv_to_printable_maxwidth(uni_stat_t *stats, const char *src, unsigned maxwidth);
char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width); char* FAST_FUNC unicode_conv_to_printable_fixedwidth(uni_stat_t *stats, const char *src, unsigned width);
# if ENABLE_LOCALE_SUPPORT # if ENABLE_UNICODE_USING_LOCALE
extern uint8_t unicode_status; extern uint8_t unicode_status;
void init_unicode(void) FAST_FUNC; void init_unicode(void) FAST_FUNC;
@ -102,9 +102,9 @@ int unicode_bidi_is_neutral_wchar(wint_t wc) FAST_FUNC;
# endif # endif
# endif /* !LOCALE_SUPPORT */ # endif /* !UNICODE_USING_LOCALE */
#endif /* FEATURE_ASSUME_UNICODE */ #endif /* UNICODE_SUPPORT */
POP_SAVED_FUNCTION_VISIBILITY POP_SAVED_FUNCTION_VISIBILITY

View File

@ -124,7 +124,7 @@ lib-y += xrealloc_vector.o
# and objects which may fail to build (SELinux on selinux-less system) # and objects which may fail to build (SELinux on selinux-less system)
lib-$(CONFIG_SELINUX) += selinux_common.o lib-$(CONFIG_SELINUX) += selinux_common.o
lib-$(CONFIG_FEATURE_MTAB_SUPPORT) += mtab.o lib-$(CONFIG_FEATURE_MTAB_SUPPORT) += mtab.o
lib-$(CONFIG_FEATURE_ASSUME_UNICODE) += unicode.o lib-$(CONFIG_UNICODE_SUPPORT) += unicode.o
lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o
lib-$(CONFIG_LOSETUP) += loop.o lib-$(CONFIG_LOSETUP) += loop.o

View File

@ -67,7 +67,7 @@
#undef CHAR_T #undef CHAR_T
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
# define BB_NUL L'\0' # define BB_NUL L'\0'
# define CHAR_T wchar_t # define CHAR_T wchar_t
static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
@ -202,7 +202,7 @@ static void deinit_S(void)
#define DEINIT_S() deinit_S() #define DEINIT_S() deinit_S()
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
static size_t load_string(const char *src, int maxsize) static size_t load_string(const char *src, int maxsize)
{ {
ssize_t len = mbstowcs(command_ps, src, maxsize - 1); ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
@ -932,7 +932,7 @@ static void input_tab(smallint *lastWasTab)
#define matchBuf (S.input_tab__matchBuf) #define matchBuf (S.input_tab__matchBuf)
int find_type; int find_type;
int recalc_pos; int recalc_pos;
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
/* cursor pos in command converted to multibyte form */ /* cursor pos in command converted to multibyte form */
int cursor_mb; int cursor_mb;
#endif #endif
@ -942,7 +942,7 @@ static void input_tab(smallint *lastWasTab)
/* Make a local copy of the string -- /* Make a local copy of the string --
* up to the position of the cursor */ * up to the position of the cursor */
save_string(matchBuf, cursor + 1); save_string(matchBuf, cursor + 1);
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
cursor_mb = strlen(matchBuf); cursor_mb = strlen(matchBuf);
#endif #endif
tmp = matchBuf; tmp = matchBuf;
@ -1015,7 +1015,7 @@ static void input_tab(smallint *lastWasTab)
} }
len_found = strlen(tmp); len_found = strlen(tmp);
#if !ENABLE_FEATURE_ASSUME_UNICODE #if !ENABLE_UNICODE_SUPPORT
/* have space to place the match? */ /* have space to place the match? */
/* The result consists of three parts with these lengths: */ /* The result consists of three parts with these lengths: */
/* (cursor - recalc_pos) + len_found + (command_len - cursor) */ /* (cursor - recalc_pos) + len_found + (command_len - cursor) */
@ -1088,7 +1088,7 @@ static void save_command_ps_at_cur_history(void)
int cur = state->cur_history; int cur = state->cur_history;
free(state->history[cur]); free(state->history[cur]);
# if ENABLE_FEATURE_ASSUME_UNICODE # if ENABLE_UNICODE_SUPPORT
{ {
char tbuf[MAX_LINELEN]; char tbuf[MAX_LINELEN];
save_string(tbuf, sizeof(tbuf)); save_string(tbuf, sizeof(tbuf));
@ -1659,7 +1659,7 @@ static int lineedit_read_key(char *read_key_buffer)
{ {
int64_t ic; int64_t ic;
int timeout = -1; int timeout = -1;
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
char unicode_buf[MB_CUR_MAX + 1]; char unicode_buf[MB_CUR_MAX + 1];
int unicode_idx = 0; int unicode_idx = 0;
#endif #endif
@ -1674,7 +1674,7 @@ static int lineedit_read_key(char *read_key_buffer)
*/ */
ic = read_key(STDIN_FILENO, read_key_buffer, timeout); ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
if (errno) { if (errno) {
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
if (errno == EAGAIN && unicode_idx != 0) if (errno == EAGAIN && unicode_idx != 0)
goto pushback; goto pushback;
#endif #endif
@ -1700,7 +1700,7 @@ static int lineedit_read_key(char *read_key_buffer)
} }
#endif #endif
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
if (unicode_status == UNICODE_ON) { if (unicode_status == UNICODE_ON) {
wchar_t wc; wchar_t wc;
@ -1817,7 +1817,7 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
/* prepare before init handlers */ /* prepare before init handlers */
cmdedit_y = 0; /* quasireal y, not true if line > xt*yt */ cmdedit_y = 0; /* quasireal y, not true if line > xt*yt */
command_len = 0; command_len = 0;
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
command_ps = xzalloc(maxsize * sizeof(command_ps[0])); command_ps = xzalloc(maxsize * sizeof(command_ps[0]));
#else #else
command_ps = command; command_ps = command;
@ -2199,8 +2199,8 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
// } // }
// } // }
if (ic < ' ' if (ic < ' '
|| (!ENABLE_FEATURE_ASSUME_UNICODE && ic >= 256) || (!ENABLE_UNICODE_SUPPORT && ic >= 256)
|| (ENABLE_FEATURE_ASSUME_UNICODE && ic >= VI_CMDMODE_BIT) || (ENABLE_UNICODE_SUPPORT && ic >= VI_CMDMODE_BIT)
) { ) {
/* If VI_CMDMODE_BIT is set, ic is >= 256 /* If VI_CMDMODE_BIT is set, ic is >= 256
* and vi mode ignores unexpected chars. * and vi mode ignores unexpected chars.
@ -2268,7 +2268,7 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
/* Stop bug catching using "command_must_not_be_used" trick */ /* Stop bug catching using "command_must_not_be_used" trick */
#undef command #undef command
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
command[0] = '\0'; command[0] = '\0';
if (command_len > 0) if (command_len > 0)
command_len = save_string(command, maxsize - 1); command_len = save_string(command, maxsize - 1);

View File

@ -36,7 +36,7 @@ const char* FAST_FUNC printable_string(uni_stat_t *stats, const char *str)
s++; s++;
} }
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
dst = unicode_conv_to_printable(stats, str); dst = unicode_conv_to_printable(stats, str);
#else #else
{ {

View File

@ -78,7 +78,7 @@ void FAST_FUNC bb_progress_update(bb_progress_t *p,
if (ratio > 100) ratio = 100; if (ratio > 100) ratio = 100;
} }
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
init_unicode(); init_unicode();
/* libbb candidate? */ /* libbb candidate? */
{ {

View File

@ -14,12 +14,12 @@
uint8_t unicode_status; uint8_t unicode_status;
#endif #endif
/* This file is compiled only if FEATURE_ASSUME_UNICODE is on. /* This file is compiled only if UNICODE_SUPPORT is on.
* We check other options and decide whether to use libc support * We check other options and decide whether to use libc support
* via locale, or use our own logic: * via locale, or use our own logic:
*/ */
#if ENABLE_LOCALE_SUPPORT #if ENABLE_UNICODE_USING_LOCALE
/* Unicode support using libc locale support. */ /* Unicode support using libc locale support. */
@ -139,7 +139,7 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
return org_n - n; return org_n - n;
} }
#define ERROR_WCHAR (~(wchar_t)0) # define ERROR_WCHAR (~(wchar_t)0)
static const char *mbstowc_internal(wchar_t *res, const char *src) static const char *mbstowc_internal(wchar_t *res, const char *src)
{ {
@ -239,7 +239,427 @@ int FAST_FUNC iswpunct(wint_t wc)
return (unsigned)wc <= 0x7f && ispunct(wc); return (unsigned)wc <= 0x7f && ispunct(wc);
} }
#include "unicode_wcwidth.c"
# if LAST_SUPPORTED_WCHAR >= 0x300
struct interval {
uint16_t first;
uint16_t last;
};
/* auxiliary function for binary search in interval table */
static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
{
unsigned min;
unsigned mid;
if (ucs < table[0].first || ucs > table[max].last)
return 0;
min = 0;
while (max >= min) {
mid = (min + max) / 2;
if (ucs > table[mid].last)
min = mid + 1;
else if (ucs < table[mid].first)
max = mid - 1;
else
return 1;
}
return 0;
}
static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
{
unsigned min;
unsigned mid;
unsigned first, last;
first = table[0] >> 2;
last = first + (table[0] & 3);
if (ucs < first || ucs > last)
return 0;
min = 0;
while (max >= min) {
mid = (min + max) / 2;
first = table[mid] >> 2;
last = first + (table[mid] & 3);
if (ucs > last)
min = mid + 1;
else if (ucs < first)
max = mid - 1;
else
return 1;
}
return 0;
}
# endif
/*
* This is an implementation of wcwidth() and wcswidth() (defined in
* IEEE Std 1002.1-2001) for Unicode.
*
* http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
* http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
*
* In fixed-width output devices, Latin characters all occupy a single
* "cell" position of equal width, whereas ideographic CJK characters
* occupy two such cells. Interoperability between terminal-line
* applications and (teletype-style) character terminals using the
* UTF-8 encoding requires agreement on which character should advance
* the cursor by how many cell positions. No established formal
* standards exist at present on which Unicode character shall occupy
* how many cell positions on character terminals. These routines are
* a first attempt of defining such behavior based on simple rules
* applied to data provided by the Unicode Consortium.
*
* For some graphical characters, the Unicode standard explicitly
* defines a character-cell width via the definition of the East Asian
* FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
* In all these cases, there is no ambiguity about which width a
* terminal shall use. For characters in the East Asian Ambiguous (A)
* class, the width choice depends purely on a preference of backward
* compatibility with either historic CJK or Western practice.
* Choosing single-width for these characters is easy to justify as
* the appropriate long-term solution, as the CJK practice of
* displaying these characters as double-width comes from historic
* implementation simplicity (8-bit encoded characters were displayed
* single-width and 16-bit ones double-width, even for Greek,
* Cyrillic, etc.) and not any typographic considerations.
*
* Much less clear is the choice of width for the Not East Asian
* (Neutral) class. Existing practice does not dictate a width for any
* of these characters. It would nevertheless make sense
* typographically to allocate two character cells to characters such
* as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
* represented adequately with a single-width glyph. The following
* routines at present merely assign a single-cell width to all
* neutral characters, in the interest of simplicity. This is not
* entirely satisfactory and should be reconsidered before
* establishing a formal standard in this area. At the moment, the
* decision which Not East Asian (Neutral) characters should be
* represented by double-width glyphs cannot yet be answered by
* applying a simple rule from the Unicode database content. Setting
* up a proper standard for the behavior of UTF-8 character terminals
* will require a careful analysis not only of each Unicode character,
* but also of each presentation form, something the author of these
* routines has avoided to do so far.
*
* http://www.unicode.org/unicode/reports/tr11/
*
* Markus Kuhn -- 2007-05-26 (Unicode 5.0)
*
* Permission to use, copy, modify, and distribute this software
* for any purpose and without fee is hereby granted. The author
* disclaims all warranties with regard to this software.
*
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
*/
/* Assigned Unicode character ranges:
* Plane Range
* 0 0000FFFF Basic Multilingual Plane
* 1 100001FFFF Supplementary Multilingual Plane
* 2 200002FFFF Supplementary Ideographic Plane
* 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
* 4-13 40000DFFFF currently unassigned
* 14 E0000EFFFF Supplementary Special-purpose Plane
* 15 F0000FFFFF Supplementary Private Use Area-A
* 16 10000010FFFF Supplementary Private Use Area-B
*
* "Supplementary Special-purpose Plane currently contains non-graphical
* characters in two blocks of 128 and 240 characters. The first block
* is for language tag characters for use when language cannot be indicated
* through other protocols (such as the xml:lang attribute in XML).
* The other block contains glyph variation selectors to indicate
* an alternate glyph for a character that cannot be determined by context."
*
* In simpler terms: it is a tool to fix the "Han unification" mess
* created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
* version of a character. (They forgot that the whole purpose of the Unicode
* was to be able to write all chars in one charset without such tricks).
* Until East Asian users say it is actually necessary to support these
* code points in console applications like busybox
* (i.e. do these chars ever appear in filenames, hostnames, text files
* and such?), we are treating these code points as invalid.
*
* Tertiary Ideographic Plane is also ignored for now,
* until Unicode committee assigns something there.
*/
/* The following two functions define the column width of an ISO 10646
* character as follows:
*
* - The null character (U+0000) has a column width of 0.
*
* - Other C0/C1 control characters and DEL will lead to a return
* value of -1.
*
* - Non-spacing and enclosing combining characters (general
* category code Mn or Me in the Unicode database) have a
* column width of 0.
*
* - SOFT HYPHEN (U+00AD) has a column width of 1.
*
* - Other format characters (general category code Cf in the Unicode
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
*
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
* have a column width of 0.
*
* - Spacing characters in the East Asian Wide (W) or East Asian
* Full-width (F) category as defined in Unicode Technical
* Report #11 have a column width of 2.
*
* - All remaining characters (including all printable
* ISO 8859-1 and WGL4 characters, Unicode control characters,
* etc.) have a column width of 1.
*
* This implementation assumes that wchar_t characters are encoded
* in ISO 10646.
*/
static int wcwidth(unsigned ucs)
{
# if LAST_SUPPORTED_WCHAR >= 0x300
/* sorted list of non-overlapping intervals of non-spacing characters */
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
static const struct interval combining[] = {
# define BIG_(a,b) { a, b },
# define PAIR(a,b)
# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
BIG_(0x0300, 0x036F) \
PAIR(0x0483, 0x0486) \
PAIR(0x0488, 0x0489) \
BIG_(0x0591, 0x05BD) \
PAIR(0x05BF, 0x05BF) \
PAIR(0x05C1, 0x05C2) \
PAIR(0x05C4, 0x05C5) \
PAIR(0x05C7, 0x05C7) \
PAIR(0x0600, 0x0603) \
BIG_(0x0610, 0x0615) \
BIG_(0x064B, 0x065E) \
PAIR(0x0670, 0x0670) \
BIG_(0x06D6, 0x06E4) \
PAIR(0x06E7, 0x06E8) \
PAIR(0x06EA, 0x06ED) \
PAIR(0x070F, 0x070F) \
PAIR(0x0711, 0x0711) \
BIG_(0x0730, 0x074A) \
BIG_(0x07A6, 0x07B0) \
BIG_(0x07EB, 0x07F3) \
PAIR(0x0901, 0x0902) \
PAIR(0x093C, 0x093C) \
BIG_(0x0941, 0x0948) \
PAIR(0x094D, 0x094D) \
PAIR(0x0951, 0x0954) \
PAIR(0x0962, 0x0963) \
PAIR(0x0981, 0x0981) \
PAIR(0x09BC, 0x09BC) \
PAIR(0x09C1, 0x09C4) \
PAIR(0x09CD, 0x09CD) \
PAIR(0x09E2, 0x09E3) \
PAIR(0x0A01, 0x0A02) \
PAIR(0x0A3C, 0x0A3C) \
PAIR(0x0A41, 0x0A42) \
PAIR(0x0A47, 0x0A48) \
PAIR(0x0A4B, 0x0A4D) \
PAIR(0x0A70, 0x0A71) \
PAIR(0x0A81, 0x0A82) \
PAIR(0x0ABC, 0x0ABC) \
BIG_(0x0AC1, 0x0AC5) \
PAIR(0x0AC7, 0x0AC8) \
PAIR(0x0ACD, 0x0ACD) \
PAIR(0x0AE2, 0x0AE3) \
PAIR(0x0B01, 0x0B01) \
PAIR(0x0B3C, 0x0B3C) \
PAIR(0x0B3F, 0x0B3F) \
PAIR(0x0B41, 0x0B43) \
PAIR(0x0B4D, 0x0B4D) \
PAIR(0x0B56, 0x0B56) \
PAIR(0x0B82, 0x0B82) \
PAIR(0x0BC0, 0x0BC0) \
PAIR(0x0BCD, 0x0BCD) \
PAIR(0x0C3E, 0x0C40) \
PAIR(0x0C46, 0x0C48) \
PAIR(0x0C4A, 0x0C4D) \
PAIR(0x0C55, 0x0C56) \
PAIR(0x0CBC, 0x0CBC) \
PAIR(0x0CBF, 0x0CBF) \
PAIR(0x0CC6, 0x0CC6) \
PAIR(0x0CCC, 0x0CCD) \
PAIR(0x0CE2, 0x0CE3) \
PAIR(0x0D41, 0x0D43) \
PAIR(0x0D4D, 0x0D4D) \
PAIR(0x0DCA, 0x0DCA) \
PAIR(0x0DD2, 0x0DD4) \
PAIR(0x0DD6, 0x0DD6) \
PAIR(0x0E31, 0x0E31) \
BIG_(0x0E34, 0x0E3A) \
BIG_(0x0E47, 0x0E4E) \
PAIR(0x0EB1, 0x0EB1) \
BIG_(0x0EB4, 0x0EB9) \
PAIR(0x0EBB, 0x0EBC) \
BIG_(0x0EC8, 0x0ECD) \
PAIR(0x0F18, 0x0F19) \
PAIR(0x0F35, 0x0F35) \
PAIR(0x0F37, 0x0F37) \
PAIR(0x0F39, 0x0F39) \
BIG_(0x0F71, 0x0F7E) \
BIG_(0x0F80, 0x0F84) \
PAIR(0x0F86, 0x0F87) \
PAIR(0x0FC6, 0x0FC6) \
BIG_(0x0F90, 0x0F97) \
BIG_(0x0F99, 0x0FBC) \
PAIR(0x102D, 0x1030) \
PAIR(0x1032, 0x1032) \
PAIR(0x1036, 0x1037) \
PAIR(0x1039, 0x1039) \
PAIR(0x1058, 0x1059) \
BIG_(0x1160, 0x11FF) \
PAIR(0x135F, 0x135F) \
PAIR(0x1712, 0x1714) \
PAIR(0x1732, 0x1734) \
PAIR(0x1752, 0x1753) \
PAIR(0x1772, 0x1773) \
PAIR(0x17B4, 0x17B5) \
BIG_(0x17B7, 0x17BD) \
PAIR(0x17C6, 0x17C6) \
BIG_(0x17C9, 0x17D3) \
PAIR(0x17DD, 0x17DD) \
PAIR(0x180B, 0x180D) \
PAIR(0x18A9, 0x18A9) \
PAIR(0x1920, 0x1922) \
PAIR(0x1927, 0x1928) \
PAIR(0x1932, 0x1932) \
PAIR(0x1939, 0x193B) \
PAIR(0x1A17, 0x1A18) \
PAIR(0x1B00, 0x1B03) \
PAIR(0x1B34, 0x1B34) \
BIG_(0x1B36, 0x1B3A) \
PAIR(0x1B3C, 0x1B3C) \
PAIR(0x1B42, 0x1B42) \
BIG_(0x1B6B, 0x1B73) \
BIG_(0x1DC0, 0x1DCA) \
PAIR(0x1DFE, 0x1DFF) \
BIG_(0x200B, 0x200F) \
BIG_(0x202A, 0x202E) \
PAIR(0x2060, 0x2063) \
BIG_(0x206A, 0x206F) \
BIG_(0x20D0, 0x20EF) \
BIG_(0x302A, 0x302F) \
PAIR(0x3099, 0x309A) \
/* Too big to be packed in PAIRs: */ \
BIG_(0xA806, 0xA806) \
BIG_(0xA80B, 0xA80B) \
BIG_(0xA825, 0xA826) \
BIG_(0xFB1E, 0xFB1E) \
BIG_(0xFE00, 0xFE0F) \
BIG_(0xFE20, 0xFE23) \
BIG_(0xFEFF, 0xFEFF) \
BIG_(0xFFF9, 0xFFFB)
ARRAY
# undef BIG_
# undef PAIR
};
# define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t combining1[] = { ARRAY };
# undef BIG_
# undef PAIR
# define BIG_(a,b) char big_##a[b < 0x4000 && b-a <= 3 ? -1 : 1];
# define PAIR(a,b) char pair##a[b >= 0x4000 || b-a > 3 ? -1 : 1];
struct CHECK { ARRAY };
# undef BIG_
# undef PAIR
# undef ARRAY
# endif
if (ucs == 0)
return 0;
/* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
return -1;
/* Quick abort if it is an obviously invalid char */
if (ucs > LAST_SUPPORTED_WCHAR)
return -1;
/* Optimization: no combining chars below 0x300 */
if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
return 1;
# if LAST_SUPPORTED_WCHAR >= 0x300
/* Binary search in table of non-spacing characters */
if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
return 0;
if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
return 0;
/* Optimization: all chars below 0x1100 are not double-width */
if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
return 1;
# if LAST_SUPPORTED_WCHAR >= 0x1100
/* Invalid code points: */
/* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
/* Private Use Area (e000..f8ff) */
/* Noncharacters fdd0..fdef */
if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
|| (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
) {
return -1;
}
/* 0xfffe and 0xffff in every plane are invalid */
if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
return -1;
}
# if LAST_SUPPORTED_WCHAR >= 0x10000
if (ucs >= 0x10000) {
/* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
static const struct interval combining0x10000[] = {
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
{ 0xD242, 0xD244 }
};
/* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
return 0;
/* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
if (LAST_SUPPORTED_WCHAR >= 0xE0001
&& ( ucs == 0xE0001
|| (ucs >= 0xE0020 && ucs <= 0xE007F)
|| (ucs >= 0xE0100 && ucs <= 0xE01EF)
)
) {
return 0;
}
}
# endif
/* If we arrive here, ucs is not a combining or C0/C1 control character.
* Check whether it's 1 char or 2-shar wide.
*/
return 1 +
( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
|| ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
|| ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
# if LAST_SUPPORTED_WCHAR >= 0xac00
|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
|| (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
|| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
|| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
|| (ucs >= 0xffe0 && ucs <= 0xffe6)
|| ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
# endif
);
# endif /* >= 0x1100 */
# endif /* >= 0x300 */
}
# if ENABLE_UNICODE_BIDI_SUPPORT # if ENABLE_UNICODE_BIDI_SUPPORT
int FAST_FUNC unicode_bidi_isrtl(wint_t wc) int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
@ -592,7 +1012,7 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
int w; int w;
wchar_t wc; wchar_t wc;
#if ENABLE_LOCALE_SUPPORT #if ENABLE_UNICODE_USING_LOCALE
{ {
mbstate_t mbst = { 0 }; mbstate_t mbst = { 0 };
ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
@ -647,7 +1067,7 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
uni_count++; uni_count++;
uni_width += w; uni_width += w;
dst = xrealloc(dst, dst_len + MB_CUR_MAX); dst = xrealloc(dst, dst_len + MB_CUR_MAX);
#if ENABLE_LOCALE_SUPPORT #if ENABLE_UNICODE_USING_LOCALE
{ {
mbstate_t mbst = { 0 }; mbstate_t mbst = { 0 };
dst_len += wcrtomb(&dst[dst_len], wc, &mbst); dst_len += wcrtomb(&dst[dst_len], wc, &mbst);
@ -699,7 +1119,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
int w; int w;
wchar_t wc; wchar_t wc;
#if ENABLE_LOCALE_SUPPORT #if ENABLE_UNICODE_USING_LOCALE
{ {
mbstate_t mbst = { 0 }; mbstate_t mbst = { 0 };
ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);

View File

@ -1,543 +0,0 @@
/*
* This is an implementation of wcwidth() and wcswidth() (defined in
* IEEE Std 1002.1-2001) for Unicode.
*
* http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
* http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
*
* In fixed-width output devices, Latin characters all occupy a single
* "cell" position of equal width, whereas ideographic CJK characters
* occupy two such cells. Interoperability between terminal-line
* applications and (teletype-style) character terminals using the
* UTF-8 encoding requires agreement on which character should advance
* the cursor by how many cell positions. No established formal
* standards exist at present on which Unicode character shall occupy
* how many cell positions on character terminals. These routines are
* a first attempt of defining such behavior based on simple rules
* applied to data provided by the Unicode Consortium.
*
* For some graphical characters, the Unicode standard explicitly
* defines a character-cell width via the definition of the East Asian
* FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
* In all these cases, there is no ambiguity about which width a
* terminal shall use. For characters in the East Asian Ambiguous (A)
* class, the width choice depends purely on a preference of backward
* compatibility with either historic CJK or Western practice.
* Choosing single-width for these characters is easy to justify as
* the appropriate long-term solution, as the CJK practice of
* displaying these characters as double-width comes from historic
* implementation simplicity (8-bit encoded characters were displayed
* single-width and 16-bit ones double-width, even for Greek,
* Cyrillic, etc.) and not any typographic considerations.
*
* Much less clear is the choice of width for the Not East Asian
* (Neutral) class. Existing practice does not dictate a width for any
* of these characters. It would nevertheless make sense
* typographically to allocate two character cells to characters such
* as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
* represented adequately with a single-width glyph. The following
* routines at present merely assign a single-cell width to all
* neutral characters, in the interest of simplicity. This is not
* entirely satisfactory and should be reconsidered before
* establishing a formal standard in this area. At the moment, the
* decision which Not East Asian (Neutral) characters should be
* represented by double-width glyphs cannot yet be answered by
* applying a simple rule from the Unicode database content. Setting
* up a proper standard for the behavior of UTF-8 character terminals
* will require a careful analysis not only of each Unicode character,
* but also of each presentation form, something the author of these
* routines has avoided to do so far.
*
* http://www.unicode.org/unicode/reports/tr11/
*
* Markus Kuhn -- 2007-05-26 (Unicode 5.0)
*
* Permission to use, copy, modify, and distribute this software
* for any purpose and without fee is hereby granted. The author
* disclaims all warranties with regard to this software.
*
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
*/
/* Assigned Unicode character ranges:
* Plane Range
* 0 0000FFFF Basic Multilingual Plane
* 1 100001FFFF Supplementary Multilingual Plane
* 2 200002FFFF Supplementary Ideographic Plane
* 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
* 4-13 40000DFFFF currently unassigned
* 14 E0000EFFFF Supplementary Special-purpose Plane
* 15 F0000FFFFF Supplementary Private Use Area-A
* 16 10000010FFFF Supplementary Private Use Area-B
*
* "Supplementary Special-purpose Plane currently contains non-graphical
* characters in two blocks of 128 and 240 characters. The first block
* is for language tag characters for use when language cannot be indicated
* through other protocols (such as the xml:lang attribute in XML).
* The other block contains glyph variation selectors to indicate
* an alternate glyph for a character that cannot be determined by context."
*
* In simpler terms: it is a tool to fix the "Han unification" mess
* created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
* version of a character. (They forgot that the whole purpose of the Unicode
* was to be able to write all chars in one charset without such tricks).
* Until East Asian users say it is actually necessary to support these
* code points in console applications like busybox
* (i.e. do these chars ever appear in filenames, hostnames, text files
* and such?), we are treating these code points as invalid.
*
* Tertiary Ideographic Plane is also ignored for now,
* until Unicode committee assigns something there.
*/
#if LAST_SUPPORTED_WCHAR >= 0x300
struct interval {
uint16_t first;
uint16_t last;
};
/* auxiliary function for binary search in interval table */
static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
{
unsigned min;
unsigned mid;
if (ucs < table[0].first || ucs > table[max].last)
return 0;
min = 0;
while (max >= min) {
mid = (min + max) / 2;
if (ucs > table[mid].last)
min = mid + 1;
else if (ucs < table[mid].first)
max = mid - 1;
else
return 1;
}
return 0;
}
static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
{
unsigned min;
unsigned mid;
unsigned first, last;
first = table[0] >> 2;
last = first + (table[0] & 3);
if (ucs < first || ucs > last)
return 0;
min = 0;
while (max >= min) {
mid = (min + max) / 2;
first = table[mid] >> 2;
last = first + (table[mid] & 3);
if (ucs > last)
min = mid + 1;
else if (ucs < first)
max = mid - 1;
else
return 1;
}
return 0;
}
#endif
/* The following two functions define the column width of an ISO 10646
* character as follows:
*
* - The null character (U+0000) has a column width of 0.
*
* - Other C0/C1 control characters and DEL will lead to a return
* value of -1.
*
* - Non-spacing and enclosing combining characters (general
* category code Mn or Me in the Unicode database) have a
* column width of 0.
*
* - SOFT HYPHEN (U+00AD) has a column width of 1.
*
* - Other format characters (general category code Cf in the Unicode
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
*
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
* have a column width of 0.
*
* - Spacing characters in the East Asian Wide (W) or East Asian
* Full-width (F) category as defined in Unicode Technical
* Report #11 have a column width of 2.
*
* - All remaining characters (including all printable
* ISO 8859-1 and WGL4 characters, Unicode control characters,
* etc.) have a column width of 1.
*
* This implementation assumes that wchar_t characters are encoded
* in ISO 10646.
*/
static int wcwidth(unsigned ucs)
{
#if LAST_SUPPORTED_WCHAR >= 0x300
/* sorted list of non-overlapping intervals of non-spacing characters */
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
static const struct interval combining[] = {
#define BIG_(a,b) { a, b },
#define PAIR(a,b)
/* PAIR if < 0x4000 and no more than 4 chars big */
BIG_(0x0300, 0x036F)
PAIR(0x0483, 0x0486)
PAIR(0x0488, 0x0489)
BIG_(0x0591, 0x05BD)
PAIR(0x05BF, 0x05BF)
PAIR(0x05C1, 0x05C2)
PAIR(0x05C4, 0x05C5)
PAIR(0x05C7, 0x05C7)
PAIR(0x0600, 0x0603)
BIG_(0x0610, 0x0615)
BIG_(0x064B, 0x065E)
PAIR(0x0670, 0x0670)
BIG_(0x06D6, 0x06E4)
PAIR(0x06E7, 0x06E8)
PAIR(0x06EA, 0x06ED)
PAIR(0x070F, 0x070F)
PAIR(0x0711, 0x0711)
BIG_(0x0730, 0x074A)
BIG_(0x07A6, 0x07B0)
BIG_(0x07EB, 0x07F3)
PAIR(0x0901, 0x0902)
PAIR(0x093C, 0x093C)
BIG_(0x0941, 0x0948)
PAIR(0x094D, 0x094D)
PAIR(0x0951, 0x0954)
PAIR(0x0962, 0x0963)
PAIR(0x0981, 0x0981)
PAIR(0x09BC, 0x09BC)
PAIR(0x09C1, 0x09C4)
PAIR(0x09CD, 0x09CD)
PAIR(0x09E2, 0x09E3)
PAIR(0x0A01, 0x0A02)
PAIR(0x0A3C, 0x0A3C)
PAIR(0x0A41, 0x0A42)
PAIR(0x0A47, 0x0A48)
PAIR(0x0A4B, 0x0A4D)
PAIR(0x0A70, 0x0A71)
PAIR(0x0A81, 0x0A82)
PAIR(0x0ABC, 0x0ABC)
BIG_(0x0AC1, 0x0AC5)
PAIR(0x0AC7, 0x0AC8)
PAIR(0x0ACD, 0x0ACD)
PAIR(0x0AE2, 0x0AE3)
PAIR(0x0B01, 0x0B01)
PAIR(0x0B3C, 0x0B3C)
PAIR(0x0B3F, 0x0B3F)
PAIR(0x0B41, 0x0B43)
PAIR(0x0B4D, 0x0B4D)
PAIR(0x0B56, 0x0B56)
PAIR(0x0B82, 0x0B82)
PAIR(0x0BC0, 0x0BC0)
PAIR(0x0BCD, 0x0BCD)
PAIR(0x0C3E, 0x0C40)
PAIR(0x0C46, 0x0C48)
PAIR(0x0C4A, 0x0C4D)
PAIR(0x0C55, 0x0C56)
PAIR(0x0CBC, 0x0CBC)
PAIR(0x0CBF, 0x0CBF)
PAIR(0x0CC6, 0x0CC6)
PAIR(0x0CCC, 0x0CCD)
PAIR(0x0CE2, 0x0CE3)
PAIR(0x0D41, 0x0D43)
PAIR(0x0D4D, 0x0D4D)
PAIR(0x0DCA, 0x0DCA)
PAIR(0x0DD2, 0x0DD4)
PAIR(0x0DD6, 0x0DD6)
PAIR(0x0E31, 0x0E31)
BIG_(0x0E34, 0x0E3A)
BIG_(0x0E47, 0x0E4E)
PAIR(0x0EB1, 0x0EB1)
BIG_(0x0EB4, 0x0EB9)
PAIR(0x0EBB, 0x0EBC)
BIG_(0x0EC8, 0x0ECD)
PAIR(0x0F18, 0x0F19)
PAIR(0x0F35, 0x0F35)
PAIR(0x0F37, 0x0F37)
PAIR(0x0F39, 0x0F39)
BIG_(0x0F71, 0x0F7E)
BIG_(0x0F80, 0x0F84)
PAIR(0x0F86, 0x0F87)
PAIR(0x0FC6, 0x0FC6)
BIG_(0x0F90, 0x0F97)
BIG_(0x0F99, 0x0FBC)
PAIR(0x102D, 0x1030)
PAIR(0x1032, 0x1032)
PAIR(0x1036, 0x1037)
PAIR(0x1039, 0x1039)
PAIR(0x1058, 0x1059)
BIG_(0x1160, 0x11FF)
PAIR(0x135F, 0x135F)
PAIR(0x1712, 0x1714)
PAIR(0x1732, 0x1734)
PAIR(0x1752, 0x1753)
PAIR(0x1772, 0x1773)
PAIR(0x17B4, 0x17B5)
BIG_(0x17B7, 0x17BD)
PAIR(0x17C6, 0x17C6)
BIG_(0x17C9, 0x17D3)
PAIR(0x17DD, 0x17DD)
PAIR(0x180B, 0x180D)
PAIR(0x18A9, 0x18A9)
PAIR(0x1920, 0x1922)
PAIR(0x1927, 0x1928)
PAIR(0x1932, 0x1932)
PAIR(0x1939, 0x193B)
PAIR(0x1A17, 0x1A18)
PAIR(0x1B00, 0x1B03)
PAIR(0x1B34, 0x1B34)
BIG_(0x1B36, 0x1B3A)
PAIR(0x1B3C, 0x1B3C)
PAIR(0x1B42, 0x1B42)
BIG_(0x1B6B, 0x1B73)
BIG_(0x1DC0, 0x1DCA)
PAIR(0x1DFE, 0x1DFF)
BIG_(0x200B, 0x200F)
BIG_(0x202A, 0x202E)
PAIR(0x2060, 0x2063)
BIG_(0x206A, 0x206F)
BIG_(0x20D0, 0x20EF)
BIG_(0x302A, 0x302F)
PAIR(0x3099, 0x309A)
/* Too big to be packed in PAIRs: */
{ 0xA806, 0xA806 },
{ 0xA80B, 0xA80B },
{ 0xA825, 0xA826 },
{ 0xFB1E, 0xFB1E },
{ 0xFE00, 0xFE0F },
{ 0xFE20, 0xFE23 },
{ 0xFEFF, 0xFEFF },
{ 0xFFF9, 0xFFFB }
#undef BIG_
#undef PAIR
};
static const uint16_t combining1[] = {
#define BIG_(a,b)
#define PAIR(a,b) (a << 2) | (b-a),
/* Exact copy-n-paste of the above: */
BIG_(0x0300, 0x036F)
PAIR(0x0483, 0x0486)
PAIR(0x0488, 0x0489)
BIG_(0x0591, 0x05BD)
PAIR(0x05BF, 0x05BF)
PAIR(0x05C1, 0x05C2)
PAIR(0x05C4, 0x05C5)
PAIR(0x05C7, 0x05C7)
PAIR(0x0600, 0x0603)
BIG_(0x0610, 0x0615)
BIG_(0x064B, 0x065E)
PAIR(0x0670, 0x0670)
BIG_(0x06D6, 0x06E4)
PAIR(0x06E7, 0x06E8)
PAIR(0x06EA, 0x06ED)
PAIR(0x070F, 0x070F)
PAIR(0x0711, 0x0711)
BIG_(0x0730, 0x074A)
BIG_(0x07A6, 0x07B0)
BIG_(0x07EB, 0x07F3)
PAIR(0x0901, 0x0902)
PAIR(0x093C, 0x093C)
BIG_(0x0941, 0x0948)
PAIR(0x094D, 0x094D)
PAIR(0x0951, 0x0954)
PAIR(0x0962, 0x0963)
PAIR(0x0981, 0x0981)
PAIR(0x09BC, 0x09BC)
PAIR(0x09C1, 0x09C4)
PAIR(0x09CD, 0x09CD)
PAIR(0x09E2, 0x09E3)
PAIR(0x0A01, 0x0A02)
PAIR(0x0A3C, 0x0A3C)
PAIR(0x0A41, 0x0A42)
PAIR(0x0A47, 0x0A48)
PAIR(0x0A4B, 0x0A4D)
PAIR(0x0A70, 0x0A71)
PAIR(0x0A81, 0x0A82)
PAIR(0x0ABC, 0x0ABC)
BIG_(0x0AC1, 0x0AC5)
PAIR(0x0AC7, 0x0AC8)
PAIR(0x0ACD, 0x0ACD)
PAIR(0x0AE2, 0x0AE3)
PAIR(0x0B01, 0x0B01)
PAIR(0x0B3C, 0x0B3C)
PAIR(0x0B3F, 0x0B3F)
PAIR(0x0B41, 0x0B43)
PAIR(0x0B4D, 0x0B4D)
PAIR(0x0B56, 0x0B56)
PAIR(0x0B82, 0x0B82)
PAIR(0x0BC0, 0x0BC0)
PAIR(0x0BCD, 0x0BCD)
PAIR(0x0C3E, 0x0C40)
PAIR(0x0C46, 0x0C48)
PAIR(0x0C4A, 0x0C4D)
PAIR(0x0C55, 0x0C56)
PAIR(0x0CBC, 0x0CBC)
PAIR(0x0CBF, 0x0CBF)
PAIR(0x0CC6, 0x0CC6)
PAIR(0x0CCC, 0x0CCD)
PAIR(0x0CE2, 0x0CE3)
PAIR(0x0D41, 0x0D43)
PAIR(0x0D4D, 0x0D4D)
PAIR(0x0DCA, 0x0DCA)
PAIR(0x0DD2, 0x0DD4)
PAIR(0x0DD6, 0x0DD6)
PAIR(0x0E31, 0x0E31)
BIG_(0x0E34, 0x0E3A)
BIG_(0x0E47, 0x0E4E)
PAIR(0x0EB1, 0x0EB1)
BIG_(0x0EB4, 0x0EB9)
PAIR(0x0EBB, 0x0EBC)
BIG_(0x0EC8, 0x0ECD)
PAIR(0x0F18, 0x0F19)
PAIR(0x0F35, 0x0F35)
PAIR(0x0F37, 0x0F37)
PAIR(0x0F39, 0x0F39)
BIG_(0x0F71, 0x0F7E)
BIG_(0x0F80, 0x0F84)
PAIR(0x0F86, 0x0F87)
PAIR(0x0FC6, 0x0FC6)
BIG_(0x0F90, 0x0F97)
BIG_(0x0F99, 0x0FBC)
PAIR(0x102D, 0x1030)
PAIR(0x1032, 0x1032)
PAIR(0x1036, 0x1037)
PAIR(0x1039, 0x1039)
PAIR(0x1058, 0x1059)
BIG_(0x1160, 0x11FF)
PAIR(0x135F, 0x135F)
PAIR(0x1712, 0x1714)
PAIR(0x1732, 0x1734)
PAIR(0x1752, 0x1753)
PAIR(0x1772, 0x1773)
PAIR(0x17B4, 0x17B5)
BIG_(0x17B7, 0x17BD)
PAIR(0x17C6, 0x17C6)
BIG_(0x17C9, 0x17D3)
PAIR(0x17DD, 0x17DD)
PAIR(0x180B, 0x180D)
PAIR(0x18A9, 0x18A9)
PAIR(0x1920, 0x1922)
PAIR(0x1927, 0x1928)
PAIR(0x1932, 0x1932)
PAIR(0x1939, 0x193B)
PAIR(0x1A17, 0x1A18)
PAIR(0x1B00, 0x1B03)
PAIR(0x1B34, 0x1B34)
BIG_(0x1B36, 0x1B3A)
PAIR(0x1B3C, 0x1B3C)
PAIR(0x1B42, 0x1B42)
BIG_(0x1B6B, 0x1B73)
BIG_(0x1DC0, 0x1DCA)
PAIR(0x1DFE, 0x1DFF)
BIG_(0x200B, 0x200F)
BIG_(0x202A, 0x202E)
PAIR(0x2060, 0x2063)
BIG_(0x206A, 0x206F)
BIG_(0x20D0, 0x20EF)
BIG_(0x302A, 0x302F)
PAIR(0x3099, 0x309A)
#undef BIG_
#undef PAIR
};
struct CHECK {
#define BIG_(a,b) char big##a[b-a <= 3 ? -1 : 1];
#define PAIR(a,b) char pair##a[b-a > 3 ? -1 : 1];
/* Copy-n-paste it here again to verify correctness */
#undef BIG_
#undef PAIR
};
#endif
if (ucs == 0)
return 0;
/* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
return -1;
/* Quick abort if it is an obviously invalid char */
if (ucs > LAST_SUPPORTED_WCHAR)
return -1;
/* Optimization: no combining chars below 0x300 */
if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
return 1;
#if LAST_SUPPORTED_WCHAR >= 0x300
/* Binary search in table of non-spacing characters */
if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
return 0;
if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
return 0;
/* Optimization: all chars below 0x1100 are not double-width */
if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
return 1;
# if LAST_SUPPORTED_WCHAR >= 0x1100
/* Invalid code points: */
/* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
/* Private Use Area (e000..f8ff) */
/* Noncharacters fdd0..fdef */
if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
|| (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
) {
return -1;
}
/* 0xfffe and 0xffff in every plane are invalid */
if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
return -1;
}
# if LAST_SUPPORTED_WCHAR >= 0x10000
if (ucs >= 0x10000) {
/* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
static const struct interval combining0x10000[] = {
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
{ 0xD242, 0xD244 }
};
/* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
return 0;
/* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
if (LAST_SUPPORTED_WCHAR >= 0xE0001
&& ( ucs == 0xE0001
|| (ucs >= 0xE0020 && ucs <= 0xE007F)
|| (ucs >= 0xE0100 && ucs <= 0xE01EF)
)
) {
return 0;
}
}
# endif
/* If we arrive here, ucs is not a combining or C0/C1 control character.
* Check whether it's 1 char or 2-shar wide.
*/
return 1 +
( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
|| ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
|| ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
# if LAST_SUPPORTED_WCHAR >= 0xac00
|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
|| (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
|| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
|| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
|| (ucs >= 0xffe0 && ucs <= 0xffe6)
|| ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
# endif
);
# endif /* >= 0x1100 */
#endif /* >= 0x300 */
}

View File

@ -60,7 +60,7 @@ int lsmod_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
token[3][strlen(token[3])-1] = '\0'; token[3][strlen(token[3])-1] = '\0';
} else } else
token[3] = (char *) ""; token[3] = (char *) "";
# if ENABLE_FEATURE_ASSUME_UNICODE # if ENABLE_UNICODE_SUPPORT
{ {
uni_stat_t uni_stat; uni_stat_t uni_stat;
char *uni_name = unicode_conv_to_printable(&uni_stat, token[0]); char *uni_name = unicode_conv_to_printable(&uni_stat, token[0]);
@ -78,7 +78,7 @@ int lsmod_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
// or comma-separated list ended by comma // or comma-separated list ended by comma
// so trimming the trailing char is just what we need! // so trimming the trailing char is just what we need!
token[3][strlen(token[3])-1] = '\0'; token[3][strlen(token[3])-1] = '\0';
# if ENABLE_FEATURE_ASSUME_UNICODE # if ENABLE_UNICODE_SUPPORT
{ {
uni_stat_t uni_stat; uni_stat_t uni_stat;
char *uni_name = unicode_conv_to_printable(&uni_stat, token[0]); char *uni_name = unicode_conv_to_printable(&uni_stat, token[0]);

View File

@ -66,7 +66,7 @@ int dumpleases_main(int argc UNUSED_PARAM, char **argv)
fmt = ":%02x"; fmt = ":%02x";
} }
addr.s_addr = lease.lease_nip; addr.s_addr = lease.lease_nip;
#if ENABLE_FEATURE_ASSUME_UNICODE #if ENABLE_UNICODE_SUPPORT
{ {
char *uni_name = unicode_conv_to_printable_fixedwidth(NULL, lease.hostname, 19); char *uni_name = unicode_conv_to_printable_fixedwidth(NULL, lease.hostname, 19);
printf(" %-16s%s ", inet_ntoa(addr), uni_name); printf(" %-16s%s ", inet_ntoa(addr), uni_name);

View File

@ -24,7 +24,7 @@ CONFIG_FEATURE_VERBOSE_USAGE=y
CONFIG_FEATURE_COMPRESS_USAGE=y CONFIG_FEATURE_COMPRESS_USAGE=y
CONFIG_FEATURE_INSTALLER=y CONFIG_FEATURE_INSTALLER=y
CONFIG_LOCALE_SUPPORT=y CONFIG_LOCALE_SUPPORT=y
CONFIG_FEATURE_ASSUME_UNICODE=y CONFIG_UNICODE_SUPPORT=y
# CONFIG_FEATURE_CHECK_UNICODE_IN_ENV is not set # CONFIG_FEATURE_CHECK_UNICODE_IN_ENV is not set
CONFIG_LONG_OPTS=y CONFIG_LONG_OPTS=y
CONFIG_FEATURE_DEVPTS=y CONFIG_FEATURE_DEVPTS=y

View File

@ -50,7 +50,7 @@ cat .config \
| grep -v ^CONFIG_BUILD_LIBBUSYBOX= \ | grep -v ^CONFIG_BUILD_LIBBUSYBOX= \
| grep -v ^CONFIG_PAM= \ | grep -v ^CONFIG_PAM= \
| grep -v ^CONFIG_TASKSET= \ | grep -v ^CONFIG_TASKSET= \
| grep -v ^CONFIG_FEATURE_ASSUME_UNICODE= \ | grep -v ^CONFIG_UNICODE_SUPPORT= \
| grep -v ^CONFIG_PIE= \ | grep -v ^CONFIG_PIE= \
| grep -v CONFIG_STATIC \ | grep -v CONFIG_STATIC \
| grep -v CONFIG_CROSS_COMPILER_PREFIX \ | grep -v CONFIG_CROSS_COMPILER_PREFIX \

View File

@ -20,7 +20,7 @@ Su Mo Tu We Th Fr Sa
" "" "" " "" ""
test x"$CONFIG_LOCALE_SUPPORT" = x"y" \ test x"$CONFIG_LOCALE_SUPPORT" = x"y" \
&& test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ && test x"$CONFIG_UNICODE_SUPPORT" = x"y" \
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \
&& test x"$CONFIG_UNICODE_WIDE_WCHARS" = x"y" \ && test x"$CONFIG_UNICODE_WIDE_WCHARS" = x"y" \
&& test x"$CONFIG_STATIC" != x"y" \ && test x"$CONFIG_STATIC" != x"y" \

View File

@ -14,7 +14,7 @@ mkdir ls.testdir || exit 1
# With Unicode provided by libc locale, I'm not sure this test can pass. # With Unicode provided by libc locale, I'm not sure this test can pass.
# I suspect we might fail to skip exactly correct number of bytes # I suspect we might fail to skip exactly correct number of bytes
# over broked unicode sequences. # over broked unicode sequences.
test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ test x"$CONFIG_UNICODE_SUPPORT" = x"y" \
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
@ -133,7 +133,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
' "" "" ' "" ""
# Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line # Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line
test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ test x"$CONFIG_UNICODE_SUPPORT" = x"y" \
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \