* iconv/gconv_builtin.c: Include <endian.h>.
	* iconv/gconv_builtin.h: Add UCS-BE aliases.
	Add UCS-4LE transformation.  Define UNICODEBIG and UNICODELITTLE
	according to current platform.
	* iconv/gconv_int.h: Declare __gconv_transform_ucs2reverse_internal,
	__gconv_transform_internal_ucs2reverse, and
	__gconv_transform_internal_ucs4le.
	* iconv/gconv_simple.c: Implement __gconv_transform_internal_ucs4le,
	__gconv_transform_ucs2reverse_internal and
	__gconv_transform_internal_ucs2reverse.
	* iconvdata/Makefile (modules): Add UNICODE.
	(distribute): Add unicode.c.
	* iconvdata/gconv-modules: Add definitions for UNICODE module.
	* iconvdata/unicode.c: New file.

	* iconvdata/utf-16.c: Rewrite code to emit BOM.  Correct code to
	determine byte order of input and convert accordingly.
This commit is contained in:
Ulrich Drepper 2000-03-21 20:18:34 +00:00
parent bc4831b956
commit 8d617a716d
10 changed files with 355 additions and 38 deletions

View File

@ -1,5 +1,23 @@
2000-03-21 Ulrich Drepper <drepper@redhat.com>
* iconv/gconv_builtin.c: Include <endian.h>.
* iconv/gconv_builtin.h: Add UCS-BE aliases.
Add UCS-4LE transformation. Define UNICODEBIG and UNICODELITTLE
according to current platform.
* iconv/gconv_int.h: Declare __gconv_transform_ucs2reverse_internal,
__gconv_transform_internal_ucs2reverse, and
__gconv_transform_internal_ucs4le.
* iconv/gconv_simple.c: Implement __gconv_transform_internal_ucs4le,
__gconv_transform_ucs2reverse_internal and
__gconv_transform_internal_ucs2reverse.
* iconvdata/Makefile (modules): Add UNICODE.
(distribute): Add unicode.c.
* iconvdata/gconv-modules: Add definitions for UNICODE module.
* iconvdata/unicode.c: New file.
* iconvdata/utf-16.c: Rewrite code to emit BOM. Correct code to
determine byte order of input and convert accordingly.
* iconvdata/utf-16.c (gconv_init): Fix typo.
2000-03-20 Ulrich Drepper <drepper@redhat.com>

View File

@ -1,5 +1,5 @@
/* Table for builtin transformation mapping.
Copyright (C) 1997, 1998, 1999 Free Software Foundation, Inc.
Copyright (C) 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
@ -18,6 +18,7 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <endian.h>
#include <limits.h>
#include <string.h>

View File

@ -20,12 +20,14 @@
BUILTIN_ALIAS ("UCS4//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("UCS-4//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("UCS-4BE//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("CSUCS4//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("ISO-10646//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("10646-1:1993//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("10646-1:1993/UCS4/", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("OSF00010104//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("OSF00010105//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("OSF00010106//", "ISO-10646/UCS4/")
BUILTIN_ALIAS ("OSF00010104//", "ISO-10646/UCS4/") /* level 1 */
BUILTIN_ALIAS ("OSF00010105//", "ISO-10646/UCS4/") /* level 2 */
BUILTIN_ALIAS ("OSF00010106//", "ISO-10646/UCS4/") /* level 3 */
BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8,
"ISO-10646/UCS4/", 1, "=INTERNAL->ucs4",
@ -37,6 +39,16 @@ BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS4/", 15,
4, 4, 4, 4)
/* Please note that we need only one function for both direction. */
BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8,
"UCS-4LE//", 1, "=INTERNAL->ucs4le",
__gconv_transform_internal_ucs4le, NULL, NULL,
4, 4, 4, 4)
BUILTIN_TRANSFORMATION (NULL, "UCS-4LE//", 15,
"INTERNAL", 1, "=ucs4le->INTERNAL",
__gconv_transform_internal_ucs4le, NULL, NULL,
4, 4, 4, 4)
/* Please note that we need only one function for both direction. */
BUILTIN_ALIAS ("UTF8//", "ISO-10646/UTF8/")
BUILTIN_ALIAS ("UTF-8//", "ISO-10646/UTF8/")
BUILTIN_ALIAS ("ISO-IR-193//", "ISO-10646/UTF8/")
@ -54,11 +66,9 @@ BUILTIN_TRANSFORMATION ("ISO-10646/UTF-?8/", "ISO-10646/UTF", 13,
BUILTIN_ALIAS ("UCS2//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UNICODE//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("OSF00010100//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("OSF00010101//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("OSF00010102//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("OSF00010100//", "ISO-10646/UCS2/") /* level 1 */
BUILTIN_ALIAS ("OSF00010101//", "ISO-10646/UCS2/") /* level 2 */
BUILTIN_ALIAS ("OSF00010102//", "ISO-10646/UCS2/") /* level 3 */
BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS2/", 15, "INTERNAL",
1, "=ucs2->INTERNAL",
@ -71,12 +81,34 @@ BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, "ISO-10646/UCS2/",
4, 4, 2, 2)
#if BYTE_ORDER == BIG_ENDIAN
BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2LE//", "UNICODELITTLE//")
BUILTIN_TRANSFORMATION (NULL, "UNICODELITTLE//", 15, "INTERNAL",
1, "=ucs2little->INTERNAL",
__gconv_transform_ucs2little_internal, NULL, NULL,
1, "=ucs2reverse->INTERNAL",
__gconv_transform_ucs2reverse_internal, NULL, NULL,
2, 2, 4, 4)
BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, "UNICODELITTLE//",
1, "=INTERNAL->ucs2little",
__gconv_transform_internal_ucs2little, NULL, NULL,
1, "=INTERNAL->ucs2reverse",
__gconv_transform_internal_ucs2reverse, NULL, NULL,
4, 4, 2, 2)
#else
BUILTIN_ALIAS ("UNICODELITTLE//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2LE//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2BE//", "UNICODEBIG//")
BUILTIN_TRANSFORMATION (NULL, "UNICODEBIG//", 15, "INTERNAL",
1, "=ucs2reverse->INTERNAL",
__gconv_transform_ucs2reverse_internal, NULL, NULL,
2, 2, 4, 4)
BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, "UNICODEBIG//",
1, "=INTERNAL->ucs2reverse",
__gconv_transform_internal_ucs2reverse, NULL, NULL,
4, 4, 2, 2)
#endif

View File

@ -169,9 +169,12 @@ __BUILTIN_TRANS (__gconv_transform_utf8_internal);
__BUILTIN_TRANS (__gconv_transform_internal_utf8);
__BUILTIN_TRANS (__gconv_transform_ucs2_internal);
__BUILTIN_TRANS (__gconv_transform_internal_ucs2);
__BUILTIN_TRANS (__gconv_transform_ucs2reverse_internal);
__BUILTIN_TRANS (__gconv_transform_internal_ucs2reverse);
__BUILTIN_TRANS (__gconv_transform_ucs2little_internal);
__BUILTIN_TRANS (__gconv_transform_internal_ucs2little);
__BUILTIN_TRANS (__gconv_transform_internal_ucs4);
__BUILTIN_TRANS (__gconv_transform_internal_ucs4le);
__BUILTIN_TRANS (__gconv_transform_internal_utf16);
__BUILTIN_TRANS (__gconv_transform_utf16_internal);
# undef __BUITLIN_TRANS

View File

@ -105,6 +105,61 @@ internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend,
#include <iconv/skeleton.c>
/* Similarly for the other byte order. */
#define DEFINE_INIT 0
#define DEFINE_FINI 0
#define MIN_NEEDED_FROM 4
#define MIN_NEEDED_TO 4
#define FROM_DIRECTION 1
#define FROM_LOOP internal_ucs4le_loop
#define TO_LOOP internal_ucs4le_loop /* This is not used. */
#define FUNCTION_NAME __gconv_transform_internal_ucs4le
static inline int
internal_ucs4le_loop (const unsigned char **inptrp, const unsigned char *inend,
unsigned char **outptrp, unsigned char *outend,
mbstate_t *state, void *data, size_t *converted)
{
const unsigned char *inptr = *inptrp;
unsigned char *outptr = *outptrp;
size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
int result;
#if __BYTE_ORDER == __BIG_ENDIAN
/* Sigh, we have to do some real work. */
size_t cnt;
for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
*((uint32_t *) outptr)++ = bswap_32 (*(uint32_t *) inptr);
*inptrp = inptr;
*outptrp = outptr;
#elif __BYTE_ORDER == __LITTLE_ENDIAN
/* Simply copy the data. */
*inptrp = inptr + n_convert * 4;
*outptrp = __mempcpy (outptr, inptr, n_convert * 4);
#else
# error "This endianess is not supported."
#endif
/* Determine the status. */
if (*outptrp == outend)
result = __GCONV_FULL_OUTPUT;
else if (*inptrp == inend)
result = __GCONV_EMPTY_INPUT;
else
result = __GCONV_INCOMPLETE_INPUT;
if (converted != NULL)
converted += n_convert;
return result;
}
#include <iconv/skeleton.c>
/* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
#define DEFINE_INIT 0
#define DEFINE_FINI 0
@ -408,7 +463,7 @@ internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend,
#define FROM_DIRECTION 1
#define FROM_LOOP ucs2little_internal_loop
#define TO_LOOP ucs2little_internal_loop /* This is not used.*/
#define FUNCTION_NAME __gconv_transform_ucs2little_internal
#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
@ -433,7 +488,7 @@ internal_ucs4_loop (const unsigned char **inptrp, const unsigned char *inend,
#define FROM_DIRECTION 1
#define FROM_LOOP internal_ucs2little_loop
#define TO_LOOP internal_ucs2little_loop /* This is not used.*/
#define FUNCTION_NAME __gconv_transform_internal_ucs2little
#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO

View File

@ -45,7 +45,7 @@ modules := ISO8859-1 ISO8859-2 ISO8859-3 ISO8859-4 ISO8859-5 \
INIS-CYRILLIC ISO_6937-2 ISO_2033 ISO_5427 ISO_5427-EXT \
ISO_5428 ISO_10367-BOX MAC-IS MAC-UK NATS-DANO NATS-SEFI \
SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK \
ISO-2022-CN libISOIR165 UTF-16
ISO-2022-CN libISOIR165 UTF-16 UNICODE
modules.so := $(addsuffix .so, $(modules))
@ -118,7 +118,7 @@ distribute := gconv-modules extra-module.mk gap.awk gaptab.awk \
macintosh.c mac-is.c mac-uk.c nats-dano.c nats-sefi.c sjis.c \
t.61.c uhc.c sami-ws2.c iso-ir-197.c tis-620.c koi8-u.c \
isiri-3342.c gbgbk.c iso-2022-cn.c cns11643l2.h iso8859-16.c \
utf-16.c
utf-16.c unicode.c
# We build the transformation modules only when we build shared libs.
ifeq (yes,$(build-shared))

View File

@ -1184,3 +1184,8 @@ module INTERNAL UTF-16LE// UTF-16 1
# from to module cost
module UTF-16BE// INTERNAL UTF-16 1
module INTERNAL UTF-16BE// UTF-16 1
# from to module cost
alias CSUNICODE// UNICODE//
module UNICODE// INTERNAL UNICODE 1
module INTERNAL UNICODE// UNICODE 1

190
iconvdata/unicode.c Normal file
View File

@ -0,0 +1,190 @@
/* Conversion module for Unicode
Copyright (C) 1999, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <byteswap.h>
#include <gconv.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
/* This is the Byte Order Mark character (BOM). */
#define BOM 0xfeff
/* And in the other endian format. */
#define BOM_OE 0xfffe
/* Definitions used in the body of the `gconv' function. */
#define FROM_LOOP from_unicode_loop
#define TO_LOOP to_unicode_loop
#define DEFINE_INIT 0
#define DEFINE_FINI 0
#define MIN_NEEDED_FROM 2
#define MIN_NEEDED_TO 4
#define FROM_DIRECTION (dir == from_unicode)
#define PREPARE_LOOP \
enum direction dir = ((struct unicode_data *) step->__data)->dir; \
int swap; \
if (FROM_DIRECTION) \
{ \
if (data->__invocation_counter == 0) \
{ \
/* We have to find out which byte order the file is encoded in. */ \
if (inptr + 2 > inbufend) \
return __GCONV_EMPTY_INPUT; \
\
if (*(uint16_t *) inptr == BOM) \
/* Simply ignore the BOM character. */ \
inptr += 2; \
else if (*(uint16_t *) inptr == BOM_OE) \
{ \
((struct unicode_data *) step->__data)->swap = 1; \
inptr += 2; \
} \
} \
} \
else if (!data->__internal_use && data->__invocation_counter == 0) \
{ \
/* Emit the Byte Order Mark. */ \
if (outbuf + 2 > outend) \
return __GCONV_FULL_OUTPUT; \
\
*(uint16_t *) outbuf = BOM; \
outbuf += 2; \
} \
swap = ((struct unicode_data *) step->__data)->swap;
#define EXTRA_LOOP_ARGS , data, swap
/* Direction of the transformation. */
enum direction
{
illegal_dir,
to_unicode,
from_unicode
};
struct unicode_data
{
enum direction dir;
int swap;
};
int
gconv_init (struct __gconv_step *step)
{
/* Determine which direction. */
struct unicode_data *new_data;
enum direction dir = illegal_dir;
int result;
if (__strcasecmp (step->__from_name, "UNICODE") == 0)
dir = from_unicode;
else
dir = to_unicode;
new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data));
result = __GCONV_NOMEM;
if (new_data != NULL)
{
new_data->dir = dir;
new_data->swap = 0;
step->__data = new_data;
if (dir == from_unicode)
{
step->__min_needed_from = MIN_NEEDED_FROM;
step->__max_needed_from = MIN_NEEDED_FROM;
step->__min_needed_to = MIN_NEEDED_TO;
step->__max_needed_to = MIN_NEEDED_TO;
}
else
{
step->__min_needed_from = MIN_NEEDED_TO;
step->__max_needed_from = MIN_NEEDED_TO;
step->__min_needed_to = MIN_NEEDED_FROM;
step->__max_needed_to = MIN_NEEDED_FROM;
}
step->__stateful = 0;
result = __GCONV_OK;
}
return result;
}
void
gconv_end (struct __gconv_step *data)
{
free (data->__data);
}
/* Convert from the internal (UCS4-like) format to UCS2. */
#define MIN_NEEDED_INPUT MIN_NEEDED_TO
#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
#define LOOPFCT TO_LOOP
#define BODY \
{ \
uint32_t c = *((uint32_t *) inptr); \
\
if (c >= 0x10000) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
*((uint16_t *) outptr) = c; \
\
outptr += 2; \
inptr += 4; \
}
#define EXTRA_LOOP_DECLS \
, struct __gconv_step_data *step_data, int swap
#include <iconv/loop.c>
/* Convert from UCS2 to the internal (UCS4-like) format. */
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
uint16_t u1 = *(uint16_t *) inptr; \
\
if (swap) \
u1 = bswap_16 (u1); \
\
*((uint32_t *) outptr) = u1; \
\
inptr += 2; \
outptr += 4; \
}
#define EXTRA_LOOP_DECLS \
, struct __gconv_step_data *step_data, int swap
#include <iconv/loop.c>
/* Now define the toplevel functions. */
#include <iconv/skeleton.c>

View File

@ -27,6 +27,8 @@
/* This is the Byte Order Mark character (BOM). */
#define BOM 0xfeff
/* And in the other byte order. */
#define BOM_OE 0xfffe
/* Definitions used in the body of the `gconv' function. */
@ -41,8 +43,27 @@
#define PREPARE_LOOP \
enum direction dir = ((struct utf16_data *) step->__data)->dir; \
enum variant var = ((struct utf16_data *) step->__data)->var; \
if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use \
&& data->__invocation_counter == 0) \
int swap = ((struct utf16_data *) step->__data)->swap; \
if (FROM_DIRECTION || var == UTF_16) \
{ \
if (data->__invocation_counter == 0) \
{ \
/* We have to find out which byte order the file is encoded in. */ \
if (inptr + 2 > inbufend) \
return __GCONV_EMPTY_INPUT; \
\
if (*(uint16_t *) inptr == BOM) \
/* Simply ignore the BOM character. */ \
inptr += 2; \
else if (*(uint16_t *) inptr == BOM_OE) \
{ \
((struct utf16_data *) step->__data)->swap = 1; \
inptr += 2; \
} \
} \
} \
else if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use \
&& data->__invocation_counter == 0) \
{ \
/* Emit the Byte Order Mark. */ \
if (outbuf + 2 > outend) \
@ -51,7 +72,7 @@
*(uint16_t *) outbuf = BOM; \
outbuf += 2; \
}
#define EXTRA_LOOP_ARGS , var, data
#define EXTRA_LOOP_ARGS , var, data, swap
/* Direction of the transformation. */
@ -74,6 +95,7 @@ struct utf16_data
{
enum direction dir;
enum variant var;
int swap;
};
@ -127,6 +149,9 @@ gconv_init (struct __gconv_step *step)
{
new_data->dir = dir;
new_data->var = var;
new_data->swap = ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)
|| (var == UTF_16BE
&& BYTE_ORDER == LITTLE_ENDIAN));
step->__data = new_data;
if (dir == from_utf16)
@ -170,8 +195,7 @@ gconv_end (struct __gconv_step *data)
{ \
uint32_t c = *((uint32_t *) inptr); \
\
if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE) \
|| (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE)) \
if (swap) \
{ \
if (c >= 0x10000) \
{ \
@ -225,7 +249,7 @@ gconv_end (struct __gconv_step *data)
inptr += 4; \
}
#define EXTRA_LOOP_DECLS \
, enum variant var, struct __gconv_step_data *step_data
, enum variant var, struct __gconv_step_data *step_data, int swap
#include <iconv/loop.c>
@ -238,8 +262,7 @@ gconv_end (struct __gconv_step *data)
{ \
uint16_t u1 = *(uint16_t *) inptr; \
\
if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE) \
|| (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE)) \
if (swap) \
{ \
u1 = bswap_16 (u1); \
\
@ -277,16 +300,6 @@ gconv_end (struct __gconv_step *data)
} \
else \
{ \
if (u1 == BOM && var == UTF_16 && !step_data->__internal_use \
&& step_data->__invocation_counter == 0 && inptr == *inptrp) \
{ \
/* This is the first word in the file and it is the BOM and \
we are converting a file without specified byte order. \
Simply sack the BOM. */ \
inptr += 2; \
continue; \
} \
\
if (u1 < 0xd800 || u1 > 0xdfff) \
{ \
/* No surrogate. */ \
@ -322,7 +335,7 @@ gconv_end (struct __gconv_step *data)
outptr += 4; \
}
#define EXTRA_LOOP_DECLS \
, enum variant var, struct __gconv_step_data *step_data
, enum variant var, struct __gconv_step_data *step_data, int swap
#include <iconv/loop.c>

View File

@ -116,7 +116,7 @@ case $add_ons in
*)
message="\
*** WARNING: Are you sure you do not want to use the \`linuxthreads'
*** and \`crypt' add-ons?"
*** add-on?"
;;
esac