cpython/Modules/_codecsmodule.c
Guido van Rossum 24bdb0474f Marc-Andre Lemburg:
The attached patch set includes a workaround to get Python with
Unicode compile on BSDI 4.x (courtesy Thomas Wouters; the cause
is a bug in the BSDI wchar.h header file) and Python interfaces
for the MBCS codec donated by Mark Hammond.

Also included are some minor corrections w/r to the docs of
the new "es" and "es#" parser markers (use PyMem_Free() instead
of free(); thanks to Mark Hammond for finding these).

The unicodedata tests are now in a separate file
(test_unicodedata.py) to avoid problems if the module cannot
be found.
2000-03-28 20:29:59 +00:00

576 lines
13 KiB
C

/* ------------------------------------------------------------------------
_codecs -- Provides access to the codec registry and the builtin
codecs.
This module should never be imported directly. The standard library
module "codecs" wraps this builtin module for use within Python.
The codec registry is accessible via:
register(search_function) -> None
lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
The builtin Unicode codecs use the following interface:
<encoding>_encode(Unicode_object[,errors='strict']) ->
(string object, bytes consumed)
<encoding>_decode(char_buffer_obj[,errors='strict']) ->
(Unicode object, bytes consumed)
These <encoding>s are available: utf_8, unicode_escape,
raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit)
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
------------------------------------------------------------------------ */
#include "Python.h"
/* --- Registry ----------------------------------------------------------- */
static
PyObject *codecregister(PyObject *self, PyObject *args)
{
PyObject *search_function;
if (!PyArg_ParseTuple(args, "O:register", &search_function))
goto onError;
if (PyCodec_Register(search_function))
goto onError;
Py_INCREF(Py_None);
return Py_None;
onError:
return NULL;
}
static
PyObject *codeclookup(PyObject *self, PyObject *args)
{
char *encoding;
if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
goto onError;
return _PyCodec_Lookup(encoding);
onError:
return NULL;
}
/* --- Helpers ------------------------------------------------------------ */
static
PyObject *codec_tuple(PyObject *unicode,
int len)
{
PyObject *v,*w;
if (unicode == NULL)
return NULL;
v = PyTuple_New(2);
if (v == NULL) {
Py_DECREF(unicode);
return NULL;
}
PyTuple_SET_ITEM(v,0,unicode);
w = PyInt_FromLong(len);
if (w == NULL) {
Py_DECREF(v);
return NULL;
}
PyTuple_SET_ITEM(v,1,w);
return v;
}
/* --- Decoder ------------------------------------------------------------ */
static PyObject *
unicode_internal_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "s#|z:unicode_internal_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
size / sizeof(Py_UNICODE)),
size);
}
static PyObject *
utf_8_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
size);
}
static PyObject *
utf_16_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
size);
}
static PyObject *
utf_16_le_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
int byteorder = -1;
if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
size);
}
static PyObject *
utf_16_be_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
int byteorder = 1;
if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
size);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-16 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_16_ex_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
&data, &size, &errors, &byteorder))
return NULL;
unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oii", unicode, size, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
size);
}
static PyObject *
raw_unicode_escape_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
size);
}
static PyObject *
latin_1_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
size);
}
static PyObject *
ascii_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
size);
}
static PyObject *
charmap_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
PyObject *mapping = NULL;
if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
&data, &size, &errors, &mapping))
return NULL;
if (mapping == Py_None)
mapping = NULL;
return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
size);
}
#ifdef MS_WIN32
static PyObject *
mbcs_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
size);
}
#endif /* MS_WIN32 */
/* --- Encoder ------------------------------------------------------------ */
static PyObject *
readbuffer_encode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
static PyObject *
charbuffer_encode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
static PyObject *
utf_8_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:utf_8_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
}
/* This version provides access to the byteorder parameter of the
builtin UTF-16 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
BOM mark.
*/
static PyObject *
utf_16_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "U|zi:utf_16_encode",
&str, &errors, &byteorder))
return NULL;
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
}
static PyObject *
utf_16_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|zi:utf_16_le_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
}
static PyObject *
utf_16_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|zi:utf_16_be_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
}
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:unicode_escape_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeUnicodeEscape(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str)),
PyUnicode_GET_SIZE(str));
}
static PyObject *
raw_unicode_escape_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:raw_unicode_escape_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str)),
PyUnicode_GET_SIZE(str));
}
static PyObject *
latin_1_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:latin_1_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeLatin1(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
}
static PyObject *
ascii_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:ascii_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeASCII(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
}
static PyObject *
charmap_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
PyObject *mapping = NULL;
if (!PyArg_ParseTuple(args, "U|zO:charmap_encode",
&str, &errors, &mapping))
return NULL;
if (mapping == Py_None)
mapping = NULL;
return codec_tuple(PyUnicode_EncodeCharmap(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
mapping,
errors),
PyUnicode_GET_SIZE(str));
}
#ifdef MS_WIN32
static PyObject *
mbcs_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:mbcs_encode",
&str, &errors))
return NULL;
return codec_tuple(PyUnicode_EncodeMBCS(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
}
#endif /* MS_WIN32 */
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
{"register", codecregister, 1},
{"lookup", codeclookup, 1},
{"utf_8_encode", utf_8_encode, 1},
{"utf_8_decode", utf_8_decode, 1},
{"utf_16_encode", utf_16_encode, 1},
{"utf_16_le_encode", utf_16_le_encode, 1},
{"utf_16_be_encode", utf_16_be_encode, 1},
{"utf_16_decode", utf_16_decode, 1},
{"utf_16_le_decode", utf_16_le_decode, 1},
{"utf_16_be_decode", utf_16_be_decode, 1},
{"utf_16_ex_decode", utf_16_ex_decode, 1},
{"unicode_escape_encode", unicode_escape_encode, 1},
{"unicode_escape_decode", unicode_escape_decode, 1},
{"unicode_internal_encode", readbuffer_encode, 1},
{"unicode_internal_decode", unicode_internal_decode, 1},
{"raw_unicode_escape_encode", raw_unicode_escape_encode, 1},
{"raw_unicode_escape_decode", raw_unicode_escape_decode, 1},
{"latin_1_encode", latin_1_encode, 1},
{"latin_1_decode", latin_1_decode, 1},
{"ascii_encode", ascii_encode, 1},
{"ascii_decode", ascii_decode, 1},
{"charmap_encode", charmap_encode, 1},
{"charmap_decode", charmap_decode, 1},
{"readbuffer_encode", readbuffer_encode, 1},
{"charbuffer_encode", charbuffer_encode, 1},
#ifdef MS_WIN32
{"mbcs_encode", mbcs_encode, 1},
{"mbcs_decode", mbcs_decode, 1},
#endif
{NULL, NULL} /* sentinel */
};
DL_EXPORT(void)
init_codecs()
{
Py_InitModule("_codecs", _codecs_functions);
}