mirror of
https://github.com/python/cpython.git
synced 2024-12-11 18:53:56 +08:00
c9a8df24cc
svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines Issue #9804: ascii() now always represents unicode surrogate pairs as a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds. ........
1085 lines
31 KiB
C
1085 lines
31 KiB
C
/* ------------------------------------------------------------------------
|
|
|
|
Python Codec Registry and support functions
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
------------------------------------------------------------------------ */
|
|
|
|
#include "Python.h"
|
|
#include <ctype.h>
|
|
|
|
/* --- Codec Registry ----------------------------------------------------- */
|
|
|
|
/* Import the standard encodings package which will register the first
|
|
codec search function.
|
|
|
|
This is done in a lazy way so that the Unicode implementation does
|
|
not downgrade startup time of scripts not needing it.
|
|
|
|
ImportErrors are silently ignored by this function. Only one try is
|
|
made.
|
|
|
|
*/
|
|
|
|
static int _PyCodecRegistry_Init(void); /* Forward */
|
|
|
|
int PyCodec_Register(PyObject *search_function)
|
|
{
|
|
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
|
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
|
goto onError;
|
|
if (search_function == NULL) {
|
|
PyErr_BadArgument();
|
|
goto onError;
|
|
}
|
|
if (!PyCallable_Check(search_function)) {
|
|
PyErr_SetString(PyExc_TypeError, "argument must be callable");
|
|
goto onError;
|
|
}
|
|
return PyList_Append(interp->codec_search_path, search_function);
|
|
|
|
onError:
|
|
return -1;
|
|
}
|
|
|
|
/* Convert a string to a normalized Python string: all characters are
|
|
converted to lower case, spaces are replaced with underscores. */
|
|
|
|
static
|
|
PyObject *normalizestring(const char *string)
|
|
{
|
|
register size_t i;
|
|
size_t len = strlen(string);
|
|
char *p;
|
|
PyObject *v;
|
|
|
|
if (len > PY_SSIZE_T_MAX) {
|
|
PyErr_SetString(PyExc_OverflowError, "string is too large");
|
|
return NULL;
|
|
}
|
|
|
|
p = PyMem_Malloc(len + 1);
|
|
if (p == NULL)
|
|
return NULL;
|
|
for (i = 0; i < len; i++) {
|
|
register char ch = string[i];
|
|
if (ch == ' ')
|
|
ch = '-';
|
|
else
|
|
ch = tolower(Py_CHARMASK(ch));
|
|
p[i] = ch;
|
|
}
|
|
p[i] = '\0';
|
|
v = PyUnicode_FromString(p);
|
|
if (v == NULL)
|
|
return NULL;
|
|
PyMem_Free(p);
|
|
return v;
|
|
}
|
|
|
|
/* Lookup the given encoding and return a tuple providing the codec
|
|
facilities.
|
|
|
|
The encoding string is looked up converted to all lower-case
|
|
characters. This makes encodings looked up through this mechanism
|
|
effectively case-insensitive.
|
|
|
|
If no codec is found, a LookupError is set and NULL returned.
|
|
|
|
As side effect, this tries to load the encodings package, if not
|
|
yet done. This is part of the lazy load strategy for the encodings
|
|
package.
|
|
|
|
*/
|
|
|
|
PyObject *_PyCodec_Lookup(const char *encoding)
|
|
{
|
|
PyInterpreterState *interp;
|
|
PyObject *result, *args = NULL, *v;
|
|
Py_ssize_t i, len;
|
|
|
|
if (encoding == NULL) {
|
|
PyErr_BadArgument();
|
|
goto onError;
|
|
}
|
|
|
|
interp = PyThreadState_GET()->interp;
|
|
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
|
goto onError;
|
|
|
|
/* Convert the encoding to a normalized Python string: all
|
|
characters are converted to lower case, spaces and hyphens are
|
|
replaced with underscores. */
|
|
v = normalizestring(encoding);
|
|
if (v == NULL)
|
|
goto onError;
|
|
PyUnicode_InternInPlace(&v);
|
|
|
|
/* First, try to lookup the name in the registry dictionary */
|
|
result = PyDict_GetItem(interp->codec_search_cache, v);
|
|
if (result != NULL) {
|
|
Py_INCREF(result);
|
|
Py_DECREF(v);
|
|
return result;
|
|
}
|
|
|
|
/* Next, scan the search functions in order of registration */
|
|
args = PyTuple_New(1);
|
|
if (args == NULL)
|
|
goto onError;
|
|
PyTuple_SET_ITEM(args,0,v);
|
|
|
|
len = PyList_Size(interp->codec_search_path);
|
|
if (len < 0)
|
|
goto onError;
|
|
if (len == 0) {
|
|
PyErr_SetString(PyExc_LookupError,
|
|
"no codec search functions registered: "
|
|
"can't find encoding");
|
|
goto onError;
|
|
}
|
|
|
|
for (i = 0; i < len; i++) {
|
|
PyObject *func;
|
|
|
|
func = PyList_GetItem(interp->codec_search_path, i);
|
|
if (func == NULL)
|
|
goto onError;
|
|
result = PyEval_CallObject(func, args);
|
|
if (result == NULL)
|
|
goto onError;
|
|
if (result == Py_None) {
|
|
Py_DECREF(result);
|
|
continue;
|
|
}
|
|
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"codec search functions must return 4-tuples");
|
|
Py_DECREF(result);
|
|
goto onError;
|
|
}
|
|
break;
|
|
}
|
|
if (i == len) {
|
|
/* XXX Perhaps we should cache misses too ? */
|
|
PyErr_Format(PyExc_LookupError,
|
|
"unknown encoding: %s", encoding);
|
|
goto onError;
|
|
}
|
|
|
|
/* Cache and return the result */
|
|
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
|
|
Py_DECREF(result);
|
|
goto onError;
|
|
}
|
|
Py_DECREF(args);
|
|
return result;
|
|
|
|
onError:
|
|
Py_XDECREF(args);
|
|
return NULL;
|
|
}
|
|
|
|
/* Codec registry encoding check API. */
|
|
|
|
int PyCodec_KnownEncoding(const char *encoding)
|
|
{
|
|
PyObject *codecs;
|
|
|
|
codecs = _PyCodec_Lookup(encoding);
|
|
if (!codecs) {
|
|
PyErr_Clear();
|
|
return 0;
|
|
}
|
|
else {
|
|
Py_DECREF(codecs);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
static
|
|
PyObject *args_tuple(PyObject *object,
|
|
const char *errors)
|
|
{
|
|
PyObject *args;
|
|
|
|
args = PyTuple_New(1 + (errors != NULL));
|
|
if (args == NULL)
|
|
return NULL;
|
|
Py_INCREF(object);
|
|
PyTuple_SET_ITEM(args,0,object);
|
|
if (errors) {
|
|
PyObject *v;
|
|
|
|
v = PyUnicode_FromString(errors);
|
|
if (v == NULL) {
|
|
Py_DECREF(args);
|
|
return NULL;
|
|
}
|
|
PyTuple_SET_ITEM(args, 1, v);
|
|
}
|
|
return args;
|
|
}
|
|
|
|
/* Helper function to get a codec item */
|
|
|
|
static
|
|
PyObject *codec_getitem(const char *encoding, int index)
|
|
{
|
|
PyObject *codecs;
|
|
PyObject *v;
|
|
|
|
codecs = _PyCodec_Lookup(encoding);
|
|
if (codecs == NULL)
|
|
return NULL;
|
|
v = PyTuple_GET_ITEM(codecs, index);
|
|
Py_DECREF(codecs);
|
|
Py_INCREF(v);
|
|
return v;
|
|
}
|
|
|
|
/* Helper function to create an incremental codec. */
|
|
|
|
static
|
|
PyObject *codec_getincrementalcodec(const char *encoding,
|
|
const char *errors,
|
|
const char *attrname)
|
|
{
|
|
PyObject *codecs, *ret, *inccodec;
|
|
|
|
codecs = _PyCodec_Lookup(encoding);
|
|
if (codecs == NULL)
|
|
return NULL;
|
|
inccodec = PyObject_GetAttrString(codecs, attrname);
|
|
Py_DECREF(codecs);
|
|
if (inccodec == NULL)
|
|
return NULL;
|
|
if (errors)
|
|
ret = PyObject_CallFunction(inccodec, "s", errors);
|
|
else
|
|
ret = PyObject_CallFunction(inccodec, NULL);
|
|
Py_DECREF(inccodec);
|
|
return ret;
|
|
}
|
|
|
|
/* Helper function to create a stream codec. */
|
|
|
|
static
|
|
PyObject *codec_getstreamcodec(const char *encoding,
|
|
PyObject *stream,
|
|
const char *errors,
|
|
const int index)
|
|
{
|
|
PyObject *codecs, *streamcodec, *codeccls;
|
|
|
|
codecs = _PyCodec_Lookup(encoding);
|
|
if (codecs == NULL)
|
|
return NULL;
|
|
|
|
codeccls = PyTuple_GET_ITEM(codecs, index);
|
|
if (errors != NULL)
|
|
streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
|
|
else
|
|
streamcodec = PyObject_CallFunction(codeccls, "O", stream);
|
|
Py_DECREF(codecs);
|
|
return streamcodec;
|
|
}
|
|
|
|
/* Convenience APIs to query the Codec registry.
|
|
|
|
All APIs return a codec object with incremented refcount.
|
|
|
|
*/
|
|
|
|
PyObject *PyCodec_Encoder(const char *encoding)
|
|
{
|
|
return codec_getitem(encoding, 0);
|
|
}
|
|
|
|
PyObject *PyCodec_Decoder(const char *encoding)
|
|
{
|
|
return codec_getitem(encoding, 1);
|
|
}
|
|
|
|
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
|
|
const char *errors)
|
|
{
|
|
return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
|
|
}
|
|
|
|
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
|
|
const char *errors)
|
|
{
|
|
return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
|
|
}
|
|
|
|
PyObject *PyCodec_StreamReader(const char *encoding,
|
|
PyObject *stream,
|
|
const char *errors)
|
|
{
|
|
return codec_getstreamcodec(encoding, stream, errors, 2);
|
|
}
|
|
|
|
PyObject *PyCodec_StreamWriter(const char *encoding,
|
|
PyObject *stream,
|
|
const char *errors)
|
|
{
|
|
return codec_getstreamcodec(encoding, stream, errors, 3);
|
|
}
|
|
|
|
/* Encode an object (e.g. an Unicode object) using the given encoding
|
|
and return the resulting encoded object (usually a Python string).
|
|
|
|
errors is passed to the encoder factory as argument if non-NULL. */
|
|
|
|
PyObject *PyCodec_Encode(PyObject *object,
|
|
const char *encoding,
|
|
const char *errors)
|
|
{
|
|
PyObject *encoder = NULL;
|
|
PyObject *args = NULL, *result = NULL;
|
|
PyObject *v = NULL;
|
|
|
|
encoder = PyCodec_Encoder(encoding);
|
|
if (encoder == NULL)
|
|
goto onError;
|
|
|
|
args = args_tuple(object, errors);
|
|
if (args == NULL)
|
|
goto onError;
|
|
|
|
result = PyEval_CallObject(encoder, args);
|
|
if (result == NULL)
|
|
goto onError;
|
|
|
|
if (!PyTuple_Check(result) ||
|
|
PyTuple_GET_SIZE(result) != 2) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"encoder must return a tuple (object, integer)");
|
|
goto onError;
|
|
}
|
|
v = PyTuple_GET_ITEM(result,0);
|
|
Py_INCREF(v);
|
|
/* We don't check or use the second (integer) entry. */
|
|
|
|
Py_DECREF(args);
|
|
Py_DECREF(encoder);
|
|
Py_DECREF(result);
|
|
return v;
|
|
|
|
onError:
|
|
Py_XDECREF(result);
|
|
Py_XDECREF(args);
|
|
Py_XDECREF(encoder);
|
|
return NULL;
|
|
}
|
|
|
|
/* Decode an object (usually a Python string) using the given encoding
|
|
and return an equivalent object (e.g. an Unicode object).
|
|
|
|
errors is passed to the decoder factory as argument if non-NULL. */
|
|
|
|
PyObject *PyCodec_Decode(PyObject *object,
|
|
const char *encoding,
|
|
const char *errors)
|
|
{
|
|
PyObject *decoder = NULL;
|
|
PyObject *args = NULL, *result = NULL;
|
|
PyObject *v;
|
|
|
|
decoder = PyCodec_Decoder(encoding);
|
|
if (decoder == NULL)
|
|
goto onError;
|
|
|
|
args = args_tuple(object, errors);
|
|
if (args == NULL)
|
|
goto onError;
|
|
|
|
result = PyEval_CallObject(decoder,args);
|
|
if (result == NULL)
|
|
goto onError;
|
|
if (!PyTuple_Check(result) ||
|
|
PyTuple_GET_SIZE(result) != 2) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"decoder must return a tuple (object,integer)");
|
|
goto onError;
|
|
}
|
|
v = PyTuple_GET_ITEM(result,0);
|
|
Py_INCREF(v);
|
|
/* We don't check or use the second (integer) entry. */
|
|
|
|
Py_DECREF(args);
|
|
Py_DECREF(decoder);
|
|
Py_DECREF(result);
|
|
return v;
|
|
|
|
onError:
|
|
Py_XDECREF(args);
|
|
Py_XDECREF(decoder);
|
|
Py_XDECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Register the error handling callback function error under the name
|
|
name. This function will be called by the codec when it encounters
|
|
an unencodable characters/undecodable bytes and doesn't know the
|
|
callback name, when name is specified as the error parameter
|
|
in the call to the encode/decode function.
|
|
Return 0 on success, -1 on error */
|
|
int PyCodec_RegisterError(const char *name, PyObject *error)
|
|
{
|
|
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
|
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
|
return -1;
|
|
if (!PyCallable_Check(error)) {
|
|
PyErr_SetString(PyExc_TypeError, "handler must be callable");
|
|
return -1;
|
|
}
|
|
return PyDict_SetItemString(interp->codec_error_registry,
|
|
(char *)name, error);
|
|
}
|
|
|
|
/* Lookup the error handling callback function registered under the
|
|
name error. As a special case NULL can be passed, in which case
|
|
the error handling callback for strict encoding will be returned. */
|
|
PyObject *PyCodec_LookupError(const char *name)
|
|
{
|
|
PyObject *handler = NULL;
|
|
|
|
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
|
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
|
return NULL;
|
|
|
|
if (name==NULL)
|
|
name = "strict";
|
|
handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
|
|
if (!handler)
|
|
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
|
|
else
|
|
Py_INCREF(handler);
|
|
return handler;
|
|
}
|
|
|
|
static void wrong_exception_type(PyObject *exc)
|
|
{
|
|
PyObject *type = PyObject_GetAttrString(exc, "__class__");
|
|
if (type != NULL) {
|
|
PyObject *name = PyObject_GetAttrString(type, "__name__");
|
|
Py_DECREF(type);
|
|
if (name != NULL) {
|
|
PyErr_Format(PyExc_TypeError,
|
|
"don't know how to handle %S in error callback", name);
|
|
Py_DECREF(name);
|
|
}
|
|
}
|
|
}
|
|
|
|
PyObject *PyCodec_StrictErrors(PyObject *exc)
|
|
{
|
|
if (PyExceptionInstance_Check(exc))
|
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
|
else
|
|
PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
|
|
{
|
|
Py_ssize_t end;
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
|
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
|
|
if (PyUnicodeTranslateError_GetEnd(exc, &end))
|
|
return NULL;
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
/* ouch: passing NULL, 0, pos gives None instead of u'' */
|
|
return Py_BuildValue("(u#n)", &end, 0, end);
|
|
}
|
|
|
|
|
|
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
|
|
{
|
|
PyObject *restuple;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
Py_ssize_t i;
|
|
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
PyObject *res;
|
|
Py_UNICODE *p;
|
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
res = PyUnicode_FromUnicode(NULL, end-start);
|
|
if (res == NULL)
|
|
return NULL;
|
|
for (p = PyUnicode_AS_UNICODE(res), i = start;
|
|
i<end; ++p, ++i)
|
|
*p = '?';
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
return restuple;
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
|
Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
|
|
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
return Py_BuildValue("(u#n)", &res, 1, end);
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
|
|
PyObject *res;
|
|
Py_UNICODE *p;
|
|
if (PyUnicodeTranslateError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeTranslateError_GetEnd(exc, &end))
|
|
return NULL;
|
|
res = PyUnicode_FromUnicode(NULL, end-start);
|
|
if (res == NULL)
|
|
return NULL;
|
|
for (p = PyUnicode_AS_UNICODE(res), i = start;
|
|
i<end; ++p, ++i)
|
|
*p = Py_UNICODE_REPLACEMENT_CHARACTER;
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
return restuple;
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
|
{
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
PyObject *restuple;
|
|
PyObject *object;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
PyObject *res;
|
|
Py_UNICODE *p;
|
|
Py_UNICODE *startp;
|
|
Py_UNICODE *outp;
|
|
int ressize;
|
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
|
return NULL;
|
|
startp = PyUnicode_AS_UNICODE(object);
|
|
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
|
|
if (*p<10)
|
|
ressize += 2+1+1;
|
|
else if (*p<100)
|
|
ressize += 2+2+1;
|
|
else if (*p<1000)
|
|
ressize += 2+3+1;
|
|
else if (*p<10000)
|
|
ressize += 2+4+1;
|
|
#ifndef Py_UNICODE_WIDE
|
|
else
|
|
ressize += 2+5+1;
|
|
#else
|
|
else if (*p<100000)
|
|
ressize += 2+5+1;
|
|
else if (*p<1000000)
|
|
ressize += 2+6+1;
|
|
else
|
|
ressize += 2+7+1;
|
|
#endif
|
|
}
|
|
/* allocate replacement */
|
|
res = PyUnicode_FromUnicode(NULL, ressize);
|
|
if (res == NULL) {
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
/* generate replacement */
|
|
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
|
|
p < startp+end; ++p) {
|
|
Py_UNICODE c = *p;
|
|
int digits;
|
|
int base;
|
|
*outp++ = '&';
|
|
*outp++ = '#';
|
|
if (*p<10) {
|
|
digits = 1;
|
|
base = 1;
|
|
}
|
|
else if (*p<100) {
|
|
digits = 2;
|
|
base = 10;
|
|
}
|
|
else if (*p<1000) {
|
|
digits = 3;
|
|
base = 100;
|
|
}
|
|
else if (*p<10000) {
|
|
digits = 4;
|
|
base = 1000;
|
|
}
|
|
#ifndef Py_UNICODE_WIDE
|
|
else {
|
|
digits = 5;
|
|
base = 10000;
|
|
}
|
|
#else
|
|
else if (*p<100000) {
|
|
digits = 5;
|
|
base = 10000;
|
|
}
|
|
else if (*p<1000000) {
|
|
digits = 6;
|
|
base = 100000;
|
|
}
|
|
else {
|
|
digits = 7;
|
|
base = 1000000;
|
|
}
|
|
#endif
|
|
while (digits-->0) {
|
|
*outp++ = '0' + c/base;
|
|
c %= base;
|
|
base /= 10;
|
|
}
|
|
*outp++ = ';';
|
|
}
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return restuple;
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static Py_UNICODE hexdigits[] = {
|
|
'0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
|
|
};
|
|
|
|
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
|
{
|
|
#ifndef Py_UNICODE_WIDE
|
|
#define IS_SURROGATE_PAIR(p, end) \
|
|
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
|
|
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
|
|
#else
|
|
#define IS_SURROGATE_PAIR(p, end) 0
|
|
#endif
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
PyObject *restuple;
|
|
PyObject *object;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
PyObject *res;
|
|
Py_UNICODE *p;
|
|
Py_UNICODE *startp;
|
|
Py_UNICODE *outp;
|
|
int ressize;
|
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
|
return NULL;
|
|
startp = PyUnicode_AS_UNICODE(object);
|
|
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
|
|
#ifdef Py_UNICODE_WIDE
|
|
if (*p >= 0x00010000)
|
|
ressize += 1+1+8;
|
|
else
|
|
#endif
|
|
if (*p >= 0x100) {
|
|
if (IS_SURROGATE_PAIR(p, startp+end)) {
|
|
ressize += 1+1+8;
|
|
++p;
|
|
}
|
|
else
|
|
ressize += 1+1+4;
|
|
}
|
|
else
|
|
ressize += 1+1+2;
|
|
}
|
|
res = PyUnicode_FromUnicode(NULL, ressize);
|
|
if (res==NULL)
|
|
return NULL;
|
|
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
|
|
p < startp+end; ++p) {
|
|
Py_UCS4 c = (Py_UCS4) *p;
|
|
*outp++ = '\\';
|
|
if (IS_SURROGATE_PAIR(p, startp+end)) {
|
|
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
|
|
++p;
|
|
}
|
|
if (c >= 0x00010000) {
|
|
*outp++ = 'U';
|
|
*outp++ = hexdigits[(c>>28)&0xf];
|
|
*outp++ = hexdigits[(c>>24)&0xf];
|
|
*outp++ = hexdigits[(c>>20)&0xf];
|
|
*outp++ = hexdigits[(c>>16)&0xf];
|
|
*outp++ = hexdigits[(c>>12)&0xf];
|
|
*outp++ = hexdigits[(c>>8)&0xf];
|
|
}
|
|
else if (c >= 0x100) {
|
|
*outp++ = 'u';
|
|
*outp++ = hexdigits[(c>>12)&0xf];
|
|
*outp++ = hexdigits[(c>>8)&0xf];
|
|
}
|
|
else
|
|
*outp++ = 'x';
|
|
*outp++ = hexdigits[(c>>4)&0xf];
|
|
*outp++ = hexdigits[c&0xf];
|
|
}
|
|
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return restuple;
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
#undef IS_SURROGATE_PAIR
|
|
}
|
|
|
|
/* This handler is declared static until someone demonstrates
|
|
a need to call it directly. */
|
|
static PyObject *
|
|
PyCodec_SurrogatePassErrors(PyObject *exc)
|
|
{
|
|
PyObject *restuple;
|
|
PyObject *object;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
PyObject *res;
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
Py_UNICODE *p;
|
|
Py_UNICODE *startp;
|
|
char *outp;
|
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
|
return NULL;
|
|
startp = PyUnicode_AS_UNICODE(object);
|
|
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
|
|
if (!res) {
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
outp = PyBytes_AsString(res);
|
|
for (p = startp+start; p < startp+end; p++) {
|
|
Py_UNICODE ch = *p;
|
|
if (ch < 0xd800 || ch > 0xdfff) {
|
|
/* Not a surrogate, fail with original exception */
|
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
*outp++ = (char)(0xe0 | (ch >> 12));
|
|
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
|
*outp++ = (char)(0x80 | (ch & 0x3f));
|
|
}
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return restuple;
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
|
unsigned char *p;
|
|
Py_UNICODE ch = 0;
|
|
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
|
return NULL;
|
|
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
/* Try decoding a single surrogate character. If
|
|
there are more, let the codec call us again. */
|
|
p += start;
|
|
if ((p[0] & 0xf0) == 0xe0 ||
|
|
(p[1] & 0xc0) == 0x80 ||
|
|
(p[2] & 0xc0) == 0x80) {
|
|
/* it's a three-byte code */
|
|
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
|
if (ch < 0xd800 || ch > 0xdfff)
|
|
/* it's not a surrogate - fail */
|
|
ch = 0;
|
|
}
|
|
Py_DECREF(object);
|
|
if (ch == 0) {
|
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("(u#n)", &ch, 1, start+3);
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static PyObject *
|
|
PyCodec_SurrogateEscapeErrors(PyObject *exc)
|
|
{
|
|
PyObject *restuple;
|
|
PyObject *object;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
PyObject *res;
|
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
|
Py_UNICODE *p;
|
|
Py_UNICODE *startp;
|
|
char *outp;
|
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
|
return NULL;
|
|
startp = PyUnicode_AS_UNICODE(object);
|
|
res = PyBytes_FromStringAndSize(NULL, end-start);
|
|
if (!res) {
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
outp = PyBytes_AsString(res);
|
|
for (p = startp+start; p < startp+end; p++) {
|
|
Py_UNICODE ch = *p;
|
|
if (ch < 0xdc80 || ch > 0xdcff) {
|
|
/* Not a UTF-8b surrogate, fail with original exception */
|
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
*outp++ = ch - 0xdc00;
|
|
}
|
|
restuple = Py_BuildValue("(On)", res, end);
|
|
Py_DECREF(res);
|
|
Py_DECREF(object);
|
|
return restuple;
|
|
}
|
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
|
unsigned char *p;
|
|
Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
|
|
int consumed = 0;
|
|
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
|
return NULL;
|
|
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
|
return NULL;
|
|
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
|
return NULL;
|
|
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
|
Py_DECREF(object);
|
|
return NULL;
|
|
}
|
|
while (consumed < 4 && consumed < end-start) {
|
|
/* Refuse to escape ASCII bytes. */
|
|
if (p[start+consumed] < 128)
|
|
break;
|
|
ch[consumed] = 0xdc00 + p[start+consumed];
|
|
consumed++;
|
|
}
|
|
Py_DECREF(object);
|
|
if (!consumed) {
|
|
/* codec complained about ASCII byte. */
|
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
|
|
}
|
|
else {
|
|
wrong_exception_type(exc);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_StrictErrors(exc);
|
|
}
|
|
|
|
|
|
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_IgnoreErrors(exc);
|
|
}
|
|
|
|
|
|
static PyObject *replace_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_ReplaceErrors(exc);
|
|
}
|
|
|
|
|
|
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_XMLCharRefReplaceErrors(exc);
|
|
}
|
|
|
|
|
|
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_BackslashReplaceErrors(exc);
|
|
}
|
|
|
|
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_SurrogatePassErrors(exc);
|
|
}
|
|
|
|
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
|
|
{
|
|
return PyCodec_SurrogateEscapeErrors(exc);
|
|
}
|
|
|
|
static int _PyCodecRegistry_Init(void)
|
|
{
|
|
static struct {
|
|
char *name;
|
|
PyMethodDef def;
|
|
} methods[] =
|
|
{
|
|
{
|
|
"strict",
|
|
{
|
|
"strict_errors",
|
|
strict_errors,
|
|
METH_O,
|
|
PyDoc_STR("Implements the 'strict' error handling, which "
|
|
"raises a UnicodeError on coding errors.")
|
|
}
|
|
},
|
|
{
|
|
"ignore",
|
|
{
|
|
"ignore_errors",
|
|
ignore_errors,
|
|
METH_O,
|
|
PyDoc_STR("Implements the 'ignore' error handling, which "
|
|
"ignores malformed data and continues.")
|
|
}
|
|
},
|
|
{
|
|
"replace",
|
|
{
|
|
"replace_errors",
|
|
replace_errors,
|
|
METH_O,
|
|
PyDoc_STR("Implements the 'replace' error handling, which "
|
|
"replaces malformed data with a replacement marker.")
|
|
}
|
|
},
|
|
{
|
|
"xmlcharrefreplace",
|
|
{
|
|
"xmlcharrefreplace_errors",
|
|
xmlcharrefreplace_errors,
|
|
METH_O,
|
|
PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
|
|
"which replaces an unencodable character with the "
|
|
"appropriate XML character reference.")
|
|
}
|
|
},
|
|
{
|
|
"backslashreplace",
|
|
{
|
|
"backslashreplace_errors",
|
|
backslashreplace_errors,
|
|
METH_O,
|
|
PyDoc_STR("Implements the 'backslashreplace' error handling, "
|
|
"which replaces an unencodable character with a "
|
|
"backslashed escape sequence.")
|
|
}
|
|
},
|
|
{
|
|
"surrogatepass",
|
|
{
|
|
"surrogatepass",
|
|
surrogatepass_errors,
|
|
METH_O
|
|
}
|
|
},
|
|
{
|
|
"surrogateescape",
|
|
{
|
|
"surrogateescape",
|
|
surrogateescape_errors,
|
|
METH_O
|
|
}
|
|
}
|
|
};
|
|
|
|
PyInterpreterState *interp = PyThreadState_GET()->interp;
|
|
PyObject *mod;
|
|
unsigned i;
|
|
|
|
if (interp->codec_search_path != NULL)
|
|
return 0;
|
|
|
|
interp->codec_search_path = PyList_New(0);
|
|
interp->codec_search_cache = PyDict_New();
|
|
interp->codec_error_registry = PyDict_New();
|
|
|
|
if (interp->codec_error_registry) {
|
|
for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
|
|
PyObject *func = PyCFunction_New(&methods[i].def, NULL);
|
|
int res;
|
|
if (!func)
|
|
Py_FatalError("can't initialize codec error registry");
|
|
res = PyCodec_RegisterError(methods[i].name, func);
|
|
Py_DECREF(func);
|
|
if (res)
|
|
Py_FatalError("can't initialize codec error registry");
|
|
}
|
|
}
|
|
|
|
if (interp->codec_search_path == NULL ||
|
|
interp->codec_search_cache == NULL ||
|
|
interp->codec_error_registry == NULL)
|
|
Py_FatalError("can't initialize codec registry");
|
|
|
|
mod = PyImport_ImportModuleNoBlock("encodings");
|
|
if (mod == NULL) {
|
|
if (PyErr_ExceptionMatches(PyExc_ImportError)) {
|
|
/* Ignore ImportErrors... this is done so that
|
|
distributions can disable the encodings package. Note
|
|
that other errors are not masked, e.g. SystemErrors
|
|
raised to inform the user of an error in the Python
|
|
configuration are still reported back to the user. */
|
|
PyErr_Clear();
|
|
return 0;
|
|
}
|
|
return -1;
|
|
}
|
|
Py_DECREF(mod);
|
|
interp->codecs_initialized = 1;
|
|
return 0;
|
|
}
|