mirror of
https://github.com/python/cpython.git
synced 2024-12-18 14:24:33 +08:00
91106cd9ff
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
171 lines
5.5 KiB
Python
171 lines
5.5 KiB
Python
""" Standard "encodings" Package
|
|
|
|
Standard Python encoding modules are stored in this package
|
|
directory.
|
|
|
|
Codec modules must have names corresponding to normalized encoding
|
|
names as defined in the normalize_encoding() function below, e.g.
|
|
'utf-8' must be implemented by the module 'utf_8.py'.
|
|
|
|
Each codec module must export the following interface:
|
|
|
|
* getregentry() -> codecs.CodecInfo object
|
|
The getregentry() API must return a CodecInfo object with encoder, decoder,
|
|
incrementalencoder, incrementaldecoder, streamwriter and streamreader
|
|
atttributes which adhere to the Python Codec Interface Standard.
|
|
|
|
In addition, a module may optionally also define the following
|
|
APIs which are then used by the package's codec search function:
|
|
|
|
* getaliases() -> sequence of encoding name strings to use as aliases
|
|
|
|
Alias names returned by getaliases() must be normalized encoding
|
|
names as defined by normalize_encoding().
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
|
|
|
"""#"
|
|
|
|
import codecs
|
|
import sys
|
|
from . import aliases
|
|
|
|
_cache = {}
|
|
_unknown = '--unknown--'
|
|
_import_tail = ['*']
|
|
_aliases = aliases.aliases
|
|
|
|
class CodecRegistryError(LookupError, SystemError):
|
|
pass
|
|
|
|
def normalize_encoding(encoding):
|
|
|
|
""" Normalize an encoding name.
|
|
|
|
Normalization works as follows: all non-alphanumeric
|
|
characters except the dot used for Python package names are
|
|
collapsed and replaced with a single underscore, e.g. ' -;#'
|
|
becomes '_'. Leading and trailing underscores are removed.
|
|
|
|
Note that encoding names should be ASCII only; if they do use
|
|
non-ASCII characters, these must be Latin-1 compatible.
|
|
|
|
"""
|
|
if isinstance(encoding, bytes):
|
|
encoding = str(encoding, "ascii")
|
|
|
|
chars = []
|
|
punct = False
|
|
for c in encoding:
|
|
if c.isalnum() or c == '.':
|
|
if punct and chars:
|
|
chars.append('_')
|
|
chars.append(c)
|
|
punct = False
|
|
else:
|
|
punct = True
|
|
return ''.join(chars)
|
|
|
|
def search_function(encoding):
|
|
|
|
# Cache lookup
|
|
entry = _cache.get(encoding, _unknown)
|
|
if entry is not _unknown:
|
|
return entry
|
|
|
|
# Import the module:
|
|
#
|
|
# First try to find an alias for the normalized encoding
|
|
# name and lookup the module using the aliased name, then try to
|
|
# lookup the module using the standard import scheme, i.e. first
|
|
# try in the encodings package, then at top-level.
|
|
#
|
|
norm_encoding = normalize_encoding(encoding)
|
|
aliased_encoding = _aliases.get(norm_encoding) or \
|
|
_aliases.get(norm_encoding.replace('.', '_'))
|
|
if aliased_encoding is not None:
|
|
modnames = [aliased_encoding,
|
|
norm_encoding]
|
|
else:
|
|
modnames = [norm_encoding]
|
|
for modname in modnames:
|
|
if not modname or '.' in modname:
|
|
continue
|
|
try:
|
|
# Import is absolute to prevent the possibly malicious import of a
|
|
# module with side-effects that is not in the 'encodings' package.
|
|
mod = __import__('encodings.' + modname, fromlist=_import_tail,
|
|
level=0)
|
|
except ImportError:
|
|
# ImportError may occur because 'encodings.(modname)' does not exist,
|
|
# or because it imports a name that does not exist (see mbcs and oem)
|
|
pass
|
|
else:
|
|
break
|
|
else:
|
|
mod = None
|
|
|
|
try:
|
|
getregentry = mod.getregentry
|
|
except AttributeError:
|
|
# Not a codec module
|
|
mod = None
|
|
|
|
if mod is None:
|
|
# Cache misses
|
|
_cache[encoding] = None
|
|
return None
|
|
|
|
# Now ask the module for the registry entry
|
|
entry = getregentry()
|
|
if not isinstance(entry, codecs.CodecInfo):
|
|
if not 4 <= len(entry) <= 7:
|
|
raise CodecRegistryError('module "%s" (%s) failed to register'
|
|
% (mod.__name__, mod.__file__))
|
|
if not callable(entry[0]) or not callable(entry[1]) or \
|
|
(entry[2] is not None and not callable(entry[2])) or \
|
|
(entry[3] is not None and not callable(entry[3])) or \
|
|
(len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \
|
|
(len(entry) > 5 and entry[5] is not None and not callable(entry[5])):
|
|
raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
|
|
% (mod.__name__, mod.__file__))
|
|
if len(entry)<7 or entry[6] is None:
|
|
entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
|
|
entry = codecs.CodecInfo(*entry)
|
|
|
|
# Cache the codec registry entry
|
|
_cache[encoding] = entry
|
|
|
|
# Register its aliases (without overwriting previously registered
|
|
# aliases)
|
|
try:
|
|
codecaliases = mod.getaliases()
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
for alias in codecaliases:
|
|
if alias not in _aliases:
|
|
_aliases[alias] = modname
|
|
|
|
# Return the registry entry
|
|
return entry
|
|
|
|
# Register the search_function in the Python codec registry
|
|
codecs.register(search_function)
|
|
|
|
if sys.platform == 'win32':
|
|
def _alias_mbcs(encoding):
|
|
try:
|
|
import _winapi
|
|
ansi_code_page = "cp%s" % _winapi.GetACP()
|
|
if encoding == ansi_code_page:
|
|
import encodings.mbcs
|
|
return encodings.mbcs.getregentry()
|
|
except ImportError:
|
|
# Imports may fail while we are shutting down
|
|
pass
|
|
|
|
codecs.register(_alias_mbcs)
|