cpython/Lib/encodings/utf_16.py

""" Python 'utf-16' Codec


Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""
import codecs, sys

### Codec APIs

encode = codecs.utf_16_encode

def decode(input, errors='strict'):
    return codecs.utf_16_decode(input, errors, True)

class IncrementalEncoder(codecs.IncrementalEncoder):
    def __init__(self, errors='strict'):
        codecs.IncrementalEncoder.__init__(self, errors)
        self.encoder = None

    def encode(self, input, final=False):
        if self.encoder is None:
            result = codecs.utf_16_encode(input, self.errors)[0]
            if sys.byteorder == 'little':
                self.encoder = codecs.utf_16_le_encode
            else:
                self.encoder = codecs.utf_16_be_encode
            return result
        return self.encoder(input, self.errors)[0]

    def reset(self):
        codecs.IncrementalEncoder.reset(self)
        self.encoder = None

    def getstate(self):
        # state info we return to the caller:
        # 0: stream is in natural order for this platform
        # 2: endianness hasn't been determined yet
        # (we're never writing in unnatural order)
        return (2 if self.encoder is None else 0)

    def setstate(self, state):
        if state:
            self.encoder = None
        else:
            if sys.byteorder == 'little':
                self.encoder = codecs.utf_16_le_encode
            else:
                self.encoder = codecs.utf_16_be_encode

class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    def __init__(self, errors='strict'):
        codecs.BufferedIncrementalDecoder.__init__(self, errors)
        self.decoder = None

    def _buffer_decode(self, input, errors, final):
        if self.decoder is None:
            (output, consumed, byteorder) = \
                codecs.utf_16_ex_decode(input, errors, 0, final)
            if byteorder == -1:
                self.decoder = codecs.utf_16_le_decode
            elif byteorder == 1:
                self.decoder = codecs.utf_16_be_decode
            elif consumed >= 2:
                raise UnicodeError("UTF-16 stream does not start with BOM")
            return (output, consumed)
        return self.decoder(input, self.errors, final)

    def reset(self):
        codecs.BufferedIncrementalDecoder.reset(self)
        self.decoder = None

    def getstate(self):
        # additonal state info from the base class must be None here,
        # as it isn't passed along to the caller
        state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
        # additional state info we pass to the caller:
        # 0: stream is in natural order for this platform
        # 1: stream is in unnatural order
        # 2: endianness hasn't been determined yet
        if self.decoder is None:
            return (state, 2)
        addstate = int((sys.byteorder == "big") !=
                       (self.decoder is codecs.utf_16_be_decode))
        return (state, addstate)

    def setstate(self, state):
        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
        codecs.BufferedIncrementalDecoder.setstate(self, state)
        state = state[1]
        if state == 0:
            self.decoder = (codecs.utf_16_be_decode
                            if sys.byteorder == "big"
                            else codecs.utf_16_le_decode)
        elif state == 1:
            self.decoder = (codecs.utf_16_le_decode
                            if sys.byteorder == "big"
                            else codecs.utf_16_be_decode)
        else:
            self.decoder = None

class StreamWriter(codecs.StreamWriter):
    def __init__(self, stream, errors='strict'):
        codecs.StreamWriter.__init__(self, stream, errors)
        self.encoder = None

    def reset(self):
        codecs.StreamWriter.reset(self)
        self.encoder = None

    def encode(self, input, errors='strict'):
        if self.encoder is None:
            result = codecs.utf_16_encode(input, errors)
            if sys.byteorder == 'little':
                self.encoder = codecs.utf_16_le_encode
            else:
                self.encoder = codecs.utf_16_be_encode
            return result
        else:
            return self.encoder(input, errors)

class StreamReader(codecs.StreamReader):

    def reset(self):
        codecs.StreamReader.reset(self)
        try:
            del self.decode
        except AttributeError:
            pass

    def decode(self, input, errors='strict'):
        (object, consumed, byteorder) = \
            codecs.utf_16_ex_decode(input, errors, 0, False)
        if byteorder == -1:
            self.decode = codecs.utf_16_le_decode
        elif byteorder == 1:
            self.decode = codecs.utf_16_be_decode
        elif consumed>=2:
            raise UnicodeError("UTF-16 stream does not start with BOM")
        return (object, consumed)

### encodings module API

def getregentry():
    return codecs.CodecInfo(
        name='utf-16',
        encode=encode,
        decode=decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
Marc-Andre Lemburg: Unicode encodings. 2000-03-11 07:17:24 +08:00			`""" Python 'utf-16' Codec`


			`Written by Marc-Andre Lemburg (mal@lemburg.com).`

			`(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.`

			`"""`
This patch by Martin v. Loewis changes the UTF-16 codec to only write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now have to start with a BOM mark in order to be readable by the codec. 2001-06-20 04:07:51 +08:00			`import codecs, sys`
Marc-Andre Lemburg: Unicode encodings. 2000-03-11 07:17:24 +08:00
			`### Codec APIs`

SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`encode = codecs.utf_16_encode`
Marc-Andre Lemburg: Unicode encodings. 2000-03-11 07:17:24 +08:00
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`def decode(input, errors='strict'):`
			`return codecs.utf_16_decode(input, errors, True)`
Marc-Andre Lemburg: Unicode encodings. 2000-03-11 07:17:24 +08:00
Merge part of the trunk changes into the p3yk branch. This merges from 43030 (branch-creation time) up to 43067. 43068 and 43069 contain a little swapping action between re.py and sre.py, and this mightily confuses svn merge, so later changes are going in separately. This merge should break no additional tests. The last-merged revision is going in a 'last_merge' property on '.' (the branch directory.) Arbitrarily chosen, really; if there's a BCP for this, I couldn't find it, but we can easily change it afterwards ;) 2006-04-21 17:43:23 +08:00			`class IncrementalEncoder(codecs.IncrementalEncoder):`
			`def __init__(self, errors='strict'):`
			`codecs.IncrementalEncoder.__init__(self, errors)`
			`self.encoder = None`

			`def encode(self, input, final=False):`
			`if self.encoder is None:`
			`result = codecs.utf_16_encode(input, self.errors)[0]`
			`if sys.byteorder == 'little':`
			`self.encoder = codecs.utf_16_le_encode`
			`else:`
			`self.encoder = codecs.utf_16_be_encode`
			`return result`
			`return self.encoder(input, self.errors)[0]`

			`def reset(self):`
			`codecs.IncrementalEncoder.reset(self)`
			`self.encoder = None`

Apply SF patch #1698994: Add getstate() and setstate() methods to incrementalcodecs. Also forward port r54786 (fix the incremental utf_8_sig decoder). 2007-04-17 06:10:50 +08:00			`def getstate(self):`
			`# state info we return to the caller:`
			`# 0: stream is in natural order for this platform`
			`# 2: endianness hasn't been determined yet`
			`# (we're never writing in unnatural order)`
			`return (2 if self.encoder is None else 0)`

			`def setstate(self, state):`
			`if state:`
			`self.encoder = None`
			`else:`
			`if sys.byteorder == 'little':`
			`self.encoder = codecs.utf_16_le_encode`
			`else:`
			`self.encoder = codecs.utf_16_be_encode`

Merge part of the trunk changes into the p3yk branch. This merges from 43030 (branch-creation time) up to 43067. 43068 and 43069 contain a little swapping action between re.py and sre.py, and this mightily confuses svn merge, so later changes are going in separately. This merge should break no additional tests. The last-merged revision is going in a 'last_merge' property on '.' (the branch directory.) Arbitrarily chosen, really; if there's a BCP for this, I couldn't find it, but we can easily change it afterwards ;) 2006-04-21 17:43:23 +08:00			`class IncrementalDecoder(codecs.BufferedIncrementalDecoder):`
			`def __init__(self, errors='strict'):`
			`codecs.BufferedIncrementalDecoder.__init__(self, errors)`
			`self.decoder = None`

			`def _buffer_decode(self, input, errors, final):`
			`if self.decoder is None:`
			`(output, consumed, byteorder) = \`
			`codecs.utf_16_ex_decode(input, errors, 0, final)`
			`if byteorder == -1:`
			`self.decoder = codecs.utf_16_le_decode`
			`elif byteorder == 1:`
			`self.decoder = codecs.utf_16_be_decode`
			`elif consumed >= 2:`
			`raise UnicodeError("UTF-16 stream does not start with BOM")`
			`return (output, consumed)`
			`return self.decoder(input, self.errors, final)`

			`def reset(self):`
			`codecs.BufferedIncrementalDecoder.reset(self)`
			`self.decoder = None`

Apply SF patch #1698994: Add getstate() and setstate() methods to incrementalcodecs. Also forward port r54786 (fix the incremental utf_8_sig decoder). 2007-04-17 06:10:50 +08:00			`def getstate(self):`
			`# additonal state info from the base class must be None here,`
			`# as it isn't passed along to the caller`
			`state = codecs.BufferedIncrementalDecoder.getstate(self)[0]`
			`# additional state info we pass to the caller:`
			`# 0: stream is in natural order for this platform`
			`# 1: stream is in unnatural order`
			`# 2: endianness hasn't been determined yet`
			`if self.decoder is None:`
			`return (state, 2)`
			`addstate = int((sys.byteorder == "big") !=`
			`(self.decoder is codecs.utf_16_be_decode))`
			`return (state, addstate)`

			`def setstate(self, state):`
			`# state[1] will be ignored by BufferedIncrementalDecoder.setstate()`
			`codecs.BufferedIncrementalDecoder.setstate(self, state)`
			`state = state[1]`
			`if state == 0:`
			`self.decoder = (codecs.utf_16_be_decode`
			`if sys.byteorder == "big"`
			`else codecs.utf_16_le_decode)`
			`elif state == 1:`
			`self.decoder = (codecs.utf_16_le_decode`
			`if sys.byteorder == "big"`
			`else codecs.utf_16_be_decode)`
			`else:`
			`self.decoder = None`

SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`class StreamWriter(codecs.StreamWriter):`
This patch by Martin v. Loewis changes the UTF-16 codec to only write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now have to start with a BOM mark in order to be readable by the codec. 2001-06-20 04:07:51 +08:00			`def __init__(self, stream, errors='strict'):`
			`codecs.StreamWriter.__init__(self, stream, errors)`
Merged revisions 81471-81472 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81471 \| victor.stinner \| 2010-05-22 15:37:56 +0200 (sam., 22 mai 2010) \| 7 lines Issue #6268: More bugfixes about BOM, UTF-16 and UTF-32 * Fix seek() method of codecs.open(), don't write the BOM twice after seek(0) * Fix reset() method of codecs, UTF-16, UTF-32 and StreamWriter classes * test_codecs: use "w+" mode instead of "wt+". "t" mode is not supported by Solaris or Windows, but does it really exist? I found it the in the issue. ........ r81472 \| victor.stinner \| 2010-05-22 15:44:25 +0200 (sam., 22 mai 2010) \| 4 lines Fix my last commit (r81471) about codecs Rememder: don't touch the code just before a commit ........ 2010-05-23 00:59:09 +08:00			`self.encoder = None`

			`def reset(self):`
			`codecs.StreamWriter.reset(self)`
			`self.encoder = None`
This patch by Martin v. Loewis changes the UTF-16 codec to only write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now have to start with a BOM mark in order to be readable by the codec. 2001-06-20 04:07:51 +08:00
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`def encode(self, input, errors='strict'):`
Merged revisions 81471-81472 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81471 \| victor.stinner \| 2010-05-22 15:37:56 +0200 (sam., 22 mai 2010) \| 7 lines Issue #6268: More bugfixes about BOM, UTF-16 and UTF-32 * Fix seek() method of codecs.open(), don't write the BOM twice after seek(0) * Fix reset() method of codecs, UTF-16, UTF-32 and StreamWriter classes * test_codecs: use "w+" mode instead of "wt+". "t" mode is not supported by Solaris or Windows, but does it really exist? I found it the in the issue. ........ r81472 \| victor.stinner \| 2010-05-22 15:44:25 +0200 (sam., 22 mai 2010) \| 4 lines Fix my last commit (r81471) about codecs Rememder: don't touch the code just before a commit ........ 2010-05-23 00:59:09 +08:00			`if self.encoder is None:`
			`result = codecs.utf_16_encode(input, errors)`
			`if sys.byteorder == 'little':`
			`self.encoder = codecs.utf_16_le_encode`
			`else:`
			`self.encoder = codecs.utf_16_be_encode`
			`return result`
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`else:`
Merged revisions 81471-81472 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81471 \| victor.stinner \| 2010-05-22 15:37:56 +0200 (sam., 22 mai 2010) \| 7 lines Issue #6268: More bugfixes about BOM, UTF-16 and UTF-32 * Fix seek() method of codecs.open(), don't write the BOM twice after seek(0) * Fix reset() method of codecs, UTF-16, UTF-32 and StreamWriter classes * test_codecs: use "w+" mode instead of "wt+". "t" mode is not supported by Solaris or Windows, but does it really exist? I found it the in the issue. ........ r81472 \| victor.stinner \| 2010-05-22 15:44:25 +0200 (sam., 22 mai 2010) \| 4 lines Fix my last commit (r81471) about codecs Rememder: don't touch the code just before a commit ........ 2010-05-23 00:59:09 +08:00			`return self.encoder(input, errors)`
Whitespace normalization. 2002-08-09 04:19:19 +08:00
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`class StreamReader(codecs.StreamReader):`

Reset internal buffers when seek() is called. This fixes SF bug #1156259. 2005-03-15 03:06:30 +08:00			`def reset(self):`
			`codecs.StreamReader.reset(self)`
			`try:`
			`del self.decode`
			`except AttributeError:`
			`pass`

SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`def decode(self, input, errors='strict'):`
			`(object, consumed, byteorder) = \`
			`codecs.utf_16_ex_decode(input, errors, 0, False)`
			`if byteorder == -1:`
			`self.decode = codecs.utf_16_le_decode`
			`elif byteorder == 1:`
			`self.decode = codecs.utf_16_be_decode`
			`elif consumed>=2:`
Raise statement normalization in Lib/. 2007-08-30 09:19:48 +08:00			`raise UnicodeError("UTF-16 stream does not start with BOM")`
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful. 2004-09-08 04:24:22 +08:00			`return (object, consumed)`
Fix for bug #222395: UTF-16 et al. don't handle .readline(). They now raise an NotImplementedError to hint to the truth ;-) 2002-04-05 20:12:00 +08:00
Marc-Andre Lemburg: Unicode encodings. 2000-03-11 07:17:24 +08:00			`### encodings module API`

			`def getregentry():`
Merge part of the trunk changes into the p3yk branch. This merges from 43030 (branch-creation time) up to 43067. 43068 and 43069 contain a little swapping action between re.py and sre.py, and this mightily confuses svn merge, so later changes are going in separately. This merge should break no additional tests. The last-merged revision is going in a 'last_merge' property on '.' (the branch directory.) Arbitrarily chosen, really; if there's a BCP for this, I couldn't find it, but we can easily change it afterwards ;) 2006-04-21 17:43:23 +08:00			`return codecs.CodecInfo(`
			`name='utf-16',`
			`encode=encode,`
			`decode=decode,`
			`incrementalencoder=IncrementalEncoder,`
			`incrementaldecoder=IncrementalDecoder,`
			`streamreader=StreamReader,`
			`streamwriter=StreamWriter,`
			`)`