diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 9ab5fb19a8a..8df98cf1ff0 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -155,13 +155,26 @@ process and user. These functions are described in :ref:`os-file-dir`. -.. function:: fsencode(value) +.. function:: fsencode(filename) - Encode *value* to bytes for use in the file system, environment variables or - the command line. Use :func:`sys.getfilesystemencoding` and - ``'surrogateescape'`` error handler for strings and return bytes unchanged. - On Windows, use ``'strict'`` error handler for strings if the file system - encoding is ``'mbcs'`` (which is the default encoding). + Encode *filename* to the filesystem encoding with ``'surrogateescape'`` + error handler, return :class:`bytes` unchanged. On Windows, use ``'strict'`` + error handler if the filesystem encoding is ``'mbcs'`` (which is the default + encoding). + + :func:`fsdencode` is the reverse function. + + .. versionadded:: 3.2 + + +.. function:: fsdecode(filename) + + Decode *filename* from the filesystem encoding with ``'surrogateescape'`` + error handler, return :class:`str` unchanged. On Windows, use ``'strict'`` + error handler if the filesystem encoding is ``'mbcs'`` (which is the default + encoding). + + :func:`fsencode` is the reverse function. .. versionadded:: 3.2 diff --git a/Doc/whatsnew/3.2.rst b/Doc/whatsnew/3.2.rst index 3927f4b62c4..2ec93d2bb2e 100644 --- a/Doc/whatsnew/3.2.rst +++ b/Doc/whatsnew/3.2.rst @@ -237,13 +237,16 @@ Major performance enhancements have been added: * Stub -Unicode -======= +Filenames and unicode +===================== The filesystem encoding can be specified by setting the :envvar:`PYTHONFSENCODING` environment variable before running the interpreter. The value should be a string in the form ````, e.g. ``utf-8``. +The :mod:`os` module has two new functions: :func:`os.fsencode` and +:func:`os.fsdecode`. + IDLE ==== diff --git a/Lib/os.py b/Lib/os.py index c7abc2a18e5..60dc12fcf2c 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -402,8 +402,7 @@ def get_exec_path(env=None): path_list = path_listb if path_list is not None and isinstance(path_list, bytes): - path_list = path_list.decode(sys.getfilesystemencoding(), - 'surrogateescape') + path_list = fsdecode(path_list) if path_list is None: path_list = defpath @@ -536,19 +535,39 @@ if supports_bytes_environ: __all__.extend(("environb", "getenvb")) -def fsencode(value): - """Encode value for use in the file system, environment variables - or the command line.""" - if isinstance(value, bytes): - return value - elif isinstance(value, str): +def fsencode(filename): + """ + Encode filename to the filesystem encoding with 'surrogateescape' error + handler, return bytes unchanged. On Windows, use 'strict' error handler if + the file system encoding is 'mbcs' (which is the default encoding). + """ + if isinstance(filename, bytes): + return filename + elif isinstance(filename, str): encoding = sys.getfilesystemencoding() if encoding == 'mbcs': - return value.encode(encoding) + return filename.encode(encoding) else: - return value.encode(encoding, 'surrogateescape') + return filename.encode(encoding, 'surrogateescape') else: - raise TypeError("expect bytes or str, not %s" % type(value).__name__) + raise TypeError("expect bytes or str, not %s" % type(filename).__name__) + +def fsdecode(filename): + """ + Decode filename from the filesystem encoding with 'surrogateescape' error + handler, return str unchanged. On Windows, use 'strict' error handler if + the file system encoding is 'mbcs' (which is the default encoding). + """ + if isinstance(filename, str): + return filename + elif isinstance(filename, bytes): + encoding = sys.getfilesystemencoding() + if encoding == 'mbcs': + return filename.decode(encoding) + else: + return filename.decode(encoding, 'surrogateescape') + else: + raise TypeError("expect bytes or str, not %s" % type(filename).__name__) def _exists(name): return name in globals() diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index f56280abf21..cd8a1b973ee 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -897,14 +897,6 @@ if sys.platform != 'win32': class Pep383Tests(unittest.TestCase): def setUp(self): - def fsdecode(filename): - encoding = sys.getfilesystemencoding() - if encoding == 'mbcs': - errors = 'strict' - else: - errors = 'surrogateescape' - return filename.decode(encoding, errors) - if support.TESTFN_UNENCODABLE: self.dir = support.TESTFN_UNENCODABLE else: @@ -930,7 +922,7 @@ if sys.platform != 'win32': for fn in bytesfn: f = open(os.path.join(self.bdir, fn), "w") f.close() - fn = fsdecode(fn) + fn = os.fsdecode(fn) if fn in self.unicodefn: raise ValueError("duplicate filename") self.unicodefn.add(fn) @@ -1139,12 +1131,43 @@ class Win32SymlinkTests(unittest.TestCase): self.assertNotEqual(os.lstat(link), os.stat(link)) -class MiscTests(unittest.TestCase): +class FSEncodingTests(unittest.TestCase): + def test_nop(self): + self.assertEquals(os.fsencode(b'abc\xff'), b'abc\xff') + self.assertEquals(os.fsdecode('abc\u0141'), 'abc\u0141') - @unittest.skipIf(os.name == "nt", "POSIX specific test") - def test_fsencode(self): - self.assertEquals(os.fsencode(b'ab\xff'), b'ab\xff') - self.assertEquals(os.fsencode('ab\uDCFF'), b'ab\xff') + def test_identity(self): + # assert fsdecode(fsencode(x)) == x + for fn in ('unicode\u0141', 'latin\xe9', 'ascii'): + try: + bytesfn = os.fsencode(fn) + except UnicodeEncodeError: + continue + self.assertEquals(os.fsdecode(bytesfn), fn) + + def get_output(self, fs_encoding, func): + env = os.environ.copy() + env['PYTHONIOENCODING'] = 'utf-8' + env['PYTHONFSENCODING'] = fs_encoding + code = 'import os; print(%s, end="")' % func + process = subprocess.Popen( + [sys.executable, "-c", code], + stdout=subprocess.PIPE, env=env) + stdout, stderr = process.communicate() + self.assertEqual(process.returncode, 0) + return stdout.decode('utf-8') + + def test_encodings(self): + def check(encoding, bytesfn, unicodefn): + encoded = self.get_output(encoding, 'repr(os.fsencode(%a))' % unicodefn) + self.assertEqual(encoded, repr(bytesfn)) + + decoded = self.get_output(encoding, 'repr(os.fsdecode(%a))' % bytesfn) + self.assertEqual(decoded, repr(unicodefn)) + + check('ascii', b'abc\xff', 'abc\udcff') + check('utf-8', b'\xc3\xa9\x80', '\xe9\udc80') + check('iso-8859-15', b'\xef\xa4', '\xef\u20ac') def test_main(): @@ -1163,7 +1186,7 @@ def test_main(): Pep383Tests, Win32KillTests, Win32SymlinkTests, - MiscTests, + FSEncodingTests, ) if __name__ == "__main__": diff --git a/Misc/NEWS b/Misc/NEWS index 23245a8cb15..31fec7adf2e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -116,6 +116,9 @@ Extensions Library ------- +- Create os.fsdecode(): decode from the filesystem encoding with + surrogateescape error handler, or strict error handler on Windows. + - Issue #3488: Provide convenient shorthand functions ``gzip.compress`` and ``gzip.decompress``. Original patch by Anand B. Pillai.