From 7a919e993052deec2826d6ba823156c805b1a9d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gust=C3=A4bel?= Date: Sat, 5 May 2012 18:15:03 +0200 Subject: [PATCH] Issue #13815: TarFile.extractfile() now returns io.BufferedReader objects. The ExFileObject class was removed, some of its code went into _FileInFile. --- Doc/library/tarfile.rst | 13 +-- Lib/tarfile.py | 196 +++++++++------------------------------ Lib/test/test_tarfile.py | 69 +++++++------- Misc/NEWS | 2 + 4 files changed, 80 insertions(+), 200 deletions(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 92e9df7b50c..86dd33df381 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -376,15 +376,12 @@ be finalized; only the internally used file object will be closed. See the .. method:: TarFile.extractfile(member) Extract a member from the archive as a file object. *member* may be a filename - or a :class:`TarInfo` object. If *member* is a regular file, a :term:`file-like - object` is returned. If *member* is a link, a file-like object is constructed from - the link's target. If *member* is none of the above, :const:`None` is returned. + or a :class:`TarInfo` object. If *member* is a regular file or a link, an + :class:`io.BufferedReader` object is returned. Otherwise, :const:`None` is + returned. - .. note:: - - The file-like object is read-only. It provides the methods - :meth:`read`, :meth:`readline`, :meth:`readlines`, :meth:`seek`, :meth:`tell`, - and :meth:`close`, and also supports iteration over its lines. + .. versionchanged:: 3.3 + Return an :class:`io.BufferedReader` object. .. method:: TarFile.add(name, arcname=None, recursive=True, exclude=None, *, filter=None) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index efb277309ba..e273787695e 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -668,6 +668,8 @@ class _FileInFile(object): self.offset = offset self.size = size self.position = 0 + self.name = getattr(fileobj, "name", None) + self.closed = False if blockinfo is None: blockinfo = [(0, size)] @@ -686,10 +688,16 @@ class _FileInFile(object): if lastpos < self.size: self.map.append((False, lastpos, self.size, None)) + def flush(self): + pass + + def readable(self): + return True + + def writable(self): + return False + def seekable(self): - if not hasattr(self.fileobj, "seekable"): - # XXX gzip.GzipFile and bz2.BZ2File - return True return self.fileobj.seekable() def tell(self): @@ -697,10 +705,21 @@ class _FileInFile(object): """ return self.position - def seek(self, position): + def seek(self, position, whence=io.SEEK_SET): """Seek to a position in the file. """ - self.position = position + if whence == io.SEEK_SET: + self.position = min(max(position, 0), self.size) + elif whence == io.SEEK_CUR: + if position < 0: + self.position = max(self.position + position, 0) + else: + self.position = min(self.position + position, self.size) + elif whence == io.SEEK_END: + self.position = max(min(self.size + position, self.size), 0) + else: + raise ValueError("Invalid argument") + return self.position def read(self, size=None): """Read data from the file. @@ -729,146 +748,16 @@ class _FileInFile(object): size -= length self.position += length return buf -#class _FileInFile - -class ExFileObject(object): - """File-like object for reading an archive member. - Is returned by TarFile.extractfile(). - """ - blocksize = 1024 - - def __init__(self, tarfile, tarinfo): - self.fileobj = _FileInFile(tarfile.fileobj, - tarinfo.offset_data, - tarinfo.size, - tarinfo.sparse) - self.name = tarinfo.name - self.mode = "r" - self.closed = False - self.size = tarinfo.size - - self.position = 0 - self.buffer = b"" - - def readable(self): - return True - - def writable(self): - return False - - def seekable(self): - return self.fileobj.seekable() - - def read(self, size=None): - """Read at most size bytes from the file. If size is not - present or None, read all data until EOF is reached. - """ - if self.closed: - raise ValueError("I/O operation on closed file") - - buf = b"" - if self.buffer: - if size is None: - buf = self.buffer - self.buffer = b"" - else: - buf = self.buffer[:size] - self.buffer = self.buffer[size:] - - if size is None: - buf += self.fileobj.read() - else: - buf += self.fileobj.read(size - len(buf)) - - self.position += len(buf) - return buf - - # XXX TextIOWrapper uses the read1() method. - read1 = read - - def readline(self, size=-1): - """Read one entire line from the file. If size is present - and non-negative, return a string with at most that - size, which may be an incomplete line. - """ - if self.closed: - raise ValueError("I/O operation on closed file") - - pos = self.buffer.find(b"\n") + 1 - if pos == 0: - # no newline found. - while True: - buf = self.fileobj.read(self.blocksize) - self.buffer += buf - if not buf or b"\n" in buf: - pos = self.buffer.find(b"\n") + 1 - if pos == 0: - # no newline found. - pos = len(self.buffer) - break - - if size != -1: - pos = min(size, pos) - - buf = self.buffer[:pos] - self.buffer = self.buffer[pos:] - self.position += len(buf) - return buf - - def readlines(self): - """Return a list with all remaining lines. - """ - result = [] - while True: - line = self.readline() - if not line: break - result.append(line) - return result - - def tell(self): - """Return the current file position. - """ - if self.closed: - raise ValueError("I/O operation on closed file") - - return self.position - - def seek(self, pos, whence=io.SEEK_SET): - """Seek to a position in the file. - """ - if self.closed: - raise ValueError("I/O operation on closed file") - - if whence == io.SEEK_SET: - self.position = min(max(pos, 0), self.size) - elif whence == io.SEEK_CUR: - if pos < 0: - self.position = max(self.position + pos, 0) - else: - self.position = min(self.position + pos, self.size) - elif whence == io.SEEK_END: - self.position = max(min(self.size + pos, self.size), 0) - else: - raise ValueError("Invalid argument") - - self.buffer = b"" - self.fileobj.seek(self.position) + def readinto(self, b): + buf = self.read(len(b)) + b[:len(buf)] = buf + return len(buf) def close(self): - """Close the file object. - """ self.closed = True +#class _FileInFile - def __iter__(self): - """Get an iterator over the file's lines. - """ - while True: - line = self.readline() - if not line: - break - yield line -#class ExFileObject #------------------ # Exported Classes @@ -1554,7 +1443,8 @@ class TarFile(object): tarinfo = TarInfo # The default TarInfo class to use. - fileobject = ExFileObject # The default ExFileObject class to use. + fileobject = None # The file-object for extractfile() or + # io.BufferedReader if None. def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, @@ -2178,12 +2068,9 @@ class TarFile(object): def extractfile(self, member): """Extract a member from the archive as a file object. `member' may be - a filename or a TarInfo object. If `member' is a regular file, a - file-like object is returned. If `member' is a link, a file-like - object is constructed from the link's target. If `member' is none of - the above, None is returned. - The file-like object is read-only and provides the following - methods: read(), readline(), readlines(), seek() and tell() + a filename or a TarInfo object. If `member' is a regular file or a + link, an io.BufferedReader object is returned. Otherwise, None is + returned. """ self._check("r") @@ -2192,13 +2079,14 @@ class TarFile(object): else: tarinfo = member - if tarinfo.isreg(): - return self.fileobject(self, tarinfo) - - elif tarinfo.type not in SUPPORTED_TYPES: - # If a member's type is unknown, it is treated as a - # regular file. - return self.fileobject(self, tarinfo) + if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: + # Members with unknown types are treated as regular files. + if self.fileobject is None: + fileobj = _FileInFile(self.fileobj, tarinfo.offset_data, tarinfo.size, tarinfo.sparse) + return io.BufferedReader(fileobj) + else: + # Keep the traditional pre-3.3 API intact. + return self.fileobject(self, tarinfo) elif tarinfo.islnk() or tarinfo.issym(): if isinstance(self.fileobj, _Stream): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 4967ad087cc..c7bf774b426 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -56,13 +56,10 @@ class UstarReadTest(ReadTest): def test_fileobj_regular_file(self): tarinfo = self.tar.getmember("ustar/regtype") - fobj = self.tar.extractfile(tarinfo) - try: + with self.tar.extractfile(tarinfo) as fobj: data = fobj.read() self.assertTrue((len(data), md5sum(data)) == (tarinfo.size, md5_regtype), "regular file extraction failed") - finally: - fobj.close() def test_fileobj_readlines(self): self.tar.extract("ustar/regtype", TEMPDIR) @@ -70,8 +67,7 @@ class UstarReadTest(ReadTest): with open(os.path.join(TEMPDIR, "ustar/regtype"), "r") as fobj1: lines1 = fobj1.readlines() - fobj = self.tar.extractfile(tarinfo) - try: + with self.tar.extractfile(tarinfo) as fobj: fobj2 = io.TextIOWrapper(fobj) lines2 = fobj2.readlines() self.assertTrue(lines1 == lines2, @@ -81,21 +77,16 @@ class UstarReadTest(ReadTest): self.assertTrue(lines2[83] == "I will gladly admit that Python is not the fastest running scripting language.\n", "fileobj.readlines() failed") - finally: - fobj.close() def test_fileobj_iter(self): self.tar.extract("ustar/regtype", TEMPDIR) tarinfo = self.tar.getmember("ustar/regtype") with open(os.path.join(TEMPDIR, "ustar/regtype"), "r") as fobj1: lines1 = fobj1.readlines() - fobj2 = self.tar.extractfile(tarinfo) - try: + with self.tar.extractfile(tarinfo) as fobj2: lines2 = list(io.TextIOWrapper(fobj2)) self.assertTrue(lines1 == lines2, "fileobj.__iter__() failed") - finally: - fobj2.close() def test_fileobj_seek(self): self.tar.extract("ustar/regtype", TEMPDIR) @@ -147,17 +138,24 @@ class UstarReadTest(ReadTest): "read() after readline() failed") fobj.close() + def test_fileobj_text(self): + with self.tar.extractfile("ustar/regtype") as fobj: + fobj = io.TextIOWrapper(fobj) + data = fobj.read().encode("iso8859-1") + self.assertEqual(md5sum(data), md5_regtype) + try: + fobj.seek(100) + except AttributeError: + # Issue #13815: seek() complained about a missing + # flush() method. + self.fail("seeking failed in text mode") + # Test if symbolic and hard links are resolved by extractfile(). The # test link members each point to a regular member whose data is # supposed to be exported. def _test_fileobj_link(self, lnktype, regtype): - a = self.tar.extractfile(lnktype) - b = self.tar.extractfile(regtype) - try: + with self.tar.extractfile(lnktype) as a, self.tar.extractfile(regtype) as b: self.assertEqual(a.name, b.name) - finally: - a.close() - b.close() def test_fileobj_link1(self): self._test_fileobj_link("ustar/lnktype", "ustar/regtype") @@ -265,9 +263,8 @@ class MiscReadTest(CommonReadTest): t = tar.next() name = t.name offset = t.offset - f = tar.extractfile(t) - data = f.read() - f.close() + with tar.extractfile(t) as f: + data = f.read() finally: tar.close() @@ -439,27 +436,26 @@ class StreamReadTest(CommonReadTest): for tarinfo in self.tar: if not tarinfo.isreg(): continue - fobj = self.tar.extractfile(tarinfo) - while True: - try: - buf = fobj.read(512) - except tarfile.StreamError: - self.fail("simple read-through using TarFile.extractfile() failed") - if not buf: - break - fobj.close() + with self.tar.extractfile(tarinfo) as fobj: + while True: + try: + buf = fobj.read(512) + except tarfile.StreamError: + self.fail("simple read-through using TarFile.extractfile() failed") + if not buf: + break def test_fileobj_regular_file(self): tarinfo = self.tar.next() # get "regtype" (can't use getmember) - fobj = self.tar.extractfile(tarinfo) - data = fobj.read() + with self.tar.extractfile(tarinfo) as fobj: + data = fobj.read() self.assertTrue((len(data), md5sum(data)) == (tarinfo.size, md5_regtype), "regular file extraction failed") def test_provoke_stream_error(self): tarinfos = self.tar.getmembers() - f = self.tar.extractfile(tarinfos[0]) # read the first member - self.assertRaises(tarfile.StreamError, f.read) + with self.tar.extractfile(tarinfos[0]) as f: # read the first member + self.assertRaises(tarfile.StreamError, f.read) def test_compare_members(self): tar1 = tarfile.open(tarname, encoding="iso8859-1") @@ -1484,12 +1480,9 @@ class AppendTest(unittest.TestCase): with tarfile.open(tarname, encoding="iso8859-1") as src: t = src.getmember("ustar/regtype") t.name = "foo" - f = src.extractfile(t) - try: + with src.extractfile(t) as f: with tarfile.open(self.tarname, mode) as tar: tar.addfile(t, f) - finally: - f.close() def _test(self, names=["bar"], fileobj=None): with tarfile.open(self.tarname, fileobj=fileobj) as tar: diff --git a/Misc/NEWS b/Misc/NEWS index 6251da6416d..92139d9fbf2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -89,6 +89,8 @@ Core and Builtins Library ------- +- Issue #13815: TarFile.extractfile() now returns io.BufferedReader objects. + - Issue #14371: Support bzip2 in zipfile module. Patch by Serhiy Storchaka.