mirror of
https://github.com/python/cpython.git
synced 2024-12-15 12:54:31 +08:00
17b1d5d4e3
The SAX parser no longer processes general external entities by default to increase security. Before, the parser created network connections to fetch remote files or loaded local files from the file system for DTD and entities. Signed-off-by: Christian Heimes <christian@python.org> https://bugs.python.org/issue17239
357 lines
13 KiB
Python
357 lines
13 KiB
Python
import io
|
|
import unittest
|
|
import xml.sax
|
|
|
|
from xml.sax.xmlreader import AttributesImpl
|
|
from xml.sax.handler import feature_external_ges
|
|
from xml.dom import pulldom
|
|
|
|
from test.support import findfile
|
|
|
|
|
|
tstfile = findfile("test.xml", subdir="xmltestdata")
|
|
|
|
# A handy XML snippet, containing attributes, a namespace prefix, and a
|
|
# self-closing tag:
|
|
SMALL_SAMPLE = """<?xml version="1.0"?>
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xdc="http://www.xml.com/books">
|
|
<!-- A comment -->
|
|
<title>Introduction to XSL</title>
|
|
<hr/>
|
|
<p><xdc:author xdc:attrib="prefixed attribute" attrib="other attrib">A. Namespace</xdc:author></p>
|
|
</html>"""
|
|
|
|
|
|
class PullDOMTestCase(unittest.TestCase):
|
|
|
|
def test_parse(self):
|
|
"""Minimal test of DOMEventStream.parse()"""
|
|
|
|
# This just tests that parsing from a stream works. Actual parser
|
|
# semantics are tested using parseString with a more focused XML
|
|
# fragment.
|
|
|
|
# Test with a filename:
|
|
handler = pulldom.parse(tstfile)
|
|
self.addCleanup(handler.stream.close)
|
|
list(handler)
|
|
|
|
# Test with a file object:
|
|
with open(tstfile, "rb") as fin:
|
|
list(pulldom.parse(fin))
|
|
|
|
def test_parse_semantics(self):
|
|
"""Test DOMEventStream parsing semantics."""
|
|
|
|
items = pulldom.parseString(SMALL_SAMPLE)
|
|
evt, node = next(items)
|
|
# Just check the node is a Document:
|
|
self.assertTrue(hasattr(node, "createElement"))
|
|
self.assertEqual(pulldom.START_DOCUMENT, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("html", node.tagName)
|
|
self.assertEqual(2, len(node.attributes))
|
|
self.assertEqual(node.attributes.getNamedItem("xmlns:xdc").value,
|
|
"http://www.xml.com/books")
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt) # Line break
|
|
evt, node = next(items)
|
|
# XXX - A comment should be reported here!
|
|
# self.assertEqual(pulldom.COMMENT, evt)
|
|
# Line break after swallowed comment:
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual("title", node.tagName)
|
|
title_node = node
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
self.assertEqual("Introduction to XSL", node.data)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
self.assertEqual("title", node.tagName)
|
|
self.assertTrue(title_node is node)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("hr", node.tagName)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
self.assertEqual("hr", node.tagName)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("p", node.tagName)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("xdc:author", node.tagName)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
self.assertEqual("xdc:author", node.tagName)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
# XXX No END_DOCUMENT item is ever obtained:
|
|
#evt, node = next(items)
|
|
#self.assertEqual(pulldom.END_DOCUMENT, evt)
|
|
|
|
def test_expandItem(self):
|
|
"""Ensure expandItem works as expected."""
|
|
items = pulldom.parseString(SMALL_SAMPLE)
|
|
# Loop through the nodes until we get to a "title" start tag:
|
|
for evt, item in items:
|
|
if evt == pulldom.START_ELEMENT and item.tagName == "title":
|
|
items.expandNode(item)
|
|
self.assertEqual(1, len(item.childNodes))
|
|
break
|
|
else:
|
|
self.fail("No \"title\" element detected in SMALL_SAMPLE!")
|
|
# Loop until we get to the next start-element:
|
|
for evt, node in items:
|
|
if evt == pulldom.START_ELEMENT:
|
|
break
|
|
self.assertEqual("hr", node.tagName,
|
|
"expandNode did not leave DOMEventStream in the correct state.")
|
|
# Attempt to expand a standalone element:
|
|
items.expandNode(node)
|
|
self.assertEqual(next(items)[0], pulldom.CHARACTERS)
|
|
evt, node = next(items)
|
|
self.assertEqual(node.tagName, "p")
|
|
items.expandNode(node)
|
|
next(items) # Skip character data
|
|
evt, node = next(items)
|
|
self.assertEqual(node.tagName, "html")
|
|
with self.assertRaises(StopIteration):
|
|
next(items)
|
|
items.clear()
|
|
self.assertIsNone(items.parser)
|
|
self.assertIsNone(items.stream)
|
|
|
|
@unittest.expectedFailure
|
|
def test_comment(self):
|
|
"""PullDOM does not receive "comment" events."""
|
|
items = pulldom.parseString(SMALL_SAMPLE)
|
|
for evt, _ in items:
|
|
if evt == pulldom.COMMENT:
|
|
break
|
|
else:
|
|
self.fail("No comment was encountered")
|
|
|
|
@unittest.expectedFailure
|
|
def test_end_document(self):
|
|
"""PullDOM does not receive "end-document" events."""
|
|
items = pulldom.parseString(SMALL_SAMPLE)
|
|
# Read all of the nodes up to and including </html>:
|
|
for evt, node in items:
|
|
if evt == pulldom.END_ELEMENT and node.tagName == "html":
|
|
break
|
|
try:
|
|
# Assert that the next node is END_DOCUMENT:
|
|
evt, node = next(items)
|
|
self.assertEqual(pulldom.END_DOCUMENT, evt)
|
|
except StopIteration:
|
|
self.fail(
|
|
"Ran out of events, but should have received END_DOCUMENT")
|
|
|
|
def test_getitem_deprecation(self):
|
|
parser = pulldom.parseString(SMALL_SAMPLE)
|
|
with self.assertWarnsRegex(DeprecationWarning,
|
|
r'Use iterator protocol instead'):
|
|
# This should have returned 'END_ELEMENT'.
|
|
self.assertEqual(parser[-1][0], pulldom.START_DOCUMENT)
|
|
|
|
def test_external_ges_default(self):
|
|
parser = pulldom.parseString(SMALL_SAMPLE)
|
|
saxparser = parser.parser
|
|
ges = saxparser.getFeature(feature_external_ges)
|
|
self.assertEqual(ges, False)
|
|
|
|
|
|
class ThoroughTestCase(unittest.TestCase):
|
|
"""Test the hard-to-reach parts of pulldom."""
|
|
|
|
def test_thorough_parse(self):
|
|
"""Test some of the hard-to-reach parts of PullDOM."""
|
|
self._test_thorough(pulldom.parse(None, parser=SAXExerciser()))
|
|
|
|
@unittest.expectedFailure
|
|
def test_sax2dom_fail(self):
|
|
"""SAX2DOM can"t handle a PI before the root element."""
|
|
pd = SAX2DOMTestHelper(None, SAXExerciser(), 12)
|
|
self._test_thorough(pd)
|
|
|
|
def test_thorough_sax2dom(self):
|
|
"""Test some of the hard-to-reach parts of SAX2DOM."""
|
|
pd = SAX2DOMTestHelper(None, SAX2DOMExerciser(), 12)
|
|
self._test_thorough(pd, False)
|
|
|
|
def _test_thorough(self, pd, before_root=True):
|
|
"""Test some of the hard-to-reach parts of the parser, using a mock
|
|
parser."""
|
|
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.START_DOCUMENT, evt)
|
|
# Just check the node is a Document:
|
|
self.assertTrue(hasattr(node, "createElement"))
|
|
|
|
if before_root:
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.COMMENT, evt)
|
|
self.assertEqual("a comment", node.data)
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.PROCESSING_INSTRUCTION, evt)
|
|
self.assertEqual("target", node.target)
|
|
self.assertEqual("data", node.data)
|
|
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("html", node.tagName)
|
|
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.COMMENT, evt)
|
|
self.assertEqual("a comment", node.data)
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.PROCESSING_INSTRUCTION, evt)
|
|
self.assertEqual("target", node.target)
|
|
self.assertEqual("data", node.data)
|
|
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.START_ELEMENT, evt)
|
|
self.assertEqual("p", node.tagName)
|
|
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.CHARACTERS, evt)
|
|
self.assertEqual("text", node.data)
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
self.assertEqual("p", node.tagName)
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.END_ELEMENT, evt)
|
|
self.assertEqual("html", node.tagName)
|
|
evt, node = next(pd)
|
|
self.assertEqual(pulldom.END_DOCUMENT, evt)
|
|
|
|
|
|
class SAXExerciser(object):
|
|
"""A fake sax parser that calls some of the harder-to-reach sax methods to
|
|
ensure it emits the correct events"""
|
|
|
|
def setContentHandler(self, handler):
|
|
self._handler = handler
|
|
|
|
def parse(self, _):
|
|
h = self._handler
|
|
h.startDocument()
|
|
|
|
# The next two items ensure that items preceding the first
|
|
# start_element are properly stored and emitted:
|
|
h.comment("a comment")
|
|
h.processingInstruction("target", "data")
|
|
|
|
h.startElement("html", AttributesImpl({}))
|
|
|
|
h.comment("a comment")
|
|
h.processingInstruction("target", "data")
|
|
|
|
h.startElement("p", AttributesImpl({"class": "paraclass"}))
|
|
h.characters("text")
|
|
h.endElement("p")
|
|
h.endElement("html")
|
|
h.endDocument()
|
|
|
|
def stub(self, *args, **kwargs):
|
|
"""Stub method. Does nothing."""
|
|
pass
|
|
setProperty = stub
|
|
setFeature = stub
|
|
|
|
|
|
class SAX2DOMExerciser(SAXExerciser):
|
|
"""The same as SAXExerciser, but without the processing instruction and
|
|
comment before the root element, because S2D can"t handle it"""
|
|
|
|
def parse(self, _):
|
|
h = self._handler
|
|
h.startDocument()
|
|
h.startElement("html", AttributesImpl({}))
|
|
h.comment("a comment")
|
|
h.processingInstruction("target", "data")
|
|
h.startElement("p", AttributesImpl({"class": "paraclass"}))
|
|
h.characters("text")
|
|
h.endElement("p")
|
|
h.endElement("html")
|
|
h.endDocument()
|
|
|
|
|
|
class SAX2DOMTestHelper(pulldom.DOMEventStream):
|
|
"""Allows us to drive SAX2DOM from a DOMEventStream."""
|
|
|
|
def reset(self):
|
|
self.pulldom = pulldom.SAX2DOM()
|
|
# This content handler relies on namespace support
|
|
self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
|
self.parser.setContentHandler(self.pulldom)
|
|
|
|
|
|
class SAX2DOMTestCase(unittest.TestCase):
|
|
|
|
def confirm(self, test, testname="Test"):
|
|
self.assertTrue(test, testname)
|
|
|
|
def test_basic(self):
|
|
"""Ensure SAX2DOM can parse from a stream."""
|
|
with io.StringIO(SMALL_SAMPLE) as fin:
|
|
sd = SAX2DOMTestHelper(fin, xml.sax.make_parser(),
|
|
len(SMALL_SAMPLE))
|
|
for evt, node in sd:
|
|
if evt == pulldom.START_ELEMENT and node.tagName == "html":
|
|
break
|
|
# Because the buffer is the same length as the XML, all the
|
|
# nodes should have been parsed and added:
|
|
self.assertGreater(len(node.childNodes), 0)
|
|
|
|
def testSAX2DOM(self):
|
|
"""Ensure SAX2DOM expands nodes as expected."""
|
|
sax2dom = pulldom.SAX2DOM()
|
|
sax2dom.startDocument()
|
|
sax2dom.startElement("doc", {})
|
|
sax2dom.characters("text")
|
|
sax2dom.startElement("subelm", {})
|
|
sax2dom.characters("text")
|
|
sax2dom.endElement("subelm")
|
|
sax2dom.characters("text")
|
|
sax2dom.endElement("doc")
|
|
sax2dom.endDocument()
|
|
|
|
doc = sax2dom.document
|
|
root = doc.documentElement
|
|
(text1, elm1, text2) = root.childNodes
|
|
text3 = elm1.childNodes[0]
|
|
|
|
self.assertIsNone(text1.previousSibling)
|
|
self.assertIs(text1.nextSibling, elm1)
|
|
self.assertIs(elm1.previousSibling, text1)
|
|
self.assertIs(elm1.nextSibling, text2)
|
|
self.assertIs(text2.previousSibling, elm1)
|
|
self.assertIsNone(text2.nextSibling)
|
|
self.assertIsNone(text3.previousSibling)
|
|
self.assertIsNone(text3.nextSibling)
|
|
|
|
self.assertIs(root.parentNode, doc)
|
|
self.assertIs(text1.parentNode, root)
|
|
self.assertIs(elm1.parentNode, root)
|
|
self.assertIs(text2.parentNode, root)
|
|
self.assertIs(text3.parentNode, elm1)
|
|
doc.unlink()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|