#13358: HTMLParser now calls handle_data only once for each CDATA.

This commit is contained in:
Ezio Melotti 2011-11-18 18:01:49 +02:00
parent 8008f2aba0
commit 15cb489234
3 changed files with 26 additions and 3 deletions

View File

@ -14,7 +14,6 @@ import re
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase):
return self.__starttag_text
def set_cdata_mode(self, elem):
self.interesting = interesting_cdata
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self):
self.interesting = interesting_normal
@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase):
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]

View File

@ -301,7 +301,27 @@ DOCTYPE html [
("data", content),
("endtag", element_lower)])
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
element_lower = element.lower().strip()
s = '<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)],
collector=Collector())
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):

View File

@ -76,6 +76,8 @@ Core and Builtins
Library
-------
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
node when it is the only child of an element. Initial patch by Dan
Kenigsberg.