cpython/Lib/markupbase.py

"""Shared support for scanning document type declarations in HTML and XHTML."""

import re
import string

_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match

del re


class ParserBase:
    """Parser base class which provides some common support methods used
    by the SGML/HTML and XHTML parsers."""

    def __init__(self):
        if self.__class__ is ParserBase:
            raise RuntimeError(
                "markupbase.ParserBase must be subclassed")

    def error(self, message):
        raise NotImplementedError(
            "subclasses of ParserBase must override error()")

    def reset(self):
        self.lineno = 1
        self.offset = 0

    def getpos(self):
        """Return current line number and offset."""
        return self.lineno, self.offset

    # Internal -- update line number and offset.  This should be
    # called for each piece of data exactly once, in order -- in other
    # words the concatenation of all the input strings to this
    # function should be exactly the entire input.
    def updatepos(self, i, j):
        if i >= j:
            return j
        rawdata = self.rawdata
        nlines = rawdata.count("\n", i, j)
        if nlines:
            self.lineno = self.lineno + nlines
            pos = rawdata.rindex("\n", i, j) # Should not fail
            self.offset = j-(pos+1)
        else:
            self.offset = self.offset + j-i
        return j

    _decl_otherchars = ''

    # Internal -- parse declaration (for use by subclasses).
    def parse_declaration(self, i):
        # This is some sort of declaration; in "HTML as
        # deployed," this should only be the document type
        # declaration ("<!DOCTYPE html...>").
        rawdata = self.rawdata
        j = i + 2
        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
        if rawdata[j:j+1] in ("-", ""):
            # Start of comment followed by buffer boundary,
            # or just a buffer boundary.
            return -1
        # in practice, this should look like: ((name|stringlit) S*)+ '>'
        n = len(rawdata)
        decltype, j = self._scan_name(j, i)
        if j < 0:
            return j
        if decltype == "doctype":
            self._decl_otherchars = ''
        while j < n:
            c = rawdata[j]
            if c == ">":
                # end of declaration syntax
                data = rawdata[i+2:j]
                if decltype == "doctype":
                    self.handle_decl(data)
                else:
                    self.unknown_decl(data)
                return j + 1
            if c in "\"'":
                m = _declstringlit_match(rawdata, j)
                if not m:
                    return -1 # incomplete
                j = m.end()
            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
                name, j = self._scan_name(j, i)
            elif c in self._decl_otherchars:
                j = j + 1
            elif c == "[":
                if decltype == "doctype":
                    j = self._parse_doctype_subset(j + 1, i)
                else:
                    self.error("unexpected '[' char in declaration")
            else:
                self.error(
                    "unexpected %s char in declaration" % `rawdata[j]`)
            if j < 0:
                return j
        return -1 # incomplete

    # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
    # returning the index just past any whitespace following the trailing ']'.
    def _parse_doctype_subset(self, i, declstartpos):
        rawdata = self.rawdata
        n = len(rawdata)
        j = i
        while j < n:
            c = rawdata[j]
            if c == "<":
                s = rawdata[j:j+2]
                if s == "<":
                    # end of buffer; incomplete
                    return -1
                if s != "<!":
                    self.updatepos(declstartpos, j + 1)
                    self.error("unexpected char in internal subset (in %s)"
                               % `s`)
                if (j + 2) == n:
                    # end of buffer; incomplete
                    return -1
                if (j + 4) > n:
                    # end of buffer; incomplete
                    return -1
                if rawdata[j:j+4] == "<!--":
                    j = self.parse_comment(j, report=0)
                    if j < 0:
                        return j
                    continue
                name, j = self._scan_name(j + 2, declstartpos)
                if j == -1:
                    return -1
                if name not in ("attlist", "element", "entity", "notation"):
                    self.updatepos(declstartpos, j + 2)
                    self.error(
                        "unknown declaration %s in internal subset" % `name`)
                # handle the individual names
                meth = getattr(self, "_parse_doctype_" + name)
                j = meth(j, declstartpos)
                if j < 0:
                    return j
            elif c == "%":
                # parameter entity reference
                if (j + 1) == n:
                    # end of buffer; incomplete
                    return -1
                s, j = self._scan_name(j + 1, declstartpos)
                if j < 0:
                    return j
                if rawdata[j] == ";":
                    j = j + 1
            elif c == "]":
                j = j + 1
                while j < n and rawdata[j] in string.whitespace:
                    j = j + 1
                if j < n:
                    if rawdata[j] == ">":
                        return j
                    self.updatepos(declstartpos, j)
                    self.error("unexpected char after internal subset")
                else:
                    return -1
            elif c in string.whitespace:
                j = j + 1
            else:
                self.updatepos(declstartpos, j)
                self.error("unexpected char %s in internal subset" % `c`)
        # end of buffer reached
        return -1

    # Internal -- scan past <!ELEMENT declarations
    def _parse_doctype_element(self, i, declstartpos):
        name, j = self._scan_name(i, declstartpos)
        if j == -1:
            return -1
        # style content model; just skip until '>'
        rawdata = self.rawdata
        if '>' in rawdata[j:]:
            return rawdata.find(">", j) + 1
        return -1

    # Internal -- scan past <!ATTLIST declarations
    def _parse_doctype_attlist(self, i, declstartpos):
        rawdata = self.rawdata
        name, j = self._scan_name(i, declstartpos)
        c = rawdata[j:j+1]
        if c == "":
            return -1
        if c == ">":
            return j + 1
        while 1:
            # scan a series of attribute descriptions; simplified:
            #   name type [value] [#constraint]
            name, j = self._scan_name(j, declstartpos)
            if j < 0:
                return j
            c = rawdata[j:j+1]
            if c == "":
                return -1
            if c == "(":
                # an enumerated type; look for ')'
                if ")" in rawdata[j:]:
                    j = rawdata.find(")", j) + 1
                else:
                    return -1
                while rawdata[j:j+1] in string.whitespace:
                    j = j + 1
                if not rawdata[j:]:
                    # end of buffer, incomplete
                    return -1
            else:
                name, j = self._scan_name(j, declstartpos)
            c = rawdata[j:j+1]
            if not c:
                return -1
            if c in "'\"":
                m = _declstringlit_match(rawdata, j)
                if m:
                    j = m.end()
                else:
                    return -1
                c = rawdata[j:j+1]
                if not c:
                    return -1
            if c == "#":
                if rawdata[j:] == "#":
                    # end of buffer
                    return -1
                name, j = self._scan_name(j + 1, declstartpos)
                if j < 0:
                    return j
                c = rawdata[j:j+1]
                if not c:
                    return -1
            if c == '>':
                # all done
                return j + 1

    # Internal -- scan past <!NOTATION declarations
    def _parse_doctype_notation(self, i, declstartpos):
        name, j = self._scan_name(i, declstartpos)
        if j < 0:
            return j
        rawdata = self.rawdata
        while 1:
            c = rawdata[j:j+1]
            if not c:
                # end of buffer; incomplete
                return -1
            if c == '>':
                return j + 1
            if c in "'\"":
                m = _declstringlit_match(rawdata, j)
                if not m:
                    return -1
                j = m.end()
            else:
                name, j = self._scan_name(j, declstartpos)
                if j < 0:
                    return j

    # Internal -- scan past <!ENTITY declarations
    def _parse_doctype_entity(self, i, declstartpos):
        rawdata = self.rawdata
        if rawdata[i:i+1] == "%":
            j = i + 1
            while 1:
                c = rawdata[j:j+1]
                if not c:
                    return -1
                if c in string.whitespace:
                    j = j + 1
                else:
                    break
        else:
            j = i
        name, j = self._scan_name(j, declstartpos)
        if j < 0:
            return j
        while 1:
            c = self.rawdata[j:j+1]
            if not c:
                return -1
            if c in "'\"":
                m = _declstringlit_match(rawdata, j)
                if m:
                    j = m.end()
                else:
                    return -1    # incomplete
            elif c == ">":
                return j + 1
            else:
                name, j = self._scan_name(j, declstartpos)
                if j < 0:
                    return j

    # Internal -- scan a name token and the new position and the token, or
    # return -1 if we've reached the end of the buffer.
    def _scan_name(self, i, declstartpos):
        rawdata = self.rawdata
        n = len(rawdata)
        if i == n:
            return None, -1
        m = _declname_match(rawdata, i)
        if m:
            s = m.group()
            name = s.strip()
            if (i + len(s)) == n:
                return None, -1  # end of buffer
            return name.lower(), m.end()
        else:
            self.updatepos(declstartpos, i)
            self.error("expected name token")

    # To be overridden -- handlers for unknown objects
    def unknown_decl(self, data):
        pass
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`"""Shared support for scanning document type declarations in HTML and XHTML."""`

			`import re`
			`import string`

			`_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match`
			`_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match`

			`del re`


			`class ParserBase:`
			`"""Parser base class which provides some common support methods used`
			`by the SGML/HTML and XHTML parsers."""`

Re-arrange things and remove some unused variables/imports to keep pychecker happy. (This does not cover everything it complained about, though.) 2001-10-27 02:02:28 +08:00			`def __init__(self):`
			`if self.__class__ is ParserBase:`
			`raise RuntimeError(`
			`"markupbase.ParserBase must be subclassed")`

			`def error(self, message):`
			`raise NotImplementedError(`
			`"subclasses of ParserBase must override error()")`

New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`def reset(self):`
			`self.lineno = 1`
			`self.offset = 0`

			`def getpos(self):`
			`"""Return current line number and offset."""`
			`return self.lineno, self.offset`

			`# Internal -- update line number and offset. This should be`
			`# called for each piece of data exactly once, in order -- in other`
			`# words the concatenation of all the input strings to this`
			`# function should be exactly the entire input.`
			`def updatepos(self, i, j):`
			`if i >= j:`
			`return j`
			`rawdata = self.rawdata`
Use string methods where possible, and remove import string 2002-05-31 22:13:04 +08:00			`nlines = rawdata.count("\n", i, j)`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`if nlines:`
			`self.lineno = self.lineno + nlines`
Use string methods where possible, and remove import string 2002-05-31 22:13:04 +08:00			`pos = rawdata.rindex("\n", i, j) # Should not fail`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`self.offset = j-(pos+1)`
			`else:`
			`self.offset = self.offset + j-i`
			`return j`

			`_decl_otherchars = ''`

			`# Internal -- parse declaration (for use by subclasses).`
			`def parse_declaration(self, i):`
			`# This is some sort of declaration; in "HTML as`
			`# deployed," this should only be the document type`
			`# declaration ("<!DOCTYPE html...>").`
			`rawdata = self.rawdata`
			`j = i + 2`
			`assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"`
			`if rawdata[j:j+1] in ("-", ""):`
			`# Start of comment followed by buffer boundary,`
			`# or just a buffer boundary.`
			`return -1`
			`# in practice, this should look like: ((name\|stringlit) S*)+ '>'`
			`n = len(rawdata)`
			`decltype, j = self._scan_name(j, i)`
			`if j < 0:`
			`return j`
			`if decltype == "doctype":`
			`self._decl_otherchars = ''`
			`while j < n:`
			`c = rawdata[j]`
			`if c == ">":`
			`# end of declaration syntax`
			`data = rawdata[i+2:j]`
			`if decltype == "doctype":`
			`self.handle_decl(data)`
			`else:`
			`self.unknown_decl(data)`
			`return j + 1`
			`if c in "\"'":`
			`m = _declstringlit_match(rawdata, j)`
			`if not m:`
			`return -1 # incomplete`
			`j = m.end()`
			`elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":`
			`name, j = self._scan_name(j, i)`
			`elif c in self._decl_otherchars:`
			`j = j + 1`
			`elif c == "[":`
			`if decltype == "doctype":`
			`j = self._parse_doctype_subset(j + 1, i)`
			`else:`
			`self.error("unexpected '[' char in declaration")`
			`else:`
			`self.error(`
			"unexpected %s char in declaration" % `rawdata[j]`)
			`if j < 0:`
			`return j`
			`return -1 # incomplete`

			`# Internal -- scan past the internal subset in a <!DOCTYPE declaration,`
			`# returning the index just past any whitespace following the trailing ']'.`
			`def _parse_doctype_subset(self, i, declstartpos):`
			`rawdata = self.rawdata`
			`n = len(rawdata)`
			`j = i`
			`while j < n:`
			`c = rawdata[j]`
			`if c == "<":`
			`s = rawdata[j:j+2]`
			`if s == "<":`
			`# end of buffer; incomplete`
			`return -1`
			`if s != "<!":`
			`self.updatepos(declstartpos, j + 1)`
			`self.error("unexpected char in internal subset (in %s)"`
			% `s`)
			`if (j + 2) == n:`
			`# end of buffer; incomplete`
			`return -1`
			`if (j + 4) > n:`
			`# end of buffer; incomplete`
			`return -1`
			`if rawdata[j:j+4] == "<!--":`
			`j = self.parse_comment(j, report=0)`
			`if j < 0:`
			`return j`
			`continue`
			`name, j = self._scan_name(j + 2, declstartpos)`
			`if j == -1:`
			`return -1`
			`if name not in ("attlist", "element", "entity", "notation"):`
			`self.updatepos(declstartpos, j + 2)`
			`self.error(`
			"unknown declaration %s in internal subset" % `name`)
			`# handle the individual names`
			`meth = getattr(self, "_parse_doctype_" + name)`
			`j = meth(j, declstartpos)`
			`if j < 0:`
			`return j`
			`elif c == "%":`
			`# parameter entity reference`
			`if (j + 1) == n:`
			`# end of buffer; incomplete`
			`return -1`
			`s, j = self._scan_name(j + 1, declstartpos)`
			`if j < 0:`
			`return j`
			`if rawdata[j] == ";":`
			`j = j + 1`
			`elif c == "]":`
			`j = j + 1`
			`while j < n and rawdata[j] in string.whitespace:`
			`j = j + 1`
			`if j < n:`
			`if rawdata[j] == ">":`
			`return j`
			`self.updatepos(declstartpos, j)`
			`self.error("unexpected char after internal subset")`
			`else:`
			`return -1`
			`elif c in string.whitespace:`
			`j = j + 1`
			`else:`
			`self.updatepos(declstartpos, j)`
			self.error("unexpected char %s in internal subset" % `c`)
			`# end of buffer reached`
			`return -1`

			`# Internal -- scan past <!ELEMENT declarations`
			`def _parse_doctype_element(self, i, declstartpos):`
			`name, j = self._scan_name(i, declstartpos)`
			`if j == -1:`
			`return -1`
			`# style content model; just skip until '>'`
Re-arrange things and remove some unused variables/imports to keep pychecker happy. (This does not cover everything it complained about, though.) 2001-10-27 02:02:28 +08:00			`rawdata = self.rawdata`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`if '>' in rawdata[j:]:`
Use string methods where possible, and remove import string 2002-05-31 22:13:04 +08:00			`return rawdata.find(">", j) + 1`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`return -1`

			`# Internal -- scan past <!ATTLIST declarations`
			`def _parse_doctype_attlist(self, i, declstartpos):`
			`rawdata = self.rawdata`
			`name, j = self._scan_name(i, declstartpos)`
			`c = rawdata[j:j+1]`
			`if c == "":`
			`return -1`
			`if c == ">":`
			`return j + 1`
			`while 1:`
			`# scan a series of attribute descriptions; simplified:`
			`# name type [value] [#constraint]`
			`name, j = self._scan_name(j, declstartpos)`
			`if j < 0:`
			`return j`
			`c = rawdata[j:j+1]`
			`if c == "":`
			`return -1`
			`if c == "(":`
			`# an enumerated type; look for ')'`
			`if ")" in rawdata[j:]:`
Use string methods where possible, and remove import string 2002-05-31 22:13:04 +08:00			`j = rawdata.find(")", j) + 1`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`else:`
			`return -1`
			`while rawdata[j:j+1] in string.whitespace:`
			`j = j + 1`
			`if not rawdata[j:]:`
			`# end of buffer, incomplete`
			`return -1`
			`else:`
			`name, j = self._scan_name(j, declstartpos)`
			`c = rawdata[j:j+1]`
			`if not c:`
			`return -1`
			`if c in "'\"":`
			`m = _declstringlit_match(rawdata, j)`
			`if m:`
			`j = m.end()`
			`else:`
			`return -1`
			`c = rawdata[j:j+1]`
			`if not c:`
			`return -1`
			`if c == "#":`
			`if rawdata[j:] == "#":`
			`# end of buffer`
			`return -1`
			`name, j = self._scan_name(j + 1, declstartpos)`
			`if j < 0:`
			`return j`
			`c = rawdata[j:j+1]`
			`if not c:`
			`return -1`
			`if c == '>':`
			`# all done`
			`return j + 1`

			`# Internal -- scan past <!NOTATION declarations`
			`def _parse_doctype_notation(self, i, declstartpos):`
			`name, j = self._scan_name(i, declstartpos)`
			`if j < 0:`
			`return j`
			`rawdata = self.rawdata`
			`while 1:`
			`c = rawdata[j:j+1]`
			`if not c:`
			`# end of buffer; incomplete`
			`return -1`
			`if c == '>':`
			`return j + 1`
			`if c in "'\"":`
			`m = _declstringlit_match(rawdata, j)`
			`if not m:`
			`return -1`
			`j = m.end()`
			`else:`
			`name, j = self._scan_name(j, declstartpos)`
			`if j < 0:`
			`return j`

			`# Internal -- scan past <!ENTITY declarations`
			`def _parse_doctype_entity(self, i, declstartpos):`
			`rawdata = self.rawdata`
			`if rawdata[i:i+1] == "%":`
			`j = i + 1`
			`while 1:`
			`c = rawdata[j:j+1]`
			`if not c:`
			`return -1`
			`if c in string.whitespace:`
			`j = j + 1`
			`else:`
			`break`
			`else:`
			`j = i`
			`name, j = self._scan_name(j, declstartpos)`
			`if j < 0:`
			`return j`
			`while 1:`
			`c = self.rawdata[j:j+1]`
			`if not c:`
			`return -1`
			`if c in "'\"":`
			`m = _declstringlit_match(rawdata, j)`
			`if m:`
			`j = m.end()`
			`else:`
			`return -1 # incomplete`
			`elif c == ">":`
			`return j + 1`
			`else:`
			`name, j = self._scan_name(j, declstartpos)`
			`if j < 0:`
			`return j`

			`# Internal -- scan a name token and the new position and the token, or`
			`# return -1 if we've reached the end of the buffer.`
			`def _scan_name(self, i, declstartpos):`
			`rawdata = self.rawdata`
			`n = len(rawdata)`
			`if i == n:`
			`return None, -1`
			`m = _declname_match(rawdata, i)`
			`if m:`
			`s = m.group()`
			`name = s.strip()`
			`if (i + len(s)) == n:`
			`return None, -1 # end of buffer`
Use string methods where possible, and remove import string 2002-05-31 22:13:04 +08:00			`return name.lower(), m.end()`
New base class for the SGMLParser and HTMLParser classes from the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser class). This has all the support for scanning over DOCTYPE declarations; it warrants having a base class since this is a fair amount of tedious code (since it's fairly strict), and should be in a separate module to avoid compiling many REs that are not used (which would happen if this were placed in either then sgmllib or HTMLParser module). 2001-09-25 04:01:28 +08:00			`else:`
			`self.updatepos(declstartpos, i)`
Remove extra param from call to self.error(). Reported by Neal Norwitz. 2001-10-13 23:59:47 +08:00			`self.error("expected name token")`
Re-arrange things and remove some unused variables/imports to keep pychecker happy. (This does not cover everything it complained about, though.) 2001-10-27 02:02:28 +08:00
			`# To be overridden -- handlers for unknown objects`
			`def unknown_decl(self, data):`
			`pass`