Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================
This commit is contained in:
Thomas Wouters 2004-03-20 17:31:29 +00:00
parent d4079e1fc2
commit 0813d76cb0

View File

@ -22,6 +22,75 @@ except NameError:
NLCRE = re.compile('\r\n|\r|\n')
class TextUtil:
""" A utility class for wrapping a file object and providing a
couple of additional useful functions.
"""
def __init__(self, fp):
self.fp = fp
self.unread = []
def readline(self):
""" Return a line of data.
If data has been pushed back with unreadline(), the most recently
returned unreadline()d data will be returned.
"""
if self.unread:
return self.unread.pop()
else:
return self.fp.readline()
def unreadline(self, line):
"""Push a line back into the object.
"""
self.unread.append(line)
def peekline(self):
"""Non-destructively look at the next line"""
line = self.readline()
self.unreadline(line)
return line
def read(self):
"""Return the remaining data
"""
r = self.fp.read()
if self.unread:
r = "\n".join(self.unread) + r
self.unread = []
return r
def readuntil(self, re, afterblank=0, includematch=0):
"""Read a line at a time until we get the specified RE.
Returns the text up to (and including, if includematch is true) the
matched text, and the RE match object. If afterblank is true,
there must be a blank line before the matched text. Moves current
filepointer to the line following the matched line. If we reach
end-of-file, return what we've got so far, and return None as the
RE match object.
"""
prematch = []
blankseen = 0
while 1:
line = self.readline()
if not line:
# end of file
return EMPTYSTRING.join(prematch), None
if afterblank:
if NLCRE.match(line):
blankseen = 1
continue
else:
blankseen = 0
m = re.match(line)
if (m and not afterblank) or (m and afterblank and blankseen):
if includematch:
prematch.append(line)
return EMPTYSTRING.join(prematch), m
prematch.append(line)
class Parser:
@ -59,9 +128,13 @@ class Parser:
meaning it parses the entire contents of the file.
"""
root = self._class()
firstbodyline = self._parseheaders(root, fp)
fp = TextUtil(fp)
self._parseheaders(root, fp)
if not headersonly:
self._parsebody(root, fp, firstbodyline)
obj = self._parsemessage(root, fp)
trailer = fp.read()
if obj and trailer:
self._attach_trailer(obj, trailer)
return root
def parsestr(self, text, headersonly=False):
@ -80,7 +153,6 @@ class Parser:
lastheader = ''
lastvalue = []
lineno = 0
firstbodyline = None
while True:
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
@ -129,7 +201,7 @@ class Parser:
# There was no separating blank line as mandated by RFC
# 2822, but we're in non-strict mode. So just offer up
# this current line as the first body line.
firstbodyline = line
fp.unreadline(line)
break
if lastheader:
container[lastheader] = NL.join(lastvalue)
@ -138,140 +210,114 @@ class Parser:
# Make sure we retain the last header
if lastheader:
container[lastheader] = NL.join(lastvalue)
return firstbodyline
return
def _parsebody(self, container, fp, firstbodyline=None):
# Parse the body, but first split the payload on the content-type
# boundary if present.
def _parsemessage(self, container, fp):
# Parse the body. We walk through the body from top to bottom,
# keeping track of the current multipart nesting as we go.
# We return the object that gets the data at the end of this
# block.
boundary = container.get_boundary()
isdigest = (container.get_content_type() == 'multipart/digest')
# If there's a boundary, split the payload text into its constituent
# parts and parse each separately. Otherwise, just parse the rest of
# the body as a single message. Note: any exceptions raised in the
# recursive parse need to have their line numbers coerced.
if boundary:
preamble = epilogue = None
# Split into subparts. The first boundary we're looking for won't
# always have a leading newline since we're at the start of the
# body text, and there's not always a preamble before the first
# boundary.
if boundary:
separator = '--' + boundary
payload = fp.read()
if firstbodyline is not None:
payload = firstbodyline + '\n' + payload
# We use an RE here because boundaries can have trailing
# whitespace.
mo = re.search(
r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
payload)
if not mo:
if self._strict:
raise Errors.BoundaryError(
"Couldn't find starting boundary: %s" % boundary)
container.set_payload(payload)
return
start = mo.start()
if start > 0:
# there's some pre-MIME boundary preamble
preamble = payload[0:start]
# Find out what kind of line endings we're using
start += len(mo.group('sep')) + len(mo.group('ws'))
mo = NLCRE.search(payload, start)
if mo:
start += len(mo.group(0))
# We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't
# support this signature. :(
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
re.escape(separator) + '--')
mo = cre.search(payload, start)
if mo:
terminator = mo.start()
linesep = mo.group('sep')
if mo.end() < len(payload):
# There's some post-MIME boundary epilogue
epilogue = payload[mo.end():]
elif self._strict:
raise Errors.BoundaryError(
"Couldn't find terminating boundary: %s" % boundary)
else:
# Handle the case of no trailing boundary. Check that it ends
# in a blank line. Some cases (spamspamspam) don't even have
# that!
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
terminator = len(payload)
# We split the textual payload on the boundary separator, which
# includes the trailing newline. If the container is a
# multipart/digest then the subparts are by default message/rfc822
# instead of text/plain. In that case, they'll have a optional
# block of MIME headers, then an empty line followed by the
# message headers.
parts = re.split(
linesep + re.escape(separator) + r'[ \t]*' + linesep,
payload[start:terminator])
for part in parts:
if isdigest:
if part.startswith(linesep):
# There's no header block so create an empty message
# object as the container, and lop off the newline so
# we can parse the sub-subobject
msgobj = self._class()
part = part[len(linesep):]
else:
parthdrs, part = part.split(linesep+linesep, 1)
# msgobj in this case is the "message/rfc822" container
msgobj = self.parsestr(parthdrs, headersonly=1)
# while submsgobj is the message itself
msgobj.set_default_type('message/rfc822')
maintype = msgobj.get_content_maintype()
if maintype in ('message', 'multipart'):
submsgobj = self.parsestr(part)
msgobj.attach(submsgobj)
else:
msgobj.set_payload(part)
else:
msgobj = self.parsestr(part)
boundaryRE = re.compile(
r'(?P<sep>' + re.escape(separator) +
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
preamble, matchobj = fp.readuntil(boundaryRE)
if not matchobj:
# Broken - we hit the end of file. Just set the body
# to the text.
container.set_payload(preamble)
return container
if preamble:
container.preamble = preamble
container.epilogue = epilogue
container.attach(msgobj)
elif container.get_main_type() == 'multipart':
else:
# The module docs specify an empty preamble is None, not ''
container.preamble = None
while 1:
subobj = self._class()
if isdigest:
subobj.set_default_type('message/rfc822')
firstline = fp.peekline()
if firstline.strip():
# we have MIME headers. all good.
self._parseheaders(subobj, fp)
else:
# no MIME headers. this is allowed for multipart/digest
# Consume the extra blank line
fp.readline()
pass
else:
self._parseheaders(subobj, fp)
container.attach(subobj)
maintype = subobj.get_content_maintype()
hassubparts = (subobj.get_content_maintype() in
( "message", "multipart" ))
if hassubparts:
subobj = self._parsemessage(subobj, fp)
trailer, matchobj = fp.readuntil(boundaryRE)
if matchobj is None or trailer:
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
trailer = trailer[:-len(linesep)]
if trailer:
self._attach_trailer(subobj, trailer)
if matchobj is None or matchobj.group('end'):
# That was the last piece of data. Let our caller attach
# the epilogue to us. But before we do that, push the
# line ending of the match group back into the readline
# buffer, as it's part of the epilogue.
if matchobj:
fp.unreadline(matchobj.group('linesep'))
return container
elif container.get_content_maintype() == "multipart":
# Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError(
'multipart message with no defined boundary')
elif container.get_type() == 'message/delivery-status':
# This special kind of type contains blocks of headers separated
# by a blank line. We'll represent each header block as a
# separate Message object
blocks = []
while True:
blockmsg = self._class()
self._parseheaders(blockmsg, fp)
if not len(blockmsg):
# No more header blocks left
break
blocks.append(blockmsg)
container.set_payload(blocks)
elif container.get_main_type() == 'message':
# Create a container for the payload, but watch out for there not
# being any headers left
try:
msg = self.parse(fp)
except Errors.HeaderParseError:
'multipart message with no defined boundary')
elif container.get_content_maintype() == "message":
ct = container.get_content_type()
if ct == "message/rfc822":
submessage = self._class()
self._parseheaders(submessage, fp)
self._parsemessage(submessage, fp)
container.attach(submessage)
return submessage
elif ct == "message/delivery-status":
# This special kind of type contains blocks of headers
# separated by a blank line. We'll represent each header
# block as a separate Message object
while 1:
nextblock = self._class()
self._parseheaders(nextblock, fp)
container.attach(nextblock)
# next peek ahead to see whether we've hit the end or not
nextline = fp.peekline()
if nextline[:2] == "--":
break
return container
else:
# Other sort of message object (e.g. external-body)
msg = self._class()
self._parsebody(msg, fp)
container.attach(msg)
self._parsemessage(msg, fp)
container.attach(msg)
return msg
else:
text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)
# single body section. We let our caller set the payload.
return container
def _attach_trailer(self, obj, trailer):
if obj.get_content_maintype() in ("message", "multipart"):
obj.epilogue = trailer
else:
obj.set_payload(trailer)
class HeaderParser(Parser):
@ -284,9 +330,8 @@ class HeaderParser(Parser):
Parsing with this subclass can be considerably faster if all you're
interested in is the message headers.
"""
def _parsebody(self, container, fp, firstbodyline=None):
def _parsemessage(self, container, fp):
# Consume but do not parse, the body
text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)
return None