1995-08-04 12:23:30 +08:00
|
|
|
# New HTML class
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
# XXX Check against HTML 2.0 spec
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
# XXX reorder methods according to hierarchy
|
|
|
|
# - html structure: head, body, title, isindex
|
|
|
|
# - headers
|
|
|
|
# - lists, items
|
|
|
|
# - paragraph styles
|
|
|
|
# - forms
|
|
|
|
# - character styles
|
|
|
|
# - images
|
|
|
|
# - bookkeeping
|
|
|
|
# - output generation
|
1995-02-27 21:16:55 +08:00
|
|
|
|
|
|
|
|
|
|
|
import sys
|
1995-08-04 12:23:30 +08:00
|
|
|
import regsub
|
1995-02-27 21:16:55 +08:00
|
|
|
import string
|
1995-08-04 12:23:30 +08:00
|
|
|
from sgmllib import SGMLParser
|
1995-02-27 21:16:55 +08:00
|
|
|
|
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
ROMAN = 0
|
|
|
|
ITALIC = 1
|
|
|
|
BOLD = 2
|
|
|
|
FIXED = 3
|
1995-02-27 21:16:55 +08:00
|
|
|
|
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
class HTMLParser(SGMLParser):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
SGMLParser.__init__(self)
|
|
|
|
self.savedata = None
|
|
|
|
self.isindex = 0
|
|
|
|
self.title = ''
|
|
|
|
self.para = None
|
|
|
|
self.lists = []
|
|
|
|
self.styles = []
|
|
|
|
self.nofill = 0
|
|
|
|
self.nospace = 1
|
|
|
|
self.softspace = 0
|
|
|
|
|
|
|
|
# --- Data
|
|
|
|
|
|
|
|
def handle_image(self, src, alt):
|
|
|
|
self.handle_data(alt)
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self.nofill:
|
|
|
|
self.handle_literal(data)
|
|
|
|
return
|
|
|
|
data = regsub.gsub('[ \t\n\r]+', ' ', data)
|
|
|
|
if self.nospace and data[:1] == ' ': data = data[1:]
|
|
|
|
if not data: return
|
|
|
|
self.nospace = 0
|
|
|
|
if self.softspace and data[:1] != ' ': data = ' ' + data
|
|
|
|
if data[-1:] == ' ':
|
|
|
|
data = data[:-1]
|
|
|
|
self.softspace = 1
|
|
|
|
self.output_data(data)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def handle_literal(self, data):
|
|
|
|
self.nospace = 0
|
|
|
|
self.softspace = 0
|
|
|
|
self.output_data(data)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def output_data(self, data):
|
|
|
|
if self.savedata is not None:
|
|
|
|
self.savedata = self.savedata + data
|
|
|
|
else:
|
|
|
|
self.write_data(data)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def write_data(self, data):
|
|
|
|
sys.stdout.write(data)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def save_bgn(self):
|
|
|
|
self.savedata = ''
|
|
|
|
self.nospace = 1
|
|
|
|
self.softspace = 0
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def save_end(self):
|
|
|
|
saved = self.savedata
|
|
|
|
self.savedata = None
|
|
|
|
self.nospace = 1
|
|
|
|
self.softspace = 0
|
|
|
|
return saved
|
|
|
|
|
|
|
|
def new_para(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def new_style(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# --- Generic style changes
|
|
|
|
|
|
|
|
def para_bgn(self, tag):
|
|
|
|
if not self.nospace:
|
|
|
|
self.handle_literal('\n')
|
|
|
|
self.nospace = 1
|
|
|
|
self.softspace = 0
|
|
|
|
if tag is not None:
|
|
|
|
self.para = tag
|
|
|
|
self.new_para()
|
|
|
|
|
|
|
|
def para_end(self):
|
|
|
|
self.para_bgn('')
|
|
|
|
|
|
|
|
def push_list(self, tag):
|
|
|
|
self.lists.append(tag)
|
|
|
|
self.para_bgn(None)
|
|
|
|
|
|
|
|
def pop_list(self):
|
|
|
|
del self.lists[-1]
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
def literal_bgn(self, tag, attrs):
|
|
|
|
self.para_bgn(tag)
|
|
|
|
|
|
|
|
def literal_end(self, tag):
|
|
|
|
self.para_end()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def push_style(self, tag):
|
|
|
|
self.styles.append(tag)
|
|
|
|
self.new_style()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def pop_style(self):
|
|
|
|
del self.styles[-1]
|
|
|
|
self.new_style()
|
|
|
|
|
|
|
|
def anchor_bgn(self, href, name, type):
|
|
|
|
self.push_style(href and 'a' or None)
|
|
|
|
|
|
|
|
def anchor_end(self):
|
|
|
|
self.pop_style()
|
|
|
|
|
|
|
|
# --- Top level tags
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_html(self, attrs): pass
|
|
|
|
def end_html(self): pass
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_head(self, attrs): pass
|
|
|
|
def end_head(self): pass
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_body(self, attrs): pass
|
|
|
|
def end_body(self): pass
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def do_isindex(self, attrs):
|
|
|
|
self.isindex = 1
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_title(self, attrs):
|
|
|
|
self.save_bgn()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def end_title(self):
|
|
|
|
self.title = self.save_end()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
# --- Old HTML 'literal text' tags
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_listing(self, attrs):
|
|
|
|
self.setliteral('listing')
|
|
|
|
self.literal_bgn('listing', attrs)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def end_listing(self):
|
|
|
|
self.literal_end('listing')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_xmp(self, attrs):
|
|
|
|
self.setliteral('xmp')
|
|
|
|
self.literal_bgn('xmp', attrs)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def end_xmp(self):
|
|
|
|
self.literal_end('xmp')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def do_plaintext(self, attrs):
|
|
|
|
self.setnomoretags()
|
|
|
|
self.literal_bgn('plaintext', attrs)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
# --- Anchors
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_a(self, attrs):
|
|
|
|
href = ''
|
|
|
|
name = ''
|
|
|
|
type = ''
|
|
|
|
for attrname, value in attrs:
|
|
|
|
if attrname == 'href':
|
|
|
|
href = value
|
|
|
|
if attrname == 'name':
|
|
|
|
name = value
|
|
|
|
if attrname == 'type':
|
|
|
|
type = string.lower(value)
|
|
|
|
if not (href or name):
|
|
|
|
return
|
|
|
|
self.anchor_bgn(href, name, type)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def end_a(self):
|
|
|
|
self.anchor_end()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
# --- Paragraph tags
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def do_p(self, attrs):
|
|
|
|
self.para_bgn(None)
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def do_br(self, attrs):
|
|
|
|
self.handle_literal('\n')
|
|
|
|
self.nospace = 1
|
|
|
|
self.softspace = 0
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def do_hr(self, attrs):
|
|
|
|
self.para_bgn(None)
|
|
|
|
self.handle_literal('-'*40)
|
|
|
|
self.para_end()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_h1(self, attrs):
|
|
|
|
self.para_bgn('h1')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_h2(self, attrs):
|
|
|
|
self.para_bgn('h2')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_h3(self, attrs):
|
|
|
|
self.para_bgn('h3')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_h4(self, attrs):
|
|
|
|
self.para_bgn('h4')
|
1995-02-27 21:16:55 +08:00
|
|
|
|
1995-08-04 12:23:30 +08:00
|
|
|
def start_h5(self, attrs):
|
|
|
|
self.para_bgn('h5')
|
|
|
|
|
|
|
|
def start_h6(self, attrs):
|
|
|
|
self.para_bgn('h6')
|
|
|
|
|
|
|
|
def end_h1(self):
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
end_h2 = end_h1
|
|
|
|
end_h3 = end_h2
|
|
|
|
end_h4 = end_h3
|
|
|
|
end_h5 = end_h4
|
|
|
|
end_h6 = end_h5
|
|
|
|
|
|
|
|
def start_ul(self, attrs):
|
|
|
|
self.para_bgn(None)
|
|
|
|
self.push_list('ul')
|
|
|
|
|
|
|
|
def start_ol(self, attrs):
|
|
|
|
self.para_bgn(None)
|
|
|
|
self.push_list('ol')
|
|
|
|
|
|
|
|
def end_ul(self):
|
|
|
|
self.pop_list()
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
def do_li(self, attrs):
|
|
|
|
self.para_bgn('li%d' % len(self.lists))
|
|
|
|
|
|
|
|
start_dir = start_menu = start_ul
|
|
|
|
end_dir = end_menu = end_ol = end_ul
|
|
|
|
|
|
|
|
def start_dl(self, attrs):
|
|
|
|
self.para_bgn(None)
|
|
|
|
self.push_list('dl')
|
|
|
|
|
|
|
|
def end_dl(self):
|
|
|
|
self.pop_list()
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
def do_dt(self, attrs):
|
|
|
|
self.para_bgn('dt%d' % len(self.lists))
|
|
|
|
|
|
|
|
def do_dd(self, attrs):
|
|
|
|
self.para_bgn('dd%d' % len(self.lists))
|
|
|
|
|
|
|
|
def start_address(self, attrs):
|
|
|
|
self.para_bgn('address')
|
|
|
|
|
|
|
|
def end_address(self):
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
def start_pre(self, attrs):
|
|
|
|
self.para_bgn('pre')
|
|
|
|
self.nofill = self.nofill + 1
|
|
|
|
|
|
|
|
def end_pre(self):
|
|
|
|
self.nofill = self.nofill - 1
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
start_typewriter = start_pre
|
|
|
|
end_typewriter = end_pre
|
|
|
|
|
|
|
|
def do_img(self, attrs):
|
|
|
|
src = ''
|
|
|
|
alt = ' (image) '
|
|
|
|
for attrname, value in attrs:
|
|
|
|
if attrname == 'alt':
|
|
|
|
alt = value
|
|
|
|
if attrname == 'src':
|
|
|
|
src = value
|
|
|
|
self.handle_image(src, alt)
|
|
|
|
|
|
|
|
# --- Character tags -- physical styles
|
|
|
|
|
|
|
|
def start_tt(self, attrs): self.push_style(FIXED)
|
|
|
|
def end_tt(self): self.pop_style()
|
|
|
|
|
|
|
|
def start_b(self, attrs): self.push_style(BOLD)
|
|
|
|
def end_b(self): self.pop_style()
|
|
|
|
|
|
|
|
def start_i(self, attrs): self.push_style(ITALIC)
|
|
|
|
def end_i(self): self.pop_style()
|
|
|
|
|
|
|
|
def start_u(self, attrs): self.push_style(ITALIC) # Underline???
|
|
|
|
def end_u(self): self.pop_style()
|
|
|
|
|
|
|
|
def start_r(self, attrs): self.push_style(ROMAN) # Not official
|
|
|
|
def end_r(self): self.pop_style()
|
|
|
|
|
|
|
|
# --- Charaacter tags -- logical styles
|
|
|
|
|
|
|
|
start_em = start_i
|
|
|
|
end_em = end_i
|
|
|
|
|
|
|
|
start_strong = start_b
|
|
|
|
end_strong = end_b
|
|
|
|
|
|
|
|
start_code = start_tt
|
|
|
|
end_code = end_tt
|
|
|
|
|
|
|
|
start_samp = start_tt
|
|
|
|
end_samp = end_tt
|
|
|
|
|
|
|
|
start_kbd = start_tt
|
|
|
|
end_kbd = end_tt
|
|
|
|
|
|
|
|
start_file = start_tt # unofficial
|
|
|
|
end_file = end_tt
|
|
|
|
|
|
|
|
start_var = start_i
|
|
|
|
end_var = end_i
|
|
|
|
|
|
|
|
start_dfn = start_i
|
|
|
|
end_dfn = end_i
|
|
|
|
|
|
|
|
start_cite = start_i
|
|
|
|
end_cite = end_i
|
|
|
|
|
|
|
|
start_hp1 = start_i
|
|
|
|
end_hp1 = start_i
|
|
|
|
|
|
|
|
start_hp2 = start_b
|
|
|
|
end_hp2 = end_b
|
|
|
|
|
|
|
|
# --- Form tags
|
|
|
|
|
|
|
|
def start_form(self, attrs):
|
|
|
|
self.para_bgn(None)
|
|
|
|
|
|
|
|
def end_form(self):
|
|
|
|
self.para_end()
|
|
|
|
|
|
|
|
# --- Unhandled tags
|
|
|
|
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def unknown_endtag(self, tag):
|
|
|
|
pass
|
1995-02-27 21:16:55 +08:00
|
|
|
|
|
|
|
|
|
|
|
def test():
|
1995-08-04 12:23:30 +08:00
|
|
|
file = 'test.html'
|
|
|
|
f = open(file, 'r')
|
|
|
|
data = f.read()
|
|
|
|
f.close()
|
|
|
|
p = HTMLParser()
|
|
|
|
p.feed(data)
|
|
|
|
p.close()
|
1995-02-27 21:16:55 +08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
1995-08-04 12:23:30 +08:00
|
|
|
test()
|