Looks like I can use HTMLParser except for a few fringe issues (bogus comments and nonstandard end tags) by subclassing the following class's onStartTag() and onEndTag() methods.
from HTMLParser import HTMLParser
class VerbatimParser(HTMLParser):
def __init__(self, out):
HTMLParser.__init__(self)
# @#%#@% HTMLParser uses old-style classes, can't use super()
self.out = out
self.tagstack = []
def emit(self, text):
self.out.write(text)
def handle_starttag(self, tag, attrs):
self.tagstack.append(tag)
self.emit(self.get_starttag_text())
self.onStartTag(tag, attrs)
def onStartTag(self, tag, attrs):
pass
def onEndTag(self, tag):
pass
def handle_endtag(self, tag):
self.onEndTag(tag)
# pop last occurrence of tag, along with any more recent tags
try:
k = self.tagstack[::-1].index(tag)
del self.tagstack[-k-1:]
except ValueError:
pass
self.emit('</')
self.emit(tag)
self.emit('>')
def handle_startendtag(self, tag, attrs):
self.emit(self.get_starttag_text())
def handle_data(self, data):
self.emit(data)
def handle_entityref(self, name):
self.emit('&')
self.emit(name)
self.emit(';')
def handle_charref(self, name):
self.emit('&#')
self.emit(name)
self.emit(';')
def handle_comment(self, data):
self.emit('<!--')
self.emit(data)
self.emit('-->')
def handle_decl(self, decl):
self.emit('<!')
self.emit(decl)
self.emit('>')
def handle_pi(self, data):
self.emit('<?')
self.emit(data)
self.emit('>')
def unknown_decl(self, data):
self.emit('<![')
self.emit(data)
self.emit(']>')
def doit(infile, outfile):
with open(outfile,'w') as fout:
parser = VerbatimParser(fout)
with open(infile) as f:
parser.feed(f.read())
parser.close()