From 3ff5e72736c08e47f538a4d6ab0e4448202b2966 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 5 May 2013 22:33:29 +0100 Subject: [PATCH 1/2] Fix #6: dom2sax crash by replacing dom2sax with a generic to_sax This moves the functionality to a new treeadapters module (where later the adapters from test_treewalker.py will get moved) and removes the previous dom2sax function. --- CHANGES.rst | 4 ++ html5lib/constants.py | 18 +++++++++ html5lib/html5parser.py | 16 +------- html5lib/treeadapters/__init__.py | 0 html5lib/treeadapters/sax.py | 44 +++++++++++++++++++++ html5lib/treebuilders/dom.py | 65 +------------------------------ 6 files changed, 69 insertions(+), 78 deletions(-) create mode 100644 html5lib/treeadapters/__init__.py create mode 100644 html5lib/treeadapters/sax.py diff --git a/CHANGES.rst b/CHANGES.rst index ac08ae1f..2f691352 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -36,6 +36,10 @@ Released on May 17, 2013 longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will return the default DOM treebuilder, which uses ``xml.dom.minidom``. +* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by + ``treeadapters.sax.to_sax`` which is generic and supports any + treewalker; it also resolves all known bugs with ``dom2sax``. + * Optional heuristic character encoding detection now based on ``charade`` for Python 2.6 - 3.3 compatibility. diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..e7089846 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -433,6 +433,24 @@ (namespaces["mathml"], "mtext") )) +adjustForeignAttributes = { + "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), + "xlink:href": ("xlink", "href", namespaces["xlink"]), + "xlink:role": ("xlink", "role", namespaces["xlink"]), + "xlink:show": ("xlink", "show", namespaces["xlink"]), + "xlink:title": ("xlink", "title", namespaces["xlink"]), + "xlink:type": ("xlink", "type", namespaces["xlink"]), + "xml:base": ("xml", "base", namespaces["xml"]), + "xml:lang": ("xml", "lang", namespaces["xml"]), + "xml:space": ("xml", "space", namespaces["xml"]), + "xmlns": (None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) +} + +unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in + adjustForeignAttributes.items()]) + spaceCharacters = frozenset(( "\t", "\n", diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 22c2b75c..0518c410 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -17,6 +17,7 @@ from .constants import cdataElements, rcdataElements from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements +from .constants import adjustForeignAttributes as adjustForeignAttributesMap def parse(doc, treebuilder="etree", encoding=None, @@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token): del token["data"][originalName] def adjustForeignAttributes(self, token): - replacements = { - "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), - "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), - "xlink:href": ("xlink", "href", namespaces["xlink"]), - "xlink:role": ("xlink", "role", namespaces["xlink"]), - "xlink:show": ("xlink", "show", namespaces["xlink"]), - "xlink:title": ("xlink", "title", namespaces["xlink"]), - "xlink:type": ("xlink", "type", namespaces["xlink"]), - "xml:base": ("xml", "base", namespaces["xml"]), - "xml:lang": ("xml", "lang", namespaces["xml"]), - "xml:space": ("xml", "space", namespaces["xml"]), - "xmlns": (None, "xmlns", namespaces["xmlns"]), - "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) - } + replacements = adjustForeignAttributesMap for originalName in token["data"].keys(): if originalName in replacements: diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py new file mode 100644 index 00000000..ad47df95 --- /dev/null +++ b/html5lib/treeadapters/sax.py @@ -0,0 +1,44 @@ +from __future__ import absolute_import, division, unicode_literals + +from xml.sax.xmlreader import AttributesNSImpl + +from ..constants import adjustForeignAttributes, unadjustForeignAttributes + +prefix_mapping = {} +for prefix, localName, namespace in adjustForeignAttributes.values(): + if prefix is not None: + prefix_mapping[prefix] = namespace + + +def to_sax(walker, handler): + """Call SAX-like content handler based on treewalker walker""" + handler.startDocument() + for prefix, namespace in prefix_mapping.items(): + handler.startPrefixMapping(prefix, namespace) + + for token in walker: + type = token["type"] + if type == "Doctype": + continue + elif type in ("StartTag", "EmptyTag"): + attrs = AttributesNSImpl(token["data"], + unadjustForeignAttributes) + handler.startElementNS((token["namespace"], token["name"]), + token["name"], + attrs) + if type == "EmptyTag": + handler.endElementNS((token["namespace"], token["name"]), + token["name"]) + elif type == "EndTag": + handler.endElementNS((token["namespace"], token["name"]), + token["name"]) + elif type in ("Characters", "SpaceCharacters"): + handler.characters(token["data"]) + elif type == "Comment": + pass + else: + assert False, "Unknown token type" + + for prefix, namespace in prefix_mapping.items(): + handler.endPrefixMapping(prefix) + handler.endDocument() diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index f9e0d76e..61e5ed79 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, unicode_literals -from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE +from xml.dom import minidom, Node import weakref from . import _base @@ -220,69 +220,6 @@ def serializeElement(element, indent=0): return "\n".join(rv) - def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}): - if node.nodeType == Node.ELEMENT_NODE: - if not nsmap: - handler.startElement(node.nodeName, node.attributes) - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endElement(node.nodeName) - else: - attributes = dict(node.attributes.itemsNS()) - - # gather namespace declarations - prefixes = [] - for attrname in list(node.attributes.keys()): - attr = node.getAttributeNode(attrname) - if (attr.namespaceURI == XMLNS_NAMESPACE or - (attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))): - prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None) - handler.startPrefixMapping(prefix, attr.nodeValue) - prefixes.append(prefix) - nsmap = nsmap.copy() - nsmap[prefix] = attr.nodeValue - del attributes[(attr.namespaceURI, attr.nodeName)] - - # apply namespace declarations - for attrname in list(node.attributes.keys()): - attr = node.getAttributeNode(attrname) - if attr.namespaceURI is None and ':' in attr.nodeName: - prefix = attr.nodeName.split(':')[0] - if prefix in nsmap: - del attributes[(attr.namespaceURI, attr.nodeName)] - attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue - - # SAX events - ns = node.namespaceURI or nsmap.get(None, None) - handler.startElementNS((ns, node.nodeName), node.nodeName, attributes) - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endElementNS((ns, node.nodeName), node.nodeName) - for prefix in prefixes: - handler.endPrefixMapping(prefix) - - elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: - handler.characters(node.nodeValue) - - elif node.nodeType == Node.DOCUMENT_NODE: - handler.startDocument() - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endDocument() - - elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: - for child in node.childNodes: - dom2sax(child, handler, nsmap) - - else: - # ATTRIBUTE_NODE - # ENTITY_NODE - # PROCESSING_INSTRUCTION_NODE - # COMMENT_NODE - # DOCUMENT_TYPE_NODE - # NOTATION_NODE - pass - return locals() From c5cbd409c4b77efac35e0450be1b0711b47cb33c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Sun, 5 May 2013 01:19:10 +0200 Subject: [PATCH 2/2] Add test for sax treeadapter. --- html5lib/tests/support.py | 45 +++++++++++++++++++++++++++++ html5lib/tests/test_treeadapters.py | 40 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 html5lib/tests/test_treeadapters.py diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index ebe90c2c..41f2d2a0 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -4,6 +4,7 @@ import sys import codecs import glob +import xml.sax.handler base_path = os.path.split(__file__)[0] @@ -130,3 +131,47 @@ def errorMessage(input, expected, actual): if sys.version_info.major == 2: msg = msg.encode("ascii", "backslashreplace") return msg + + +class TracingSaxHandler(xml.sax.handler.ContentHandler): + def __init__(self): + xml.sax.handler.ContentHandler.__init__(self) + self.visited = [] + + def startDocument(self): + self.visited.append('startDocument') + + def endDocument(self): + self.visited.append('endDocument') + + def startPrefixMapping(self, prefix, uri): + # These are ignored as their order is not guaranteed + pass + + def endPrefixMapping(self, prefix): + # These are ignored as their order is not guaranteed + pass + + def startElement(self, name, attrs): + self.visited.append(('startElement', name, attrs)) + + def endElement(self, name): + self.visited.append(('endElement', name)) + + def startElementNS(self, name, qname, attrs): + self.visited.append(('startElementNS', name, qname, dict(attrs))) + + def endElementNS(self, name, qname): + self.visited.append(('endElementNS', name, qname)) + + def characters(self, content): + self.visited.append(('characters', content)) + + def ignorableWhitespace(self, whitespace): + self.visited.append(('ignorableWhitespace', whitespace)) + + def processingInstruction(self, target, data): + self.visited.append(('processingInstruction', target, data)) + + def skippedEntity(self, name): + self.visited.append(('skippedEntity', name)) diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py new file mode 100644 index 00000000..5f38b6c3 --- /dev/null +++ b/html5lib/tests/test_treeadapters.py @@ -0,0 +1,40 @@ +from __future__ import absolute_import, division, unicode_literals + +from . import support # flake8: noqa + +import html5lib +from html5lib.treeadapters import sax +from html5lib.treewalkers import getTreeWalker + + +def test_to_sax(): + handler = support.TracingSaxHandler() + tree = html5lib.parse(""" + Directory Listing +

+ """, treebuilder="etree") + walker = getTreeWalker("etree") + sax.to_sax(walker(tree), handler) + expected = [ + 'startDocument', + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'), + 'html', {(None, 'xml:lang'): 'en'}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}), + ('characters', 'Directory Listing'), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), + ('characters', '\n '), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'), + ('characters', '\n '), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'), + ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'), + 'endDocument', + ] + assert expected == handler.visited