From b2ac32a45fc1f481c6d8727061b50f82e875d0ca Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 18:25:29 +0100 Subject: [PATCH 01/13] Ignore .coverage as well. --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9c5970ee..81b7d66c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ /MANIFEST # Generated by parse.py -p -stats.prof \ No newline at end of file +stats.prof + +# From cover (esp. in combination with nose) +.coverage From e90eeee5bc956a196659eba8015a1f3118ad6197 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 18:59:36 +0100 Subject: [PATCH 02/13] Add diff to error messages from treewalker tests. I've spent too long straining to see subtle difference. This helps. --- html5lib/tests/test_treewalkers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 97eee4cd..a876d49a 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +from difflib import unified_diff import os import sys import unittest @@ -280,10 +281,15 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) + add_lf = lambda x: x + "\n" + diff = "".join(unified_diff(map(add_lf, expected.splitlines()), + map(add_lf, output.splitlines()), + "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input, "", "Expected:", expected, - "", "Received:", output + "", "Received:", output, + "", "Diff:", diff, ]) except NotImplementedError: pass # Amnesty for those that confess... From 3a669294933c85b2b8bbfbd5f8e6c967eb8f7a11 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 19:35:30 +0100 Subject: [PATCH 03/13] Re-enable Genshi treewalker. The Genshi treewalker was never fully updated when support for namespaces was added to treewalkers. This finally fixes this, allowing all tests to once again pass. This removes the ignoring of children of void elements as the commonly used treewalkers don't support this and there was some subtle bug I couldn't see in the code. Given most don't support this, this seems like little loss. (You will, however, still get an error from the treewalker if a void element has children.) --- html5lib/tests/test_treewalkers.py | 22 ++++++++------ html5lib/treewalkers/genshistream.py | 45 +++++++++++++--------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index a876d49a..4376288f 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -127,15 +127,19 @@ def GenshiAdapter(tree): name = "{%s}%s" % (token["namespace"], token["name"]) else: name = token["name"] - yield (START, - (QName(name), - Attrs([(QName(attr),value) for attr,value in token["data"]])), - (None, -1, -1)) + attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value) + for attr, value in token["data"].items()]) + yield (START, (QName(name), attrs), (None, -1, -1)) if type == "EmptyTag": type = "EndTag" if type == "EndTag": - yield END, QName(token["name"]), (None, -1, -1) + if token["namespace"]: + name = "{%s}%s" % (token["namespace"], token["name"]) + else: + name = token["name"] + + yield END, QName(name), (None, -1, -1) elif type == "Comment": yield COMMENT, token["data"], (None, -1, -1) @@ -150,10 +154,10 @@ def GenshiAdapter(tree): if text is not None: yield TEXT, text, (None, -1, -1) - #treeTypes["genshi"] = \ - # {"builder": treebuilders.getTreeBuilder("simpletree"), - # "adapter": GenshiAdapter, - # "walker": treewalkers.getTreeWalker("genshi")} + treeTypes["genshi"] = \ + {"builder": treebuilders.getTreeBuilder("simpletree"), + "adapter": GenshiAdapter, + "walker": treewalkers.getTreeWalker("genshi")} except ImportError: pass diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index cfd4e216..6267b6d8 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -1,52 +1,49 @@ from __future__ import absolute_import, division, unicode_literals +from genshi.core import QName from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT from genshi.output import NamespaceFlattener from . import _base -from html5lib.constants import voidElements +from html5lib.constants import voidElements, namespaces class TreeWalker(_base.TreeWalker): def __iter__(self): - depth = 0 - ignore_until = None - previous = None + # Buffer the events so we can pass in the following one + previous = None for event in self.tree: if previous is not None: - if previous[0] == START: - depth += 1 - if ignore_until <= depth: - ignore_until = None - if ignore_until is None: - for token in self.tokens(previous, event): - yield token - if token["type"] == "EmptyTag": - ignore_until = depth - if previous[0] == END: - depth -= 1 + for token in self.tokens(previous, event): + yield token previous = event + + # Don't forget the final event! if previous is not None: - if ignore_until is None or ignore_until <= depth: - for token in self.tokens(previous, None): - yield token - elif ignore_until is not None: - raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") + for token in self.tokens(previous, None): + yield token def tokens(self, event, next): kind, data, pos = event if kind == START: - tag, attrib = data + tag, attribs = data name = tag.localname namespace = tag.namespace - if tag in voidElements: - for token in self.emptyTag(namespace, name, list(attrib), + converted_attribs = {} + for k, v in attribs: + if isinstance(k, QName): + converted_attribs[(k.namespace, k.localname)] = v + else: + converted_attribs[(None, k)] = v + + if namespace == namespaces["html"] and name in voidElements: + for token in self.emptyTag(namespace, name, converted_attribs, not next or next[0] != END or next[1] != tag): yield token else: - yield self.startTag(namespace, name, list(attrib)) + yield self.startTag(namespace, name, converted_attribs) elif kind == END: name = data.localname From 71cb2fe8fc301e7f8d5042ac9b4a0088ef5808b5 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 19:50:43 +0100 Subject: [PATCH 04/13] Add requirements files for our optional dependencies and test with them. --- .travis.yml | 8 +++++++- requirements-optional-cpython.txt | 3 +++ requirements-optional.txt | 11 +++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 requirements-optional-cpython.txt create mode 100644 requirements-optional.txt diff --git a/.travis.yml b/.travis.yml index fa01a7d8..e9c542b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,11 +6,17 @@ python: - "3.3" - "pypy" +env: + - USE_OPTIONAL=true + - USE_OPTIONAL=false + before_install: - git submodule update --init --recursive install: - pip install -r requirements.txt -r requirements-test.txt --use-mirrors + - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi + - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi script: - - nosetests \ No newline at end of file + - nosetests diff --git a/requirements-optional-cpython.txt b/requirements-optional-cpython.txt new file mode 100644 index 00000000..0dfbeed4 --- /dev/null +++ b/requirements-optional-cpython.txt @@ -0,0 +1,3 @@ +# lxml is supported with its own treebuilder ("lxml") and otherwise +# uses the standard ElementTree support +lxml diff --git a/requirements-optional.txt b/requirements-optional.txt new file mode 100644 index 00000000..b8663ef3 --- /dev/null +++ b/requirements-optional.txt @@ -0,0 +1,11 @@ +# We support a Genshi treewalker that can be used to serialize Genshi +# streams. +genshi + +# chardet can be used as a fallback in case we are unable to determine +# the encoding of a document. +chardet + +# DATrie can be used in place of our Python trie implementation for +# slightly better parsing performance. +datrie From 0e1d6288463a5d398edfc4ef8962e840f24ddf08 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 19:57:10 +0100 Subject: [PATCH 05/13] Fix DATrie support under Python 2. This is a simple issue of using `str` to refer to what should be `six.text_type`. --- html5lib/trie/datrie.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html5lib/trie/datrie.py b/html5lib/trie/datrie.py index e02efc10..fc98bdc3 100644 --- a/html5lib/trie/datrie.py +++ b/html5lib/trie/datrie.py @@ -3,6 +3,7 @@ from itertools import chain from datrie import Trie as DATrie +from six import text_type from ._base import Trie as ABCTrie @@ -10,7 +11,7 @@ class Trie(ABCTrie): def __init__(self, data): chars = set() for key in data.keys(): - if not isinstance(key, str): + if not isinstance(key, text_type): raise TypeError("All keys must be strings") for char in key: chars.add(char) From 3cd3977a4af9a13fd32a7ae1ad7db857372c96c6 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 20:02:25 +0100 Subject: [PATCH 06/13] fixup! Add diff to error messages from treewalker tests. --- html5lib/tests/test_treewalkers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 4376288f..544bc1db 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -286,8 +286,8 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) add_lf = lambda x: x + "\n" - diff = "".join(unified_diff(map(add_lf, expected.splitlines()), - map(add_lf, output.splitlines()), + diff = "".join(unified_diff(list(map(add_lf, expected.splitlines())), + list(map(add_lf, output.splitlines())), "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input, From 971f9845bbf78925832b9807918dd516eca05b73 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 20:20:54 +0100 Subject: [PATCH 07/13] Fix chardet test under Python 3 to read file as bytes --- html5lib/tests/test_encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 2cc18a96..74730e60 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -54,7 +54,7 @@ def test_encoding(): try: import chardet def test_chardet(): - data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read() + data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb").read() encoding = inputstream.HTMLInputStream(data).charEncoding assert encoding[0].lower() == "big5" except ImportError: From 23fda02cd36153995d9e96dce03d3ccf4cebb201 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 20:39:45 +0100 Subject: [PATCH 08/13] Split requirements-optional so chardet works on Python 2 and 3. As the comment says, I am well aware how evil this is hardcoding the Debian URL, but this allows for testing of all supported optional extras, and given chardet is packaged for Python 3 in varies places, this seems worthwhile to support. --- .travis.yml | 2 ++ requirements-optional-2.txt | 3 +++ requirements-optional-3.txt | 6 ++++++ requirements-optional.txt | 4 ---- 4 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 requirements-optional-2.txt create mode 100644 requirements-optional-3.txt diff --git a/.travis.yml b/.travis.yml index e9c542b2..24fd61b4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,8 @@ before_install: install: - pip install -r requirements.txt -r requirements-test.txt --use-mirrors - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi + - if [[ $TRAVIS_PYTHON_VERSION != "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi + - if [[ $TRAVIS_PYTHON_VERSION == "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi script: diff --git a/requirements-optional-2.txt b/requirements-optional-2.txt new file mode 100644 index 00000000..e1353e21 --- /dev/null +++ b/requirements-optional-2.txt @@ -0,0 +1,3 @@ +# chardet can be used as a fallback in case we are unable to determine +# the encoding of a document. +chardet diff --git a/requirements-optional-3.txt b/requirements-optional-3.txt new file mode 100644 index 00000000..97bd26d7 --- /dev/null +++ b/requirements-optional-3.txt @@ -0,0 +1,6 @@ +# chardet can be used as a fallback in case we are unable to determine +# the encoding of a document. +# And yes, I know it's evil hardcoding the Debian URL here. But alas, +# there has never been a formal release supporting Python 3, though +# several distros provide packages for it that support Python 3. +http://ftp.us.debian.org/debian/pool/main/p/python3-chardet/python3-chardet_2.0.1.orig.tar.gz diff --git a/requirements-optional.txt b/requirements-optional.txt index b8663ef3..1e7ec0ea 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -2,10 +2,6 @@ # streams. genshi -# chardet can be used as a fallback in case we are unable to determine -# the encoding of a document. -chardet - # DATrie can be used in place of our Python trie implementation for # slightly better parsing performance. datrie From e23acf9fee2c8508ef7bdcebb52ca55ac55f5791 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 11 Apr 2013 20:51:39 +0100 Subject: [PATCH 09/13] fixup! Split requirements-optional so chardet works on Python 2 and 3. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 24fd61b4..36829014 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,8 +16,8 @@ before_install: install: - pip install -r requirements.txt -r requirements-test.txt --use-mirrors - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi - - if [[ $TRAVIS_PYTHON_VERSION != "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi - - if [[ $TRAVIS_PYTHON_VERSION == "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi + - if [[ $TRAVIS_PYTHON_VERSION != 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi + - if [[ $TRAVIS_PYTHON_VERSION == 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi script: From 16733bbd1183069bbf1d3487a39efceec6cba482 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 15:08:40 +0100 Subject: [PATCH 10/13] fixup! Add diff to error messages from treewalker tests. --- html5lib/tests/test_treewalkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 544bc1db..f0f33c91 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -1,10 +1,10 @@ from __future__ import absolute_import, division, unicode_literals -from difflib import unified_diff import os import sys import unittest import warnings +from difflib import unified_diff try: unittest.TestCase.assertEqual From 24bdea4960a2c2614af5b2a65588fd1dc327c1b4 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 15:09:55 +0100 Subject: [PATCH 11/13] fixup! Re-enable Genshi treewalker. --- html5lib/treewalkers/genshistream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 6267b6d8..5d96645f 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -12,7 +12,7 @@ class TreeWalker(_base.TreeWalker): def __iter__(self): # Buffer the events so we can pass in the following one - previous = None + previous = None for event in self.tree: if previous is not None: for token in self.tokens(previous, event): From 27eabb91859459cc3063c7bddc6402b178f710b7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 15:11:49 +0100 Subject: [PATCH 12/13] fixup! Add diff to error messages from treewalker tests. --- html5lib/tests/test_treewalkers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index f0f33c91..50bac4eb 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -285,9 +285,8 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) - add_lf = lambda x: x + "\n" - diff = "".join(unified_diff(list(map(add_lf, expected.splitlines())), - list(map(add_lf, output.splitlines())), + diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()], + [line + "\n" for line in output.splitlines()] "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input, From 6dcc84954baa1fa72f1cdfee02cf6e08fa187289 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 13 Apr 2013 15:17:27 +0100 Subject: [PATCH 13/13] fixup! fixup! Add diff to error messages from treewalker tests. --- html5lib/tests/test_treewalkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 50bac4eb..fa550276 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -286,7 +286,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()], - [line + "\n" for line in output.splitlines()] + [line + "\n" for line in output.splitlines()], "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input,