From b2ac32a45fc1f481c6d8727061b50f82e875d0ca Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 18:25:29 +0100
Subject: [PATCH 01/13] Ignore .coverage as well.

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 9c5970ee..81b7d66c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,7 @@
 /MANIFEST
 
 # Generated by parse.py -p
-stats.prof
\ No newline at end of file
+stats.prof
+
+# From cover (esp. in combination with nose)
+.coverage

From e90eeee5bc956a196659eba8015a1f3118ad6197 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 18:59:36 +0100
Subject: [PATCH 02/13] Add diff to error messages from treewalker tests.

I've spent too long straining to see subtle difference. This helps.
---
 html5lib/tests/test_treewalkers.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 97eee4cd..a876d49a 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
+from difflib import unified_diff
 import os
 import sys
 import unittest
@@ -280,10 +281,15 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
         output = convertTokens(treeClass["walker"](document))
         output = attrlist.sub(sortattrs, output)
         expected = attrlist.sub(sortattrs, convertExpected(expected))
+        add_lf = lambda x: x + "\n"
+        diff = "".join(unified_diff(map(add_lf, expected.splitlines()),
+                                    map(add_lf, output.splitlines()),
+                                    "Expected", "Received"))
         assert expected == output, "\n".join([
                 "", "Input:", input,
                 "", "Expected:", expected,
-                "", "Received:", output
+                "", "Received:", output,
+                "", "Diff:", diff,
                 ])
     except NotImplementedError:
         pass # Amnesty for those that confess...

From 3a669294933c85b2b8bbfbd5f8e6c967eb8f7a11 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 19:35:30 +0100
Subject: [PATCH 03/13] Re-enable Genshi treewalker.

The Genshi treewalker was never fully updated when support for
namespaces was added to treewalkers. This finally fixes this, allowing
all tests to once again pass.

This removes the ignoring of children of void elements as the commonly
used treewalkers don't support this and there was some subtle bug I
couldn't see in the code. Given most don't support this, this seems
like little loss. (You will, however, still get an error from the
treewalker if a void element has children.)
---
 html5lib/tests/test_treewalkers.py   | 22 ++++++++------
 html5lib/treewalkers/genshistream.py | 45 +++++++++++++---------------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index a876d49a..4376288f 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -127,15 +127,19 @@ def GenshiAdapter(tree):
                     name = "{%s}%s" % (token["namespace"], token["name"])
                 else:
                     name = token["name"]
-                yield (START,
-                       (QName(name),
-                        Attrs([(QName(attr),value) for attr,value in token["data"]])),
-                       (None, -1, -1))
+                attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                               for attr, value in token["data"].items()])
+                yield (START, (QName(name), attrs), (None, -1, -1))
                 if type == "EmptyTag":
                     type = "EndTag"
 
             if type == "EndTag":
-                yield END, QName(token["name"]), (None, -1, -1)
+                if token["namespace"]:
+                    name = "{%s}%s" % (token["namespace"], token["name"])
+                else:
+                    name = token["name"]
+
+                yield END, QName(name), (None, -1, -1)
 
             elif type == "Comment":
                 yield COMMENT, token["data"], (None, -1, -1)
@@ -150,10 +154,10 @@ def GenshiAdapter(tree):
         if text is not None:
             yield TEXT, text, (None, -1, -1)
 
-    #treeTypes["genshi"] = \
-    #    {"builder": treebuilders.getTreeBuilder("simpletree"),
-    #     "adapter": GenshiAdapter,
-    #     "walker":  treewalkers.getTreeWalker("genshi")}
+    treeTypes["genshi"] = \
+       {"builder": treebuilders.getTreeBuilder("simpletree"),
+        "adapter": GenshiAdapter,
+        "walker":  treewalkers.getTreeWalker("genshi")}
 except ImportError:
     pass
 
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index cfd4e216..6267b6d8 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -1,52 +1,49 @@
 from __future__ import absolute_import, division, unicode_literals
 
+from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 from genshi.core  import  START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from genshi.output import NamespaceFlattener
 
 from . import _base
 
-from html5lib.constants import voidElements
+from html5lib.constants import voidElements, namespaces
 
 class TreeWalker(_base.TreeWalker):
     def __iter__(self):
-        depth = 0
-        ignore_until = None
-        previous = None
+        # Buffer the events so we can pass in the following one
+        previous = None        
         for event in self.tree:
             if previous is not None:
-                if previous[0] == START:
-                    depth += 1
-                if ignore_until <= depth:
-                    ignore_until = None
-                if ignore_until is None:
-                    for token in self.tokens(previous, event):
-                        yield token
-                        if token["type"] == "EmptyTag":
-                            ignore_until = depth
-                if previous[0] == END:
-                    depth -= 1
+                for token in self.tokens(previous, event):
+                    yield token
             previous = event
+
+        # Don't forget the final event!
         if previous is not None:
-            if ignore_until is None or ignore_until <= depth:
-                for token in self.tokens(previous, None):
-                    yield token
-            elif ignore_until is not None:
-                raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+            for token in self.tokens(previous, None):
+                yield token
 
     def tokens(self, event, next):
         kind, data, pos = event
         if kind == START:
-            tag, attrib = data
+            tag, attribs = data
             name = tag.localname
             namespace = tag.namespace
-            if tag in voidElements:
-                for token in self.emptyTag(namespace, name, list(attrib),
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
                                            not next or next[0] != END 
                                            or next[1] != tag):
                     yield token
             else:
-                yield self.startTag(namespace, name, list(attrib))
+                yield self.startTag(namespace, name, converted_attribs)
 
         elif kind == END:
             name = data.localname

From 71cb2fe8fc301e7f8d5042ac9b4a0088ef5808b5 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 19:50:43 +0100
Subject: [PATCH 04/13] Add requirements files for our optional dependencies
 and test with them.

---
 .travis.yml                       |  8 +++++++-
 requirements-optional-cpython.txt |  3 +++
 requirements-optional.txt         | 11 +++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 requirements-optional-cpython.txt
 create mode 100644 requirements-optional.txt

diff --git a/.travis.yml b/.travis.yml
index fa01a7d8..e9c542b2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,11 +6,17 @@ python:
   - "3.3"
   - "pypy"
 
+env:
+  - USE_OPTIONAL=true
+  - USE_OPTIONAL=false
+
 before_install:
   - git submodule update --init --recursive
 
 install:
   - pip install -r requirements.txt -r requirements-test.txt --use-mirrors
+  - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi
+  - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi
 
 script:
-  - nosetests
\ No newline at end of file
+  - nosetests
diff --git a/requirements-optional-cpython.txt b/requirements-optional-cpython.txt
new file mode 100644
index 00000000..0dfbeed4
--- /dev/null
+++ b/requirements-optional-cpython.txt
@@ -0,0 +1,3 @@
+# lxml is supported with its own treebuilder ("lxml") and otherwise
+# uses the standard ElementTree support
+lxml
diff --git a/requirements-optional.txt b/requirements-optional.txt
new file mode 100644
index 00000000..b8663ef3
--- /dev/null
+++ b/requirements-optional.txt
@@ -0,0 +1,11 @@
+# We support a Genshi treewalker that can be used to serialize Genshi
+# streams.
+genshi
+
+# chardet can be used as a fallback in case we are unable to determine
+# the encoding of a document.
+chardet
+
+# DATrie can be used in place of our Python trie implementation for
+# slightly better parsing performance.
+datrie

From 0e1d6288463a5d398edfc4ef8962e840f24ddf08 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 19:57:10 +0100
Subject: [PATCH 05/13] Fix DATrie support under Python 2.

This is a simple issue of using `str` to refer to what should be
`six.text_type`.
---
 html5lib/trie/datrie.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/html5lib/trie/datrie.py b/html5lib/trie/datrie.py
index e02efc10..fc98bdc3 100644
--- a/html5lib/trie/datrie.py
+++ b/html5lib/trie/datrie.py
@@ -3,6 +3,7 @@
 from itertools import chain
 
 from datrie import Trie as DATrie
+from six import text_type
 
 from ._base import Trie as ABCTrie
 
@@ -10,7 +11,7 @@ class Trie(ABCTrie):
     def __init__(self, data):
         chars = set()
         for key in data.keys():
-            if not isinstance(key, str):
+            if not isinstance(key, text_type):
                 raise TypeError("All keys must be strings")
             for char in key:
                 chars.add(char)

From 3cd3977a4af9a13fd32a7ae1ad7db857372c96c6 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 20:02:25 +0100
Subject: [PATCH 06/13] fixup! Add diff to error messages from treewalker
 tests.

---
 html5lib/tests/test_treewalkers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 4376288f..544bc1db 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -286,8 +286,8 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
         output = attrlist.sub(sortattrs, output)
         expected = attrlist.sub(sortattrs, convertExpected(expected))
         add_lf = lambda x: x + "\n"
-        diff = "".join(unified_diff(map(add_lf, expected.splitlines()),
-                                    map(add_lf, output.splitlines()),
+        diff = "".join(unified_diff(list(map(add_lf, expected.splitlines())),
+                                    list(map(add_lf, output.splitlines())),
                                     "Expected", "Received"))
         assert expected == output, "\n".join([
                 "", "Input:", input,

From 971f9845bbf78925832b9807918dd516eca05b73 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 20:20:54 +0100
Subject: [PATCH 07/13] Fix chardet test under Python 3 to read file as bytes

---
 html5lib/tests/test_encoding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 2cc18a96..74730e60 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -54,7 +54,7 @@ def test_encoding():
 try:
     import chardet
     def test_chardet():
-        data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
+        data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb").read()
         encoding = inputstream.HTMLInputStream(data).charEncoding
         assert encoding[0].lower() == "big5"
 except ImportError:

From 23fda02cd36153995d9e96dce03d3ccf4cebb201 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 20:39:45 +0100
Subject: [PATCH 08/13] Split requirements-optional so chardet works on Python
 2 and 3.

As the comment says, I am well aware how evil this is hardcoding the
Debian URL, but this allows for testing of all supported optional
extras, and given chardet is packaged for Python 3 in varies places,
this seems worthwhile to support.
---
 .travis.yml                 | 2 ++
 requirements-optional-2.txt | 3 +++
 requirements-optional-3.txt | 6 ++++++
 requirements-optional.txt   | 4 ----
 4 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 requirements-optional-2.txt
 create mode 100644 requirements-optional-3.txt

diff --git a/.travis.yml b/.travis.yml
index e9c542b2..24fd61b4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,6 +16,8 @@ before_install:
 install:
   - pip install -r requirements.txt -r requirements-test.txt --use-mirrors
   - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi
+  - if [[ $TRAVIS_PYTHON_VERSION != "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi
   - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi
 
 script:
diff --git a/requirements-optional-2.txt b/requirements-optional-2.txt
new file mode 100644
index 00000000..e1353e21
--- /dev/null
+++ b/requirements-optional-2.txt
@@ -0,0 +1,3 @@
+# chardet can be used as a fallback in case we are unable to determine
+# the encoding of a document.
+chardet
diff --git a/requirements-optional-3.txt b/requirements-optional-3.txt
new file mode 100644
index 00000000..97bd26d7
--- /dev/null
+++ b/requirements-optional-3.txt
@@ -0,0 +1,6 @@
+# chardet can be used as a fallback in case we are unable to determine
+# the encoding of a document.
+# And yes, I know it's evil hardcoding the Debian URL here. But alas,
+# there has never been a formal release supporting Python 3, though
+# several distros provide packages for it that support Python 3.
+http://ftp.us.debian.org/debian/pool/main/p/python3-chardet/python3-chardet_2.0.1.orig.tar.gz
diff --git a/requirements-optional.txt b/requirements-optional.txt
index b8663ef3..1e7ec0ea 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -2,10 +2,6 @@
 # streams.
 genshi
 
-# chardet can be used as a fallback in case we are unable to determine
-# the encoding of a document.
-chardet
-
 # DATrie can be used in place of our Python trie implementation for
 # slightly better parsing performance.
 datrie

From e23acf9fee2c8508ef7bdcebb52ca55ac55f5791 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Thu, 11 Apr 2013 20:51:39 +0100
Subject: [PATCH 09/13] fixup! Split requirements-optional so chardet works on
 Python 2 and 3.

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 24fd61b4..36829014 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,8 +16,8 @@ before_install:
 install:
   - pip install -r requirements.txt -r requirements-test.txt --use-mirrors
   - if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi
-  - if [[ $TRAVIS_PYTHON_VERSION != "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi
-  - if [[ $TRAVIS_PYTHON_VERSION == "3.*" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi
+  - if [[ $TRAVIS_PYTHON_VERSION != 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi
   - if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi
 
 script:

From 16733bbd1183069bbf1d3487a39efceec6cba482 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sat, 13 Apr 2013 15:08:40 +0100
Subject: [PATCH 10/13] fixup! Add diff to error messages from treewalker
 tests.

---
 html5lib/tests/test_treewalkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 544bc1db..f0f33c91 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -1,10 +1,10 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from difflib import unified_diff
 import os
 import sys
 import unittest
 import warnings
+from difflib import unified_diff
 
 try:
     unittest.TestCase.assertEqual

From 24bdea4960a2c2614af5b2a65588fd1dc327c1b4 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sat, 13 Apr 2013 15:09:55 +0100
Subject: [PATCH 11/13] fixup! Re-enable Genshi treewalker.

---
 html5lib/treewalkers/genshistream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 6267b6d8..5d96645f 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -12,7 +12,7 @@
 class TreeWalker(_base.TreeWalker):
     def __iter__(self):
         # Buffer the events so we can pass in the following one
-        previous = None        
+        previous = None
         for event in self.tree:
             if previous is not None:
                 for token in self.tokens(previous, event):

From 27eabb91859459cc3063c7bddc6402b178f710b7 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sat, 13 Apr 2013 15:11:49 +0100
Subject: [PATCH 12/13] fixup! Add diff to error messages from treewalker
 tests.

---
 html5lib/tests/test_treewalkers.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index f0f33c91..50bac4eb 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -285,9 +285,8 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
         output = convertTokens(treeClass["walker"](document))
         output = attrlist.sub(sortattrs, output)
         expected = attrlist.sub(sortattrs, convertExpected(expected))
-        add_lf = lambda x: x + "\n"
-        diff = "".join(unified_diff(list(map(add_lf, expected.splitlines())),
-                                    list(map(add_lf, output.splitlines())),
+        diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
+                                    [line + "\n" for line in output.splitlines()]
                                     "Expected", "Received"))
         assert expected == output, "\n".join([
                 "", "Input:", input,

From 6dcc84954baa1fa72f1cdfee02cf6e08fa187289 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sat, 13 Apr 2013 15:17:27 +0100
Subject: [PATCH 13/13] fixup! fixup! Add diff to error messages from
 treewalker tests.

---
 html5lib/tests/test_treewalkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 50bac4eb..fa550276 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -286,7 +286,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
         output = attrlist.sub(sortattrs, output)
         expected = attrlist.sub(sortattrs, convertExpected(expected))
         diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
-                                    [line + "\n" for line in output.splitlines()]
+                                    [line + "\n" for line in output.splitlines()],
                                     "Expected", "Received"))
         assert expected == output, "\n".join([
                 "", "Input:", input,