Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@
/MANIFEST

# Generated by parse.py -p
stats.prof
stats.prof

# From cover (esp. in combination with nose)
.coverage
10 changes: 9 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,19 @@ python:
- "3.3"
- "pypy"

env:
- USE_OPTIONAL=true
- USE_OPTIONAL=false

before_install:
- git submodule update --init --recursive

install:
- pip install -r requirements.txt -r requirements-test.txt --use-mirrors
- if [[ $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional.txt --use-mirrors; fi
- if [[ $TRAVIS_PYTHON_VERSION != 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-2.txt --use-mirrors; fi
- if [[ $TRAVIS_PYTHON_VERSION == 3.* && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-3.txt --use-mirrors; fi
- if [[ $TRAVIS_PYTHON_VERSION != "pypy" && $USE_OPTIONAL == "true" ]]; then pip install -r requirements-optional-cpython.txt --use-mirrors; fi

script:
- nosetests
- nosetests
2 changes: 1 addition & 1 deletion html5lib/tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_encoding():
try:
import chardet
def test_chardet():
data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb").read()
encoding = inputstream.HTMLInputStream(data).charEncoding
assert encoding[0].lower() == "big5"
except ImportError:
Expand Down
29 changes: 19 additions & 10 deletions html5lib/tests/test_treewalkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import unittest
import warnings
from difflib import unified_diff

try:
unittest.TestCase.assertEqual
Expand Down Expand Up @@ -126,15 +127,19 @@ def GenshiAdapter(tree):
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
yield (START,
(QName(name),
Attrs([(QName(attr),value) for attr,value in token["data"]])),
(None, -1, -1))
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
for attr, value in token["data"].items()])
yield (START, (QName(name), attrs), (None, -1, -1))
if type == "EmptyTag":
type = "EndTag"

if type == "EndTag":
yield END, QName(token["name"]), (None, -1, -1)
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]

yield END, QName(name), (None, -1, -1)

elif type == "Comment":
yield COMMENT, token["data"], (None, -1, -1)
Expand All @@ -149,10 +154,10 @@ def GenshiAdapter(tree):
if text is not None:
yield TEXT, text, (None, -1, -1)

#treeTypes["genshi"] = \
# {"builder": treebuilders.getTreeBuilder("simpletree"),
# "adapter": GenshiAdapter,
# "walker": treewalkers.getTreeWalker("genshi")}
treeTypes["genshi"] = \
{"builder": treebuilders.getTreeBuilder("simpletree"),
"adapter": GenshiAdapter,
"walker": treewalkers.getTreeWalker("genshi")}
except ImportError:
pass

Expand Down Expand Up @@ -280,10 +285,14 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
output = convertTokens(treeClass["walker"](document))
output = attrlist.sub(sortattrs, output)
expected = attrlist.sub(sortattrs, convertExpected(expected))
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
[line + "\n" for line in output.splitlines()],
"Expected", "Received"))
assert expected == output, "\n".join([
"", "Input:", input,
"", "Expected:", expected,
"", "Received:", output
"", "Received:", output,
"", "Diff:", diff,
])
except NotImplementedError:
pass # Amnesty for those that confess...
Expand Down
43 changes: 20 additions & 23 deletions html5lib/treewalkers/genshistream.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,49 @@
from __future__ import absolute_import, division, unicode_literals

from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener

from . import _base

from html5lib.constants import voidElements
from html5lib.constants import voidElements, namespaces

class TreeWalker(_base.TreeWalker):
def __iter__(self):
depth = 0
ignore_until = None
# Buffer the events so we can pass in the following one
previous = None
for event in self.tree:
if previous is not None:
if previous[0] == START:
depth += 1
if ignore_until <= depth:
ignore_until = None
if ignore_until is None:
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = depth
if previous[0] == END:
depth -= 1
for token in self.tokens(previous, event):
yield token
previous = event

# Don't forget the final event!
if previous is not None:
if ignore_until is None or ignore_until <= depth:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
for token in self.tokens(previous, None):
yield token

def tokens(self, event, next):
kind, data, pos = event
if kind == START:
tag, attrib = data
tag, attribs = data
name = tag.localname
namespace = tag.namespace
if tag in voidElements:
for token in self.emptyTag(namespace, name, list(attrib),
converted_attribs = {}
for k, v in attribs:
if isinstance(k, QName):
converted_attribs[(k.namespace, k.localname)] = v
else:
converted_attribs[(None, k)] = v

if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END
or next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, list(attrib))
yield self.startTag(namespace, name, converted_attribs)

elif kind == END:
name = data.localname
Expand Down
3 changes: 2 additions & 1 deletion html5lib/trie/datrie.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
from itertools import chain

from datrie import Trie as DATrie
from six import text_type

from ._base import Trie as ABCTrie

class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, str):
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
Expand Down
3 changes: 3 additions & 0 deletions requirements-optional-2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# chardet can be used as a fallback in case we are unable to determine
# the encoding of a document.
chardet
6 changes: 6 additions & 0 deletions requirements-optional-3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# chardet can be used as a fallback in case we are unable to determine
# the encoding of a document.
# And yes, I know it's evil hardcoding the Debian URL here. But alas,
# there has never been a formal release supporting Python 3, though
# several distros provide packages for it that support Python 3.
http://ftp.us.debian.org/debian/pool/main/p/python3-chardet/python3-chardet_2.0.1.orig.tar.gz
3 changes: 3 additions & 0 deletions requirements-optional-cpython.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# lxml is supported with its own treebuilder ("lxml") and otherwise
# uses the standard ElementTree support
lxml
7 changes: 7 additions & 0 deletions requirements-optional.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# We support a Genshi treewalker that can be used to serialize Genshi
# streams.
genshi

# DATrie can be used in place of our Python trie implementation for
# slightly better parsing performance.
datrie