Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,17 @@ class HTMLSerializer(object):
resolve_entities = True

# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False

options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", "resolve_entities", "sanitize")
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")

def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Expand Down Expand Up @@ -143,6 +145,8 @@ def __init__(self, **kwargs):
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order.

.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
Expand Down Expand Up @@ -171,10 +175,11 @@ def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False
self.errors = []

if encoding and self.inject_meta_charset:
from ..filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from ..filters.whitespace import Filter
Expand All @@ -185,6 +190,12 @@ def serialize(self, treewalker, encoding=None):
if self.omit_optional_tags:
from ..filters.optionaltags import Filter
treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)

for token in treewalker:
type = token["type"]
if type == "Doctype":
Expand Down
17 changes: 9 additions & 8 deletions html5lib/tests/test_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals

import html5lib
from html5lib import serializer, constants
from html5lib.filters.alphabeticalattributes import Filter as AlphabeticalAttributesFilter
from html5lib import constants
from html5lib.serializer import HTMLSerializer, serialize
from html5lib.treewalkers._base import TreeWalker

optionals_loaded = []
Expand Down Expand Up @@ -82,8 +82,9 @@ def _convertAttrib(self, attribs):

def serialize_html(input, options):
options = dict([(str(k), v) for k, v in options.items()])
stream = AlphabeticalAttributesFilter(JsonWalker(input))
return serializer.HTMLSerializer(**options).render(stream, options.get("encoding", None))
stream = JsonWalker(input)
serializer = HTMLSerializer(alphabetical_attributes=True, **options)
return serializer.render(stream, options.get("encoding", None))


def runSerializerTest(input, expected, options):
Expand Down Expand Up @@ -147,24 +148,24 @@ class LxmlTestCase(unittest.TestCase):
def setUp(self):
self.parser = etree.XMLParser(resolve_entities=False)
self.treewalker = html5lib.getTreeWalker("lxml")
self.serializer = serializer.HTMLSerializer()
self.serializer = HTMLSerializer()

def testEntityReplacement(self):
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
tree = etree.fromstring(doc, parser=self.parser).getroottree()
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
result = serialize(tree, tree="lxml", omit_optional_tags=False)
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)

def testEntityXML(self):
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
tree = etree.fromstring(doc, parser=self.parser).getroottree()
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
result = serialize(tree, tree="lxml", omit_optional_tags=False)
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)

def testEntityNoResolve(self):
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
tree = etree.fromstring(doc, parser=self.parser).getroottree()
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
result = serialize(tree, tree="lxml", omit_optional_tags=False,
resolve_entities=False)
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)

Expand Down