diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
index bf17bb5e..0fc79308 100644
--- a/html5lib/ihatexml.py
+++ b/html5lib/ihatexml.py
@@ -179,6 +179,9 @@ def escapeRegexp(string):
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
+
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
@@ -188,7 +191,8 @@ def __init__(self, replaceChars=None,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
preventDashAtCommentEnd=False,
- replaceFormFeedCharacters=True):
+ replaceFormFeedCharacters=True,
+ preventSingleQuotePubid=False):
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@@ -198,6 +202,8 @@ def __init__(self, replaceChars=None,
self.replaceFormFeedCharacters = replaceFormFeedCharacters
+ self.preventSingleQuotePubid = preventSingleQuotePubid
+
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
@@ -229,6 +235,17 @@ def coerceCharacters(self, data):
# Other non-xml characters
return data
+ def coercePubid(self, data):
+ dataOutput = data
+ for char in nonPubidCharRegexp.findall(data):
+ warnings.warn("Coercing non-XML pubid", DataLossWarning)
+ replacement = self.getReplacementCharacter(char)
+ dataOutput = dataOutput.replace(char, replacement)
+ if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+ warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+ dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+ return dataOutput
+
def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
@@ -260,7 +277,7 @@ def fromXmlName(self, name):
return name
def escapeChar(self, char):
- replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+ replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement
return replacement
diff --git a/html5lib/tests/test_parser.py b/html5lib/tests/test_parser.py
index 35e0fd5e..ec363fe8 100644
--- a/html5lib/tests/test_parser.py
+++ b/html5lib/tests/test_parser.py
@@ -27,28 +27,25 @@ def convertTreeDump(data):
def runParserTest(innerHTML, input, expected, errors, treeClass,
namespaceHTMLElements):
- warnings.resetwarnings()
- warnings.simplefilter("error")
- # XXX - move this out into the setup function
- # concatenate all consecutive character tokens into a single token
- try:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
p = html5parser.HTMLParser(tree=treeClass,
namespaceHTMLElements=namespaceHTMLElements)
- except constants.DataLossWarning:
- return
- try:
- if innerHTML:
- document = p.parseFragment(input, innerHTML)
- else:
- try:
+ try:
+ if innerHTML:
+ document = p.parseFragment(input, innerHTML)
+ else:
document = p.parse(input)
- except constants.DataLossWarning:
- return
- except:
- errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
- "\nTraceback:", traceback.format_exc()])
- assert False, errorMsg
+ except:
+ errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+ "\nTraceback:", traceback.format_exc()])
+ assert False, errorMsg
+
+ otherW = [x for x in w if not issubclass(x.category, constants.DataLossWarning)]
+ assert len(otherW) == 0, [(x.category, x.message) for x in otherW]
+ if len(w):
+ return
output = convertTreeDump(p.tree.testSerializer(document))
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index aa46ea49..35d08efa 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -291,11 +291,16 @@ def insertDoctype(self, token):
publicId = token["publicId"]
systemId = token["systemId"]
- if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
- warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+ if not name:
+ warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+ self.doctype = None
+ else:
+ coercedName = self.infosetFilter.coerceElement(name)
+ if coercedName != name:
+ warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
- doctype = self.doctypeClass(name, publicId, systemId)
- self.doctype = doctype
+ doctype = self.doctypeClass(coercedName, publicId, systemId)
+ self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
@@ -313,12 +318,24 @@ def insertRoot(self, token):
# Therefore we need to use the built-in parser to create our iniial
# tree, after which we can add elements like normal
docStr = ""
- if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
+ if self.doctype:
+ assert self.doctype.name
docStr += "= 0 and sysid.find('"') >= 0:
+ warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+ sysid = sysid.replace("'", 'U00027')
+ if sysid.find("'") >= 0:
+ docStr += '"%s"' % sysid
+ else:
+ docStr += "'%s'" % sysid
+ else:
+ docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)