diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index bf17bb5e..0fc79308 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -179,6 +179,9 @@ def escapeRegexp(string): nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +# Simpler things +nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") + class InfosetFilter(object): replacementRegexp = re.compile(r"U[\dA-F]{5,5}") @@ -188,7 +191,8 @@ def __init__(self, replaceChars=None, dropXmlnsAttrNs=False, preventDoubleDashComments=False, preventDashAtCommentEnd=False, - replaceFormFeedCharacters=True): + replaceFormFeedCharacters=True, + preventSingleQuotePubid=False): self.dropXmlnsLocalName = dropXmlnsLocalName self.dropXmlnsAttrNs = dropXmlnsAttrNs @@ -198,6 +202,8 @@ def __init__(self, replaceChars=None, self.replaceFormFeedCharacters = replaceFormFeedCharacters + self.preventSingleQuotePubid = preventSingleQuotePubid + self.replaceCache = {} def coerceAttribute(self, name, namespace=None): @@ -229,6 +235,17 @@ def coerceCharacters(self, data): # Other non-xml characters return data + def coercePubid(self, data): + dataOutput = data + for char in nonPubidCharRegexp.findall(data): + warnings.warn("Coercing non-XML pubid", DataLossWarning) + replacement = self.getReplacementCharacter(char) + dataOutput = dataOutput.replace(char, replacement) + if self.preventSingleQuotePubid and dataOutput.find("'") >= 0: + warnings.warn("Pubid cannot contain single quote", DataLossWarning) + dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'")) + return dataOutput + def toXmlName(self, name): nameFirst = name[0] nameRest = name[1:] @@ -260,7 +277,7 @@ def fromXmlName(self, name): return name def escapeChar(self, char): - replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0") + replacement = "U%05X" % ord(char) self.replaceCache[char] = replacement return replacement diff --git a/html5lib/tests/test_parser.py b/html5lib/tests/test_parser.py index 35e0fd5e..ec363fe8 100644 --- a/html5lib/tests/test_parser.py +++ b/html5lib/tests/test_parser.py @@ -27,28 +27,25 @@ def convertTreeDump(data): def runParserTest(innerHTML, input, expected, errors, treeClass, namespaceHTMLElements): - warnings.resetwarnings() - warnings.simplefilter("error") - # XXX - move this out into the setup function - # concatenate all consecutive character tokens into a single token - try: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") p = html5parser.HTMLParser(tree=treeClass, namespaceHTMLElements=namespaceHTMLElements) - except constants.DataLossWarning: - return - try: - if innerHTML: - document = p.parseFragment(input, innerHTML) - else: - try: + try: + if innerHTML: + document = p.parseFragment(input, innerHTML) + else: document = p.parse(input) - except constants.DataLossWarning: - return - except: - errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected, - "\nTraceback:", traceback.format_exc()]) - assert False, errorMsg + except: + errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected, + "\nTraceback:", traceback.format_exc()]) + assert False, errorMsg + + otherW = [x for x in w if not issubclass(x.category, constants.DataLossWarning)] + assert len(otherW) == 0, [(x.category, x.message) for x in otherW] + if len(w): + return output = convertTreeDump(p.tree.testSerializer(document)) diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index aa46ea49..35d08efa 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -291,11 +291,16 @@ def insertDoctype(self, token): publicId = token["publicId"] systemId = token["systemId"] - if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"': - warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) + if not name: + warnings.warn("lxml cannot represent empty doctype", DataLossWarning) + self.doctype = None + else: + coercedName = self.infosetFilter.coerceElement(name) + if coercedName != name: + warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning) - doctype = self.doctypeClass(name, publicId, systemId) - self.doctype = doctype + doctype = self.doctypeClass(coercedName, publicId, systemId) + self.doctype = doctype def insertCommentInitial(self, data, parent=None): self.initial_comments.append(data) @@ -313,12 +318,24 @@ def insertRoot(self, token): # Therefore we need to use the built-in parser to create our iniial # tree, after which we can add elements like normal docStr = "" - if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'): + if self.doctype: + assert self.doctype.name docStr += "= 0 and sysid.find('"') >= 0: + warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning) + sysid = sysid.replace("'", 'U00027') + if sysid.find("'") >= 0: + docStr += '"%s"' % sysid + else: + docStr += "'%s'" % sysid + else: + docStr += "''" docStr += ">" if self.doctype.name != token["name"]: warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)