Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions html5lib/ihatexml.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ def escapeRegexp(string):

nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')

# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")


class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
Expand All @@ -188,7 +191,8 @@ def __init__(self, replaceChars=None,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
preventDashAtCommentEnd=False,
replaceFormFeedCharacters=True):
replaceFormFeedCharacters=True,
preventSingleQuotePubid=False):

self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
Expand All @@ -198,6 +202,8 @@ def __init__(self, replaceChars=None,

self.replaceFormFeedCharacters = replaceFormFeedCharacters

self.preventSingleQuotePubid = preventSingleQuotePubid

self.replaceCache = {}

def coerceAttribute(self, name, namespace=None):
Expand Down Expand Up @@ -229,6 +235,17 @@ def coerceCharacters(self, data):
# Other non-xml characters
return data

def coercePubid(self, data):
dataOutput = data
for char in nonPubidCharRegexp.findall(data):
warnings.warn("Coercing non-XML pubid", DataLossWarning)
replacement = self.getReplacementCharacter(char)
dataOutput = dataOutput.replace(char, replacement)
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
return dataOutput

def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
Expand Down Expand Up @@ -260,7 +277,7 @@ def fromXmlName(self, name):
return name

def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement
return replacement

Expand Down
33 changes: 15 additions & 18 deletions html5lib/tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,25 @@ def convertTreeDump(data):

def runParserTest(innerHTML, input, expected, errors, treeClass,
namespaceHTMLElements):
warnings.resetwarnings()
warnings.simplefilter("error")
# XXX - move this out into the setup function
# concatenate all consecutive character tokens into a single token
try:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
p = html5parser.HTMLParser(tree=treeClass,
namespaceHTMLElements=namespaceHTMLElements)
except constants.DataLossWarning:
return

try:
if innerHTML:
document = p.parseFragment(input, innerHTML)
else:
try:
try:
if innerHTML:
document = p.parseFragment(input, innerHTML)
else:
document = p.parse(input)
except constants.DataLossWarning:
return
except:
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
assert False, errorMsg
except:
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
assert False, errorMsg

otherW = [x for x in w if not issubclass(x.category, constants.DataLossWarning)]
assert len(otherW) == 0, [(x.category, x.message) for x in otherW]
if len(w):
return

output = convertTreeDump(p.tree.testSerializer(document))

Expand Down
31 changes: 24 additions & 7 deletions html5lib/treebuilders/etree_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,11 +291,16 @@ def insertDoctype(self, token):
publicId = token["publicId"]
systemId = token["systemId"]

if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)

doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype

def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
Expand All @@ -313,12 +318,24 @@ def insertRoot(self, token):
# Therefore we need to use the built-in parser to create our iniial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
if self.doctype:
assert self.doctype.name
docStr += "<!DOCTYPE %s" % self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"' % (self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
Expand Down