[utils] Decode HTML5 entities

Used in test_Vporn_1. Also related to #9270
This commit is contained in:
Yen Chi Hsuan 2016-06-10 15:11:55 +08:00
parent 9631a94fb5
commit 55b2f099c0
No known key found for this signature in database
GPG key ID: 3FDDD575826C5C30
2 changed files with 12 additions and 2 deletions

View file

@ -249,6 +249,8 @@ def test_unescape_html(self):
self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('/'), '/')
self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('é'), 'é')
self.assertEqual(unescapeHTML('�'), '�') self.assertEqual(unescapeHTML('�'), '�')
# HTML5 entities
self.assertEqual(unescapeHTML('.''), '.\'')
def test_date_from_str(self): def test_date_from_str(self):
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))

View file

@ -39,6 +39,7 @@
compat_chr, compat_chr,
compat_etree_fromstring, compat_etree_fromstring,
compat_html_entities, compat_html_entities,
compat_html_entities_html5,
compat_http_client, compat_http_client,
compat_kwargs, compat_kwargs,
compat_parse_qs, compat_parse_qs,
@ -456,12 +457,19 @@ def orderedSet(iterable):
return res return res
def _htmlentity_transform(entity): def _htmlentity_transform(entity_with_semicolon):
"""Transforms an HTML entity to a character.""" """Transforms an HTML entity to a character."""
entity = entity_with_semicolon[:-1]
# Known non-numeric HTML entity # Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint: if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity]) return compat_chr(compat_html_entities.name2codepoint[entity])
# TODO: HTML5 allows entities without a semicolon. For example,
# '&Eacuteric' should be decoded as 'Éric'.
if entity_with_semicolon in compat_html_entities_html5:
return compat_html_entities_html5[entity_with_semicolon]
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
if mobj is not None: if mobj is not None:
numstr = mobj.group(1) numstr = mobj.group(1)
@ -486,7 +494,7 @@ def unescapeHTML(s):
assert type(s) == compat_str assert type(s) == compat_str
return re.sub( return re.sub(
r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding(): def get_subprocess_encoding():