mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-30 12:01:28 +00:00
[compat] compat_etree_fromstring: also decode the text attribute
Deletes parse_xml from utils, because it also does it.
This commit is contained in:
parent
387db16a78
commit
f78546272c
|
@ -74,10 +74,19 @@ def test_compat_shlex_split(self):
|
||||||
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
|
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
|
||||||
|
|
||||||
def test_compat_etree_fromstring(self):
|
def test_compat_etree_fromstring(self):
|
||||||
xml = '<el foo="bar" spam="中文"></el>'
|
xml = '''
|
||||||
|
<root foo="bar" spam="中文">
|
||||||
|
<normal>foo</normal>
|
||||||
|
<chinese>中文</chinese>
|
||||||
|
<foo><bar>spam</bar></foo>
|
||||||
|
</root>
|
||||||
|
'''
|
||||||
doc = compat_etree_fromstring(xml.encode('utf-8'))
|
doc = compat_etree_fromstring(xml.encode('utf-8'))
|
||||||
self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
|
self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
|
||||||
self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
|
self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
|
||||||
|
self.assertTrue(isinstance(doc.find('normal').text, compat_str))
|
||||||
|
self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
|
||||||
|
self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -216,9 +216,19 @@ def data_open(self, req):
|
||||||
if sys.version_info[0] >= 3:
|
if sys.version_info[0] >= 3:
|
||||||
compat_etree_fromstring = xml.etree.ElementTree.fromstring
|
compat_etree_fromstring = xml.etree.ElementTree.fromstring
|
||||||
else:
|
else:
|
||||||
# on python 2.x the the attributes of a node aren't always unicode objects
|
# on python 2.x the attributes and text of a node aren't always unicode
|
||||||
|
# objects
|
||||||
etree = xml.etree.ElementTree
|
etree = xml.etree.ElementTree
|
||||||
|
|
||||||
|
try:
|
||||||
|
_etree_iter = etree.Element.iter
|
||||||
|
except AttributeError: # Python <=2.6
|
||||||
|
def _etree_iter(root):
|
||||||
|
for el in root.findall('*'):
|
||||||
|
yield el
|
||||||
|
for sub in _etree_iter(el):
|
||||||
|
yield sub
|
||||||
|
|
||||||
# on 2.6 XML doesn't have a parser argument, function copied from CPython
|
# on 2.6 XML doesn't have a parser argument, function copied from CPython
|
||||||
# 2.7 source
|
# 2.7 source
|
||||||
def _XML(text, parser=None):
|
def _XML(text, parser=None):
|
||||||
|
@ -235,7 +245,11 @@ def _element_factory(*args, **kwargs):
|
||||||
return el
|
return el
|
||||||
|
|
||||||
def compat_etree_fromstring(text):
|
def compat_etree_fromstring(text):
|
||||||
return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
|
doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
|
||||||
|
for el in _etree_iter(doc):
|
||||||
|
if el.text is not None and isinstance(el.text, bytes):
|
||||||
|
el.text = el.text.decode('utf-8')
|
||||||
|
return doc
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import parse_qs as compat_parse_qs
|
from urllib.parse import parse_qs as compat_parse_qs
|
||||||
|
|
|
@ -14,8 +14,8 @@
|
||||||
parse_duration,
|
parse_duration,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
xpath_text,
|
xpath_text,
|
||||||
parse_xml,
|
|
||||||
)
|
)
|
||||||
|
from ..compat import compat_etree_fromstring
|
||||||
|
|
||||||
|
|
||||||
class ARDMediathekIE(InfoExtractor):
|
class ARDMediathekIE(InfoExtractor):
|
||||||
|
@ -161,7 +161,7 @@ def _real_extract(self, url):
|
||||||
raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
|
raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
|
||||||
|
|
||||||
if re.search(r'[\?&]rss($|[=&])', url):
|
if re.search(r'[\?&]rss($|[=&])', url):
|
||||||
doc = parse_xml(webpage)
|
doc = compat_etree_fromstring(webpage.encode('utf-8'))
|
||||||
if doc.tag == 'rss':
|
if doc.tag == 'rss':
|
||||||
return GenericIE()._extract_rss(url, video_id, doc)
|
return GenericIE()._extract_rss(url, video_id, doc)
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from .youtube import YoutubeIE
|
from .youtube import YoutubeIE
|
||||||
from ..compat import (
|
from ..compat import (
|
||||||
|
compat_etree_fromstring,
|
||||||
compat_urllib_parse_unquote,
|
compat_urllib_parse_unquote,
|
||||||
compat_urllib_request,
|
compat_urllib_request,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
|
@ -21,7 +22,6 @@
|
||||||
HEADRequest,
|
HEADRequest,
|
||||||
is_html,
|
is_html,
|
||||||
orderedSet,
|
orderedSet,
|
||||||
parse_xml,
|
|
||||||
smuggle_url,
|
smuggle_url,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
|
@ -1237,7 +1237,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
# Is it an RSS feed, a SMIL file or a XSPF playlist?
|
# Is it an RSS feed, a SMIL file or a XSPF playlist?
|
||||||
try:
|
try:
|
||||||
doc = parse_xml(webpage)
|
doc = compat_etree_fromstring(webpage.encode('utf-8'))
|
||||||
if doc.tag == 'rss':
|
if doc.tag == 'rss':
|
||||||
return self._extract_rss(url, video_id, doc)
|
return self._extract_rss(url, video_id, doc)
|
||||||
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
|
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
|
||||||
|
|
|
@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'):
|
||||||
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
|
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
etree_iter = xml.etree.ElementTree.Element.iter
|
|
||||||
except AttributeError: # Python <=2.6
|
|
||||||
etree_iter = lambda n: n.findall('.//*')
|
|
||||||
|
|
||||||
|
|
||||||
def parse_xml(s):
|
|
||||||
class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
|
|
||||||
def doctype(self, name, pubid, system):
|
|
||||||
pass # Ignore doctypes
|
|
||||||
|
|
||||||
parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
|
|
||||||
kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
|
|
||||||
tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
|
|
||||||
# Fix up XML parser in Python 2.x
|
|
||||||
if sys.version_info < (3, 0):
|
|
||||||
for n in etree_iter(tree):
|
|
||||||
if n.text is not None:
|
|
||||||
if not isinstance(n.text, compat_str):
|
|
||||||
n.text = n.text.decode('utf-8')
|
|
||||||
return tree
|
|
||||||
|
|
||||||
|
|
||||||
US_RATINGS = {
|
US_RATINGS = {
|
||||||
'G': 0,
|
'G': 0,
|
||||||
'PG': 10,
|
'PG': 10,
|
||||||
|
|
Loading…
Reference in a new issue