Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178)

Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes).
This commit is contained in:
Jaime Marquínez Ferrándiz 2015-10-25 20:04:55 +01:00
parent 755ff8d22c
commit 36e6f62cd0
11 changed files with 61 additions and 21 deletions

View file

@ -13,8 +13,10 @@
from youtube_dl.utils import get_filesystem_encoding from youtube_dl.utils import get_filesystem_encoding
from youtube_dl.compat import ( from youtube_dl.compat import (
compat_getenv, compat_getenv,
compat_etree_fromstring,
compat_expanduser, compat_expanduser,
compat_shlex_split, compat_shlex_split,
compat_str,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
) )
@ -71,5 +73,10 @@ def test_compat_urllib_parse_unquote_plus(self):
def test_compat_shlex_split(self): def test_compat_shlex_split(self):
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
def test_compat_etree_fromstring(self):
xml = '<el foo="bar"></el>'
doc = compat_etree_fromstring(xml.encode('utf-8'))
self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -68,6 +68,9 @@
cli_valueless_option, cli_valueless_option,
cli_bool_option, cli_bool_option,
) )
from youtube_dl.compat import (
compat_etree_fromstring,
)
class TestUtil(unittest.TestCase): class TestUtil(unittest.TestCase):
@ -242,7 +245,7 @@ def test_find_xpath_attr(self):
<node x="b" y="d" /> <node x="b" y="d" />
<node x="" /> <node x="" />
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
@ -263,7 +266,7 @@ def test_xpath_with_ns(self):
<url>http://server.com/download.mp3</url> <url>http://server.com/download.mp3</url>
</media:song> </media:song>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
self.assertTrue(find('media:song') is not None) self.assertTrue(find('media:song') is not None)
self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/media:author').text, 'The Author')
@ -285,7 +288,7 @@ def test_xpath_text(self):
<p>Foo</p> <p>Foo</p>
</div> </div>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')
self.assertTrue(xpath_text(doc, 'div/bar') is None) self.assertTrue(xpath_text(doc, 'div/bar') is None)
@ -297,7 +300,7 @@ def test_xpath_attr(self):
<p x="a">Foo</p> <p x="a">Foo</p>
</div> </div>
</root>''' </root>'''
doc = xml.etree.ElementTree.fromstring(testxml) doc = compat_etree_fromstring(testxml)
self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')
self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)
self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None)

View file

@ -14,6 +14,7 @@
import subprocess import subprocess
import sys import sys
import itertools import itertools
import xml.etree.ElementTree
try: try:
@ -212,6 +213,29 @@ def data_open(self, req):
except ImportError: # Python 2.6 except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error from xml.parsers.expat import ExpatError as compat_xml_parse_error
if sys.version_info[0] >= 3:
compat_etree_fromstring = xml.etree.ElementTree.fromstring
else:
# on python 2.x the the attributes of a node are str objects instead of
# unicode
etree = xml.etree.ElementTree
# on 2.6 XML doesn't have a parser argument, function copied from CPython
# 2.7 source
def _XML(text, parser=None):
if not parser:
parser = etree.XMLParser(target=etree.TreeBuilder())
parser.feed(text)
return parser.close()
def _element_factory(*args, **kwargs):
el = etree.Element(*args, **kwargs)
for k, v in el.items():
el.set(k, v.decode('utf-8'))
return el
def compat_etree_fromstring(text):
return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
try: try:
from urllib.parse import parse_qs as compat_parse_qs from urllib.parse import parse_qs as compat_parse_qs
@ -507,6 +531,7 @@ def compat_itertools_count(start=0, step=1):
'compat_chr', 'compat_chr',
'compat_cookiejar', 'compat_cookiejar',
'compat_cookies', 'compat_cookies',
'compat_etree_fromstring',
'compat_expanduser', 'compat_expanduser',
'compat_get_terminal_size', 'compat_get_terminal_size',
'compat_getenv', 'compat_getenv',

View file

@ -5,10 +5,10 @@
import itertools import itertools
import os import os
import time import time
import xml.etree.ElementTree as etree
from .fragment import FragmentFD from .fragment import FragmentFD
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urlparse, compat_urlparse,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
@ -290,7 +290,7 @@ def real_download(self, filename, info_dict):
man_url = urlh.geturl() man_url = urlh.geturl()
manifest = urlh.read() manifest = urlh.read()
doc = etree.fromstring(manifest) doc = compat_etree_fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f) formats = [(int(f.attrib.get('bitrate', -1)), f)
for f in self._get_unencrypted_media(doc)] for f in self._get_unencrypted_media(doc)]
if requested_bitrate is None: if requested_bitrate is None:

View file

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
@ -14,7 +13,10 @@
remove_end, remove_end,
unescapeHTML, unescapeHTML,
) )
from ..compat import compat_HTTPError from ..compat import (
compat_etree_fromstring,
compat_HTTPError,
)
class BBCCoUkIE(InfoExtractor): class BBCCoUkIE(InfoExtractor):
@ -344,7 +346,7 @@ def _download_media_selector_url(self, url, programme_id=None):
url, programme_id, 'Downloading media selection XML') url, programme_id, 'Downloading media selection XML')
except ExtractorError as ee: except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
else: else:
raise raise
return self._process_media_selector(media_selection, programme_id) return self._process_media_selector(media_selection, programme_id)

View file

@ -4,9 +4,11 @@
import re import re
import itertools import itertools
import json import json
import xml.etree.ElementTree as ET
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
)
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
unified_strdate, unified_strdate,
@ -88,7 +90,7 @@ def _real_extract(self, url):
except ValueError: except ValueError:
pass pass
lq_doc = ET.fromstring(lq_page) lq_doc = compat_etree_fromstring(lq_page)
lq_durls = lq_doc.findall('./durl') lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml( hq_doc = self._download_xml(

View file

@ -3,10 +3,10 @@
import re import re
import json import json
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_parse_qs, compat_parse_qs,
compat_str, compat_str,
compat_urllib_parse, compat_urllib_parse,
@ -119,7 +119,7 @@ def _build_brighcove_url(cls, object_str):
object_str = fix_xml_ampersands(object_str) object_str = fix_xml_ampersands(object_str)
try: try:
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
except compat_xml_parse_error: except compat_xml_parse_error:
return return

View file

@ -10,7 +10,6 @@
import socket import socket
import sys import sys
import time import time
import xml.etree.ElementTree
from ..compat import ( from ..compat import (
compat_cookiejar, compat_cookiejar,
@ -23,6 +22,7 @@
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_str, compat_str,
compat_etree_fromstring,
) )
from ..utils import ( from ..utils import (
NO_DEFAULT, NO_DEFAULT,
@ -461,7 +461,7 @@ def _download_xml(self, url_or_request, video_id,
return xml_string return xml_string
if transform_source: if transform_source:
xml_string = transform_source(xml_string) xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id, def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata', note='Downloading JSON metadata',

View file

@ -5,12 +5,12 @@
import json import json
import base64 import base64
import zlib import zlib
import xml.etree.ElementTree
from hashlib import sha1 from hashlib import sha1
from math import pow, sqrt, floor from math import pow, sqrt, floor
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urllib_parse, compat_urllib_parse,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_request, compat_urllib_request,
@ -234,7 +234,7 @@ def ass_bool(strvalue):
return output return output
def _extract_subtitles(self, subtitle): def _extract_subtitles(self, subtitle):
sub_root = xml.etree.ElementTree.fromstring(subtitle) sub_root = compat_etree_fromstring(subtitle)
return [{ return [{
'ext': 'srt', 'ext': 'srt',
'data': self._convert_subtitles_to_srt(sub_root), 'data': self._convert_subtitles_to_srt(sub_root),

View file

@ -1,10 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urllib_request, compat_urllib_request,
) )
from ..utils import ( from ..utils import (
@ -97,7 +97,7 @@ def _formats_from_json(self, video_info):
if last_version['version'] == -1: if last_version['version'] == -1:
raise ExtractorError('Unable to extract last version of the video') raise ExtractorError('Unable to extract last version of the video')
renditions = xml.etree.ElementTree.fromstring(last_version['data']) renditions = compat_etree_fromstring(last_version['data'])
formats = [] formats = []
# Already sorted from worst to best quality # Already sorted from worst to best quality
for rend in renditions.findall('rendition'): for rend in renditions.findall('rendition'):
@ -114,7 +114,7 @@ def _formats_from_json(self, video_info):
def _formats_from_smil(self, smil_xml): def _formats_from_smil(self, smil_xml):
formats = [] formats = []
smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
for el in els: for el in els:
src = el.attrib['src'] src = el.attrib['src']

View file

@ -36,6 +36,7 @@
from .compat import ( from .compat import (
compat_basestring, compat_basestring,
compat_chr, compat_chr,
compat_etree_fromstring,
compat_html_entities, compat_html_entities,
compat_http_client, compat_http_client,
compat_kwargs, compat_kwargs,
@ -1974,7 +1975,7 @@ def parse_node(node):
return out return out
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = [] out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')