From da0d84258bf8163550f0c952393efb43d44ece17 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 17 Nov 2022 00:11:51 +0100 Subject: [PATCH 01/15] [test/test_utils] refactor test_get_element_text_and_html_by_tag() --- test/test_utils.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3045b6d7e..334423619 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import os import re import sys +import textwrap import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -1768,32 +1769,31 @@ Line 1 self.assertEqual(list(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')]) - GET_ELEMENT_BY_TAG_TEST_STRING = ''' - random text lorem ipsum

-
- this should be returned - this should also be returned -
- this should also be returned -
- closing tag above should not trick, so this should also be returned -
- but this text should not be returned - ''' - GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276] - GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6] - GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] - GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] - def test_get_element_text_and_html_by_tag(self): - html = self.GET_ELEMENT_BY_TAG_TEST_STRING + get_element_by_tag_test_string = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned +
+ this should also be returned +
+ closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) + get_element_by_tag_res_outerdiv_html = html.strip()[32:276] + get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] + get_element_by_tag_res_innerspan_html = html.strip()[78:119] + get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] self.assertEqual( get_element_text_and_html_by_tag('div', html), - (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML)) + (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('span', html), - (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) + (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) def test_iri_to_uri(self): From af03fa454299de0a39fb31e257e02d269f7ef6b2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 17 Nov 2022 01:20:25 +0100 Subject: [PATCH 02/15] [utils] more forgiving html parsing + unit tests --- test/test_utils.py | 19 +++++++++++++++++++ yt_dlp/utils.py | 12 +++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 334423619..022e821a6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1796,6 +1796,25 @@ Line 1 (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + def test_get_element_text_and_html_by_tag_malformed(self): + inner_text = 'inner_text' + malnested_elements = f'{inner_text}' + html = f'
{malnested_elements}
' + + self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html)) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_a', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_b', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 8c2c5593c..de058b0e6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): pass def handle_starttag(self, tag, _): - self.tagstack.append(tag) + self.tagstack.appendleft(tag) def handle_endtag(self, tag): if not self.tagstack: raise compat_HTMLParseError('no tags in the stack') - while self.tagstack: - inner_tag = self.tagstack.pop() - if inner_tag == tag: - break - else: - raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + with contextlib.suppress(ValueError): + self.tagstack.remove(tag) if not self.tagstack: raise self.HTMLBreakOnClosingTagException() @@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html): next_closing_tag_end = next_closing_tag_start + len(closing_tag) try: parser.feed(html[offset:offset + next_closing_tag_end]) + if tag not in parser.tagstack: + raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException() offset += next_closing_tag_end except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: return html[content_start:offset + next_closing_tag_start], \ From 5e3894df3fa043b1cd7bc731f5e5954bc17295e2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 22 Nov 2022 14:07:14 +0100 Subject: [PATCH 03/15] [parsing] add new module containing various HTML parser classes as replacement for utils.get_html_... functions * performance is mostly better for large HTML data and on PyPy --- test/test_utils.py | 73 ++++++++++++++- yt_dlp/parsing.py | 219 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 288 insertions(+), 4 deletions(-) create mode 100644 yt_dlp/parsing.py diff --git a/test/test_utils.py b/test/test_utils.py index 022e821a6..d9a62258c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,14 @@ from yt_dlp.compat import ( compat_HTMLParseError, compat_os_name, ) +from yt_dlp.parsing import ( + HTMLTagParser, + FirstMatchingElementParser, +) + +# some testcases don't work with current functions +get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag + from yt_dlp.utils import ( Config, DateRange, @@ -60,7 +68,6 @@ from yt_dlp.utils import ( get_element_by_class, get_element_html_by_attribute, get_element_html_by_class, - get_element_text_and_html_by_tag, get_elements_by_attribute, get_elements_by_class, get_elements_html_by_attribute, @@ -1797,11 +1804,14 @@ Line 1 self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) def test_get_element_text_and_html_by_tag_malformed(self): - inner_text = 'inner_text' + inner_text = 'inner text' malnested_elements = f'{inner_text}' - html = f'
{malnested_elements}
' + commented_html = '' + outerdiv_html = f'
{malnested_elements}
' + html = f'{commented_html}{outerdiv_html}' - self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html)) + self.assertEqual( + get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('malnested_a', html), (f'{inner_text}', @@ -1815,6 +1825,61 @@ Line 1 self.assertRaises( compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + def test_strict_html_parsing(self): + class StrictTagParser(HTMLTagParser): + STRICT = True + + parser = StrictTagParser() + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

/p>

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): + parser.taglist('must be empty', reset=True) + + def test_relaxed_html_parsing(self): + Tag = HTMLTagParser.Tag + parser = HTMLTagParser() + + self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), []) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + + tags = parser.taglist('

/p>

', reset=True) + self.assertEqual(tags, [Tag('div')]) + + tags = parser.taglist('

paragraph

', reset=True) + self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py new file mode 100644 index 000000000..d0dcf450a --- /dev/null +++ b/yt_dlp/parsing.py @@ -0,0 +1,219 @@ +import collections +import contextlib +import itertools +import re +from html.parser import HTMLParser + +from .utils import orderedSet + +from .compat import compat_HTMLParseError + + +class HTMLTagParser(HTMLParser): + """HTML parser which acts as iterator + returns found elements as instances of Tag + nested elements will be returned before its parents + + strict=True raises compat_HTMLParseError on malformed html + + two modes of usage: + # as an lazy iterator: + for tag_obj in HTMLTagParser(html): + tag_obj.text_and_html() + + # or return a list with all found tag objects + # this is faster by factor 2-5 compared to iteration + for tag_obj in HTMLTagParser(html).taglist(): + tag_obj.text_and_html() + """ + + STRICT = False + ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''') + CLOSING_TAG_REGEX = re.compile(r']+(?:\s*>)?') + VOID_TAGS = { + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', + } + + class Tag: + __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs' + + def __init__(self, name, *, string='', start=None, stop=None, attrs=()): + self.name = name + self.string = string + self.start = start + self.start_len = 0 + self.stop = stop + self.attrs = tuple(attrs) + + def __str__(self): + return self.name + + def __repr__(self): + return f'{self.__class__.__name__}({str(self)!r})' + + def __eq__(self, other): + return self.name == other + + def html(self): + return self.string[self.start:self.stop] + + def text_and_html(self): + assert isinstance(self.start, int) + if not self.start_len: + match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:]) + assert match + self.start_len = len(match.group()) + if self.stop is None: + return '', self.string[self.start: self.start + self.start_len] + html = self.html() + cidx = html.rindex('') or tag in self.VOID_TAGS: + if self.callback(obj) is not False: + self.found_tags.append(obj) + return + else: + obj = None + + self.tagstack.appendleft(obj or tag) + + handle_startendtag = handle_starttag + + def handle_endtag(self, tag): + if '<' in tag: + if self.STRICT: + raise compat_HTMLParseError(f'malformed closing tag {tag!r}') + tag = tag[:tag.index('<')] + + try: + idx = self.tagstack.index(tag) + if self.STRICT and idx: + open_tags = ''.join(f'' for tag in itertools.islice(self.tagstack, idx)) + raise compat_HTMLParseError( + f'malnested closing tag {tag!r}, expected after {open_tags!r}') + tag_obj = self.tagstack[idx] + self.tagstack.remove(tag) + if not isinstance(tag_obj, str): + # since we landed here we'll always find a closing tag + match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:]) + tag_obj.stop = self._offset + match.end() + if self.callback(tag_obj) is not False: + self.found_tags.append(tag_obj) + except ValueError as exc: + if isinstance(exc, compat_HTMLParseError): + raise + elif self.STRICT: + raise compat_HTMLParseError(f'stray closing tag {tag!r}') + + +class ClassParser(HTMLTagParser): + def __init__(self, attribute, matchfunc, stop): + super().__init__() + self.search_attr = attribute + self.matchfunc = matchfunc + self.stop = stop + self.processing = 0 + + def predicate(self, tag, attrs): + if self.processing <= 0 and self.stop is not None and self._offset > self.stop: + self.abort() + string = dict(attrs).get(self.search_attr, '') + if self.matchfunc(string): + self.processing += 1 + return True + return False + + def callback(self, tag_obj): + if self.stop is None: + self.abort(tag_obj) + self.processing -= 1 + + @classmethod + def get_elements_html_by_class(cls, class_name, html): + regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b') + it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html) + start = stop = None + for match in it: + if start is None: + start = match.start() + else: + stop = match.end() + if start is None: + return [] + parser = cls('class', lambda x: regex.match(x), stop) + return [tag.html() for tag in parser.taglist(html[start:])] + + +class FirstMatchingElementParser(HTMLTagParser): + def __init__(self, matchfunc): + super().__init__() + self.matchfunc = matchfunc + self.found = False + + def predicate(self, tag, attrs): + if not self.found and self.matchfunc(tag, attrs): + self.found = True + return True + return False + + def callback(self, obj): + self.abort(obj) + + @classmethod + def get_element_text_and_html_by_tag(cls, tag, html): + """ + For the first element with the specified tag in the given HTML document + return its content (text) and the whole element (html) + """ + parser = cls(lambda _tag, _: _tag == tag) + for tag_obj in parser.taglist(html): + return tag_obj.text_and_html() + raise compat_HTMLParseError(f'tag {tag} not found') From e092ba9922191886c542972461ec27b1d82a466d Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 22 Nov 2022 22:37:14 +0100 Subject: [PATCH 04/15] [test] rollback test_utils.py and add related tests to test_parsing.py --- test/test_parsing.py | 218 +++++++++++++++++++++++++++++++++++++++++++ test/test_utils.py | 124 ++++-------------------- 2 files changed, 238 insertions(+), 104 deletions(-) create mode 100644 test/test_parsing.py diff --git a/test/test_parsing.py b/test/test_parsing.py new file mode 100644 index 000000000..782a1196d --- /dev/null +++ b/test/test_parsing.py @@ -0,0 +1,218 @@ +import textwrap +import unittest + +from parsing import ( + FirstMatchingElementParser, + HTMLTagParser, + MatchingElementParser, +) + +from yt_dlp.compat import compat_HTMLParseError + +get_element_by_attribute = FirstMatchingElementParser +get_element_by_class = FirstMatchingElementParser +get_element_html_by_attribute = FirstMatchingElementParser +get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser +get_elements_by_class = MatchingElementParser +get_elements_html_by_attribute = MatchingElementParser +get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser + + +class TestParsing(unittest.TestCase): + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' + nice + ''' + + def test_get_element_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + + def test_get_element_html_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_by_class('no-such-class', html), None) + + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' +

+ ''' + + def test_get_element_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') + self.assertEqual(get_element_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + + def test_get_element_html_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) + self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) + + GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' + nice + also nice + ''' + GET_ELEMENTS_BY_CLASS_RES = [ + 'nice', + 'also nice' + ] + + def test_get_elements_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_class('no-such-class', html), []) + + def test_get_elements_html_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_class('no-such-class', html), []) + + def test_get_elements_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), + self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_text_and_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual( + get_elements_text_and_html_by_attribute('class', 'foo bar', html), + list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) + + self.assertEqual(get_elements_text_and_html_by_attribute( + 'class', 'foo', 'nicenice', tag='a'), + [('nice', 'nice')]) + + def test_get_element_text_and_html_by_tag(self): + get_element_by_tag_test_string = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned +
+ this should also be returned +
+ closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) + get_element_by_tag_res_outerdiv_html = html.strip()[32:276] + get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] + get_element_by_tag_res_innerspan_html = html.strip()[78:119] + get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), + (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('span', html), + (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) + self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + + def test_get_element_text_and_html_by_tag_malformed(self): + inner_text = 'inner text' + malnested_elements = f'{inner_text}' + commented_html = '' + outerdiv_html = f'
{malnested_elements}
' + html = f'{commented_html}{outerdiv_html}' + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_a', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_b', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + + def test_strict_html_parsing(self): + class StrictTagParser(HTMLTagParser): + STRICT = True + + parser = StrictTagParser() + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

/p>

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): + parser.taglist('must be empty', reset=True) + + def test_relaxed_html_parsing(self): + Tag = HTMLTagParser.Tag + parser = HTMLTagParser() + + self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), []) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + + tags = parser.taglist('

/p>

', reset=True) + self.assertEqual(tags, [Tag('div')]) + + tags = parser.taglist('

paragraph

', reset=True) + self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) diff --git a/test/test_utils.py b/test/test_utils.py index d9a62258c..3045b6d7e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,7 +4,6 @@ import os import re import sys -import textwrap import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -21,14 +20,6 @@ from yt_dlp.compat import ( compat_HTMLParseError, compat_os_name, ) -from yt_dlp.parsing import ( - HTMLTagParser, - FirstMatchingElementParser, -) - -# some testcases don't work with current functions -get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag - from yt_dlp.utils import ( Config, DateRange, @@ -68,6 +59,7 @@ from yt_dlp.utils import ( get_element_by_class, get_element_html_by_attribute, get_element_html_by_class, + get_element_text_and_html_by_tag, get_elements_by_attribute, get_elements_by_class, get_elements_html_by_attribute, @@ -1776,110 +1768,34 @@ Line 1 self.assertEqual(list(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')]) - def test_get_element_text_and_html_by_tag(self): - get_element_by_tag_test_string = ''' - random text lorem ipsum

+ GET_ELEMENT_BY_TAG_TEST_STRING = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned
- this should be returned - this should also be returned -
- this should also be returned -
- closing tag above should not trick, so this should also be returned + this should also be returned
- but this text should not be returned - ''' - html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) - get_element_by_tag_res_outerdiv_html = html.strip()[32:276] - get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] - get_element_by_tag_res_innerspan_html = html.strip()[78:119] - get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] + closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276] + GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] + + def test_get_element_text_and_html_by_tag(self): + html = self.GET_ELEMENT_BY_TAG_TEST_STRING self.assertEqual( get_element_text_and_html_by_tag('div', html), - (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) + (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML)) self.assertEqual( get_element_text_and_html_by_tag('span', html), - (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) + (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) - def test_get_element_text_and_html_by_tag_malformed(self): - inner_text = 'inner text' - malnested_elements = f'{inner_text}' - commented_html = '' - outerdiv_html = f'
{malnested_elements}
' - html = f'{commented_html}{outerdiv_html}' - - self.assertEqual( - get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) - self.assertEqual( - get_element_text_and_html_by_tag('malnested_a', html), - (f'{inner_text}', - f'{inner_text}')) - self.assertEqual( - get_element_text_and_html_by_tag('malnested_b', html), - (f'{inner_text}', - f'{inner_text}')) - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - - def test_strict_html_parsing(self): - class StrictTagParser(HTMLTagParser): - STRICT = True - - parser = StrictTagParser() - with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): - parser.taglist('

/p>

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): - parser.taglist('must be empty', reset=True) - - def test_relaxed_html_parsing(self): - Tag = HTMLTagParser.Tag - parser = HTMLTagParser() - - self.assertEqual(parser.taglist('

', reset=True), []) - self.assertEqual(parser.taglist('

', reset=True), []) - - tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('div'), Tag('p')]) - - tags = parser.taglist('

/p>

', reset=True) - self.assertEqual(tags, [Tag('div')]) - - tags = parser.taglist('

paragraph

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) - self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) - self.assertEqual(tags, [Tag('img')]) - self.assertEqual(tags[0].text_and_html(), ('', '')) - - def test_compliant_html_parsing(self): - # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) - Tag = HTMLTagParser.Tag - html = ''' - no error without closing tag: - self closing is ok: - ''' - parser = HTMLTagParser() - tags = parser.taglist(html, reset=True) - self.assertEqual(tags, [Tag('img'), Tag('img')]) - - # don't get fooled by '>' in attributes - html = '''''' - tags = parser.taglist(html, reset=True) - self.assertEqual(tags[0].text_and_html(), ('', html)) - def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), From 176a156c651defe95f4ed6714ddf47d599ecef50 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 22 Nov 2022 19:58:06 +0100 Subject: [PATCH 05/15] [parsing] rework interface, implemented all get_element(s) functions + extract_attributes() as MatchingElementParser class methods and improve performance --- test/test_parsing.py | 168 +++++++++++++++---- yt_dlp/parsing.py | 373 ++++++++++++++++++++++++++++++------------- 2 files changed, 399 insertions(+), 142 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 782a1196d..75ed8ebf3 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -1,29 +1,71 @@ import textwrap import unittest -from parsing import ( - FirstMatchingElementParser, - HTMLTagParser, +from yt_dlp.compat import compat_HTMLParseError +from yt_dlp.parsing import ( MatchingElementParser, + HTMLCommentRanges, + HTMLTagParser, ) -from yt_dlp.compat import compat_HTMLParseError - -get_element_by_attribute = FirstMatchingElementParser -get_element_by_class = FirstMatchingElementParser -get_element_html_by_attribute = FirstMatchingElementParser -get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class -get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag -get_elements_by_attribute = MatchingElementParser -get_elements_by_class = MatchingElementParser -get_elements_html_by_attribute = MatchingElementParser -get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class -get_elements_text_and_html_by_attribute = MatchingElementParser +extract_attributes = MatchingElementParser.extract_attributes +get_element_by_attribute = MatchingElementParser.get_element_by_attribute +get_element_by_class = MatchingElementParser.get_element_by_class +get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute +get_element_html_by_class = MatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute +get_elements_by_class = MatchingElementParser.get_elements_by_class +get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute +get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute +get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag class TestParsing(unittest.TestCase): + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + # Malformed HTML should not break attributes extraction on older Python + self.assertEqual(extract_attributes(''), {}) + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' nice +

also nice
''' def test_get_element_by_class(self): @@ -35,7 +77,8 @@ class TestParsing(unittest.TestCase): def test_get_element_html_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING - self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_html_by_class('foo', html), + 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' @@ -48,6 +91,7 @@ class TestParsing(unittest.TestCase): self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice') html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING @@ -56,7 +100,8 @@ class TestParsing(unittest.TestCase): def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING - self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), + 'nice') self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) @@ -110,7 +155,7 @@ class TestParsing(unittest.TestCase): self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute( - 'class', 'foo', 'nicenice', tag='a'), + 'class', 'foo', 'nicenot nice', tag='a'), [('nice', 'nice')]) def test_get_element_text_and_html_by_tag(self): @@ -138,7 +183,16 @@ class TestParsing(unittest.TestCase): self.assertEqual( get_element_text_and_html_by_tag('span', html), (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) - self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + self.assertIsNone(get_element_text_and_html_by_tag('article', html)) + + def test_get_elements_text_and_html_by_tag(self): + test_string = ''' + + + ignore + ''' + items = get_elements_text_and_html_by_tag('img', test_string) + self.assertListEqual(items, [('', ''), ('', '')]) def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' @@ -157,10 +211,8 @@ class TestParsing(unittest.TestCase): get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) + self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): @@ -188,14 +240,14 @@ class TestParsing(unittest.TestCase): self.assertEqual(parser.taglist('

', reset=True), []) tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags, [Tag('p'), Tag('div')]) tags = parser.taglist('

/p>

', reset=True) self.assertEqual(tags, [Tag('div')]) - tags = parser.taglist('

paragraph

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) - self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraph

paragraph

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[1].text_and_html(), ('paragraph', '

paragraph')) tags = parser.taglist('must be empty', reset=True) self.assertEqual(tags, [Tag('img')]) @@ -216,3 +268,65 @@ class TestParsing(unittest.TestCase): html = '''''' tags = parser.taglist(html, reset=True) self.assertEqual(tags[0].text_and_html(), ('', html)) + + def test_tag_return_order(self): + Tag = HTMLTagParser.Tag + html = ''' + + + + + + + + + + + + + + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual( + str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'), + Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')])) + + tags = parser.taglist(html, reset=True, depth_first=True) + self.assertEqual( + str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'), + Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')])) + + # return tags in nested order + tags = parser.taglist(html, reset=True, depth_first=None) + self.assertEqual( + str(tags), str([ + [Tag('t0'), + [Tag('t1'), + [Tag('t2'), Tag('t3'), Tag('t4')]], + [Tag('t5'), Tag('t6')]], + [Tag('t7'), Tag('t8')]])) + + def test_within_html_comment(self): + def mark_comments(_string, char='^', nochar='-'): + cmts = HTMLCommentRanges(_string) + return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) + + html_string = ''' + no comments in this line + --------------------------------------------------------------------- + + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + before after + -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-------- + here is and end + ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------ + this ends here --> and not here + -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---------------------------- + stray --> comment closings --> are ignored ' encountered + note: markers within quotes are not ignored + """ + + def __init__(self, html): + self._range_iter = self.ranges(html) + self._range = next(self._range_iter, None) + self._last_offset = 0 + + @staticmethod + def ranges(string, sopen=''): + assert not (sopen.startswith(sclose) or sclose.startswith(sopen)) + open_iter = iter_find(string, sopen) + close_len = len(sclose) + close_iter = (idx + close_len for idx in iter_find(string, sclose)) + next_open = next(open_iter, None) + next_close = next(close_iter, None) + + while True: + if next_open is None: + return + while next_close is not None and next_open > next_close: + next_close = next(close_iter, None) + yield slice(next_open, next_close) + if next_close is None: + return + while next_open is not None and next_open < next_close: + next_open = next(open_iter, None) + + def __contains__(self, offset): + assert isinstance(offset, int) + assert offset >= self._last_offset, 'offset must be in increasing order' + self._last_offset = offset + while self._range and self._range.stop is not None and offset >= self._range.stop: + self._range = next(self._range_iter, None) + + return not (self._range is None or offset < self._range.start) class HTMLTagParser(HTMLParser): - """HTML parser which acts as iterator - returns found elements as instances of Tag - nested elements will be returned before its parents + """HTML parser which returns found elements as instances of 'Tag' + when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements - strict=True raises compat_HTMLParseError on malformed html - - two modes of usage: - # as an lazy iterator: - for tag_obj in HTMLTagParser(html): + usage: + parser = HTMLTagParser() + for tag_obj in parser.taglist(html): tag_obj.text_and_html() - # or return a list with all found tag objects - # this is faster by factor 2-5 compared to iteration - for tag_obj in HTMLTagParser(html).taglist(): - tag_obj.text_and_html() """ STRICT = False ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''') - CLOSING_TAG_REGEX = re.compile(r']+(?:\s*>)?') VOID_TAGS = { 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', } class Tag: - __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs' + __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange' - def __init__(self, name, *, string='', start=None, stop=None, attrs=()): + def __init__(self, name, *, string='', attrs=()): self.name = name self.string = string - self.start = start - self.start_len = 0 - self.stop = stop self.attrs = tuple(attrs) + self._openrange = None + self._closerange = None def __str__(self): return self.name @@ -55,52 +97,81 @@ class HTMLTagParser(HTMLParser): def __eq__(self, other): return self.name == other + def openrange(self, offset, startlen=0): + if isinstance(offset, slice): + self._openrange = offset + else: + self._openrange = slice(offset, offset + startlen) + + def closerange(self, offset, stoplen=0): + if isinstance(offset, slice): + self._closerange = offset + else: + self._closerange = slice(offset, offset + stoplen) + + def opentag(self): + return self.string[self._openrange] if self._openrange else '' + def html(self): - return self.string[self.start:self.stop] + if not self._openrange: + return '' + if self._closerange: + return self.string[self._openrange.start:self._closerange.stop] + return self.string[self._openrange] + + def text(self): + if self._openrange and self._closerange: + return self.string[self._openrange.stop:self._closerange.start] + return '' def text_and_html(self): - assert isinstance(self.start, int) - if not self.start_len: - match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:]) - assert match - self.start_len = len(match.group()) - if self.stop is None: - return '', self.string[self.start: self.start + self.start_len] - html = self.html() - cidx = html.rindex('') or tag in self.VOID_TAGS: - if self.callback(obj) is not False: - self.found_tags.append(obj) - return - else: - obj = None - self.tagstack.appendleft(obj or tag) + tag_obj = tag + if self.predicate(tag, attrs): + tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) + tag_obj.openrange(self._offset, len(tag_text)) + if tag_text.endswith('/>') or tag in self.VOID_TAGS: + self._nestedtags[-1].append(tag_obj) + self.callback(tag_obj) + return + nesting = [] + self._nestedtags[-1].append(nesting) + self._nestedtags.append(nesting) + self.tagstack.appendleft(tag_obj) handle_startendtag = handle_starttag @@ -141,79 +213,150 @@ class HTMLTagParser(HTMLParser): f'malnested closing tag {tag!r}, expected after {open_tags!r}') tag_obj = self.tagstack[idx] self.tagstack.remove(tag) - if not isinstance(tag_obj, str): - # since we landed here we'll always find a closing tag - match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:]) - tag_obj.stop = self._offset + match.end() - if self.callback(tag_obj) is not False: - self.found_tags.append(tag_obj) + if isinstance(tag_obj, self.Tag): + close_idx = self.rawdata.find('>', self._offset) + 1 + tag_obj.closerange(self._offset, close_idx - self._offset) + self._nestedtags.pop().insert(0, tag_obj) + self.callback(tag_obj) except ValueError as exc: if isinstance(exc, compat_HTMLParseError): raise - elif self.STRICT: - raise compat_HTMLParseError(f'stray closing tag {tag!r}') + if self.STRICT: + raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc -class ClassParser(HTMLTagParser): - def __init__(self, attribute, matchfunc, stop): - super().__init__() - self.search_attr = attribute - self.matchfunc = matchfunc - self.stop = stop - self.processing = 0 - - def predicate(self, tag, attrs): - if self.processing <= 0 and self.stop is not None and self._offset > self.stop: - self.abort() - string = dict(attrs).get(self.search_attr, '') - if self.matchfunc(string): - self.processing += 1 - return True - return False - - def callback(self, tag_obj): - if self.stop is None: - self.abort(tag_obj) - self.processing -= 1 - - @classmethod - def get_elements_html_by_class(cls, class_name, html): - regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b') - it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html) - start = stop = None - for match in it: - if start is None: - start = match.start() - else: - stop = match.end() - if start is None: - return [] - parser = cls('class', lambda x: regex.match(x), stop) - return [tag.html() for tag in parser.taglist(html[start:])] - - -class FirstMatchingElementParser(HTMLTagParser): +class MatchingElementParser(HTMLTagParser): + """ optimized version of HTMLTagParser + """ def __init__(self, matchfunc): super().__init__() self.matchfunc = matchfunc - self.found = False + self.found_none = True + + def reset(self): + super().reset() + self.found_none = True + + def callback(self, tag_obj): + raise self.AbortException() def predicate(self, tag, attrs): - if not self.found and self.matchfunc(tag, attrs): - self.found = True + if self.found_none and self.matchfunc(tag, attrs): + self.found_none = False return True return False - def callback(self, obj): - self.abort(obj) + @staticmethod + def class_value_regex(class_name): + return rf'[\w\s\-]*(?"']|"[^"]*"|'[^']*')*)? + \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) + ''' + + @classmethod + def iter_tags(cls, regex, html, *, matchfunc): + comments = HTMLCommentRanges(html) + parser = cls(matchfunc) + for match in re.finditer(regex, html): + if match.start() not in comments: + yield from parser.taglist(html[match.start():], reset=True) + + @classmethod + def tags_by_name(cls, tag, html): + def matchfunc(tag_str, _attrs): + return tag_str == tag + + yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc) + + @classmethod + def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): + def matchfunc(_tag_str, attrs): + return any(attr == attribute and re.fullmatch(value, value_str) + for attr, value_str in attrs) + + tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value) + yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) + + @classmethod + def extract_attributes(cls, html): + attr_dict = {} + + def matchfunc(_tag, attrs): + attr_dict.update(attrs) + raise cls.AbortException() + + with contextlib.suppress(cls.AbortException): + cls(matchfunc).feed(html) + + return attr_dict + + @classmethod + def get_elements_text_and_html_by_tag(cls, tag, html): + return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)] @classmethod def get_element_text_and_html_by_tag(cls, tag, html): - """ - For the first element with the specified tag in the given HTML document - return its content (text) and the whole element (html) - """ - parser = cls(lambda _tag, _: _tag == tag) - for tag_obj in parser.taglist(html): - return tag_obj.text_and_html() - raise compat_HTMLParseError(f'tag {tag} not found') + tag = next(cls.tags_by_name(tag, html), None) + return tag and tag.text_and_html() + + @classmethod + def get_elements_text_and_html_by_attribute(cls, *args, **kwargs): + return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_by_attribute(cls, *args, **kwargs): + return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_html_by_attribute(cls, *args, **kwargs): + return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_element_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.text() + + @classmethod + def get_element_html_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.html() + + @classmethod + def get_elements_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.html() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_text_and_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_element_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.html() + + @classmethod + def get_element_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.text() From 8451074b501f51cb66c4d5463260320763b9ff69 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 27 Nov 2022 16:22:03 +0100 Subject: [PATCH 06/15] [parsing] fix: don't push unmatched void tags onto queue --- test/test_parsing.py | 7 +++++++ yt_dlp/parsing.py | 14 ++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 75ed8ebf3..880c41a34 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -186,6 +186,9 @@ class TestParsing(unittest.TestCase): self.assertIsNone(get_element_text_and_html_by_tag('article', html)) def test_get_elements_text_and_html_by_tag(self): + class StrictParser(MatchingElementParser): + STRICT = True + test_string = ''' @@ -194,6 +197,10 @@ class TestParsing(unittest.TestCase): items = get_elements_text_and_html_by_tag('img', test_string) self.assertListEqual(items, [('', ''), ('', '')]) + self.assertEqual( + StrictParser.get_element_text_and_html_by_tag('use', ''), + ('', '')) + def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' malnested_elements = f'{inner_text}' diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index bcc48c4d3..8fbb4db14 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -185,17 +185,19 @@ class HTMLTagParser(HTMLParser): tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group() tag_obj = tag + tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS) if self.predicate(tag, attrs): tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj.openrange(self._offset, len(tag_text)) - if tag_text.endswith('/>') or tag in self.VOID_TAGS: + if tag_is_open: + nesting = [] + self._nestedtags[-1].append(nesting) + self._nestedtags.append(nesting) + else: self._nestedtags[-1].append(tag_obj) self.callback(tag_obj) - return - nesting = [] - self._nestedtags[-1].append(nesting) - self._nestedtags.append(nesting) - self.tagstack.appendleft(tag_obj) + if tag_is_open: + self.tagstack.appendleft(tag_obj) handle_startendtag = handle_starttag From dbf350c12291279c0be56cb82922c2fae1c87eb2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 27 Nov 2022 16:34:06 +0100 Subject: [PATCH 07/15] [parsing] return unclosed matched tags --- test/test_parsing.py | 9 +++++---- yt_dlp/parsing.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 880c41a34..588711518 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -218,8 +218,9 @@ class TestParsing(unittest.TestCase): get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) - self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): @@ -244,13 +245,13 @@ class TestParsing(unittest.TestCase): parser = HTMLTagParser() self.assertEqual(parser.taglist('

', reset=True), []) - self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), [Tag('div'), Tag('p')]) tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags, [Tag('div'), Tag('p')]) tags = parser.taglist('

/p>

', reset=True) - self.assertEqual(tags, [Tag('div')]) + self.assertEqual(tags, [Tag('div'), Tag('p')]) tags = parser.taglist('

paragraph

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 8fbb4db14..5ecd6b75c 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -190,7 +190,7 @@ class HTMLTagParser(HTMLParser): tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj.openrange(self._offset, len(tag_text)) if tag_is_open: - nesting = [] + nesting = [tag_obj] self._nestedtags[-1].append(nesting) self._nestedtags.append(nesting) else: @@ -218,7 +218,7 @@ class HTMLTagParser(HTMLParser): if isinstance(tag_obj, self.Tag): close_idx = self.rawdata.find('>', self._offset) + 1 tag_obj.closerange(self._offset, close_idx - self._offset) - self._nestedtags.pop().insert(0, tag_obj) + self._nestedtags.pop() self.callback(tag_obj) except ValueError as exc: if isinstance(exc, compat_HTMLParseError): From 7a67a2028f49f71c2cd4bae0611c2a04e313e840 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 27 Nov 2022 21:26:58 +0100 Subject: [PATCH 08/15] [parsing] tweak tag regex --- yt_dlp/parsing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 5ecd6b75c..d2c260954 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -261,7 +261,7 @@ class MatchingElementParser(HTMLTagParser): return rf'''(?x) <(?:{tag}) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)? \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) ''' @@ -278,7 +278,8 @@ class MatchingElementParser(HTMLTagParser): def matchfunc(tag_str, _attrs): return tag_str == tag - yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc) + tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' + yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) @classmethod def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): From 29278a3323be5106809e43d2977efcd0e3159a4f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 27 Nov 2022 16:56:45 +0100 Subject: [PATCH 09/15] [parsing] fix return value --- test/test_parsing.py | 16 +++++++++++++--- yt_dlp/parsing.py | 4 ++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 588711518..e21299df0 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -195,7 +195,7 @@ class TestParsing(unittest.TestCase): ignore ''' items = get_elements_text_and_html_by_tag('img', test_string) - self.assertListEqual(items, [('', ''), ('', '')]) + self.assertEqual(items, [('', ''), ('', '')]) self.assertEqual( StrictParser.get_element_text_and_html_by_tag('use', ''), @@ -245,16 +245,26 @@ class TestParsing(unittest.TestCase): parser = HTMLTagParser() self.assertEqual(parser.taglist('

', reset=True), []) - self.assertEqual(parser.taglist('

', reset=True), [Tag('div'), Tag('p')]) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('', '

')) + self.assertEqual(tags[1].text_and_html(), ('', '

')) tags = parser.taglist('

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('

', '

')) + self.assertEqual(tags[1].text_and_html(), ('
', '

')) tags = parser.taglist('

/p>

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('

/p>', '

/p>

')) + self.assertEqual(tags[1].text_and_html(), ('', '

')) tags = parser.taglist('

paragraph

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), + ('

paragraph', '

paragraph

')) self.assertEqual(tags[1].text_and_html(), ('paragraph', '

paragraph')) tags = parser.taglist('must be empty', reset=True) @@ -315,7 +325,7 @@ class TestParsing(unittest.TestCase): [Tag('t5'), Tag('t6')]], [Tag('t7'), Tag('t8')]])) - def test_within_html_comment(self): + def test_html_comment_ranges(self): def mark_comments(_string, char='^', nochar='-'): cmts = HTMLCommentRanges(_string) return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index d2c260954..8751cd5f9 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -318,7 +318,7 @@ class MatchingElementParser(HTMLTagParser): @classmethod def get_elements_by_attribute(cls, *args, **kwargs): - return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)] + return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)] @classmethod def get_elements_html_by_attribute(cls, *args, **kwargs): @@ -349,7 +349,7 @@ class MatchingElementParser(HTMLTagParser): @classmethod def get_elements_text_and_html_by_class(cls, class_name, html): value = cls.class_value_regex(class_name) - return [tag.text() for tag + return [tag.text_and_html() for tag in cls.tags_by_attribute('class', value, html, escape_value=False)] @classmethod From 6169b3eca81ccde2d6c0116295b2c38e807befb2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 29 Nov 2022 00:25:52 +0100 Subject: [PATCH 10/15] [parsing] replace HTMLCommentRanges with HTMLIgnoreRanges * ignore matches within CDATA elements and comments --- test/test_parsing.py | 25 +++++++++------- yt_dlp/parsing.py | 71 +++++++++++++++++--------------------------- 2 files changed, 43 insertions(+), 53 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index e21299df0..1898ee8ab 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -4,7 +4,7 @@ import unittest from yt_dlp.compat import compat_HTMLParseError from yt_dlp.parsing import ( MatchingElementParser, - HTMLCommentRanges, + HTMLIgnoreRanges, HTMLTagParser, ) @@ -325,26 +325,31 @@ class TestParsing(unittest.TestCase): [Tag('t5'), Tag('t6')]], [Tag('t7'), Tag('t8')]])) - def test_html_comment_ranges(self): + def test_html_ignored_ranges(self): def mark_comments(_string, char='^', nochar='-'): - cmts = HTMLCommentRanges(_string) + cmts = HTMLIgnoreRanges(_string) return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) html_string = ''' no comments in this line --------------------------------------------------------------------- - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--- before after - -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-------- + -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------- + this is a leftover comment --> and end - ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------ - this ends here --> and not here - -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---------------------------- - stray --> comment closings --> are ignored ' encountered - note: markers within quotes are not ignored + usage: + ranges = HTMLIgnoreRanges(html) + if offset in ranges: + ... """ + REGEX = re.compile(r'|]*>') def __init__(self, html): - self._range_iter = self.ranges(html) - self._range = next(self._range_iter, None) - self._last_offset = 0 - - @staticmethod - def ranges(string, sopen=''): - assert not (sopen.startswith(sclose) or sclose.startswith(sopen)) - open_iter = iter_find(string, sopen) - close_len = len(sclose) - close_iter = (idx + close_len for idx in iter_find(string, sclose)) - next_open = next(open_iter, None) - next_close = next(close_iter, None) - - while True: - if next_open is None: - return - while next_close is not None and next_open > next_close: - next_close = next(close_iter, None) - yield slice(next_open, next_close) - if next_close is None: - return - while next_open is not None and next_open < next_close: - next_open = next(open_iter, None) + self.html = html + self._last_match = None + self._final = False def __contains__(self, offset): assert isinstance(offset, int) - assert offset >= self._last_offset, 'offset must be in increasing order' - self._last_offset = offset - while self._range and self._range.stop is not None and offset >= self._range.stop: - self._range = next(self._range_iter, None) - return not (self._range is None or offset < self._range.start) + if not self._final and (self._last_match is None or offset >= self._last_match.end()): + match = self.REGEX.search(self.html, offset) + if match: + self._last_match = match + else: + self._final = True + + if self._last_match is None: + return False + match_string = self._last_match.group() + if match_string.startswith('': + return offset < self._last_match.start() + return offset >= self._last_match.end() class HTMLTagParser(HTMLParser): @@ -267,10 +252,10 @@ class MatchingElementParser(HTMLTagParser): @classmethod def iter_tags(cls, regex, html, *, matchfunc): - comments = HTMLCommentRanges(html) + ignored = HTMLIgnoreRanges(html) parser = cls(matchfunc) for match in re.finditer(regex, html): - if match.start() not in comments: + if match.start() not in ignored: yield from parser.taglist(html[match.start():], reset=True) @classmethod From 65f91148fc6fcbce967d775527edb95b567db0cb Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 29 Nov 2022 15:01:18 +0100 Subject: [PATCH 11/15] [parsing] search for case-insensitive tag names --- test/test_parsing.py | 4 ++++ yt_dlp/parsing.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 1898ee8ab..8a36beda4 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -222,6 +222,10 @@ class TestParsing(unittest.TestCase): get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) + # ignore case on tags + ci_html = f'{html}' + self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html)) + def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): STRICT = True diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 1698591e3..1db6704dd 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -245,7 +245,7 @@ class MatchingElementParser(HTMLTagParser): value_regex = re.escape(value_regex) return rf'''(?x) - <(?:{tag}) + <(?i:{tag}) (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)? \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) ''' @@ -263,7 +263,7 @@ class MatchingElementParser(HTMLTagParser): def matchfunc(tag_str, _attrs): return tag_str == tag - tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' + tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) @classmethod From 8d87bb4d91ed732bc08bd39ce114bdcca63abf68 Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 30 Nov 2022 17:21:09 +0100 Subject: [PATCH 12/15] [parsing] unify tag nesting --- test/test_parsing.py | 6 +++--- yt_dlp/parsing.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 8a36beda4..a7e7ec7d4 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -325,9 +325,9 @@ class TestParsing(unittest.TestCase): str(tags), str([ [Tag('t0'), [Tag('t1'), - [Tag('t2'), Tag('t3'), Tag('t4')]], - [Tag('t5'), Tag('t6')]], - [Tag('t7'), Tag('t8')]])) + [Tag('t2'), [Tag('t3')], [Tag('t4')]]], + [Tag('t5'), [Tag('t6')]]], + [Tag('t7'), [Tag('t8')]]])) def test_html_ignored_ranges(self): def mark_comments(_string, char='^', nochar='-'): diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 1db6704dd..c6748d2d8 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -174,12 +174,11 @@ class HTMLTagParser(HTMLParser): if self.predicate(tag, attrs): tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj.openrange(self._offset, len(tag_text)) + nesting = [tag_obj] + self._nestedtags[-1].append(nesting) if tag_is_open: - nesting = [tag_obj] - self._nestedtags[-1].append(nesting) self._nestedtags.append(nesting) else: - self._nestedtags[-1].append(tag_obj) self.callback(tag_obj) if tag_is_open: self.tagstack.appendleft(tag_obj) From 7a9dd3d35fa793f8f6fd1bff7ab9d500e025f9b4 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 2 Dec 2022 20:54:04 +0100 Subject: [PATCH 13/15] [parsing] inline tag_obj.closerange() --- yt_dlp/parsing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index c6748d2d8..256ba8e6c 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -200,8 +200,7 @@ class HTMLTagParser(HTMLParser): tag_obj = self.tagstack[idx] self.tagstack.remove(tag) if isinstance(tag_obj, self.Tag): - close_idx = self.rawdata.find('>', self._offset) + 1 - tag_obj.closerange(self._offset, close_idx - self._offset) + tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1)) self._nestedtags.pop() self.callback(tag_obj) except ValueError as exc: From c34166d7c8d64f065eb05a6447e268a7b7dc3e6e Mon Sep 17 00:00:00 2001 From: flashdagger Date: Mon, 13 Nov 2023 06:54:28 +0100 Subject: [PATCH 14/15] [parsing] support uppercase SCRIPT tags as suggested by github-advanced-security bot --- test/test_parsing.py | 2 +- yt_dlp/parsing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index a7e7ec7d4..0e006298f 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -345,7 +345,7 @@ class TestParsing(unittest.TestCase): ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ here is and end ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^--------- - + --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^--------- ''' diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 256ba8e6c..f4aaf1ac4 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -20,7 +20,7 @@ class HTMLIgnoreRanges: if offset in ranges: ... """ - REGEX = re.compile(r'|]*>') + REGEX = re.compile(r'|]*>', flags=re.IGNORECASE) def __init__(self, html): self.html = html From a91d9e1084ca87472b952d189eb897dc8a52fec5 Mon Sep 17 00:00:00 2001 From: flashdagger Date: Mon, 13 Nov 2023 07:14:14 +0100 Subject: [PATCH 15/15] [parsing] support comment end tag '--!>' as suggested by github-advanced-security bot --- test/test_parsing.py | 2 +- yt_dlp/parsing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 0e006298f..9641df91d 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -343,7 +343,7 @@ class TestParsing(unittest.TestCase): -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------- this is a leftover comment --> and end + here is and end ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^--------- --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^--------- diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index f4aaf1ac4..72d7e448b 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -20,7 +20,7 @@ class HTMLIgnoreRanges: if offset in ranges: ... """ - REGEX = re.compile(r'|]*>', flags=re.IGNORECASE) + REGEX = re.compile(r'': + if match_string.startswith('', '--!>'): return offset < self._last_match.start() return offset >= self._last_match.end()