diff --git a/test/test_parsing.py b/test/test_parsing.py new file mode 100644 index 000000000..9641df91d --- /dev/null +++ b/test/test_parsing.py @@ -0,0 +1,359 @@ +import textwrap +import unittest + +from yt_dlp.compat import compat_HTMLParseError +from yt_dlp.parsing import ( + MatchingElementParser, + HTMLIgnoreRanges, + HTMLTagParser, +) + +extract_attributes = MatchingElementParser.extract_attributes +get_element_by_attribute = MatchingElementParser.get_element_by_attribute +get_element_by_class = MatchingElementParser.get_element_by_class +get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute +get_element_html_by_class = MatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute +get_elements_by_class = MatchingElementParser.get_elements_by_class +get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute +get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute +get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag + + +class TestParsing(unittest.TestCase): + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + # Malformed HTML should not break attributes extraction on older Python + self.assertEqual(extract_attributes(''), {}) + + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' + nice +
also nice
+ ''' + + def test_get_element_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + + def test_get_element_html_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_class('foo', html), + 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' + + ''' + + def test_get_element_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') + self.assertEqual(get_element_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice') + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + + def test_get_element_html_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), + 'nice') + self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) + + GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' + nice + also nice + ''' + GET_ELEMENTS_BY_CLASS_RES = [ + 'nice', + 'also nice' + ] + + def test_get_elements_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_class('no-such-class', html), []) + + def test_get_elements_html_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_class('no-such-class', html), []) + + def test_get_elements_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), + self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_text_and_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual( + get_elements_text_and_html_by_attribute('class', 'foo bar', html), + list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) + + self.assertEqual(get_elements_text_and_html_by_attribute( + 'class', 'foo', 'nicenot nice', tag='a'), + [('nice', 'nice')]) + + def test_get_element_text_and_html_by_tag(self): + get_element_by_tag_test_string = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned +
+ this should also be returned +
+ closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) + get_element_by_tag_res_outerdiv_html = html.strip()[32:276] + get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] + get_element_by_tag_res_innerspan_html = html.strip()[78:119] + get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), + (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('span', html), + (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) + self.assertIsNone(get_element_text_and_html_by_tag('article', html)) + + def test_get_elements_text_and_html_by_tag(self): + class StrictParser(MatchingElementParser): + STRICT = True + + test_string = ''' + + + ignore + ''' + items = get_elements_text_and_html_by_tag('img', test_string) + self.assertEqual(items, [('', ''), ('', '')]) + + self.assertEqual( + StrictParser.get_element_text_and_html_by_tag('use', ''), + ('', '')) + + def test_get_element_text_and_html_by_tag_malformed(self): + inner_text = 'inner text' + malnested_elements = f'{inner_text}' + commented_html = '' + outerdiv_html = f'
{malnested_elements}
' + html = f'{commented_html}{outerdiv_html}' + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_a', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_b', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) + self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) + + # ignore case on tags + ci_html = f'{html}' + self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html)) + + def test_strict_html_parsing(self): + class StrictTagParser(HTMLTagParser): + STRICT = True + + parser = StrictTagParser() + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

/p>

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): + parser.taglist('must be empty', reset=True) + + def test_relaxed_html_parsing(self): + Tag = HTMLTagParser.Tag + parser = HTMLTagParser() + + self.assertEqual(parser.taglist('

', reset=True), []) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('', '

')) + self.assertEqual(tags[1].text_and_html(), ('', '

')) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('

', '

')) + self.assertEqual(tags[1].text_and_html(), ('
', '

')) + + tags = parser.taglist('

/p>

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), ('

/p>', '

/p>

')) + self.assertEqual(tags[1].text_and_html(), ('', '

')) + + tags = parser.taglist('

paragraph

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + self.assertEqual(tags[0].text_and_html(), + ('

paragraph', '

paragraph

')) + self.assertEqual(tags[1].text_and_html(), ('paragraph', '

paragraph')) + + tags = parser.taglist('must be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) + + def test_tag_return_order(self): + Tag = HTMLTagParser.Tag + html = ''' + + + + + + + + + + + + + + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual( + str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'), + Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')])) + + tags = parser.taglist(html, reset=True, depth_first=True) + self.assertEqual( + str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'), + Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')])) + + # return tags in nested order + tags = parser.taglist(html, reset=True, depth_first=None) + self.assertEqual( + str(tags), str([ + [Tag('t0'), + [Tag('t1'), + [Tag('t2'), [Tag('t3')], [Tag('t4')]]], + [Tag('t5'), [Tag('t6')]]], + [Tag('t7'), [Tag('t8')]]])) + + def test_html_ignored_ranges(self): + def mark_comments(_string, char='^', nochar='-'): + cmts = HTMLIgnoreRanges(_string) + return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) + + html_string = ''' + no comments in this line + --------------------------------------------------------------------- + + ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--- + before after + -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------- + this is a leftover comment --> and end + ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^--------- + + --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^--------- + ''' + + lines = textwrap.dedent(html_string).strip().splitlines() + for line, marker in zip(lines[0::2], lines[1::2]): + self.assertEqual((line, mark_comments(line)), (line, marker)) + + # yet we must be able to match script elements + test_string = '''''' + items = get_element_text_and_html_by_tag('script', test_string) + self.assertEqual(items, ("var foo = 'bar';", test_string)) diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py new file mode 100644 index 000000000..72d7e448b --- /dev/null +++ b/yt_dlp/parsing.py @@ -0,0 +1,348 @@ +import collections +import contextlib +import itertools +import re +from html.parser import HTMLParser + +from .compat import compat_HTMLParseError +from .utils import orderedSet + + +class HTMLIgnoreRanges: + """check if an offset is within CDATA content elements (script, style) or XML comments + + note: + * given offsets must be in increasing order + * no detection of nested constructs (e.g. comments within script tags) + + usage: + ranges = HTMLIgnoreRanges(html) + if offset in ranges: + ... + """ + REGEX = re.compile(r'', '--!>'): + return offset < self._last_match.start() + return offset >= self._last_match.end() + + +class HTMLTagParser(HTMLParser): + """HTML parser which returns found elements as instances of 'Tag' + when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements + + usage: + parser = HTMLTagParser() + for tag_obj in parser.taglist(html): + tag_obj.text_and_html() + + """ + + STRICT = False + ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''') + VOID_TAGS = { + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', + } + + class Tag: + __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange' + + def __init__(self, name, *, string='', attrs=()): + self.name = name + self.string = string + self.attrs = tuple(attrs) + self._openrange = None + self._closerange = None + + def __str__(self): + return self.name + + def __repr__(self): + return f'{self.__class__.__name__}({str(self)!r})' + + def __eq__(self, other): + return self.name == other + + def openrange(self, offset, startlen=0): + if isinstance(offset, slice): + self._openrange = offset + else: + self._openrange = slice(offset, offset + startlen) + + def closerange(self, offset, stoplen=0): + if isinstance(offset, slice): + self._closerange = offset + else: + self._closerange = slice(offset, offset + stoplen) + + def opentag(self): + return self.string[self._openrange] if self._openrange else '' + + def html(self): + if not self._openrange: + return '' + if self._closerange: + return self.string[self._openrange.start:self._closerange.stop] + return self.string[self._openrange] + + def text(self): + if self._openrange and self._closerange: + return self.string[self._openrange.stop:self._closerange.start] + return '' + + def text_and_html(self): + return self.text(), self.html() + + class AbortException(Exception): + pass + + def __init__(self): + self.tagstack = collections.deque() + self._nestedtags = [[]] + super().__init__() + self._offset = self.offset + + def predicate(self, tag, attrs): + """ return True for every encountered opening tag that should be processed """ + return True + + def callback(self, tag_obj): + """ this will be called when the requested tag is closed """ + + def reset(self): + super().reset() + self.tagstack.clear() + + def taglist(self, data, reset=True, depth_first=False): + """ parse data and return found tag objects + @param data: html string + @param reset: reset state + @param depth_first: return order: as opened (False), as closed (True), nested (None) + @return: list of Tag objects + """ + def flatten(_list, first=True): + rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1]) + for item in rlist: + if isinstance(item, list): + yield from flatten(item, first=False) + else: + yield item + + if reset: + self.reset() + with contextlib.suppress(HTMLTagParser.AbortException): + self.feed(data) + if self.STRICT and self.tagstack: + orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True)))) + raise compat_HTMLParseError(f'unclosed tag {orphans}') + taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0])) + self._nestedtags = [[]] + return taglist + + def updatepos(self, i, j): + offset = self._offset = super().updatepos(i, j) + return offset + + def handle_starttag(self, tag, attrs): + try: + # we use internal variable for performance reasons + tag_text = getattr(self, '_HTMLParser__starttag_text') + except AttributeError: + tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group() + + tag_obj = tag + tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS) + if self.predicate(tag, attrs): + tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) + tag_obj.openrange(self._offset, len(tag_text)) + nesting = [tag_obj] + self._nestedtags[-1].append(nesting) + if tag_is_open: + self._nestedtags.append(nesting) + else: + self.callback(tag_obj) + if tag_is_open: + self.tagstack.appendleft(tag_obj) + + handle_startendtag = handle_starttag + + def handle_endtag(self, tag): + if '<' in tag: + if self.STRICT: + raise compat_HTMLParseError(f'malformed closing tag {tag!r}') + tag = tag[:tag.index('<')] + + try: + idx = self.tagstack.index(tag) + if self.STRICT and idx: + open_tags = ''.join(f'' for tag in itertools.islice(self.tagstack, idx)) + raise compat_HTMLParseError( + f'malnested closing tag {tag!r}, expected after {open_tags!r}') + tag_obj = self.tagstack[idx] + self.tagstack.remove(tag) + if isinstance(tag_obj, self.Tag): + tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1)) + self._nestedtags.pop() + self.callback(tag_obj) + except ValueError as exc: + if isinstance(exc, compat_HTMLParseError): + raise + if self.STRICT: + raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc + + +class MatchingElementParser(HTMLTagParser): + """ optimized version of HTMLTagParser + """ + def __init__(self, matchfunc): + super().__init__() + self.matchfunc = matchfunc + self.found_none = True + + def reset(self): + super().reset() + self.found_none = True + + def callback(self, tag_obj): + raise self.AbortException() + + def predicate(self, tag, attrs): + if self.found_none and self.matchfunc(tag, attrs): + self.found_none = False + return True + return False + + @staticmethod + def class_value_regex(class_name): + return rf'[\w\s\-]*(?"'\\]|"[^"\\]*"|'[^'\\]*')*)? + \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) + ''' + + @classmethod + def iter_tags(cls, regex, html, *, matchfunc): + ignored = HTMLIgnoreRanges(html) + parser = cls(matchfunc) + for match in re.finditer(regex, html): + if match.start() not in ignored: + yield from parser.taglist(html[match.start():], reset=True) + + @classmethod + def tags_by_name(cls, tag, html): + def matchfunc(tag_str, _attrs): + return tag_str == tag + + tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' + yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) + + @classmethod + def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): + def matchfunc(_tag_str, attrs): + return any(attr == attribute and re.fullmatch(value, value_str) + for attr, value_str in attrs) + + tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value) + yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) + + @classmethod + def extract_attributes(cls, html): + attr_dict = {} + + def matchfunc(_tag, attrs): + attr_dict.update(attrs) + raise cls.AbortException() + + with contextlib.suppress(cls.AbortException): + cls(matchfunc).feed(html) + + return attr_dict + + @classmethod + def get_elements_text_and_html_by_tag(cls, tag, html): + return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)] + + @classmethod + def get_element_text_and_html_by_tag(cls, tag, html): + tag = next(cls.tags_by_name(tag, html), None) + return tag and tag.text_and_html() + + @classmethod + def get_elements_text_and_html_by_attribute(cls, *args, **kwargs): + return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_by_attribute(cls, *args, **kwargs): + return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_elements_html_by_attribute(cls, *args, **kwargs): + return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)] + + @classmethod + def get_element_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.text() + + @classmethod + def get_element_html_by_attribute(cls, *args, **kwargs): + tag = next(cls.tags_by_attribute(*args, **kwargs), None) + return tag and tag.html() + + @classmethod + def get_elements_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.html() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_elements_text_and_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + return [tag.text_and_html() for tag + in cls.tags_by_attribute('class', value, html, escape_value=False)] + + @classmethod + def get_element_html_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.html() + + @classmethod + def get_element_by_class(cls, class_name, html): + value = cls.class_value_regex(class_name) + tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None) + return tag and tag.text() diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b63766912..666f69685 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -395,17 +395,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): pass def handle_starttag(self, tag, _): - self.tagstack.append(tag) + self.tagstack.appendleft(tag) def handle_endtag(self, tag): if not self.tagstack: raise compat_HTMLParseError('no tags in the stack') - while self.tagstack: - inner_tag = self.tagstack.pop() - if inner_tag == tag: - break - else: - raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + with contextlib.suppress(ValueError): + self.tagstack.remove(tag) if not self.tagstack: raise self.HTMLBreakOnClosingTagException() @@ -439,6 +435,8 @@ def get_element_text_and_html_by_tag(tag, html): next_closing_tag_end = next_closing_tag_start + len(closing_tag) try: parser.feed(html[offset:offset + next_closing_tag_end]) + if tag not in parser.tagstack: + raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException() offset += next_closing_tag_end except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: return html[content_start:offset + next_closing_tag_start], \