[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges

* ignore matches within CDATA elements and comments
2024-11-14 12:23:19 +00:00 · 2022-11-29 00:25:52 +01:00 · 2022-11-29 00:25:52 +01:00 · 6169b3eca8
parent 29278a3323
commit 6169b3eca8
2 changed files with 43 additions and 53 deletions
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@ -4,7 +4,7 @@
 from yt_dlp.compat import compat_HTMLParseError
 from yt_dlp.parsing import (
    MatchingElementParser,
-    HTMLCommentRanges,
+    HTMLIgnoreRanges,
    HTMLTagParser,
 )
@ -325,26 +325,31 @@ def test_tag_return_order(self):
                 [Tag('t5'), Tag('t6')]],
                [Tag('t7'), Tag('t8')]]))
-    def test_html_comment_ranges(self):
+    def test_html_ignored_ranges(self):
        def mark_comments(_string, char='^', nochar='-'):
-            cmts = HTMLCommentRanges(_string)
+            cmts = HTMLIgnoreRanges(_string)
            return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
        html_string = '''
        no              comments         in            this              line
        ---------------------------------------------------------------------
        <!--                 whole line represents a comment              -->
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
        before <!--                      comment                  -->   after
-        -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+        -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
        this is a leftover comment -->     <!-- a new comment without closing
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        here   is   <!-- a comment -->   and   <!-- another comment -->   end
-        ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
+        ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
-        this <!-- nested  <!--     comment    -->  ends here --> and not here
+        <script> ignore here </script>            <script> and here </script>
-        -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
+        --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
        stray --> comment closings --> are ignored <!-- but not <!-- openings
        -------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
        '''
        lines = textwrap.dedent(html_string).strip().splitlines()
        for line, marker in zip(lines[0::2], lines[1::2]):
            self.assertEqual((line, mark_comments(line)), (line, marker))
        # yet we must be able to match script elements
        test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
        items = get_element_text_and_html_by_tag('script', test_string)
        self.assertEqual(items, ("var foo = 'bar';", test_string))
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@ -8,56 +8,41 @@
 from .utils import orderedSet
-def iter_find(string, sub: str):
+class HTMLIgnoreRanges:
-    size = len(sub)
+    """check if an offset is within CDATA content elements (script, style) or XML comments
    idx = -size
    while True:
        idx = string.find(sub, idx + size)
        if idx == -1:
            return
        yield idx
        note:
            * given offsets must be in increasing order
            * no detection of nested constructs (e.g. comments within script tags)
-class HTMLCommentRanges:
+        usage:
-    """computes the offsets of HTML comments
+            ranges = HTMLIgnoreRanges(html)
-
+            if offset in ranges:
-    comments start with '<!--' and end with the first '-->' encountered
+                ...
    note: markers within quotes are not ignored
    """
    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')
    def __init__(self, html):
-        self._range_iter = self.ranges(html)
+        self.html = html
-        self._range = next(self._range_iter, None)
+        self._last_match = None
-        self._last_offset = 0
+        self._final = False
    @staticmethod
    def ranges(string, sopen='<!--', sclose='-->'):
        assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
        open_iter = iter_find(string, sopen)
        close_len = len(sclose)
        close_iter = (idx + close_len for idx in iter_find(string, sclose))
        next_open = next(open_iter, None)
        next_close = next(close_iter, None)
        while True:
            if next_open is None:
                return
            while next_close is not None and next_open > next_close:
                next_close = next(close_iter, None)
            yield slice(next_open, next_close)
            if next_close is None:
                return
            while next_open is not None and next_open < next_close:
                next_open = next(open_iter, None)
    def __contains__(self, offset):
        assert isinstance(offset, int)
        assert offset >= self._last_offset, 'offset must be in increasing order'
        self._last_offset = offset
        while self._range and self._range.stop is not None and offset >= self._range.stop:
            self._range = next(self._range_iter, None)
-        return not (self._range is None or offset < self._range.start)
+        if not self._final and (self._last_match is None or offset >= self._last_match.end()):
            match = self.REGEX.search(self.html, offset)
            if match:
                self._last_match = match
            else:
                self._final = True
        if self._last_match is None:
            return False
        match_string = self._last_match.group()
        if match_string.startswith('</') or match_string == '-->':
            return offset < self._last_match.start()
        return offset >= self._last_match.end()
 class HTMLTagParser(HTMLParser):
@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
    @classmethod
    def iter_tags(cls, regex, html, *, matchfunc):
-        comments = HTMLCommentRanges(html)
+        ignored = HTMLIgnoreRanges(html)
        parser = cls(matchfunc)
        for match in re.finditer(regex, html):
-            if match.start() not in comments:
+            if match.start() not in ignored:
                yield from parser.taglist(html[match.start():], reset=True)
    @classmethod