[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges

* ignore matches within CDATA elements and comments
2024-11-13 20:03:17 +00:00 · 2022-11-29 00:25:52 +01:00 · 2022-11-29 00:25:52 +01:00 · 6169b3eca8
parent 29278a3323
commit 6169b3eca8
2 changed files with 43 additions and 53 deletions
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@ -4,7 +4,7 @@
 from yt_dlp.compat import compat_HTMLParseError
 from yt_dlp.parsing import (
    MatchingElementParser,
-    HTMLCommentRanges,
+    HTMLIgnoreRanges,
    HTMLTagParser,
 )

@ -325,26 +325,31 @@ def test_tag_return_order(self):
                 [Tag('t5'), Tag('t6')]],
                [Tag('t7'), Tag('t8')]]))

-    def test_html_comment_ranges(self):
+    def test_html_ignored_ranges(self):
        def mark_comments(_string, char='^', nochar='-'):
-            cmts = HTMLCommentRanges(_string)
+            cmts = HTMLIgnoreRanges(_string)
            return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))

        html_string = '''
        no              comments         in            this              line
        ---------------------------------------------------------------------
        <!--                 whole line represents a comment              -->
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
        before <!--                      comment                  -->   after
-        -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+        -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
+        this is a leftover comment -->     <!-- a new comment without closing
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        here   is   <!-- a comment -->   and   <!-- another comment -->   end
-        ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
-        this <!-- nested  <!--     comment    -->  ends here --> and not here
-        -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
-        stray --> comment closings --> are ignored <!-- but not <!-- openings
-        -------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
+        ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
+        <script> ignore here </script>            <script> and here </script>
+        --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
        '''

        lines = textwrap.dedent(html_string).strip().splitlines()
        for line, marker in zip(lines[0::2], lines[1::2]):
            self.assertEqual((line, mark_comments(line)), (line, marker))
+
+        # yet we must be able to match script elements
+        test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
+        items = get_element_text_and_html_by_tag('script', test_string)
+        self.assertEqual(items, ("var foo = 'bar';", test_string))
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@ -8,56 +8,41 @@
 from .utils import orderedSet


-def iter_find(string, sub: str):
-    size = len(sub)
-    idx = -size
-    while True:
-        idx = string.find(sub, idx + size)
-        if idx == -1:
-            return
-        yield idx
+class HTMLIgnoreRanges:
+    """check if an offset is within CDATA content elements (script, style) or XML comments

+        note:
+            * given offsets must be in increasing order
+            * no detection of nested constructs (e.g. comments within script tags)

-class HTMLCommentRanges:
-    """computes the offsets of HTML comments
-
-    comments start with '<!--' and end with the first '-->' encountered
-    note: markers within quotes are not ignored
+        usage:
+            ranges = HTMLIgnoreRanges(html)
+            if offset in ranges:
+                ...
    """
+    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')

    def __init__(self, html):
-        self._range_iter = self.ranges(html)
-        self._range = next(self._range_iter, None)
-        self._last_offset = 0
-
-    @staticmethod
-    def ranges(string, sopen='<!--', sclose='-->'):
-        assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
-        open_iter = iter_find(string, sopen)
-        close_len = len(sclose)
-        close_iter = (idx + close_len for idx in iter_find(string, sclose))
-        next_open = next(open_iter, None)
-        next_close = next(close_iter, None)
-
-        while True:
-            if next_open is None:
-                return
-            while next_close is not None and next_open > next_close:
-                next_close = next(close_iter, None)
-            yield slice(next_open, next_close)
-            if next_close is None:
-                return
-            while next_open is not None and next_open < next_close:
-                next_open = next(open_iter, None)
+        self.html = html
+        self._last_match = None
+        self._final = False

    def __contains__(self, offset):
        assert isinstance(offset, int)
-        assert offset >= self._last_offset, 'offset must be in increasing order'
-        self._last_offset = offset
-        while self._range and self._range.stop is not None and offset >= self._range.stop:
-            self._range = next(self._range_iter, None)

-        return not (self._range is None or offset < self._range.start)
+        if not self._final and (self._last_match is None or offset >= self._last_match.end()):
+            match = self.REGEX.search(self.html, offset)
+            if match:
+                self._last_match = match
+            else:
+                self._final = True
+
+        if self._last_match is None:
+            return False
+        match_string = self._last_match.group()
+        if match_string.startswith('</') or match_string == '-->':
+            return offset < self._last_match.start()
+        return offset >= self._last_match.end()


 class HTMLTagParser(HTMLParser):
@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):

    @classmethod
    def iter_tags(cls, regex, html, *, matchfunc):
-        comments = HTMLCommentRanges(html)
+        ignored = HTMLIgnoreRanges(html)
        parser = cls(matchfunc)
        for match in re.finditer(regex, html):
-            if match.start() not in comments:
+            if match.start() not in ignored:
                yield from parser.taglist(html[match.start():], reset=True)

    @classmethod