[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges

* ignore matches within CDATA elements and comments
This commit is contained in:
Marcel 2022-11-29 00:25:52 +01:00
parent 29278a3323
commit 6169b3eca8
No known key found for this signature in database
GPG key ID: 7813C97693AD6AAE
2 changed files with 43 additions and 53 deletions

View file

@ -4,7 +4,7 @@
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
HTMLCommentRanges,
HTMLIgnoreRanges,
HTMLTagParser,
)
@ -325,26 +325,31 @@ def test_tag_return_order(self):
[Tag('t5'), Tag('t6')]],
[Tag('t7'), Tag('t8')]]))
def test_html_comment_ranges(self):
def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLCommentRanges(_string)
cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
<!-- whole line represents a comment -->
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before <!-- comment --> after
-------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
-----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> <!-- a new comment without closing
^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
here is <!-- a comment --> and <!-- another comment --> end
------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
this <!-- nested <!-- comment --> ends here --> and not here
-----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
stray --> comment closings --> are ignored <!-- but not <!-- openings
-------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
<script> ignore here </script> <script> and here </script>
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
'''
lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker))
# yet we must be able to match script elements
test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
items = get_element_text_and_html_by_tag('script', test_string)
self.assertEqual(items, ("var foo = 'bar';", test_string))

View file

@ -8,56 +8,41 @@
from .utils import orderedSet
def iter_find(string, sub: str):
size = len(sub)
idx = -size
while True:
idx = string.find(sub, idx + size)
if idx == -1:
return
yield idx
class HTMLIgnoreRanges:
"""check if an offset is within CDATA content elements (script, style) or XML comments
note:
* given offsets must be in increasing order
* no detection of nested constructs (e.g. comments within script tags)
class HTMLCommentRanges:
"""computes the offsets of HTML comments
comments start with '<!--' and end with the first '-->' encountered
note: markers within quotes are not ignored
usage:
ranges = HTMLIgnoreRanges(html)
if offset in ranges:
...
"""
REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')
def __init__(self, html):
self._range_iter = self.ranges(html)
self._range = next(self._range_iter, None)
self._last_offset = 0
@staticmethod
def ranges(string, sopen='<!--', sclose='-->'):
assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
open_iter = iter_find(string, sopen)
close_len = len(sclose)
close_iter = (idx + close_len for idx in iter_find(string, sclose))
next_open = next(open_iter, None)
next_close = next(close_iter, None)
while True:
if next_open is None:
return
while next_close is not None and next_open > next_close:
next_close = next(close_iter, None)
yield slice(next_open, next_close)
if next_close is None:
return
while next_open is not None and next_open < next_close:
next_open = next(open_iter, None)
self.html = html
self._last_match = None
self._final = False
def __contains__(self, offset):
assert isinstance(offset, int)
assert offset >= self._last_offset, 'offset must be in increasing order'
self._last_offset = offset
while self._range and self._range.stop is not None and offset >= self._range.stop:
self._range = next(self._range_iter, None)
return not (self._range is None or offset < self._range.start)
if not self._final and (self._last_match is None or offset >= self._last_match.end()):
match = self.REGEX.search(self.html, offset)
if match:
self._last_match = match
else:
self._final = True
if self._last_match is None:
return False
match_string = self._last_match.group()
if match_string.startswith('</') or match_string == '-->':
return offset < self._last_match.start()
return offset >= self._last_match.end()
class HTMLTagParser(HTMLParser):
@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
@classmethod
def iter_tags(cls, regex, html, *, matchfunc):
comments = HTMLCommentRanges(html)
ignored = HTMLIgnoreRanges(html)
parser = cls(matchfunc)
for match in re.finditer(regex, html):
if match.start() not in comments:
if match.start() not in ignored:
yield from parser.taglist(html[match.start():], reset=True)
@classmethod