[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges

* ignore matches within CDATA elements and comments
This commit is contained in:
Marcel 2022-11-29 00:25:52 +01:00
parent 29278a3323
commit 6169b3eca8
No known key found for this signature in database
GPG key ID: 7813C97693AD6AAE
2 changed files with 43 additions and 53 deletions

View file

@ -4,7 +4,7 @@
from yt_dlp.compat import compat_HTMLParseError from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import ( from yt_dlp.parsing import (
MatchingElementParser, MatchingElementParser,
HTMLCommentRanges, HTMLIgnoreRanges,
HTMLTagParser, HTMLTagParser,
) )
@ -325,26 +325,31 @@ def test_tag_return_order(self):
[Tag('t5'), Tag('t6')]], [Tag('t5'), Tag('t6')]],
[Tag('t7'), Tag('t8')]])) [Tag('t7'), Tag('t8')]]))
def test_html_comment_ranges(self): def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'): def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLCommentRanges(_string) cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = ''' html_string = '''
no comments in this line no comments in this line
--------------------------------------------------------------------- ---------------------------------------------------------------------
<!-- whole line represents a comment --> <!-- whole line represents a comment -->
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before <!-- comment --> after before <!-- comment --> after
-------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-------- -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> <!-- a new comment without closing
^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
here is <!-- a comment --> and <!-- another comment --> end here is <!-- a comment --> and <!-- another comment --> end
------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------ ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
this <!-- nested <!-- comment --> ends here --> and not here <script> ignore here </script> <script> and here </script>
-----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---------------------------- --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
stray --> comment closings --> are ignored <!-- but not <!-- openings
-------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
''' '''
lines = textwrap.dedent(html_string).strip().splitlines() lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]): for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker)) self.assertEqual((line, mark_comments(line)), (line, marker))
# yet we must be able to match script elements
test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
items = get_element_text_and_html_by_tag('script', test_string)
self.assertEqual(items, ("var foo = 'bar';", test_string))

View file

@ -8,56 +8,41 @@
from .utils import orderedSet from .utils import orderedSet
def iter_find(string, sub: str): class HTMLIgnoreRanges:
size = len(sub) """check if an offset is within CDATA content elements (script, style) or XML comments
idx = -size
while True:
idx = string.find(sub, idx + size)
if idx == -1:
return
yield idx
note:
* given offsets must be in increasing order
* no detection of nested constructs (e.g. comments within script tags)
class HTMLCommentRanges: usage:
"""computes the offsets of HTML comments ranges = HTMLIgnoreRanges(html)
if offset in ranges:
comments start with '<!--' and end with the first '-->' encountered ...
note: markers within quotes are not ignored
""" """
REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')
def __init__(self, html): def __init__(self, html):
self._range_iter = self.ranges(html) self.html = html
self._range = next(self._range_iter, None) self._last_match = None
self._last_offset = 0 self._final = False
@staticmethod
def ranges(string, sopen='<!--', sclose='-->'):
assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
open_iter = iter_find(string, sopen)
close_len = len(sclose)
close_iter = (idx + close_len for idx in iter_find(string, sclose))
next_open = next(open_iter, None)
next_close = next(close_iter, None)
while True:
if next_open is None:
return
while next_close is not None and next_open > next_close:
next_close = next(close_iter, None)
yield slice(next_open, next_close)
if next_close is None:
return
while next_open is not None and next_open < next_close:
next_open = next(open_iter, None)
def __contains__(self, offset): def __contains__(self, offset):
assert isinstance(offset, int) assert isinstance(offset, int)
assert offset >= self._last_offset, 'offset must be in increasing order'
self._last_offset = offset
while self._range and self._range.stop is not None and offset >= self._range.stop:
self._range = next(self._range_iter, None)
return not (self._range is None or offset < self._range.start) if not self._final and (self._last_match is None or offset >= self._last_match.end()):
match = self.REGEX.search(self.html, offset)
if match:
self._last_match = match
else:
self._final = True
if self._last_match is None:
return False
match_string = self._last_match.group()
if match_string.startswith('</') or match_string == '-->':
return offset < self._last_match.start()
return offset >= self._last_match.end()
class HTMLTagParser(HTMLParser): class HTMLTagParser(HTMLParser):
@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
@classmethod @classmethod
def iter_tags(cls, regex, html, *, matchfunc): def iter_tags(cls, regex, html, *, matchfunc):
comments = HTMLCommentRanges(html) ignored = HTMLIgnoreRanges(html)
parser = cls(matchfunc) parser = cls(matchfunc)
for match in re.finditer(regex, html): for match in re.finditer(regex, html):
if match.start() not in comments: if match.start() not in ignored:
yield from parser.taglist(html[match.start():], reset=True) yield from parser.taglist(html[match.start():], reset=True)
@classmethod @classmethod