[extractor/bitchute] Improve BitChuteChannelIE (#5066)

Authored by: flashdagger, pukkandan
This commit is contained in:
MMM 2022-11-09 04:30:15 +01:00 committed by GitHub
parent 8fddc232bf
commit c61473c1d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 99 additions and 41 deletions

View file

@ -1,14 +1,18 @@
import itertools import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
OnDemandPagedList,
clean_html, clean_html,
get_element_by_class, get_element_by_class,
get_elements_html_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
parse_count,
parse_duration,
traverse_obj, traverse_obj,
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
@ -109,51 +113,103 @@ def _real_extract(self, url):
class BitChuteChannelIE(InfoExtractor): class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_TEST = { _TESTS = [{
'url': 'https://www.bitchute.com/channel/victoriaxrave/', 'url': 'https://www.bitchute.com/channel/bitchute/',
'playlist_mincount': 185,
'info_dict': { 'info_dict': {
'id': 'victoriaxrave', 'id': 'bitchute',
'title': 'BitChute',
'description': 'md5:5329fb3866125afa9446835594a9b138',
},
'playlist': [
{
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'UGlrF9o9b-Q',
'ext': 'mp4',
'filesize': None,
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20170103',
'duration': 16,
'view_count': int,
}, },
} }
],
'params': {
'skip_download': True,
'playlist_items': '-1',
},
}, {
'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
'playlist_mincount': 20,
'info_dict': {
'id': 'wV9Imujxasw9',
'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:04913227d2714af1d36d804aa2ab6b1e',
}
}]
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
PAGE_SIZE = 25
HTML_CLASS_NAMES = {
'channel': {
'container': 'channel-videos-container',
'title': 'channel-videos-title',
'description': 'channel-videos-text',
},
'playlist': {
'container': 'playlist-video',
'title': 'title',
'description': 'description',
}
def _entries(self, channel_id): }
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
offset = 0 @staticmethod
for page_num in itertools.count(1): def _make_url(playlist_id, playlist_type):
return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
def _fetch_page(self, playlist_id, playlist_type, page_num):
playlist_url = self._make_url(playlist_id, playlist_type)
data = self._download_json( data = self._download_json(
'%sextend/' % channel_url, channel_id, f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
'Downloading channel page %d' % page_num,
data=urlencode_postdata({ data=urlencode_postdata({
'csrfmiddlewaretoken': self._TOKEN, 'csrfmiddlewaretoken': self._TOKEN,
'name': '', 'name': '',
'offset': offset, 'offset': page_num * self.PAGE_SIZE,
}), headers={ }), headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': channel_url, 'Referer': playlist_url,
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'csrftoken=%s' % self._TOKEN, 'Cookie': f'csrftoken={self._TOKEN}',
}) })
if data.get('success') is False: if not data.get('success'):
break return
html = data.get('html') classes = self.HTML_CLASS_NAMES[playlist_type]
if not html: for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
break video_id = self._search_regex(
video_ids = re.findall( r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', if not video_id:
html) continue
if not video_ids:
break
offset += len(video_ids)
for video_id in video_ids:
yield self.url_result( yield self.url_result(
'https://www.bitchute.com/video/%s' % video_id, f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
ie=BitChuteIE.ie_key(), video_id=video_id) title=clean_html(get_element_by_class(classes['title'], video_html)),
description=clean_html(get_element_by_class(classes['description'], video_html)),
duration=parse_duration(get_element_by_class('video-duration', video_html)),
view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
return self.playlist_result( return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id) OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
title=self._html_extract_title(webpage, default=None),
description=self._html_search_meta(
('description', 'og:description', 'twitter:description'), webpage, default=None),
playlist_count=int_or_none(self._html_search_regex(
r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))

View file

@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
Return the text (content) and the html (whole) of the tag with the specified Return the text (content) and the html (whole) of the tag with the specified
attribute in the passed HTML document attribute in the passed HTML document
""" """
if not value:
return
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'