From 6d8a53d870ff6795f509085bfbf3981417999038 Mon Sep 17 00:00:00 2001 From: Podiumnoche <134448981+Podiumnoche@users.noreply.github.com> Date: Fri, 17 May 2024 00:41:34 +0200 Subject: [PATCH] [ie/cda] Fix age-gated web extraction (#9939) Closes #5980, Closes #6638 Authored by: Podiumnoche, Szpachlarz, dirkf, emqi --- yt_dlp/extractor/cda.py | 62 +++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 90b4d082e..0a5a524c1 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -16,7 +16,6 @@ from ..utils import ( merge_dicts, multipart_encode, parse_duration, - random_birthday, traverse_obj, try_call, try_get, @@ -63,38 +62,57 @@ class CDAIE(InfoExtractor): 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', - 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, + 'upload_date': '20160220', + 'timestamp': 1455968218, } }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', + # Age-restricted with vfilm redirection + 'url': 'https://www.cda.pl/video/8753244c4', + 'md5': 'd8eeb83d63611289507010d3df3bb8b3', 'info_dict': { - 'id': '1273454c4', + 'id': '8753244c4', 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?', + 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e', 'height': 1080, - 'uploader': 'boniek61', + 'uploader': 'arhn eu', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, + 'duration': 991, 'age_limit': 18, - 'view_count': int, 'average_rating': float, - }, + 'timestamp': 1633888264, + 'upload_date': '20211010', + } + }, { + # Age-restricted without vfilm redirection + 'url': 'https://www.cda.pl/video/17028157b8', + 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992', + 'info_dict': { + 'id': '17028157b8', + 'ext': 'mp4', + 'title': 'STENDUPY MICHAŁ OGIŃSKI', + 'description': 'md5:5851f3272bfc31f762d616040a1d609a', + 'height': 480, + 'uploader': 'oginski', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 18855, + 'age_limit': 18, + 'average_rating': float, + 'timestamp': 1699705901, + 'upload_date': '20231111', + } }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) + data, content_type = multipart_encode({'age_confirm': ''}) return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, + url, video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, @@ -164,7 +182,7 @@ class CDAIE(InfoExtractor): if 'Authorization' in self._API_HEADERS: return self._api_extract(video_id) else: - return self._web_extract(video_id, url) + return self._web_extract(video_id) def _api_extract(self, video_id): meta = self._download_json( @@ -197,9 +215,9 @@ class CDAIE(InfoExtractor): 'view_count': meta.get('views'), } - def _web_extract(self, video_id, url): + def _web_extract(self, video_id): self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: @@ -209,10 +227,10 @@ class CDAIE(InfoExtractor): self.raise_geo_restricted() need_confirm_age = False - if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', + if self._html_search_regex(r'(]+name="[^"]*age_confirm[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') + urlh.url, video_id, note='Confirming age') need_confirm_age = True formats = [] @@ -222,9 +240,6 @@ class CDAIE(InfoExtractor): (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, @@ -235,7 +250,6 @@ class CDAIE(InfoExtractor): 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, - 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats,