[extractor/mlb] Add MLBArticle extractor (#4832)

Closes #3475
Authored by: HobbyistDev
This commit is contained in:
HobbyistDev 2022-10-25 19:30:03 +09:00 committed by GitHub
parent c9bd65185c
commit e091fb92da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 0 deletions

View file

@ -1003,6 +1003,7 @@
MLBIE, MLBIE,
MLBVideoIE, MLBVideoIE,
MLBTVIE, MLBTVIE,
MLBArticleIE,
) )
from .mlssoccer import MLSSoccerIE from .mlssoccer import MLSSoccerIE
from .mnet import MnetIE from .mnet import MnetIE

View file

@ -348,3 +348,36 @@ def _real_extract(self, url):
'subtitles': subtitles, 'subtitles': subtitles,
'http_headers': {'Authorization': f'Bearer {self._access_token}'}, 'http_headers': {'Authorization': f'Bearer {self._access_token}'},
} }
class MLBArticleIE(InfoExtractor):
_VALID_URL = r'https?://www\.mlb\.com/news/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.mlb.com/news/manny-machado-robs-guillermo-heredia-reacts',
'info_dict': {
'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a',
'title': 'Machado\'s grab draws hilarious irate reaction',
'modified_timestamp': 1650130737,
'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676',
'modified_date': '20220416',
},
'playlist_count': 2,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache']
content_data_id = traverse_obj(
apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getForgeContent'), 'id'), get_all=False)
content_real_info = apollo_cache_json[content_data_id]
return self.playlist_from_matches(
traverse_obj(content_real_info, ('parts', lambda _, v: v['typename'] == 'Video', 'id')),
getter=lambda x: f'https://www.mlb.com/video/{apollo_cache_json[x]["slug"]}',
ie=MLBVideoIE, playlist_id=content_real_info.get('_translationId'),
title=self._html_search_meta('og:title', webpage),
description=content_real_info.get('summary'),
modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate')))