[LinkedInLearning] Add subtitles (#1077)

Authored by: Ashish0804
Closes #1072
This commit is contained in:
Ashish Gupta 2021-09-25 16:55:33 +05:30 committed by GitHub
parent e99b2d2771
commit 8dc831f715
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 24 additions and 1 deletions

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from itertools import zip_longest
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -8,6 +9,8 @@ from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
srt_subtitles_timecode,
try_get,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
@ -86,6 +89,16 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
}, },
} }
def json2srt(self, transcript_lines, duration=None):
srt_data = ''
for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])):
start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption']
end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1
srt_data += '%d\n%s --> %s\n%s\n' % (line + 1, srt_subtitles_timecode(start_time),
srt_subtitles_timecode(end_time),
caption)
return srt_data
def _real_extract(self, url): def _real_extract(self, url):
course_slug, video_slug = self._match_valid_url(url).groups() course_slug, video_slug = self._match_valid_url(url).groups()
@ -101,6 +114,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
formats.append({ formats.append({
'format_id': 'progressive-%dp' % height, 'format_id': 'progressive-%dp' % height,
'url': progressive_url, 'url': progressive_url,
'ext': 'mp4',
'height': height, 'height': height,
'width': width, 'width': width,
'source_preference': 1, 'source_preference': 1,
@ -128,6 +142,14 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
# However, unless someone can confirm this, the old # However, unless someone can confirm this, the old
# behaviour is being kept as-is # behaviour is being kept as-is
self._sort_formats(formats, ('res', 'source_preference')) self._sort_formats(formats, ('res', 'source_preference'))
subtitles = {}
duration = int_or_none(video_data.get('durationInSeconds'))
transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list)
if transcript_lines:
subtitles['en'] = [{
'ext': 'srt',
'data': self.json2srt(transcript_lines, duration)
}]
return { return {
'id': self._get_video_id(video_data, course_slug, video_slug), 'id': self._get_video_id(video_data, course_slug, video_slug),
@ -135,7 +157,8 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
'formats': formats, 'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'), 'thumbnail': video_data.get('defaultThumbnail'),
'timestamp': float_or_none(video_data.get('publishedOn'), 1000), 'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
'duration': int_or_none(video_data.get('durationInSeconds')), 'duration': duration,
'subtitles': subtitles,
} }