Added yt_dlp directly, added rename format options, added xclip clipboard subproc, added copy name context menu option

This commit is contained in:
2022-12-02 20:00:26 -06:00
parent e4e5e08cb4
commit b84fd38523
976 changed files with 191451 additions and 6 deletions

View File

@@ -0,0 +1,54 @@
import os
from ..utils import load_plugins
_LAZY_LOADER = False
if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
try:
from .lazy_extractors import *
from .lazy_extractors import _ALL_CLASSES
_LAZY_LOADER = True
except ImportError:
pass
if not _LAZY_LOADER:
from .extractors import *
_ALL_CLASSES = [
klass
for name, klass in globals().items()
if name.endswith('IE') and name != 'GenericIE'
]
_ALL_CLASSES.append(GenericIE)
_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
def gen_extractor_classes():
""" Return a list of supported extractors.
The order does matter; the first extractor matched is the one handling the URL.
"""
return _ALL_CLASSES
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
"""
return [klass() for klass in gen_extractor_classes()]
def list_extractors(age_limit):
"""
Return a list of extractors that are suitable for the given age,
sorted by extractor ID.
"""
return sorted(
filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
key=lambda ie: ie.IE_NAME.lower())
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name + 'IE']

View File

@@ -0,0 +1,318 @@
from __future__ import unicode_literals
import hashlib
import hmac
import re
import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
dict_get,
ExtractorError,
js_to_json,
int_or_none,
parse_iso8601,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
)
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
_VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn)/(?:[^/]+/){1,4}(?P<id>\d{5,})'
_TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
'id': '5868334',
'ext': 'mp4',
'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
'skip': 'this video has expired',
}, {
'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
'md5': '4ebd61bdc82d9a8b722f64f1f4b4d121',
'info_dict': {
'id': 'NvqvPeNZsHU',
'ext': 'mp4',
'upload_date': '20150816',
'uploader': 'ABC News (Australia)',
'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef',
'uploader_id': 'NewsOnABC',
'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
},
'add_ie': ['Youtube'],
'skip': 'Not accessible from Travis CI server',
}, {
'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',
'md5': 'b96eee7c9edf4fc5a358a0252881cc1f',
'info_dict': {
'id': '6880080',
'ext': 'mp3',
'title': 'NAB lifts interest rates, following Westpac and CBA',
'description': 'md5:f13d8edc81e462fce4a0437c7dc04728',
},
}, {
'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
'only_matching': True,
}, {
'url': 'https://www.abc.net.au/btn/classroom/wwi-centenary/10527914',
'info_dict': {
'id': '10527914',
'ext': 'mp4',
'title': 'WWI Centenary',
'description': 'md5:c2379ec0ca84072e86b446e536954546',
}
}, {
'url': 'https://www.abc.net.au/news/programs/the-world/2020-06-10/black-lives-matter-protests-spawn-support-for/12342074',
'info_dict': {
'id': '12342074',
'ext': 'mp4',
'title': 'Black Lives Matter protests spawn support for Papuans in Indonesia',
'description': 'md5:2961a17dc53abc558589ccd0fb8edd6f',
}
}, {
'url': 'https://www.abc.net.au/btn/newsbreak/btn-newsbreak-20200814/12560476',
'info_dict': {
'id': 'tDL8Ld4dK_8',
'ext': 'mp4',
'title': 'Fortnite Banned From Apple and Google App Stores',
'description': 'md5:a6df3f36ce8f816b74af4bd6462f5651',
'upload_date': '20200813',
'uploader': 'Behind the News',
'uploader_id': 'behindthenews',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
mobj = re.search(r'<a\s+href="(?P<url>[^"]+)"\s+data-duration="\d+"\s+title="Download audio directly">', webpage)
if mobj:
urls_info = mobj.groupdict()
youtube = False
video = False
else:
mobj = re.search(r'<a href="(?P<url>http://www\.youtube\.com/watch\?v=[^"]+)"><span><strong>External Link:</strong>',
webpage)
if mobj is None:
mobj = re.search(r'<iframe width="100%" src="(?P<url>//www\.youtube-nocookie\.com/embed/[^?"]+)', webpage)
if mobj:
urls_info = mobj.groupdict()
youtube = True
video = True
if mobj is None:
mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage)
if mobj is None:
mobj = re.search(
r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
webpage)
if mobj is None:
expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None)
if expired:
raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)
raise ExtractorError('Unable to extract video urls')
urls_info = self._parse_json(
mobj.group('json_data'), video_id, transform_source=js_to_json)
youtube = mobj.group('type') == 'YouTube'
video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4'
if not isinstance(urls_info, list):
urls_info = [urls_info]
if youtube:
return self.playlist_result([
self.url_result(url_info['url']) for url_info in urls_info])
formats = []
for url_info in urls_info:
height = int_or_none(url_info.get('height'))
bitrate = int_or_none(url_info.get('bitrate'))
width = int_or_none(url_info.get('width'))
format_id = None
mobj = re.search(r'_(?:(?P<height>\d+)|(?P<bitrate>\d+)k)\.mp4$', url_info['url'])
if mobj:
height_from_url = mobj.group('height')
if height_from_url:
height = height or int_or_none(height_from_url)
width = width or int_or_none(url_info.get('label'))
else:
bitrate = bitrate or int_or_none(mobj.group('bitrate'))
format_id = str_or_none(url_info.get('label'))
formats.append({
'url': url_info['url'],
'vcodec': url_info.get('codec') if video else 'none',
'width': width,
'height': height,
'tbr': bitrate,
'filesize': int_or_none(url_info.get('filesize')),
'format_id': format_id
})
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
class ABCIViewIE(InfoExtractor):
IE_NAME = 'abc.net.au:iview'
_VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
_GEO_COUNTRIES = ['AU']
# ABC iview programs are normally available for 14 days only.
_TESTS = [{
'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00',
'md5': '67715ce3c78426b11ba167d875ac6abf',
'info_dict': {
'id': 'LE1927H001S00',
'ext': 'mp4',
'title': "Series 11 Ep 1",
'series': "Gruen",
'description': 'md5:52cc744ad35045baf6aded2ce7287f67',
'upload_date': '20190925',
'uploader_id': 'abc1',
'timestamp': 1569445289,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_params = self._download_json(
'https://iview.abc.net.au/api/programs/' + video_id, video_id)
title = unescapeHTML(video_params.get('title') or video_params['seriesTitle'])
stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream'))
house_number = video_params.get('episodeHouseNumber') or video_id
path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format(
int(time.time()), house_number)
sig = hmac.new(
b'android.content.res.Resources',
path.encode('utf-8'), hashlib.sha256).hexdigest()
token = self._download_webpage(
'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id)
def tokenize_url(url, token):
return update_url_query(url, {
'hdnea': token,
})
for sd in ('720', 'sd', 'sd-low'):
sd_url = try_get(
stream, lambda x: x['streams']['hls'][sd], compat_str)
if not sd_url:
continue
formats = self._extract_m3u8_formats(
tokenize_url(sd_url, token), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
if formats:
break
self._sort_formats(formats)
subtitles = {}
src_vtt = stream.get('captions', {}).get('src-vtt')
if src_vtt:
subtitles['en'] = [{
'url': src_vtt,
'ext': 'vtt',
}]
is_live = video_params.get('livestream') == '1'
return {
'id': video_id,
'title': title,
'description': video_params.get('description'),
'thumbnail': video_params.get('thumbnail'),
'duration': int_or_none(video_params.get('eventDuration')),
'timestamp': parse_iso8601(video_params.get('pubDate'), ' '),
'series': unescapeHTML(video_params.get('seriesTitle')),
'series_id': video_params.get('seriesHouseNumber') or video_id[:7],
'season_number': int_or_none(self._search_regex(
r'\bSeries\s+(\d+)\b', title, 'season number', default=None)),
'episode_number': int_or_none(self._search_regex(
r'\bEp\s+(\d+)\b', title, 'episode number', default=None)),
'episode_id': house_number,
'uploader_id': video_params.get('channel'),
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
}
class ABCIViewShowSeriesIE(InfoExtractor):
IE_NAME = 'abc.net.au:iview:showseries'
_VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$'
_GEO_COUNTRIES = ['AU']
_TESTS = [{
'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
'info_dict': {
'id': '124870-1',
'title': 'Series 1',
'description': 'md5:93119346c24a7c322d446d8eece430ff',
'series': 'Upper Middle Bogan',
'season': 'Series 1',
'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$'
},
'playlist_count': 8,
}, {
'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
'info_dict': {
'id': 'CO1108V001S00',
'ext': 'mp4',
'title': 'Series 1 Ep 1 I\'m A Swan',
'description': 'md5:7b676758c1de11a30b79b4d301e8da93',
'series': 'Upper Middle Bogan',
'uploader_id': 'abc1',
'upload_date': '20210630',
'timestamp': 1625036400,
},
'params': {
'noplaylist': True,
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
webpage_data = self._search_regex(
r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;',
webpage, 'initial state')
video_data = self._parse_json(
unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id)
video_data = video_data['route']['pageData']['_embedded']
highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl'])
if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'):
return self.url_result(highlight, ie=ABCIViewIE.ie_key())
series = video_data['selectedSeries']
return {
'_type': 'playlist',
'entries': [self.url_result(episode['shareUrl'])
for episode in series['_embedded']['videoEpisodes']],
'id': series.get('id'),
'title': dict_get(series, ('title', 'displaySubtitle')),
'description': series.get('description'),
'series': dict_get(series, ('showTitle', 'displayTitle')),
'season': dict_get(series, ('title', 'displaySubtitle')),
'thumbnail': series.get('thumbnail'),
}

View File

@@ -0,0 +1,157 @@
# coding: utf-8
from __future__ import unicode_literals
from .amp import AMPIE
from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
try_get,
)
class AbcNewsVideoIE(AMPIE):
IE_NAME = 'abcnews:video'
_VALID_URL = r'''(?x)
https?://
(?:
abcnews\.go\.com/
(?:
(?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-|
video/(?:embed|itemfeed)\?.*?\bid=
)|
fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/
)
(?P<id>\d+)
'''
_TESTS = [{
'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
'info_dict': {
'id': '20411932',
'ext': 'mp4',
'display_id': 'week-exclusive-irans-foreign-minister-zarif',
'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
'duration': 180,
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1380454200,
'upload_date': '20130929',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://abcnews.go.com/video/embed?id=46979033',
'only_matching': True,
}, {
'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
'only_matching': True,
}, {
'url': 'http://abcnews.go.com/video/itemfeed?id=46979033',
'only_matching': True,
}, {
'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
info_dict = self._extract_feed_info(
'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
info_dict.update({
'id': video_id,
'display_id': display_id,
})
return info_dict
class AbcNewsIE(InfoExtractor):
IE_NAME = 'abcnews'
_VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
_TESTS = [{
# Youtube Embeds
'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501',
'info_dict': {
'id': '51286501',
'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player",
'description': 'Billingsley went from a child actor to Hollywood power player.',
},
'playlist_count': 5,
}, {
'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
'info_dict': {
'id': '38897857',
'ext': 'mp4',
'title': 'Justin Timberlake Drops Hints For Secret Single',
'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
'upload_date': '20160505',
'timestamp': 1462442280,
},
'params': {
# m3u8 download
'skip_download': True,
# The embedded YouTube video is blocked due to copyright issues
'playlist_items': '1',
},
'add_ie': ['AbcNewsVideo'],
}, {
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
}, {
# inline.type == 'video'
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
}]
def _real_extract(self, url):
story_id = self._match_id(url)
webpage = self._download_webpage(url, story_id)
story = self._parse_json(self._search_regex(
r"window\['__abcnews__'\]\s*=\s*({.+?});",
webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0]
article_contents = story.get('articleContents') or {}
def entries():
featured_video = story.get('featuredVideo') or {}
feed = try_get(featured_video, lambda x: x['video']['feed'])
if feed:
yield {
'_type': 'url',
'id': featured_video.get('id'),
'title': featured_video.get('name'),
'url': feed,
'thumbnail': featured_video.get('images'),
'description': featured_video.get('description'),
'timestamp': parse_iso8601(featured_video.get('uploadDate')),
'duration': parse_duration(featured_video.get('duration')),
'ie_key': AbcNewsVideoIE.ie_key(),
}
for inline in (article_contents.get('inlines') or []):
inline_type = inline.get('type')
if inline_type == 'iframe':
iframe_url = try_get(inline, lambda x: x['attrs']['src'])
if iframe_url:
yield self.url_result(iframe_url)
elif inline_type == 'video':
video_id = inline.get('id')
if video_id:
yield {
'_type': 'url',
'id': video_id,
'url': 'http://abcnews.go.com/video/embed?id=' + video_id,
'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'),
'description': inline.get('description'),
'duration': parse_duration(inline.get('duration')),
'ie_key': AbcNewsVideoIE.ie_key(),
}
return self.playlist_result(
entries(), story_id, article_contents.get('headline'),
article_contents.get('subHead'))

View File

@@ -0,0 +1,136 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
dict_get,
int_or_none,
try_get,
)
class ABCOTVSIE(InfoExtractor):
IE_NAME = 'abcotvs'
IE_DESC = 'ABC Owned Television Stations'
_VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
'info_dict': {
'id': '472548',
'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
'ext': 'mp4',
'title': 'East Bay museum celebrates synthesized music',
'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1421118520,
'upload_date': '20150113',
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://abc7news.com/472581',
'only_matching': True,
},
{
'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/',
'only_matching': True,
},
]
_SITE_MAP = {
'6abc': 'wpvi',
'abc11': 'wtvd',
'abc13': 'ktrk',
'abc30': 'kfsn',
'abc7': 'kabc',
'abc7chicago': 'wls',
'abc7news': 'kgo',
'abc7ny': 'wabc',
}
def _real_extract(self, url):
site, display_id, video_id = self._match_valid_url(url).groups()
display_id = display_id or video_id
station = self._SITE_MAP[site]
data = self._download_json(
'https://api.abcotvs.com/v2/content', display_id, query={
'id': video_id,
'key': 'otv.web.%s.story' % station,
'station': station,
})['data']
video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data
video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id))
title = video.get('title') or video['linkText']
formats = []
m3u8_url = video.get('m3u8')
if m3u8_url:
formats = self._extract_m3u8_formats(
video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False)
mp4_url = video.get('mp4')
if mp4_url:
formats.append({
'abr': 128,
'format_id': 'https',
'height': 360,
'url': mp4_url,
'width': 640,
})
self._sort_formats(formats)
image = video.get('image') or {}
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])),
'thumbnail': dict_get(image, ('source', 'dynamicSource')),
'timestamp': int_or_none(video.get('date')),
'duration': int_or_none(video.get('length')),
'formats': formats,
}
class ABCOTVSClipsIE(InfoExtractor):
IE_NAME = 'abcotvs:clips'
_VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P<id>\d+)'
_TEST = {
'url': 'https://clips.abcotvs.com/kabc/video/214814',
'info_dict': {
'id': '214814',
'ext': 'mp4',
'title': 'SpaceX launch pad explosion destroys rocket, satellite',
'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b',
'upload_date': '20160901',
'timestamp': 1472756695,
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0]
title = video_data['title']
formats = self._extract_m3u8_formats(
video_data['videoURL'].split('?')[0], video_id, 'mp4')
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('thumbnailURL'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': int_or_none(video_data.get('pubDate')),
'formats': formats,
}

View File

@@ -0,0 +1,41 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
_TEST = {
'url': 'http://academicearth.org/playlists/laws-of-nature/',
'info_dict': {
'id': 'laws-of-nature',
'title': 'Laws of Nature',
'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.',
},
'playlist_count': 3,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<p class="excerpt"[^>]*?>(.*?)</p>',
webpage, 'description', fatal=False)
urls = re.findall(
r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
entries = [self.url_result(u) for u in urls]
return {
'_type': 'playlist',
'id': playlist_id,
'title': title,
'description': description,
'entries': entries,
}

View File

@@ -0,0 +1,125 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
clean_podcast_url,
int_or_none,
parse_iso8601,
)
class ACastBaseIE(InfoExtractor):
def _extract_episode(self, episode, show_info):
title = episode['title']
info = {
'id': episode['id'],
'display_id': episode.get('episodeUrl'),
'url': clean_podcast_url(episode['url']),
'title': title,
'description': clean_html(episode.get('description') or episode.get('summary')),
'thumbnail': episode.get('image'),
'timestamp': parse_iso8601(episode.get('publishDate')),
'duration': int_or_none(episode.get('duration')),
'filesize': int_or_none(episode.get('contentLength')),
'season_number': int_or_none(episode.get('season')),
'episode': title,
'episode_number': int_or_none(episode.get('episode')),
}
info.update(show_info)
return info
def _extract_show_info(self, show):
return {
'creator': show.get('author'),
'series': show.get('title'),
}
def _call_api(self, path, video_id, query=None):
return self._download_json(
'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query)
class ACastIE(ACastBaseIE):
IE_NAME = 'acast'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:embed|www)\.)?acast\.com/|
play\.acast\.com/s/
)
(?P<channel>[^/]+)/(?P<id>[^/#?]+)
'''
_TESTS = [{
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
'md5': 'f5598f3ad1e4776fed12ec1407153e4b',
'info_dict': {
'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
'ext': 'mp3',
'title': '2. Raggarmordet - Röster ur det förflutna',
'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67',
'timestamp': 1477346700,
'upload_date': '20161024',
'duration': 2766,
'creator': 'Anton Berg & Martin Johnson',
'series': 'Spår',
'episode': '2. Raggarmordet - Röster ur det förflutna',
}
}, {
'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
'only_matching': True,
}]
def _real_extract(self, url):
channel, display_id = self._match_valid_url(url).groups()
episode = self._call_api(
'%s/episodes/%s' % (channel, display_id),
display_id, {'showInfo': 'true'})
return self._extract_episode(
episode, self._extract_show_info(episode.get('show') or {}))
class ACastChannelIE(ACastBaseIE):
IE_NAME = 'acast:channel'
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?acast\.com/|
play\.acast\.com/s/
)
(?P<id>[^/#?]+)
'''
_TESTS = [{
'url': 'https://www.acast.com/todayinfocus',
'info_dict': {
'id': '4efc5294-5385-4847-98bd-519799ce5786',
'title': 'Today in Focus',
'description': 'md5:c09ce28c91002ce4ffce71d6504abaae',
},
'playlist_mincount': 200,
}, {
'url': 'http://play.acast.com/s/ft-banking-weekly',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
def _real_extract(self, url):
show_slug = self._match_id(url)
show = self._call_api(show_slug, show_slug)
show_info = self._extract_show_info(show)
entries = []
for episode in (show.get('episodes') or []):
entries.append(self._extract_episode(episode, show_info))
return self.playlist_result(
entries, show.get('id'), show.get('title'), show.get('description'))

View File

@@ -0,0 +1,262 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import binascii
import json
import os
import random
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_HTTPError,
compat_b64decode,
)
from ..utils import (
ass_subtitles_timecode,
bytes_to_intlist,
bytes_to_long,
ExtractorError,
float_or_none,
int_or_none,
intlist_to_bytes,
long_to_bytes,
pkcs1pad,
strip_or_none,
try_get,
unified_strdate,
urlencode_postdata,
)
class ADNIE(InfoExtractor):
IE_DESC = 'Anime Digital Network'
_VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'md5': '0319c99885ff5547565cacb4f3f9348d',
'info_dict': {
'id': '7778',
'ext': 'mp4',
'title': 'Blue Exorcist - Kyôto Saga - Episode 1',
'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
'series': 'Blue Exorcist - Kyôto Saga',
'duration': 1467,
'release_date': '20170106',
'comment_count': int,
'average_rating': float,
'season_number': 2,
'episode': 'Début des hostilités',
'episode_number': 1,
}
}
_NETRC_MACHINE = 'animedigitalnetwork'
_BASE_URL = 'http://animedigitalnetwork.fr'
_API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/'
_PLAYER_BASE_URL = _API_BASE_URL + 'player/'
_HEADERS = {}
_LOGIN_ERR_MESSAGE = 'Unable to log in'
_RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537)
_POS_ALIGN_MAP = {
'start': 1,
'end': 3,
}
_LINE_ALIGN_MAP = {
'middle': 8,
'end': 4,
}
def _get_subtitles(self, sub_url, video_id):
if not sub_url:
return None
enc_subtitles = self._download_webpage(
sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}'
subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location')
if subtitle_location:
enc_subtitles = self._download_webpage(
subtitle_location, video_id, 'Downloading subtitles data',
fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'})
if not enc_subtitles:
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes(
compat_b64decode(enc_subtitles[24:]),
binascii.unhexlify(self._K + 'ab9f52f5baae7c72'),
compat_b64decode(enc_subtitles[:24])))
subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False)
if not subtitles_json:
return None
subtitles = {}
for sub_lang, sub in subtitles_json.items():
ssa = '''[Script Info]
ScriptType:V4.00
[V4 Styles]
Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding
Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0
[Events]
Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
for current in sub:
start, end, text, line_align, position_align = (
float_or_none(current.get('startTime')),
float_or_none(current.get('endTime')),
current.get('text'), current.get('lineAlign'),
current.get('positionAlign'))
if start is None or end is None or text is None:
continue
alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0)
ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % (
ass_subtitles_timecode(start),
ass_subtitles_timecode(end),
'{\\a%d}' % alignment if alignment != 2 else '',
text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}'))
if sub_lang == 'vostf':
sub_lang = 'fr'
subtitles.setdefault(sub_lang, []).extend([{
'ext': 'json',
'data': json.dumps(sub),
}, {
'ext': 'ssa',
'data': ssa,
}])
return subtitles
def _real_initialize(self):
username, password = self._get_login_info()
if not username:
return
try:
access_token = (self._download_json(
self._API_BASE_URL + 'authentication/login', None,
'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False,
data=urlencode_postdata({
'password': password,
'rememberMe': False,
'source': 'Web',
'username': username,
})) or {}).get('accessToken')
if access_token:
self._HEADERS = {'authorization': 'Bearer ' + access_token}
except ExtractorError as e:
message = None
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
resp = self._parse_json(
e.cause.read().decode(), None, fatal=False) or {}
message = resp.get('message') or resp.get('code')
self.report_warning(message or self._LOGIN_ERR_MESSAGE)
def _real_extract(self, url):
video_id = self._match_id(url)
video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id
player = self._download_json(
video_base_url + 'configuration', video_id,
'Downloading player config JSON metadata',
headers=self._HEADERS)['player']
options = player['options']
user = options['user']
if not user.get('hasAccess'):
self.raise_login_required()
token = self._download_json(
user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'),
video_id, 'Downloading access token', headers={
'x-player-refresh-token': user['refreshToken']
}, data=b'')['token']
links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
message = bytes_to_intlist(json.dumps({
'k': self._K,
't': token,
}))
# Sometimes authentication fails for no good reason, retry with
# a different random padding
links_data = None
for _ in range(3):
padded_message = intlist_to_bytes(pkcs1pad(message, 128))
n, e = self._RSA_KEY
encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
authorization = base64.b64encode(encrypted_message).decode()
try:
links_data = self._download_json(
links_url, video_id, 'Downloading links JSON metadata', headers={
'X-Player-Token': authorization
}, query={
'freeWithAds': 'true',
'adaptive': 'false',
'withMetadata': 'true',
'source': 'Web'
})
break
except ExtractorError as e:
if not isinstance(e.cause, compat_HTTPError):
raise e
if e.cause.code == 401:
# This usually goes away with a different random pkcs1pad, so retry
continue
error = self._parse_json(e.cause.read(), video_id)
message = error.get('message')
if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
self.raise_geo_restricted(msg=message)
raise ExtractorError(message)
else:
raise ExtractorError('Giving up retrying')
links = links_data.get('links') or {}
metas = links_data.get('metadata') or {}
sub_url = (links.get('subtitles') or {}).get('all')
video_info = links_data.get('video') or {}
title = metas['title']
formats = []
for format_id, qualities in (links.get('streaming') or {}).items():
if not isinstance(qualities, dict):
continue
for quality, load_balancer_url in qualities.items():
load_balancer_data = self._download_json(
load_balancer_url, video_id,
'Downloading %s %s JSON metadata' % (format_id, quality),
fatal=False) or {}
m3u8_url = load_balancer_data.get('location')
if not m3u8_url:
continue
m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False)
if format_id == 'vf':
for f in m3u8_formats:
f['language'] = 'fr'
formats.extend(m3u8_formats)
self._sort_formats(formats)
video = (self._download_json(
self._API_BASE_URL + 'video/%s' % video_id, video_id,
'Downloading additional video metadata', fatal=False) or {}).get('video') or {}
show = video.get('show') or {}
return {
'id': video_id,
'title': title,
'description': strip_or_none(metas.get('summary') or video.get('summary')),
'thumbnail': video_info.get('image') or player.get('image'),
'formats': formats,
'subtitles': self.extract_subtitles(sub_url, video_id),
'episode': metas.get('subtitle') or video.get('name'),
'episode_number': int_or_none(video.get('shortNumber')),
'series': show.get('title'),
'season_number': int_or_none(video.get('season')),
'duration': int_or_none(video_info.get('duration') or video.get('duration')),
'release_date': unified_strdate(video.get('releaseDate')),
'average_rating': float_or_none(video.get('rating') or metas.get('rating')),
'comment_count': int_or_none(video.get('commentsCount')),
}

View File

@@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
class AdobeConnectIE(InfoExtractor):
_VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P<id>[\w-]+)'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []
for con_string in qs['conStrings'][0].split(','):
formats.append({
'format_id': con_string.split('://')[0],
'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]),
'ext': 'flv',
'play_path': 'mp4:' + qs['streamName'][0],
'rtmp_conn': 'S:' + qs['ticket'][0],
'rtmp_live': is_live,
'url': con_string,
})
return {
'id': video_id,
'title': title,
'formats': formats,
'is_live': is_live,
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,289 @@
from __future__ import unicode_literals
import functools
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
ISO639Utils,
join_nonempty,
OnDemandPagedList,
parse_duration,
str_or_none,
str_to_int,
unified_strdate,
)
class AdobeTVBaseIE(InfoExtractor):
def _call_api(self, path, video_id, query, note=None):
return self._download_json(
'http://tv.adobe.com/api/v4/' + path,
video_id, note, query=query)['data']
def _parse_subtitles(self, video_data, url_key):
subtitles = {}
for translation in video_data.get('translations', []):
vtt_path = translation.get(url_key)
if not vtt_path:
continue
lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
subtitles.setdefault(lang, []).append({
'ext': 'vtt',
'url': vtt_path,
})
return subtitles
def _parse_video_data(self, video_data):
video_id = compat_str(video_data['id'])
title = video_data['title']
s3_extracted = False
formats = []
for source in video_data.get('videos', []):
source_url = source.get('url')
if not source_url:
continue
f = {
'format_id': source.get('quality_level'),
'fps': int_or_none(source.get('frame_rate')),
'height': int_or_none(source.get('height')),
'tbr': int_or_none(source.get('video_data_rate')),
'width': int_or_none(source.get('width')),
'url': source_url,
}
original_filename = source.get('original_filename')
if original_filename:
if not (f.get('height') and f.get('width')):
mobj = re.search(r'_(\d+)x(\d+)', original_filename)
if mobj:
f.update({
'height': int(mobj.group(2)),
'width': int(mobj.group(1)),
})
if original_filename.startswith('s3://') and not s3_extracted:
formats.append({
'format_id': 'original',
'quality': 1,
'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'),
})
s3_extracted = True
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('thumbnail'),
'upload_date': unified_strdate(video_data.get('start_date')),
'duration': parse_duration(video_data.get('duration')),
'view_count': str_to_int(video_data.get('playcount')),
'formats': formats,
'subtitles': self._parse_subtitles(video_data, 'vtt'),
}
class AdobeTVEmbedIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:embed'
_VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)'
_TEST = {
'url': 'https://tv.adobe.com/embed/22/4153',
'md5': 'c8c0461bf04d54574fc2b4d07ac6783a',
'info_dict': {
'id': '4153',
'ext': 'flv',
'title': 'Creating Graphics Optimized for BlackBerry',
'description': 'md5:eac6e8dced38bdaae51cd94447927459',
'thumbnail': r're:https?://.*\.jpg$',
'upload_date': '20091109',
'duration': 377,
'view_count': int,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._call_api(
'episode/' + video_id, video_id, {'disclosure': 'standard'})[0]
return self._parse_video_data(video_data)
class AdobeTVIE(AdobeTVBaseIE):
IE_NAME = 'adobetv'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)'
_TEST = {
'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/',
'md5': '9bc5727bcdd55251f35ad311ca74fa1e',
'info_dict': {
'id': '10981',
'ext': 'mp4',
'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop',
'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311',
'thumbnail': r're:https?://.*\.jpg$',
'upload_date': '20110914',
'duration': 60,
'view_count': int,
},
}
def _real_extract(self, url):
language, show_urlname, urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
video_data = self._call_api(
'episode/get', urlname, {
'disclosure': 'standard',
'language': language,
'show_urlname': show_urlname,
'urlname': urlname,
})[0]
return self._parse_video_data(video_data)
class AdobeTVPlaylistBaseIE(AdobeTVBaseIE):
_PAGE_SIZE = 25
def _fetch_page(self, display_id, query, page):
page += 1
query['page'] = page
for element_data in self._call_api(
self._RESOURCE, display_id, query, 'Download Page %d' % page):
yield self._process_data(element_data)
def _extract_playlist_entries(self, display_id, query):
return OnDemandPagedList(functools.partial(
self._fetch_page, display_id, query), self._PAGE_SIZE)
class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
IE_NAME = 'adobetv:show'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)'
_TEST = {
'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost',
'info_dict': {
'id': '36',
'title': 'The Complete Picture with Julieanne Kost',
'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27',
},
'playlist_mincount': 136,
}
_RESOURCE = 'episode'
_process_data = AdobeTVBaseIE._parse_video_data
def _real_extract(self, url):
language, show_urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
query = {
'disclosure': 'standard',
'language': language,
'show_urlname': show_urlname,
}
show_data = self._call_api(
'show/get', show_urlname, query)[0]
return self.playlist_result(
self._extract_playlist_entries(show_urlname, query),
str_or_none(show_data.get('id')),
show_data.get('show_name'),
show_data.get('show_description'))
class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
IE_NAME = 'adobetv:channel'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?'
_TEST = {
'url': 'http://tv.adobe.com/channel/development',
'info_dict': {
'id': 'development',
},
'playlist_mincount': 96,
}
_RESOURCE = 'show'
def _process_data(self, show_data):
return self.url_result(
show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id')))
def _real_extract(self, url):
language, channel_urlname, category_urlname = self._match_valid_url(url).groups()
if not language:
language = 'en'
query = {
'channel_urlname': channel_urlname,
'language': language,
}
if category_urlname:
query['category_urlname'] = category_urlname
return self.playlist_result(
self._extract_playlist_entries(channel_urlname, query),
channel_urlname)
class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
'url': 'https://video.tv.adobe.com/v/2456/',
'md5': '43662b577c018ad707a63766462b1e87',
'info_dict': {
'id': '2456',
'ext': 'mp4',
'title': 'New experience with Acrobat DC',
'description': 'New experience with Acrobat DC',
'duration': 248.667,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(self._search_regex(
r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
title = video_data['title']
formats = []
sources = video_data.get('sources') or []
for source in sources:
source_src = source.get('src')
if not source_src:
continue
formats.append({
'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000),
'format_id': join_nonempty(source.get('format'), source.get('label')),
'height': int_or_none(source.get('height') or None),
'tbr': int_or_none(source.get('bitrate') or None),
'width': int_or_none(source.get('width') or None),
'url': source_src,
})
self._sort_formats(formats)
# For both metadata and downloaded files the duration varies among
# formats. I just pick the max one
duration = max(filter(None, [
float_or_none(source.get('duration'), scale=1000)
for source in sources]))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('video', {}).get('poster'),
'duration': duration,
'subtitles': self._parse_subtitles(video_data, 'vttPath'),
}

View File

@@ -0,0 +1,201 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .turner import TurnerBaseIE
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
mimetype2ext,
parse_age_limit,
parse_iso8601,
strip_or_none,
try_get,
)
class AdultSwimIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
_TESTS = [{
'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
'info_dict': {
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
'ext': 'mp4',
'title': 'Rick and Morty - Pilot',
'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
'timestamp': 1543294800,
'upload_date': '20181127',
},
'params': {
# m3u8 download
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
'ext': 'mp4',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
'upload_date': '20080124',
'timestamp': 1201150800,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': '404 Not Found',
}, {
'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
'info_dict': {
'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
'ext': 'mp4',
'title': 'Decker - Inside Decker: A New Hero',
'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
'timestamp': 1469480460,
'upload_date': '20160725',
},
'params': {
# m3u8 download
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/attack-on-titan',
'info_dict': {
'id': 'attack-on-titan',
'title': 'Attack on Titan',
'description': 'md5:41caa9416906d90711e31dc00cb7db7e',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.adultswim.com/videos/streams/williams-stream',
'info_dict': {
'id': 'd8DEBj7QRfetLsRgFnGEyg',
'ext': 'mp4',
'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'description': 'original programming',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': '404 Not Found',
}]
def _real_extract(self, url):
show_path, episode_path = self._match_valid_url(url).groups()
display_id = episode_path or show_path
query = '''query {
getShowBySlug(slug:"%s") {
%%s
}
}''' % show_path
if episode_path:
query = query % '''title
getVideoBySlug(slug:"%s") {
_id
auth
description
duration
episodeNumber
launchDate
mediaID
seasonNumber
poster
title
tvRating
}''' % episode_path
['getVideoBySlug']
else:
query = query % '''metaDescription
title
videos(first:1000,sort:["episode_number"]) {
edges {
node {
_id
slug
}
}
}'''
show_data = self._download_json(
'https://www.adultswim.com/api/search', display_id,
data=json.dumps({'query': query}).encode(),
headers={'Content-Type': 'application/json'})['data']['getShowBySlug']
if episode_path:
video_data = show_data['getVideoBySlug']
video_id = video_data['_id']
episode_title = title = video_data['title']
series = show_data.get('title')
if series:
title = '%s - %s' % (series, title)
info = {
'id': video_id,
'title': title,
'description': strip_or_none(video_data.get('description')),
'duration': float_or_none(video_data.get('duration')),
'formats': [],
'subtitles': {},
'age_limit': parse_age_limit(video_data.get('tvRating')),
'thumbnail': video_data.get('poster'),
'timestamp': parse_iso8601(video_data.get('launchDate')),
'series': series,
'season_number': int_or_none(video_data.get('seasonNumber')),
'episode': episode_title,
'episode_number': int_or_none(video_data.get('episodeNumber')),
}
auth = video_data.get('auth')
media_id = video_data.get('mediaID')
if media_id:
info.update(self._extract_ngtv_info(media_id, {
# CDN_TOKEN_APP_ID from:
# https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js
'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE',
}, {
'url': url,
'site_name': 'AdultSwim',
'auth_required': auth,
}))
if not auth:
extract_data = self._download_json(
'https://www.adultswim.com/api/shows/v1/videos/' + video_id,
video_id, query={'fields': 'stream'}, fatal=False) or {}
assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or []
for asset in assets:
asset_url = asset.get('url')
if not asset_url:
continue
ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type')))
if ext == 'm3u8':
info['formats'].extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
continue
# info['formats'].extend(self._extract_f4m_formats(
# asset_url, video_id, f4m_id='hds', fatal=False))
elif ext in ('scc', 'ttml', 'vtt'):
info['subtitles'].setdefault('en', []).append({
'url': asset_url,
})
self._sort_formats(info['formats'])
return info
else:
entries = []
for edge in show_data.get('videos', {}).get('edges', []):
video = edge.get('node') or {}
slug = video.get('slug')
if not slug:
continue
entries.append(self.url_result(
'http://adultswim.com/videos/%s/%s' % (show_path, slug),
'AdultSwim', video.get('_id')))
return self.playlist_result(
entries, show_path, show_data.get('title'),
strip_or_none(show_data.get('metaDescription')))

View File

@@ -0,0 +1,341 @@
# coding: utf-8
from __future__ import unicode_literals
from .theplatform import ThePlatformIE
from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
update_url_query,
urlencode_postdata,
)
class AENetworksBaseIE(ThePlatformIE):
_BASE_URL_REGEX = r'''(?x)https?://
(?:(?:www|play|watch)\.)?
(?P<domain>
(?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
)/'''
_THEPLATFORM_KEY = '43jXaGRQud'
_THEPLATFORM_SECRET = 'S10BPXHMlb'
_DOMAIN_MAP = {
'history.com': ('HISTORY', 'history'),
'aetv.com': ('AETV', 'aetv'),
'mylifetime.com': ('LIFETIME', 'lifetime'),
'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'),
'fyi.tv': ('FYI', 'fyi'),
'historyvault.com': (None, 'historyvault'),
'biography.com': (None, 'biography'),
}
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
if auth:
query['auth'] = auth
TP_SMIL_QUERY = [{
'assetTypes': 'high_video_ak',
'switch': 'hls_high_ak'
}, {
'assetTypes': 'high_video_s3'
}, {
'assetTypes': 'high_video_s3',
'switch': 'hls_high_fastly',
}]
formats = []
subtitles = {}
last_e = None
for q in TP_SMIL_QUERY:
q.update(query)
m_url = update_url_query(smil_url, q)
m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes']))
except ExtractorError as e:
if isinstance(e, GeoRestrictedError):
raise
last_e = e
continue
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if last_e and not formats:
raise last_e
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
def _extract_aetn_info(self, domain, filter_key, filter_value, url):
requestor_id, brand = self._DOMAIN_MAP[domain]
result = self._download_json(
'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand,
filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0]
title = result['title']
video_id = result['id']
media_url = result['publicUrl']
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
auth = None
if theplatform_metadata.get('AETN$isBehindWall'):
resource = self._get_mvpd_resource(
requestor_id, theplatform_metadata['title'],
theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
theplatform_metadata['ratings'][0]['rating'])
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
info.update(self._extract_aen_smil(media_url, video_id, auth))
info.update({
'title': title,
'series': result.get('seriesName'),
'season_number': int_or_none(result.get('tvSeasonNumber')),
'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')),
})
return info
class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
_VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id>
shows/[^/]+/season-\d+/episode-\d+|
(?:
(?:movie|special)s/[^/]+|
(?:shows/[^/]+/)?videos
)/[^/?#&]+
)'''
_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': {
'id': '22253814',
'ext': 'mp4',
'title': 'Winter is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
'timestamp': 1338306241,
'upload_date': '20120529',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
'skip': 'This video is only available for users of participating TV providers.',
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': {
'id': '600587331957',
'ext': 'mp4',
'title': 'Inlawful Entry',
'description': 'md5:57c12115a2b384d883fe64ca50529e08',
'timestamp': 1452634428,
'upload_date': '20160112',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True
}, {
'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
'only_matching': True
}, {
'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
'only_matching': True
}, {
'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie',
'only_matching': True
}, {
'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
'only_matching': True
}, {
'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story',
'only_matching': True
}, {
'url': 'http://www.history.com/videos/history-of-valentines-day',
'only_matching': True
}, {
'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape',
'only_matching': True
}]
def _real_extract(self, url):
domain, canonical = self._match_valid_url(url).groups()
return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url)
class AENetworksListBaseIE(AENetworksBaseIE):
def _call_api(self, resource, slug, brand, fields):
return self._download_json(
'https://yoga.appsvcs.aetnd.com/graphql',
slug, query={'brand': brand}, data=urlencode_postdata({
'query': '''{
%s(slug: "%s") {
%s
}
}''' % (resource, slug, fields),
}))['data'][resource]
def _real_extract(self, url):
domain, slug = self._match_valid_url(url).groups()
_, brand = self._DOMAIN_MAP[domain]
playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS)
base_url = 'http://watch.%s' % domain
entries = []
for item in (playlist.get(self._ITEMS_KEY) or []):
doc = self._get_doc(item)
canonical = doc.get('canonical')
if not canonical:
continue
entries.append(self.url_result(
base_url + canonical, AENetworksIE.ie_key(), doc.get('id')))
description = None
if self._PLAYLIST_DESCRIPTION_KEY:
description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY)
return self.playlist_result(
entries, playlist.get('id'),
playlist.get(self._PLAYLIST_TITLE_KEY), description)
class AENetworksCollectionIE(AENetworksListBaseIE):
IE_NAME = 'aenetworks:collection'
_VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)'
_TESTS = [{
'url': 'https://watch.historyvault.com/list/america-the-story-of-us',
'info_dict': {
'id': '282',
'title': 'America The Story of Us',
},
'playlist_mincount': 12,
}, {
'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us',
'only_matching': True
}, {
'url': 'https://www.historyvault.com/collections/mysteryquest',
'only_matching': True
}]
_RESOURCE = 'list'
_ITEMS_KEY = 'items'
_PLAYLIST_TITLE_KEY = 'display_title'
_PLAYLIST_DESCRIPTION_KEY = None
_FIELDS = '''id
display_title
items {
... on ListVideoItem {
doc {
canonical
id
}
}
}'''
def _get_doc(self, item):
return item.get('doc') or {}
class AENetworksShowIE(AENetworksListBaseIE):
IE_NAME = 'aenetworks:show'
_VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)'
_TESTS = [{
'url': 'http://www.history.com/shows/ancient-aliens',
'info_dict': {
'id': 'SERIES1574',
'title': 'Ancient Aliens',
'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f',
},
'playlist_mincount': 150,
}]
_RESOURCE = 'series'
_ITEMS_KEY = 'episodes'
_PLAYLIST_TITLE_KEY = 'title'
_PLAYLIST_DESCRIPTION_KEY = 'description'
_FIELDS = '''description
id
title
episodes {
canonical
id
}'''
def _get_doc(self, item):
return item
class HistoryTopicIE(AENetworksBaseIE):
IE_NAME = 'history:topic'
IE_DESC = 'History.com Topic'
_VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video'
_TESTS = [{
'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video',
'info_dict': {
'id': '40700995724',
'ext': 'mp4',
'title': "History of Valentines Day",
'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
'timestamp': 1375819729,
'upload_date': '20130806',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self.url_result(
'http://www.history.com/videos/' + display_id,
AENetworksIE.ie_key())
class HistoryPlayerIE(AENetworksBaseIE):
IE_NAME = 'history:player'
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)'
_TESTS = []
def _real_extract(self, url):
domain, video_id = self._match_valid_url(url).groups()
return self._extract_aetn_info(domain, 'id', video_id, url)
class BiographyIE(AENetworksBaseIE):
_VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808',
'info_dict': {
'id': '30322987',
'ext': 'mp4',
'title': 'Vincent Van Gogh - Full Episode',
'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.',
'timestamp': 1311970571,
'upload_date': '20110729',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
player_url = self._search_regex(
r'<phoenix-iframe[^>]+src="(%s)' % HistoryPlayerIE._VALID_URL,
webpage, 'player URL')
return self.url_result(player_url, HistoryPlayerIE.ie_key())

View File

@@ -0,0 +1,479 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
date_from_str,
determine_ext,
ExtractorError,
int_or_none,
qualities,
traverse_obj,
unified_strdate,
unified_timestamp,
update_url_query,
url_or_none,
urlencode_postdata,
xpath_text,
)
class AfreecaTVIE(InfoExtractor):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
(?:
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
vod\.afreecatv\.com/PLAYER/STATION/
)
(?P<id>\d+)
'''
_NETRC_MACHINE = 'afreecatv'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
'info_dict': {
'id': '36164052',
'ext': 'mp4',
'title': '데일리 에이프릴 요정들의 시상식!',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
},
'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
'id': '36153164',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '36153164_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '36153164_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}],
'skip': 'Video is gone',
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
'info_dict': {
'id': '18650793',
'ext': 'mp4',
'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '윈아디',
'uploader_id': 'badkids',
'duration': 107,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
'info_dict': {
'id': '10481652',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'duration': 6492,
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '20160502_c4c62b9d_174361386_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160502',
'duration': 3601,
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '20160502_39e739bb_174361386_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160502',
'duration': 2891,
},
}],
'params': {
'skip_download': True,
},
}, {
# non standard key
'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
'info_dict': {
'id': '20170411_BE689A0E_190960999_1_2_h',
'ext': 'mp4',
'title': '혼자사는여자집',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
'duration': 213,
},
'params': {
'skip_download': True,
},
}, {
# PARTIAL_ADULT
'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439',
'info_dict': {
'id': '20180327_27901457_202289533_1',
'ext': 'mp4',
'title': '[생]빨개요♥ (part 1)',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': '[SA]서아',
'uploader_id': 'bjdyrksu',
'upload_date': '20180327',
'duration': 3601,
},
'params': {
'skip_download': True,
},
'expected_warnings': ['adult content'],
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
'only_matching': True,
}]
@staticmethod
def parse_video_key(key):
video_key = {}
m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
if m:
video_key['upload_date'] = m.group('upload_date')
video_key['part'] = int(m.group('part'))
return video_key
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
login_form = {
'szWork': 'login',
'szType': 'json',
'szUid': username,
'szPassword': password,
'isSaveId': 'false',
'szScriptVar': 'oLoginRet',
'szAction': '',
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
result = int_or_none(response.get('RESULT'))
if result != 1:
error = _ERRORS.get(result, 'You have failed to log in.')
raise ExtractorError(
'Unable to login: %s said: %s' % (self.IE_NAME, error),
expected=True)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
if re.search(r'alert\(["\']This video has been deleted', webpage):
raise ExtractorError(
'Video %s has been deleted' % video_id, expected=True)
station_id = self._search_regex(
r'nStationNo\s*=\s*(\d+)', webpage, 'station')
bbs_id = self._search_regex(
r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs')
video_id = self._search_regex(
r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id)
partial_view = False
adult_view = False
for _ in range(2):
query = {
'nTitleNo': video_id,
'nStationNo': station_id,
'nBbsNo': bbs_id,
}
if partial_view:
query['partialView'] = 'SKIP_ADULT'
if adult_view:
query['adultView'] = 'ADULT_VIEW'
video_xml = self._download_xml(
'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
video_id, 'Downloading video info XML%s'
% (' (skipping adult)' if partial_view else ''),
video_id, headers={
'Referer': url,
}, query=query)
flag = xpath_text(video_xml, './track/flag', 'flag', default=None)
if flag and flag == 'SUCCEED':
break
if flag == 'PARTIAL_ADULT':
self.report_warning(
'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
'Only content suitable for all ages will be downloaded. '
'Provide account credentials if you wish to download restricted content.')
partial_view = True
continue
elif flag == 'ADULT':
if not adult_view:
adult_view = True
continue
error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
else:
error = flag
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
else:
raise ExtractorError('Unable to download video info')
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
duration = int_or_none(xpath_text(
video_xml, './track/duration', 'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
common_entry = {
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
}
info = common_entry.copy()
info.update({
'id': video_id,
'title': title,
'duration': duration,
})
if not video_url:
entries = []
file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
file_url = url_or_none(file_element.text)
if not file_url:
continue
key = file_element.get('key', '')
upload_date = unified_strdate(self._search_regex(
r'^(\d{8})_', key, 'upload date', default=None))
if upload_date is not None:
# sometimes the upload date isn't included in the file name
# instead, another random ID is, which may parse as a valid
# date but be wildly out of a reasonable range
parsed_date = date_from_str(upload_date)
if parsed_date.year < 2000 or parsed_date.year >= 2100:
upload_date = None
file_duration = int_or_none(file_element.get('duration'))
format_id = key if key else '%s_%s' % (video_id, file_num)
if determine_ext(file_url) == 'm3u8':
formats = self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls',
note='Downloading part %d m3u8 information' % file_num)
else:
formats = [{
'url': file_url,
'format_id': 'http',
}]
if not formats and not self.get_param('ignore_no_formats'):
continue
self._sort_formats(formats)
file_info = common_entry.copy()
file_info.update({
'id': format_id,
'title': title if one else '%s (part %d)' % (title, file_num),
'upload_date': upload_date,
'duration': file_duration,
'formats': formats,
})
entries.append(file_info)
entries_info = info.copy()
entries_info.update({
'_type': 'multi_video',
'entries': entries,
})
return entries_info
info = {
'id': video_id,
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'thumbnail': thumbnail,
}
if determine_ext(video_url) == 'm3u8':
info['formats'] = self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
else:
app, playpath = video_url.split('mp4:')
info.update({
'url': app,
'ext': 'flv',
'play_path': 'mp4:' + playpath,
'rtmp_live': True, # downloading won't end without this
})
return info
class AfreecaTVLiveIE(AfreecaTVIE):
IE_NAME = 'afreecatv:live'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
_TESTS = [{
'url': 'https://play.afreecatv.com/pyh3646/237852185',
'info_dict': {
'id': '237852185',
'ext': 'mp4',
'title': '【 우루과이 오늘은 무슨일이? 】',
'uploader': '박진우[JINU]',
'uploader_id': 'pyh3646',
'timestamp': 1640661495,
'is_live': True,
},
'skip': 'Livestream has ended',
}, {
'url': 'http://play.afreeca.com/pyh3646/237852185',
'only_matching': True,
}, {
'url': 'http://play.afreeca.com/pyh3646',
'only_matching': True,
}]
_LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
_QUALITIES = ('sd', 'hd', 'hd2k', 'original')
def _real_extract(self, url):
broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
data=urlencode_postdata({'bid': broadcaster_id})) or {}
channel_info = info.get('CHANNEL') or {}
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
if not broadcast_no:
raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
formats = []
quality_key = qualities(self._QUALITIES)
for quality_str in self._QUALITIES:
aid_response = self._download_json(
self._LIVE_API_URL, broadcast_no, fatal=False,
data=urlencode_postdata({
'bno': broadcast_no,
'stream_type': 'common',
'type': 'aid',
'quality': quality_str,
}),
note=f'Downloading access token for {quality_str} stream',
errnote=f'Unable to download access token for {quality_str} stream')
aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
if not aid:
continue
stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
stream_info = self._download_json(
f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False,
query={
'return_type': channel_info.get('CDN', 'gcp_cdn'),
'broad_key': f'{broadcast_no}-common-{quality_str}-hls',
},
note=f'Downloading metadata for {quality_str} stream',
errnote=f'Unable to download metadata for {quality_str} stream') or {}
if stream_info.get('view_url'):
formats.append({
'format_id': quality_str,
'url': update_url_query(stream_info['view_url'], {'aid': aid}),
'ext': 'mp4',
'protocol': 'm3u8',
'quality': quality_key(quality_str),
})
self._sort_formats(formats)
station_info = self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
query={'szBjId': broadcaster_id}, fatal=False,
note='Downloading channel metadata', errnote='Unable to download channel metadata') or {}
return {
'id': broadcast_no,
'title': channel_info.get('TITLE') or station_info.get('station_title'),
'uploader': channel_info.get('BJNICK') or station_info.get('station_name'),
'uploader_id': broadcaster_id,
'timestamp': unified_timestamp(station_info.get('broad_start')),
'formats': formats,
'is_live': True,
}

View File

@@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
)
class AirMozillaIE(InfoExtractor):
_VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
_TEST = {
'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
'md5': '8d02f53ee39cf006009180e21df1f3ba',
'info_dict': {
'id': '6x4q2w',
'ext': 'mp4',
'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
'thumbnail': r're:https?://.*/poster\.jpg',
'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
'timestamp': 1422487800,
'upload_date': '20150128',
'location': 'SFO Commons',
'duration': 3780,
'view_count': int,
'categories': ['Main', 'Privacy'],
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id')
embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
jwconfig = self._parse_json(self._search_regex(
r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config']
info_dict = self._parse_jwplayer_data(jwconfig, video_id)
view_count = int_or_none(self._html_search_regex(
r'Views since archived: ([0-9]+)',
webpage, 'view count', fatal=False))
timestamp = parse_iso8601(self._html_search_regex(
r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False))
duration = parse_duration(self._search_regex(
r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
webpage, 'duration', fatal=False))
info_dict.update({
'id': video_id,
'title': self._og_search_title(webpage),
'url': self._og_search_url(webpage),
'display_id': display_id,
'description': self._og_search_description(webpage),
'timestamp': timestamp,
'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
'duration': duration,
'view_count': view_count,
'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
})
return info_dict

View File

@@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
try_get,
)
class AliExpressLiveIE(InfoExtractor):
_VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)'
_TEST = {
'url': 'https://live.aliexpress.com/live/2800002704436634',
'md5': 'e729e25d47c5e557f2630eaf99b740a5',
'info_dict': {
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
'thumbnail': r're:http://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._parse_json(
self._search_regex(
r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var',
webpage, 'runParams'),
video_id)
title = data['title']
formats = self._extract_m3u8_formats(
data['replyStreamUrl'], video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
return {
'id': video_id,
'title': title,
'thumbnail': data.get('coverUrl'),
'uploader': try_get(
data, lambda x: x['followBar']['name'], compat_str),
'timestamp': float_or_none(data.get('startTimeLong'), scale=1000),
'formats': formats,
}

View File

@@ -0,0 +1,86 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
try_get,
)
class AlJazeeraIE(InfoExtractor):
_VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana',
'info_dict': {
'id': '6280641530001',
'ext': 'mp4',
'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana',
'timestamp': 1636219149,
'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.',
'upload_date': '20211106',
}
}, {
'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu',
'info_dict': {
'id': '6280654936001',
'ext': 'mp4',
'title': 'Đoković ušao u finale Mastersa u Parizu',
'timestamp': 1636221686,
'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.',
'upload_date': '20211106',
},
}]
BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)'
def _real_extract(self, url):
base, post_type, id = self._match_valid_url(url).groups()
wp = {
'balkans.aljazeera.net': 'ajb',
'chinese.aljazeera.net': 'chinese',
'mubasher.aljazeera.net': 'ajm',
}.get(base) or 'aje'
post_type = {
'features': 'post',
'program': 'episode',
'programs': 'episode',
'videos': 'video',
'news': 'news',
}[post_type.split('/')[0]]
video = self._download_json(
f'https://{base}/graphql', id, query={
'wp-site': wp,
'operationName': 'ArchipelagoSingleArticleQuery',
'variables': json.dumps({
'name': id,
'postType': post_type,
}),
}, headers={
'wp-site': wp,
})
video = try_get(video, lambda x: x['data']['article']['video']) or {}
video_id = video.get('id')
account = video.get('accountId') or '911432371001'
player_id = video.get('playerId') or 'csvTfAlKW'
embed = 'default'
if video_id is None:
webpage = self._download_webpage(url, id)
account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id',
group=(1, 2, 3, 4), default=(None, None, None, None))
if video_id is None:
return {
'_type': 'url_transparent',
'url': url,
'ie_key': 'Generic'
}
return {
'_type': 'url_transparent',
'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}',
'ie_key': 'BrightcoveNew'
}

View File

@@ -0,0 +1,132 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
qualities,
remove_end,
try_get,
unified_timestamp,
url_basename,
)
class AllocineIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?:article|video|film)/(?:fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
'md5': '0c9fcf59a841f65635fa300ac43d8269',
'info_dict': {
'id': '19546517',
'display_id': '18635087',
'ext': 'mp4',
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
'thumbnail': r're:http://.*\.jpg',
'duration': 39,
'timestamp': 1404273600,
'upload_date': '20140702',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0',
'info_dict': {
'id': '19540403',
'display_id': '19540403',
'ext': 'mp4',
'title': 'Planes 2 Bande-annonce VF',
'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',
'thumbnail': r're:http://.*\.jpg',
'duration': 69,
'timestamp': 1385659800,
'upload_date': '20131128',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html',
'md5': '101250fb127ef9ca3d73186ff22a47ce',
'info_dict': {
'id': '19544709',
'display_id': '19544709',
'ext': 'mp4',
'title': 'Dragons 2 - Bande annonce finale VF',
'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a',
'thumbnail': r're:http://.*\.jpg',
'duration': 144,
'timestamp': 1397589900,
'upload_date': '20140415',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/video-19550147/',
'md5': '3566c0668c0235e2d224fd8edb389f67',
'info_dict': {
'id': '19550147',
'ext': 'mp4',
'title': 'Faux Raccord N°123 - Les gaffes de Cliffhanger',
'description': 'md5:bc734b83ffa2d8a12188d9eb48bb6354',
'thumbnail': r're:http://.*\.jpg',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
formats = []
quality = qualities(['ld', 'md', 'hd'])
model = self._html_search_regex(
r'data-model="([^"]+)"', webpage, 'data model', default=None)
if model:
model_data = self._parse_json(model, display_id)
video = model_data['videos'][0]
title = video['title']
for video_url in video['sources'].values():
video_id, format_id = url_basename(video_url).split('_')[:2]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': video_url,
})
duration = int_or_none(video.get('duration'))
view_count = int_or_none(video.get('view_count'))
timestamp = unified_timestamp(try_get(
video, lambda x: x['added_at']['date'], compat_str))
else:
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
title = remove_end(
self._html_search_regex(
r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
' - AlloCiné')
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
format_id = key[:-len('Path')]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': value,
})
duration, view_count, timestamp = [None] * 3
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'duration': duration,
'timestamp': timestamp,
'view_count': view_count,
'formats': formats,
}

View File

@@ -0,0 +1,77 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
parse_duration,
parse_filesize,
int_or_none,
)
class AlphaPornoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
'info_dict': {
'id': '258807',
'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
'ext': 'mp4',
'title': 'Sensual striptease porn with Samantha Alexandra',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1418694611,
'upload_date': '20141216',
'duration': 387,
'filesize_approx': 54120000,
'tbr': 1145,
'categories': list,
'age_limit': 18,
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
video_url = self._search_regex(
r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
ext = self._html_search_meta(
'encodingFormat', webpage, 'ext', default='.mp4')[1:]
title = self._search_regex(
[r'<meta content="([^"]+)" itemprop="description">',
r'class="title" itemprop="name">([^<]+)<'],
webpage, 'title')
thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
timestamp = parse_iso8601(self._html_search_meta(
'uploadDate', webpage, 'upload date'))
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration'))
filesize_approx = parse_filesize(self._html_search_meta(
'contentSize', webpage, 'file size'))
bitrate = int_or_none(self._html_search_meta(
'bitrate', webpage, 'bitrate'))
categories = self._html_search_meta(
'keywords', webpage, 'categories', default='').split(',')
age_limit = self._rta_search(webpage)
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
'filesize_approx': filesize_approx,
'tbr': bitrate,
'categories': categories,
'age_limit': age_limit,
}

View File

@@ -0,0 +1,179 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
from ..utils import (
urlencode_postdata,
urljoin,
int_or_none,
clean_html,
ExtractorError
)
class AluraIE(InfoExtractor):
_VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)'
_LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
_VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video'
_NETRC_MACHINE = 'alura'
_TESTS = [{
'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095',
'info_dict': {
'id': '60095',
'ext': 'mp4',
'title': 'Referências, ref-set e alter'
},
'skip': 'Requires alura account credentials'},
{
# URL without video
'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098',
'only_matching': True},
{
'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219',
'only_matching': True}
]
def _real_extract(self, url):
course, video_id = self._match_valid_url(url)
video_url = self._VIDEO_URL % (course, video_id)
video_dict = self._download_json(video_url, video_id, 'Searching for videos')
if video_dict:
webpage = self._download_webpage(url, video_id)
video_title = clean_html(self._search_regex(
r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)',
webpage, 'title', group='title'))
formats = []
for video_obj in video_dict:
video_url_m3u8 = video_obj.get('link')
video_format = self._extract_m3u8_formats(
video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)
for f in video_format:
m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url'])
if m:
if not f.get('height'):
f['height'] = int('720' if m.group('res') == 'hd' else '480')
formats.extend(video_format)
self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
"formats": formats
}
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
pass
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login popup')
def is_logged(webpage):
return any(re.search(p, webpage) for p in (
r'href=[\"|\']?/signout[\"|\']',
r'>Logout<'))
# already logged in
if is_logged(login_page):
return
login_form = self._hidden_inputs(login_page)
login_form.update({
'username': username,
'password': password,
})
post_url = self._search_regex(
r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page,
'post url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in',
data=urlencode_postdata(login_form),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
if not is_logged(response):
error = self._html_search_regex(
r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>',
response, 'error message', default=None)
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
class AluraCourseIE(AluraIE):
_VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)'
_LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
_NETRC_MACHINE = 'aluracourse'
_TESTS = [{
'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url)
def _real_extract(self, url):
course_path = self._match_id(url)
webpage = self._download_webpage(url, course_path)
course_title = self._search_regex(
r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage,
'course title', default=course_path, group='course_title')
entries = []
if webpage:
for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage):
page_url = urljoin(url, path)
section_path = self._download_webpage(page_url, course_path)
for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path):
chapter = clean_html(
self._search_regex(
r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',
section_path,
'chapter',
group='chapter'))
chapter_number = int_or_none(
self._search_regex(
r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',
section_path,
'chapter number',
group='chapter_number'))
video_url = urljoin(url, path_video)
entry = {
'_type': 'url_transparent',
'id': self._match_id(video_url),
'url': video_url,
'id_key': self.ie_key(),
'chapter': chapter,
'chapter_number': chapter_number
}
entries.append(entry)
return self.playlist_result(entries, course_path, course_title)

View File

@@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .youtube import YoutubeIE
from .vimeo import VimeoIE
from ..utils import (
int_or_none,
parse_iso8601,
update_url_query,
)
class AmaraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
_TESTS = [{
# Youtube
'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
'info_dict': {
'id': 'h6ZuVdvYnfE',
'ext': 'mp4',
'title': 'Why jury trials are becoming less common',
'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20160813',
'uploader': 'PBS NewsHour',
'uploader_id': 'PBSNewsHour',
'timestamp': 1549639570,
}
}, {
# Vimeo
'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
'md5': '99392c75fa05d432a8f11df03612195e',
'info_dict': {
'id': '18622084',
'ext': 'mov',
'title': 'Vimeo at CES 2011!',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'timestamp': 1294763658,
'upload_date': '20110111',
'uploader': 'Sam Morrill',
'uploader_id': 'sammorrill'
}
}, {
# Direct Link
'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
'md5': 'd3970f08512738ee60c5807311ff5d3f',
'info_dict': {
'id': 's8KL7I3jLmh6',
'ext': 'mp4',
'title': 'The danger of a single story',
'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20091007',
'timestamp': 1254942511,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._download_json(
'https://amara.org/api/videos/%s/' % video_id,
video_id, query={'format': 'json'})
title = meta['title']
video_url = meta['all_urls'][0]
subtitles = {}
for language in (meta.get('languages') or []):
subtitles_uri = language.get('subtitles_uri')
if not (subtitles_uri and language.get('published')):
continue
subtitle = subtitles.setdefault(language.get('code') or 'en', [])
for f in ('json', 'srt', 'vtt'):
subtitle.append({
'ext': f,
'url': update_url_query(subtitles_uri, {'format': f}),
})
info = {
'url': video_url,
'id': video_id,
'subtitles': subtitles,
'title': title,
'description': meta.get('description'),
'thumbnail': meta.get('thumbnail'),
'duration': int_or_none(meta.get('duration')),
'timestamp': parse_iso8601(meta.get('created')),
}
for ie in (YoutubeIE, VimeoIE):
if ie.suitable(video_url):
info.update({
'_type': 'url_transparent',
'ie_key': ie.ie_key(),
})
break
return info

View File

@@ -0,0 +1,53 @@
# coding: utf-8
from .common import InfoExtractor
from ..utils import int_or_none
class AmazonStoreIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
_TESTS = [{
'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
'info_dict': {
'id': 'B098XNCHLD',
'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed',
},
'playlist_mincount': 1,
'playlist': [{
'info_dict': {
'id': 'A1F83G8C2ARO7P',
'ext': 'mp4',
'title': 'mcdodo usb c cable 100W 5a',
'thumbnail': r're:^https?://.*\.jpg$',
},
}]
}, {
'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
'info_dict': {
'id': 'B0863TXGM3',
'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff',
},
'playlist_mincount': 4,
}, {
'url': 'https://www.amazon.com/dp/B0845NXCXF/',
'info_dict': {
'id': 'B0845NXCXF',
'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171',
},
'playlist-mincount': 1,
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id)
entries = [{
'id': video['marketPlaceID'],
'url': video['url'],
'title': video.get('title'),
'thumbnail': video.get('thumbUrl') or video.get('thumb'),
'duration': video.get('durationSeconds'),
'height': int_or_none(video.get('videoHeight')),
'width': int_or_none(video.get('videoWidth')),
} for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title'])

View File

@@ -0,0 +1,150 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .theplatform import ThePlatformIE
from ..utils import (
int_or_none,
parse_age_limit,
try_get,
update_url_query,
)
class AMCNetworksIE(ThePlatformIE):
_VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631',
'info_dict': {
'id': '4Lq1dzOnZGt0',
'ext': 'mp4',
'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner",
'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.",
'upload_date': '20201120',
'timestamp': 1605904350,
'uploader': 'AMCN',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
'only_matching': True,
}, {
'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal',
'only_matching': True,
}, {
'url': 'http://www.ifc.com/movies/chaos',
'only_matching': True,
}, {
'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3',
'only_matching': True,
}, {
'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1',
'only_matching': True,
}]
_REQUESTOR_ID_MAP = {
'amc': 'AMC',
'bbcamerica': 'BBCA',
'ifc': 'IFC',
'sundancetv': 'SUNDANCE',
'wetv': 'WETV',
}
def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups()
requestor_id = self._REQUESTOR_ID_MAP[site]
page_data = self._download_json(
'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s'
% (requestor_id.lower(), display_id), display_id)['data']
properties = page_data.get('properties') or {}
query = {
'mbr': 'true',
'manifest': 'm3u',
}
video_player_count = 0
try:
for v in page_data['children']:
if v.get('type') == 'video-player':
releasePid = v['properties']['currentVideo']['meta']['releasePid']
tp_path = 'M_UwQC/' + releasePid
media_url = 'https://link.theplatform.com/s/' + tp_path
video_player_count += 1
except KeyError:
pass
if video_player_count > 1:
self.report_warning(
'The JSON data has %d video players. Only one will be extracted' % video_player_count)
# Fall back to videoPid if releasePid not found.
# TODO: Fall back to videoPid if releasePid manifest uses DRM.
if not video_player_count:
tp_path = 'M_UwQC/media/' + properties['videoPid']
media_url = 'https://link.theplatform.com/s/' + tp_path
theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
title = theplatform_metadata['title']
rating = try_get(
theplatform_metadata, lambda x: x['ratings'][0]['rating'])
video_category = properties.get('videoCategory')
if video_category and video_category.endswith('-Auth'):
resource = self._get_mvpd_resource(
requestor_id, title, video_id, rating)
query['auth'] = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
media_url = update_url_query(media_url, query)
formats, subtitles = self._extract_theplatform_smil(
media_url, video_id)
self._sort_formats(formats)
thumbnails = []
thumbnail_urls = [properties.get('imageDesktop')]
if 'thumbnail' in info:
thumbnail_urls.append(info.pop('thumbnail'))
for thumbnail_url in thumbnail_urls:
if not thumbnail_url:
continue
mobj = re.search(r'(\d+)x(\d+)', thumbnail_url)
thumbnails.append({
'url': thumbnail_url,
'width': int(mobj.group(1)) if mobj else None,
'height': int(mobj.group(2)) if mobj else None,
})
info.update({
'age_limit': parse_age_limit(rating),
'formats': formats,
'id': video_id,
'subtitles': subtitles,
'thumbnails': thumbnails,
})
ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
if ns_keys:
ns = list(ns_keys)[0]
episode = theplatform_metadata.get(ns + '$episodeTitle') or None
episode_number = int_or_none(
theplatform_metadata.get(ns + '$episode'))
season_number = int_or_none(
theplatform_metadata.get(ns + '$season'))
series = theplatform_metadata.get(ns + '$show') or None
info.update({
'episode': episode,
'episode_number': episode_number,
'season_number': season_number,
'series': series,
})
return info

View File

@@ -0,0 +1,158 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
try_get,
unified_strdate,
unified_timestamp,
)
class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
'info_dict': {
'id': '5b400b9ee338f922cb06450c',
'title': 'Japanese Suppers',
'ext': 'mp4',
'description': 'md5:64e606bfee910627efc4b5f050de92b3',
'thumbnail': r're:^https?://',
'timestamp': 1523318400,
'upload_date': '20180410',
'release_date': '20180410',
'series': "America's Test Kitchen",
'season_number': 18,
'episode': 'Japanese Suppers',
'episode_number': 15,
},
'params': {
'skip_download': True,
},
}, {
# Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above)
'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner',
'md5': '06451608c57651e985a498e69cec17e5',
'info_dict': {
'id': '5fbe8c61bda2010001c6763b',
'title': 'Simple Chicken Dinner',
'ext': 'mp4',
'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
'thumbnail': r're:^https?://',
'timestamp': 1610755200,
'upload_date': '20210116',
'release_date': '20210116',
'series': "America's Test Kitchen",
'season_number': 21,
'episode': 'Simple Chicken Dinner',
'episode_number': 3,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington',
'only_matching': True,
}]
def _real_extract(self, url):
resource_type, video_id = self._match_valid_url(url).groups()
is_episode = resource_type == 'episode'
if is_episode:
resource_type = 'episodes'
resource = self._download_json(
'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id)
video = resource['video'] if is_episode else resource
episode = resource if is_episode else resource.get('episode') or {}
return {
'_type': 'url_transparent',
'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'],
'ie_key': 'Zype',
'description': clean_html(video.get('description')),
'timestamp': unified_timestamp(video.get('publishDate')),
'release_date': unified_strdate(video.get('publishDate')),
'episode_number': int_or_none(episode.get('number')),
'season_number': int_or_none(episode.get('season')),
'series': try_get(episode, lambda x: x['show']['title']),
'episode': episode.get('title'),
}
class AmericasTestKitchenSeasonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
_TESTS = [{
# ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
'info_dict': {
'id': 'season_1',
'title': 'Season 1',
},
'playlist_count': 13,
}, {
# Cooks Country Season
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'info_dict': {
'id': 'season_12',
'title': 'Season 12',
},
'playlist_count': 13,
}]
def _real_extract(self, url):
show_name, season_number = self._match_valid_url(url).groups()
season_number = int(season_number)
slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
season = 'Season %d' % season_number
season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
season, headers={
'Origin': 'https://www.%s.com' % show_name,
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={
'facetFilters': json.dumps([
'search_season_list:' + season,
'search_document_klass:episode',
'search_show_slug:' + slug,
]),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
'attributesToHighlight': '',
'hitsPerPage': 1000,
})
def entries():
for episode in (season_search.get('hits') or []):
search_url = episode.get('search_url')
if not search_url:
continue
yield {
'_type': 'url',
'url': 'https://www.%s.com%s' % (show_name, search_url),
'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
'title': episode.get('title'),
'description': episode.get('description'),
'timestamp': unified_timestamp(episode.get('search_document_date')),
'season_number': season_number,
'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)),
'ie_key': AmericasTestKitchenIE.ie_key(),
}
return self.playlist_result(
entries(), 'season_%d' % season_number, season)

View File

@@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
unified_timestamp,
url_or_none,
)
class AMPIE(InfoExtractor):
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
'Unable to download Akamai AMP feed')
item = feed.get('channel', {}).get('item')
if not item:
raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
video_id = item['guid']
def get_media_node(name, default=None):
media_name = 'media-%s' % name
media_group = item.get('media-group') or item
return media_group.get(media_name) or item.get(media_name) or item.get(name, default)
thumbnails = []
media_thumbnail = get_media_node('thumbnail')
if media_thumbnail:
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
subtitles = {}
media_subtitle = get_media_node('subTitle')
if media_subtitle:
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {})
subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href:
continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
'url': subtitle_href,
'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
})
formats = []
media_content = get_media_node('content')
if isinstance(media_content, dict):
media_content = [media_content]
for media_data in media_content:
media = media_data.get('@attributes', {})
media_url = url_or_none(media.get('url'))
if not media_url:
continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media_url,
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
'ext': ext,
})
self._sort_formats(formats)
timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
return {
'id': video_id,
'title': get_media_node('title'),
'description': get_media_node('description'),
'thumbnails': thumbnails,
'timestamp': timestamp,
'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
'subtitles': subtitles,
'formats': formats,
}

View File

@@ -0,0 +1,285 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
urlencode_postdata,
int_or_none,
str_or_none,
determine_ext,
)
from ..compat import compat_HTTPError
class AnimeLabBaseIE(InfoExtractor):
_LOGIN_REQUIRED = True
_LOGIN_URL = 'https://www.animelab.com/login'
_NETRC_MACHINE = 'animelab'
def _login(self):
def is_logged_in(login_webpage):
return 'Sign In' not in login_webpage
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
# Check if already logged in
if is_logged_in(login_page):
return
(username, password) = self._get_login_info()
if username is None and self._LOGIN_REQUIRED:
self.raise_login_required('Login is required to access any AnimeLab content')
login_form = {
'email': username,
'password': password,
}
try:
response = self._download_webpage(
self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
data=urlencode_postdata(login_form),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
else:
raise
# if login was successful
if is_logged_in(response):
return
raise ExtractorError('Unable to login (cannot verify if logged in)')
def _real_initialize(self):
self._login()
class AnimeLabIE(AnimeLabBaseIE):
_VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'
# the following tests require authentication, but a free account will suffice
# just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
# or you can set 'username' and 'password' there
# the tests also select a specific format so that the same video is downloaded
# regardless of whether the user is premium or not (needs testing on a premium account)
_TEST = {
'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
'info_dict': {
'id': '383',
'ext': 'mp4',
'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
'series': 'Fullmetal Alchemist: Brotherhood',
'episode': 'Signs of a Counteroffensive',
'episode_number': 42,
'duration': 1469,
'season': 'Season 1',
'season_number': 1,
'season_id': '38',
},
'params': {
'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
},
'skip': 'All AnimeLab content requires authentication',
}
def _real_extract(self, url):
display_id = self._match_id(url)
# unfortunately we can get different URLs for the same formats
# e.g. if we are using a "free" account so no dubs available
# (so _remove_duplicate_formats is not effective)
# so we use a dictionary as a workaround
formats = {}
for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
'https://www.animelab.com/player/%s/dubbed'):
actual_url = language_option_url % display_id
webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)
video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position'))
raw_data = video_collection[position]['videoEntry']
video_id = str_or_none(raw_data['id'])
# create a title from many sources (while grabbing other info)
# TODO use more fallback sources to get some of these
series = raw_data.get('showTitle')
video_type = raw_data.get('videoEntryType', {}).get('name')
episode_number = raw_data.get('episodeNumber')
episode_name = raw_data.get('name')
title_parts = (series, video_type, episode_number, episode_name)
if None not in title_parts:
title = '%s - %s %s - %s' % title_parts
else:
title = episode_name
description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)
duration = int_or_none(raw_data.get('duration'))
thumbnail_data = raw_data.get('images', [])
thumbnails = []
for thumbnail in thumbnail_data:
for instance in thumbnail['imageInstances']:
image_data = instance.get('imageInfo', {})
thumbnails.append({
'id': str_or_none(image_data.get('id')),
'url': image_data.get('fullPath'),
'width': image_data.get('width'),
'height': image_data.get('height'),
})
season_data = raw_data.get('season', {}) or {}
season = str_or_none(season_data.get('name'))
season_number = int_or_none(season_data.get('seasonNumber'))
season_id = str_or_none(season_data.get('id'))
for video_data in raw_data['videoList']:
current_video_list = {}
current_video_list['language'] = video_data.get('language', {}).get('languageCode')
is_hardsubbed = video_data.get('hardSubbed')
for video_instance in video_data['videoInstances']:
httpurl = video_instance.get('httpUrl')
url = httpurl if httpurl else video_instance.get('rtmpUrl')
if url is None:
# this video format is unavailable to the user (not premium etc.)
continue
current_format = current_video_list.copy()
format_id_parts = []
format_id_parts.append(str_or_none(video_instance.get('id')))
if is_hardsubbed is not None:
if is_hardsubbed:
format_id_parts.append('yeshardsubbed')
else:
format_id_parts.append('nothardsubbed')
format_id_parts.append(current_format['language'])
format_id = '_'.join([x for x in format_id_parts if x is not None])
ext = determine_ext(url)
if ext == 'm3u8':
for format_ in self._extract_m3u8_formats(
url, video_id, m3u8_id=format_id, fatal=False):
formats[format_['format_id']] = format_
continue
elif ext == 'mpd':
for format_ in self._extract_mpd_formats(
url, video_id, mpd_id=format_id, fatal=False):
formats[format_['format_id']] = format_
continue
current_format['url'] = url
quality_data = video_instance.get('videoQuality')
if quality_data:
quality = quality_data.get('name') or quality_data.get('description')
else:
quality = None
height = None
if quality:
height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))
if height is None:
self.report_warning('Could not get height of video')
else:
current_format['height'] = height
current_format['format_id'] = format_id
formats[current_format['format_id']] = current_format
formats = list(formats.values())
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'series': series,
'episode': episode_name,
'episode_number': int_or_none(episode_number),
'thumbnails': thumbnails,
'duration': duration,
'formats': formats,
'season': season,
'season_number': season_number,
'season_id': season_id,
}
class AnimeLabShowsIE(AnimeLabBaseIE):
_VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'
_TEST = {
'url': 'https://www.animelab.com/shows/attack-on-titan',
'info_dict': {
'id': '45',
'title': 'Attack on Titan',
'description': 'md5:989d95a2677e9309368d5cf39ba91469',
},
'playlist_count': 59,
'skip': 'All AnimeLab content requires authentication',
}
def _real_extract(self, url):
_BASE_URL = 'http://www.animelab.com'
_SHOWS_API_URL = '/api/videoentries/show/videos/'
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, 'Downloading requested URL')
show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
show_data = self._parse_json(show_data_str, display_id)
show_id = str_or_none(show_data.get('id'))
title = show_data.get('name')
description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')
entries = []
for season in show_data['seasons']:
season_id = season['id']
get_data = urlencode_postdata({
'seasonId': season_id,
'limit': 1000,
})
# despite using urlencode_postdata, we are sending a GET request
target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
response = self._download_webpage(
target_url,
None, 'Season id %s' % season_id)
season_data = self._parse_json(response, display_id)
for video_data in season_data['list']:
entries.append(self.url_result(
_BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
str_or_none(video_data.get('id')), video_data.get('name')
))
return {
'_type': 'playlist',
'id': show_id,
'title': title,
'description': description,
'entries': entries,
}
# TODO implement myqueue

View File

@@ -0,0 +1,291 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
join_nonempty,
url_or_none,
urlencode_postdata,
urljoin,
)
class AnimeOnDemandIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)'
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
# German-speaking countries of Europe
_GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
_TESTS = [{
# jap, OmU
'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': {
'id': '161',
'title': 'Grimgar, Ashes and Illusions (OmU)',
'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
},
'playlist_mincount': 4,
}, {
# Film wording is used instead of Episode, ger/jap, Dub/OmU
'url': 'https://www.anime-on-demand.de/anime/39',
'only_matching': True,
}, {
# Episodes without titles, jap, OmU
'url': 'https://www.anime-on-demand.de/anime/162',
'only_matching': True,
}, {
# ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/169',
'only_matching': True,
}, {
# Full length film, non-series, ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/185',
'only_matching': True,
}, {
# Flash videos
'url': 'https://www.anime-on-demand.de/anime/12',
'only_matching': True,
}]
def _login(self):
username, password = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page:
self.raise_geo_restricted(
'%s is only available in German-speaking countries of Europe' % self.IE_NAME)
login_form = self._form_hidden_inputs('new_user', login_page)
login_form.update({
'user[login]': username,
'user[password]': password,
})
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in',
data=urlencode_postdata(login_form), headers={
'Referer': self._LOGIN_URL,
})
if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
error = self._search_regex(
r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>',
response, 'error', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_initialize(self):
self._login()
def _real_extract(self, url):
anime_id = self._match_id(url)
webpage = self._download_webpage(url, anime_id)
if 'data-playlist=' not in webpage:
self._download_webpage(
self._APPLY_HTML5_URL, anime_id,
'Activating HTML5 beta', 'Unable to apply HTML5 beta')
webpage = self._download_webpage(url, anime_id)
csrf_token = self._html_search_meta(
'csrf-token', webpage, 'csrf token', fatal=True)
anime_title = self._html_search_regex(
r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>',
webpage, 'anime name')
anime_description = self._html_search_regex(
r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
webpage, 'anime description', default=None)
def extract_info(html, video_id, num=None):
title, description = [None] * 2
formats = []
for input_ in re.findall(
r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
attributes = extract_attributes(input_)
title = attributes.get('data-dialog-header')
playlist_urls = []
for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
playlist_url = attributes.get(playlist_key)
if isinstance(playlist_url, compat_str) and re.match(
r'/?[\da-zA-Z]+', playlist_url):
playlist_urls.append(attributes[playlist_key])
if not playlist_urls:
continue
lang = attributes.get('data-lang')
lang_note = attributes.get('value')
for playlist_url in playlist_urls:
kind = self._search_regex(
r'videomaterialurl/\d+/([^/]+)/',
playlist_url, 'media kind', default=None)
format_id = join_nonempty(lang, kind) if lang or kind else str(num)
format_note = join_nonempty(kind, lang_note, delim=', ')
item_id_list = []
if format_id:
item_id_list.append(format_id)
item_id_list.append('videomaterial')
playlist = self._download_json(
urljoin(url, playlist_url), video_id,
'Downloading %s JSON' % ' '.join(item_id_list),
headers={
'X-Requested-With': 'XMLHttpRequest',
'X-CSRF-Token': csrf_token,
'Referer': url,
'Accept': 'application/json, text/javascript, */*; q=0.01',
}, fatal=False)
if not playlist:
continue
stream_url = url_or_none(playlist.get('streamurl'))
if stream_url:
rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
stream_url)
if rtmp:
formats.append({
'url': rtmp.group('url'),
'app': rtmp.group('app'),
'play_path': rtmp.group('playpath'),
'page_url': url,
'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
'rtmp_real_time': True,
'format_id': 'rtmp',
'ext': 'flv',
})
continue
start_video = playlist.get('startvideo', 0)
playlist = playlist.get('playlist')
if not playlist or not isinstance(playlist, list):
continue
playlist = playlist[start_video]
title = playlist.get('title')
if not title:
continue
description = playlist.get('description')
for source in playlist.get('sources', []):
file_ = source.get('file')
if not file_:
continue
ext = determine_ext(file_)
format_id = join_nonempty(
lang, kind,
'hls' if ext == 'm3u8' else None,
'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None)
if ext == 'm3u8':
file_formats = self._extract_m3u8_formats(
file_, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)
elif source.get('type') == 'video/dash' or ext == 'mpd':
continue
file_formats = self._extract_mpd_formats(
file_, video_id, mpd_id=format_id, fatal=False)
else:
continue
for f in file_formats:
f.update({
'language': lang,
'format_note': format_note,
})
formats.extend(file_formats)
return {
'title': title,
'description': description,
'formats': formats,
}
def extract_entries(html, video_id, common_info, num=None):
info = extract_info(html, video_id, num)
if info['formats']:
self._sort_formats(info['formats'])
f = common_info.copy()
f.update(info)
yield f
# Extract teaser/trailer only when full episode is not available
if not info['formats']:
m = re.search(
r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
html)
if m:
f = common_info.copy()
f.update({
'id': '%s-%s' % (f['id'], m.group('kind').lower()),
'title': m.group('title'),
'url': urljoin(url, m.group('href')),
})
yield f
def extract_episodes(html):
for num, episode_html in enumerate(re.findall(
r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
episodebox_title = self._search_regex(
(r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
episode_html, 'episodebox title', default=None, group='title')
if not episodebox_title:
continue
episode_number = int(self._search_regex(
r'(?:Episode|Film)\s*(\d+)',
episodebox_title, 'episode number', default=num))
episode_title = self._search_regex(
r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
episodebox_title, 'episode title', default=None)
video_id = 'episode-%d' % episode_number
common_info = {
'id': video_id,
'series': anime_title,
'episode': episode_title,
'episode_number': episode_number,
}
for e in extract_entries(episode_html, video_id, common_info):
yield e
def extract_film(html, video_id):
common_info = {
'id': anime_id,
'title': anime_title,
'description': anime_description,
}
for e in extract_entries(html, video_id, common_info):
yield e
def entries():
has_episodes = False
for e in extract_episodes(webpage):
has_episodes = True
yield e
if not has_episodes:
for e in extract_film(webpage, anime_id):
yield e
return self.playlist_result(
entries(), anime_id, anime_title, anime_description)

View File

@@ -0,0 +1,399 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import hashlib
import json
import random
import re
import time
from .common import InfoExtractor
from ..aes import aes_encrypt
from ..compat import compat_str
from ..utils import (
bytes_to_intlist,
determine_ext,
intlist_to_bytes,
int_or_none,
join_nonempty,
strip_jsonp,
unescapeHTML,
unsmuggle_url,
)
# This import causes a ModuleNotFoundError on some systems for unknown reason.
# See issues:
# https://github.com/yt-dlp/yt-dlp/issues/35
# https://github.com/ytdl-org/youtube-dl/issues/27449
# https://github.com/animelover1984/youtube-dl/issues/17
try:
from .anvato_token_generator import NFLTokenGenerator
except ImportError:
NFLTokenGenerator = None
def md5_text(s):
if not isinstance(s, compat_str):
s = compat_str(s)
return hashlib.md5(s.encode('utf-8')).hexdigest()
class AnvatoIE(InfoExtractor):
_VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z',
'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B',
'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj',
'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l',
'04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P',
'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A',
'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V',
'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z',
'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9',
'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e',
'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D',
'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d',
'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ',
'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V',
'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe',
'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP',
'3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV',
'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v',
'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q',
'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV',
'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r',
'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR',
'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0',
'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl',
'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923',
'7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P',
'3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa',
'3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V',
'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5',
'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ',
'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye',
'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o',
'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e',
'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z',
'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R',
'7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29',
'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q',
'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp',
'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze',
'5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ',
'70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa',
'26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ',
'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL',
'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo',
'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV',
'3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa',
'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y',
'7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P',
'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO',
'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr',
'5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy',
'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn',
'3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj',
'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29',
'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V',
'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5',
'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy',
'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e',
'5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y',
'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0',
'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy',
'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV',
'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K',
'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23',
'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR',
'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R',
'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ',
'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L',
'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR',
}
_MCP_TO_ACCESS_KEY_TABLE = {
'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
}
_TOKEN_GENERATORS = {
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator,
}
_API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
_TESTS = [{
# from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
'info_dict': {
'id': '4465496',
'ext': 'mp4',
'title': 'VIDEO: Humpback whale breaches right next to NH boat',
'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
'duration': 22,
'timestamp': 1534855680,
'upload_date': '20180821',
'uploader': 'ANV',
},
'params': {
'skip_download': True,
},
}, {
# from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
'only_matching': True,
}]
def __init__(self, *args, **kwargs):
super(AnvatoIE, self).__init__(*args, **kwargs)
self.__server_time = None
def _server_time(self, access_key, video_id):
if self.__server_time is not None:
return self.__server_time
self.__server_time = int(self._download_json(
self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
note='Fetching server time')['server_time'])
return self.__server_time
def _api_prefix(self, access_key):
return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
def _get_video_json(self, access_key, video_id):
# See et() in anvplayer.min.js, which is an alias of getVideoJSON()
video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
server_time = self._server_time(access_key, video_id)
input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
auth_secret = intlist_to_bytes(aes_encrypt(
bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
anvrid = md5_text(time.time() * 1000 * random.random())[:30]
api = {
'anvrid': anvrid,
'anvts': server_time,
}
if self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id)
else:
api['anvstk'] = md5_text('%s|%s|%d|%s' % (
access_key, anvrid, server_time,
self._ANVACK_TABLE.get(access_key, self._API_KEY)))
return self._download_json(
video_data_url, video_id, transform_source=strip_jsonp,
data=json.dumps({'api': api}).encode('utf-8'))
def _get_anvato_videos(self, access_key, video_id):
video_data = self._get_video_json(access_key, video_id)
formats = []
for published_url in video_data['published_urls']:
video_url = published_url['embed_url']
media_format = published_url.get('format')
ext = determine_ext(video_url)
if ext == 'smil' or media_format == 'smil':
formats.extend(self._extract_smil_formats(video_url, video_id))
continue
tbr = int_or_none(published_url.get('kbps'))
a_format = {
'url': video_url,
'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(),
'tbr': tbr or None,
}
if media_format == 'm3u8' and tbr is not None:
a_format.update({
'format_id': join_nonempty('hls', tbr),
'ext': 'mp4',
})
elif media_format == 'm3u8-variant' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
elif ext == 'mp3' or media_format == 'mp3':
a_format['vcodec'] = 'none'
else:
a_format.update({
'width': int_or_none(published_url.get('width')),
'height': int_or_none(published_url.get('height')),
})
formats.append(a_format)
self._sort_formats(formats)
subtitles = {}
for caption in video_data.get('captions', []):
a_caption = {
'url': caption['url'],
'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
}
subtitles.setdefault(caption['language'], []).append(a_caption)
return {
'id': video_id,
'formats': formats,
'title': video_data.get('def_title'),
'description': video_data.get('def_description'),
'tags': video_data.get('def_tags', '').split(','),
'categories': video_data.get('categories'),
'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'),
'timestamp': int_or_none(video_data.get(
'ts_published') or video_data.get('ts_added')),
'uploader': video_data.get('mcp_id'),
'duration': int_or_none(video_data.get('duration')),
'subtitles': subtitles,
}
@staticmethod
def _extract_urls(ie, webpage, video_id):
entries = []
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
anvplayer_data = ie._parse_json(
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
if not access_key:
continue
entries.append(ie.url_result(
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
self._html_search_regex(
self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({
'countries': smuggled_data.get('geo_countries'),
})
mobj = self._match_valid_url(url)
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE:
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
access_key) or access_key
return self._get_anvato_videos(access_key, video_id)

View File

@@ -0,0 +1,7 @@
from __future__ import unicode_literals
from .nfl import NFLTokenGenerator
__all__ = [
'NFLTokenGenerator',
]

View File

@@ -0,0 +1,6 @@
from __future__ import unicode_literals
class TokenGenerator:
def generate(self, anvack, mcp_id):
raise NotImplementedError('This method must be implemented by subclasses')

View File

@@ -0,0 +1,30 @@
from __future__ import unicode_literals
import json
from .common import TokenGenerator
class NFLTokenGenerator(TokenGenerator):
_AUTHORIZATION = None
def generate(ie, anvack, mcp_id):
if not NFLTokenGenerator._AUTHORIZATION:
reroute = ie._download_json(
'https://api.nfl.com/v1/reroute', mcp_id,
data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100})
NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
return ie._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id),
}).encode(), headers={
'Authorization': NFLTokenGenerator._AUTHORIZATION,
'Content-Type': 'application/json',
})['data']['viewer']['mediaToken']['token']

View File

@@ -0,0 +1,136 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .yahoo import YahooIE
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
url_or_none,
)
class AolIE(YahooIE):
IE_NAME = 'aol.com'
_VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})'
_TESTS = [{
# video with 5min ID
'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/',
'md5': '18ef68f48740e86ae94b98da815eec42',
'info_dict': {
'id': '518167793',
'ext': 'mp4',
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.',
'timestamp': 1395405060,
'upload_date': '20140321',
'uploader': 'Newsy Studio',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
# video with vidible ID
'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/',
'info_dict': {
'id': '5707d6b8e4b090497b04f706',
'ext': 'mp4',
'title': 'Netflix is Raising Rates',
'description': 'Netflix is rewarding millions of its long-standing members with an increase in cost. Veuers Carly Figueroa has more.',
'upload_date': '20160408',
'timestamp': 1460123280,
'uploader': 'Veuer',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/',
'only_matching': True,
}, {
'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/',
'only_matching': True,
}, {
'url': 'aol-video:5707d6b8e4b090497b04f706',
'only_matching': True,
}, {
'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/',
'only_matching': True,
}, {
'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/',
'only_matching': True,
}, {
'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/',
'only_matching': True,
}, {
'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/',
'only_matching': True,
}, {
'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',
'only_matching': True,
}, {
# Yahoo video
'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
if '-' in video_id:
return self._extract_yahoo_video(video_id, 'us')
response = self._download_json(
'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,
video_id)['response']
if response['statusText'] != 'Ok':
raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True)
video_data = response['data']
formats = []
m3u8_url = url_or_none(video_data.get('videoMasterPlaylist'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []):
video_url = url_or_none(rendition.get('url'))
if not video_url:
continue
ext = rendition.get('format')
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
f = {
'url': video_url,
'format_id': rendition.get('quality'),
}
mobj = re.search(r'(\d+)x(\d+)', video_url)
if mobj:
f.update({
'width': int(mobj.group(1)),
'height': int(mobj.group(2)),
})
else:
qs = parse_qs(video_url)
f.update({
'width': int_or_none(qs.get('w', [None])[0]),
'height': int_or_none(qs.get('h', [None])[0]),
})
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
'title': video_data['title'],
'duration': int_or_none(video_data.get('duration')),
'timestamp': int_or_none(video_data.get('publishDate')),
'view_count': int_or_none(video_data.get('views')),
'description': video_data.get('description'),
'uploader': video_data.get('videoOwner'),
'formats': formats,
}

View File

@@ -0,0 +1,95 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
url_or_none,
)
class APAIE(InfoExtractor):
_VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
'info_dict': {
'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029',
'ext': 'mp4',
'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78',
'only_matching': True,
}, {
'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76',
'only_matching': True,
}, {
'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url')
webpage = self._download_webpage(
'%s/player/%s' % (base_url, video_id), video_id)
jwplatform_id = self._search_regex(
r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage,
'jwplatform id', default=None)
if jwplatform_id:
return self.url_result(
'jwplatform:' + jwplatform_id, ie='JWPlatform',
video_id=video_id)
def extract(field, name=None):
return self._search_regex(
r'\b%s["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % field,
webpage, name or field, default=None, group='value')
title = extract('title') or video_id
description = extract('description')
thumbnail = extract('poster', 'thumbnail')
formats = []
for format_id in ('hls', 'progressive'):
source_url = url_or_none(extract(format_id))
if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
else:
height = int_or_none(self._search_regex(
r'(\d+)\.mp4', source_url, 'height', default=None))
formats.append({
'url': source_url,
'format_id': format_id,
'height': height,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'formats': formats,
}

View File

@@ -0,0 +1,91 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
get_element_by_id,
int_or_none,
merge_dicts,
mimetype2ext,
url_or_none,
)
class AparatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://www.aparat.com/v/wP8On',
'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
'info_dict': {
'id': 'wP8On',
'ext': 'mp4',
'title': 'تیم گلکسی 11 - زومیت',
'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028',
'duration': 231,
'timestamp': 1387394859,
'upload_date': '20131218',
'view_count': int,
},
}, {
# multiple formats
'url': 'https://www.aparat.com/v/8dflw/',
'only_matching': True,
}]
def _parse_options(self, webpage, video_id, fatal=True):
return self._parse_json(self._search_regex(
r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id)
def _real_extract(self, url):
video_id = self._match_id(url)
# If available, provides more metadata
webpage = self._download_webpage(url, video_id, fatal=False)
options = self._parse_options(webpage, video_id, fatal=False)
if not options:
webpage = self._download_webpage(
'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
video_id, 'Downloading embed webpage')
options = self._parse_options(webpage, video_id)
formats = []
for sources in (options.get('multiSRC') or []):
for item in sources:
if not isinstance(item, dict):
continue
file_url = url_or_none(item.get('src'))
if not file_url:
continue
item_type = item.get('type')
if item_type == 'application/vnd.apple.mpegurl':
formats.extend(self._extract_m3u8_formats(
file_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
else:
ext = mimetype2ext(item.get('type'))
label = item.get('label')
formats.append({
'url': file_url,
'ext': ext,
'format_id': 'http-%s' % (label or ext),
'height': int_or_none(self._search_regex(
r'(\d+)[pP]', label or '', 'height',
default=None)),
})
self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, default={})
if not info.get('title'):
info['title'] = get_element_by_id('videoTitle', webpage) or \
self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True)
return merge_dicts(info, {
'id': video_id,
'thumbnail': url_or_none(options.get('poster')),
'duration': int_or_none(options.get('duration')),
'formats': formats,
})

View File

@@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
str_to_int,
ExtractorError
)
class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'md5': 'c1d41f72c8bcaf222e089434619316e4',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
}, {
'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
try:
video_json = self._html_search_regex(
r'class="auc-video-data">(\{.*?\})', webpage, 'json')
except ExtractorError:
raise ExtractorError('This post doesn\'t contain a video', expected=True)
video_data = self._parse_json(video_json, video_id)
timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None))
return {
'id': video_id,
'url': video_data['sslSrc'],
'title': video_data['title'],
'description': video_data['description'],
'uploader': video_data['artistName'],
'thumbnail': video_data['artworkUrl'],
'timestamp': timestamp,
'like_count': like_count,
}

View File

@@ -0,0 +1,62 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_podcast_url,
int_or_none,
parse_iso8601,
try_get,
)
class ApplePodcastsIE(InfoExtractor):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
'md5': 'df02e6acb11c10e844946a39e7222b08',
'info_dict': {
'id': '1000482637777',
'ext': 'mp3',
'title': '207 - Whitney Webb Returns',
'description': 'md5:13a73bade02d2e43737751e3987e1399',
'upload_date': '20200705',
'timestamp': 1593921600,
'duration': 6425,
'series': 'The Tim Dillon Show',
}
}, {
'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
'only_matching': True,
}, {
'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
'only_matching': True,
}, {
'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
'only_matching': True,
}]
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
ember_data = self._parse_json(self._search_regex(
r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id)
ember_data = ember_data.get(episode_id) or ember_data
episode = ember_data['data']['attributes']
description = episode.get('description') or {}
series = None
for inc in (ember_data.get('included') or []):
if inc.get('type') == 'media/podcast':
series = try_get(inc, lambda x: x['attributes']['name'])
return {
'id': episode_id,
'title': episode['name'],
'url': clean_podcast_url(episode['assetUrl']),
'description': description.get('standard') or description.get('short'),
'timestamp': parse_iso8601(episode.get('releaseDateTime')),
'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
'series': series,
}

View File

@@ -0,0 +1,283 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
parse_duration,
unified_strdate,
)
class AppleTrailersIE(InfoExtractor):
IE_NAME = 'appletrailers'
_VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
'id': '5111',
'title': 'Man of Steel',
},
'playlist': [
{
'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
'info_dict': {
'id': 'manofsteel-trailer4',
'ext': 'mov',
'duration': 111,
'title': 'Trailer 4',
'upload_date': '20130523',
'uploader_id': 'wb',
},
},
{
'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
'info_dict': {
'id': 'manofsteel-trailer3',
'ext': 'mov',
'duration': 182,
'title': 'Trailer 3',
'upload_date': '20130417',
'uploader_id': 'wb',
},
},
{
'md5': 'd0f1e1150989b9924679b441f3404d48',
'info_dict': {
'id': 'manofsteel-trailer',
'ext': 'mov',
'duration': 148,
'title': 'Trailer',
'upload_date': '20121212',
'uploader_id': 'wb',
},
},
{
'md5': '5fe08795b943eb2e757fa95cb6def1cb',
'info_dict': {
'id': 'manofsteel-teaser',
'ext': 'mov',
'duration': 93,
'title': 'Teaser',
'upload_date': '20120721',
'uploader_id': 'wb',
},
},
]
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
'id': '4489',
'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
}, {
# json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
'info_dict': {
'id': '15881',
'title': 'Kung Fu Panda 3',
},
'playlist_mincount': 4,
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
}, {
'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
'only_matching': True,
}]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
def _real_extract(self, url):
mobj = self._match_valid_url(url)
movie = mobj.group('movie')
uploader_id = mobj.group('company')
webpage = self._download_webpage(url, movie)
film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
film_data = self._download_json(
'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
film_id, fatal=False)
if film_data:
entries = []
for clip in film_data.get('clips', []):
clip_title = clip['title']
formats = []
for version, version_data in clip.get('versions', {}).items():
for size, size_data in version_data.get('sizes', {}).items():
src = size_data.get('src')
if not src:
continue
formats.append({
'format_id': '%s-%s' % (version, size),
'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src),
'width': int_or_none(size_data.get('width')),
'height': int_or_none(size_data.get('height')),
'language': version[:2],
})
self._sort_formats(formats)
entries.append({
'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
'formats': formats,
'title': clip_title,
'thumbnail': clip.get('screen') or clip.get('thumb'),
'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
'upload_date': unified_strdate(clip.get('posted')),
'uploader_id': uploader_id,
})
page_data = film_data.get('page', {})
return self.playlist_result(entries, film_id, page_data.get('movie_title'))
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m):
return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
s = '<html>%s</html>' % s
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
playlist = []
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
first_url = trailer_info.get('url')
if not first_url:
continue
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
thumbnail = li.find('.//img').attrib['src']
upload_date = trailer_info['posted'].replace('-', '')
runtime = trailer_info['runtime']
m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
duration = None
if m:
duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
formats = []
for format in settings['metadata']['sizes']:
# The src is a file pointing to the real video file
format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src'])
formats.append({
'url': format_url,
'format': format['type'],
'width': int_or_none(format['width']),
'height': int_or_none(format['height']),
})
self._sort_formats(formats)
playlist.append({
'_type': 'video',
'id': video_id,
'formats': formats,
'title': title,
'duration': duration,
'thumbnail': thumbnail,
'upload_date': upload_date,
'uploader_id': uploader_id,
'http_headers': {
'User-Agent': 'QuickTime compatible (yt-dlp)',
},
})
return {
'_type': 'playlist',
'id': movie,
'entries': playlist,
}
class AppleTrailersSectionIE(InfoExtractor):
IE_NAME = 'appletrailers:section'
_SECTIONS = {
'justadded': {
'feed_path': 'just_added',
'title': 'Just Added',
},
'exclusive': {
'feed_path': 'exclusive',
'title': 'Exclusive',
},
'justhd': {
'feed_path': 'just_hd',
'title': 'Just HD',
},
'mostpopular': {
'feed_path': 'most_pop',
'title': 'Most Popular',
},
'moviestudios': {
'feed_path': 'studios',
'title': 'Movie Studios',
},
}
_VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
_TESTS = [{
'url': 'http://trailers.apple.com/#section=justadded',
'info_dict': {
'title': 'Just Added',
'id': 'justadded',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=exclusive',
'info_dict': {
'title': 'Exclusive',
'id': 'exclusive',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=justhd',
'info_dict': {
'title': 'Just HD',
'id': 'justhd',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=mostpopular',
'info_dict': {
'title': 'Most Popular',
'id': 'mostpopular',
},
'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
'title': 'Movie Studios',
'id': 'moviestudios',
},
'playlist_mincount': 80,
}]
def _real_extract(self, url):
section = self._match_id(url)
section_data = self._download_json(
'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
section)
entries = [
self.url_result('http://trailers.apple.com' + e['location'])
for e in section_data]
return self.playlist_result(entries, section, self._SECTIONS[section]['title'])

View File

@@ -0,0 +1,673 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from .youtube import YoutubeIE, YoutubeBaseInfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_HTTPError
)
from ..utils import (
bug_reports_message,
clean_html,
dict_get,
extract_attributes,
ExtractorError,
get_element_by_id,
HEADRequest,
int_or_none,
join_nonempty,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
orderedSet,
parse_duration,
parse_qs,
str_to_int,
str_or_none,
traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
url_or_none
)
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org video and audio'
_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
_TESTS = [{
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
'info_dict': {
'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'ext': 'ogv',
'title': '1968 Demo - FJCC Conference Presentation Reel #1',
'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
'release_date': '19681210',
'timestamp': 1268695290,
'upload_date': '20100315',
'creator': 'SRI International',
'uploader': 'laura@archive.org',
},
}, {
'url': 'https://archive.org/details/Cops1922',
'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
'upload_date': '20131222',
},
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'only_matching': True,
}, {
'url': 'https://archive.org/details/Election_Ads',
'md5': '284180e857160cf866358700bab668a3',
'info_dict': {
'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
'ext': 'mp4',
},
}, {
'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'md5': '7915213ef02559b5501fe630e1a53f59',
'info_dict': {
'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'ext': 'mp4',
'timestamp': 1205588045,
'uploader': 'mikedavisstripmaster@yahoo.com',
'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
'upload_date': '20080315',
},
}, {
'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
'md5': '7d07ffb42aba6537c28e053efa4b54c9',
'info_dict': {
'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
'title': 'Turning',
'ext': 'flac',
},
}, {
'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
'info_dict': {
'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
'title': 'Deal',
'ext': 'flac',
'timestamp': 1205895624,
'uploader': 'mvernon54@yahoo.com',
'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
'upload_date': '20080319',
'location': 'Barton Hall - Cornell University',
},
}, {
'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
'md5': '7cb019baa9b332e82ea7c10403acd180',
'info_dict': {
'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
'title': 'Bells Of Rostov',
'ext': 'mp3',
},
}, {
'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
'info_dict': {
'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
'ext': 'mp3',
'timestamp': 1569662587,
'uploader': 'associate-joygen-odiongan@archive.org',
'description': 'md5:012b2d668ae753be36896f343d12a236',
'upload_date': '20190928',
},
}]
@staticmethod
def _playlist_data(webpage):
element = re.findall(r'''(?xs)
<input
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s+class=['"]?js-play8-playlist['"]?
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*/>
''', webpage)[0]
return json.loads(extract_attributes(element)['value'])
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
# Archive.org metadata API doesn't clearly demarcate playlist entries
# or subtitle tracks, so we get them from the embeddable player.
embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
playlist = self._playlist_data(embed_page)
entries = {}
for p in playlist:
# If the user specified a playlist entry in the URL, ignore the
# rest of the playlist.
if entry_id and p['orig'] != entry_id:
continue
entries[p['orig']] = {
'formats': [],
'thumbnails': [],
'artist': p.get('artist'),
'track': p.get('title'),
'subtitles': {},
}
for track in p.get('tracks', []):
if track['kind'] != 'subtitles':
continue
entries[p['orig']][track['label']] = {
'url': 'https://archive.org/' + track['file'].lstrip('/')
}
metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
m = metadata['metadata']
identifier = m['identifier']
info = {
'id': identifier,
'title': m['title'],
'description': clean_html(m.get('description')),
'uploader': dict_get(m, ['uploader', 'adder']),
'creator': m.get('creator'),
'license': m.get('licenseurl'),
'release_date': unified_strdate(m.get('date')),
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
'webpage_url': f'https://archive.org/details/{identifier}',
'location': m.get('venue'),
'release_year': int_or_none(m.get('year'))}
for f in metadata['files']:
if f['name'] in entries:
entries[f['name']] = merge_dicts(entries[f['name']], {
'id': identifier + '/' + f['name'],
'title': f.get('title') or f['name'],
'display_id': f['name'],
'description': clean_html(f.get('description')),
'creator': f.get('creator'),
'duration': parse_duration(f.get('length')),
'track_number': int_or_none(f.get('track')),
'album': f.get('album'),
'discnumber': int_or_none(f.get('disc')),
'release_year': int_or_none(f.get('year'))})
entry = entries[f['name']]
elif traverse_obj(f, 'original', expected_type=str) in entries:
entry = entries[f['original']]
else:
continue
if f.get('format') == 'Thumbnail':
entry['thumbnails'].append({
'id': f['name'],
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('width')),
'filesize': int_or_none(f.get('size'))})
extension = (f['name'].rsplit('.', 1) + [None])[1]
if extension in KNOWN_EXTENSIONS:
entry['formats'].append({
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'format': f.get('format'),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'filesize': int_or_none(f.get('size')),
'protocol': 'https'})
for entry in entries.values():
self._sort_formats(entry['formats'])
if len(entries) == 1:
# If there's only one item, use it as the main info dict
only_video = next(iter(entries.values()))
if entry_id:
info = merge_dicts(only_video, info)
else:
info = merge_dicts(info, only_video)
else:
# Otherwise, we have a playlist.
info['_type'] = 'playlist'
info['entries'] = list(entries.values())
if metadata.get('reviews'):
info['comments'] = []
for review in metadata['reviews']:
info['comments'].append({
'id': review.get('review_id'),
'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'})
return info
class YoutubeWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:youtube'
IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
_VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
(?:https?://)?web\.archive\.org/
(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
(?:https?(?::|%3[Aa])//)?(?:
(?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
|(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
)
)(?P<id>[0-9A-Za-z_-]{11})
(?(prefix)
(?::(?P<date2>[0-9]{14}))?$|
(?:%26|[#&]|$)
)'''
_TESTS = [
{
'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
'info_dict': {
'id': 'aYAGB11YrSs',
'ext': 'webm',
'title': 'Team Fortress 2 - Sandviches!',
'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
'upload_date': '20110926',
'uploader': 'Zeurel',
'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
'duration': 32,
'uploader_id': 'Zeurel',
'uploader_url': 'http://www.youtube.com/user/Zeurel'
}
}, {
# Internal link
'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
'info_dict': {
'id': '97t7Xj_iBv0',
'ext': 'mp4',
'title': 'Why Machines That Bend Are Better',
'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
'upload_date': '20190312',
'uploader': 'Veritasium',
'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
'duration': 771,
'uploader_id': '1veritasium',
'uploader_url': 'http://www.youtube.com/user/1veritasium'
}
}, {
# Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
# Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
'info_dict': {
'id': 'AkhihxRKcrs',
'ext': 'webm',
'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
'upload_date': '20120712',
'duration': 398,
'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
'uploader_id': 'machinima',
'uploader_url': 'http://www.youtube.com/user/machinima'
}
}, {
# FLV video. Video file URL does not provide itag information
'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
'info_dict': {
'id': 'jNQXAC9IVRw',
'ext': 'flv',
'title': 'Me at the zoo',
'upload_date': '20050423',
'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
'duration': 19,
'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
'uploader_id': 'jawed',
'uploader_url': 'http://www.youtube.com/user/jawed'
}
}, {
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
'info_dict': {
'id': 'lTx3G6h2xyA',
'ext': 'flv',
'title': 'Madeon - Pop Culture (live mashup)',
'upload_date': '20110711',
'uploader': 'Madeon',
'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
'duration': 204,
'description': 'md5:f7535343b6eda34a314eff8b85444680',
'uploader_id': 'itsmadeon',
'uploader_url': 'http://www.youtube.com/user/itsmadeon'
}
}, {
# First capture is of dead video, second is the oldest from CDX response.
'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
'info_dict': {
'id': '1JYutPM8O6E',
'ext': 'mp4',
'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
'upload_date': '20160218',
'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
'duration': 1236,
'description': 'md5:21032bae736421e89c2edf36d1936947',
'uploader_id': 'MachinimaETC',
'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
}
}, {
# First capture of dead video, capture date in link links to dead capture.
'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
'info_dict': {
'id': '6FPhZJGvf4E',
'ext': 'mp4',
'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
'upload_date': '20160219',
'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
'duration': 798,
'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
'uploader_id': 'MachinimaETC',
'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
},
'expected_warnings': [
r'unable to download capture webpage \(it may not be archived\)'
]
}, { # Very old YouTube page, has - YouTube in title.
'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
'info_dict': {
'id': '-06-KB9XTzg',
'ext': 'flv',
'title': 'New Coin Hack!! 100% Safe!!'
}
}, {
'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
'info_dict': {
'id': 'dWW7qP423y8',
'ext': 'mp4',
'title': 'It\'s Bootleg AirPods Time.',
'upload_date': '20211021',
'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
'duration': 810,
'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
'uploader': 'DankPods',
'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug'
}
}, {
# player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',
'info_dict': {
'id': '6Dh-RL__uN4',
'ext': 'mp4',
'title': 'bitch lasagna',
'upload_date': '20181005',
'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
'duration': 135,
'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
'uploader': 'PewDiePie',
'uploader_id': 'PewDiePie',
'uploader_url': 'http://www.youtube.com/user/PewDiePie'
}
}, {
'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
'only_matching': True
}, {
'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',
'only_matching': True
}, {
# Video not archived, only capture is unavailable video page
'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
'only_matching': True
}, { # Encoded url
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
'only_matching': True
}, {
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
'only_matching': True
}, {
'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&amp;search=soccer',
'only_matching': True
}, {
'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
'only_matching': True
}, {
'url': 'ytarchive:BaW_jenozKc:20050214000000',
'only_matching': True
}, {
'url': 'ytarchive:BaW_jenozKc',
'only_matching': True
},
]
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
_YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
_YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
_YT_ALL_THUMB_SERVERS = orderedSet(
_YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]])
_WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'
_OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000
def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = {
'url': url,
'output': 'json',
'fl': 'original,mimetype,length,timestamp',
'limit': 500,
'filter': ['statuscode:200'] + (filters or []),
'collapse': collapse or [],
**(query or {})
}
res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query)
if isinstance(res, list) and len(res) >= 2:
# format response to make it easier to use
return list(dict(zip(res[0], v)) for v in res[1:])
elif not isinstance(res, list) or len(res) != 0:
self.report_warning('Error while parsing CDX API response' + bug_reports_message())
def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
return self._parse_json(self._search_regex(
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
page_title = self._html_search_regex(
r'<title>([^<]*)</title>', webpage, 'title', default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
page_title, 'title', default='')
def _extract_metadata(self, video_id, webpage):
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
initial_data = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
initial_data_video = traverse_obj(
initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
expected_type=dict, get_all=False, default={})
video_details = traverse_obj(
player_response, 'videoDetails', expected_type=dict, get_all=False, default={})
microformats = traverse_obj(
player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})
video_title = (
video_details.get('title')
or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
or self._extract_webpage_title(webpage)
or search_meta(['og:title', 'twitter:title', 'title']))
channel_id = str_or_none(
video_details.get('channelId')
or microformats.get('externalChannelId')
or search_meta('channelId')
or self._search_regex(
r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
webpage, 'channel id', default=None, group='id'))
channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None
duration = int_or_none(
video_details.get('lengthSeconds')
or microformats.get('lengthSeconds')
or parse_duration(search_meta('duration')))
description = (
video_details.get('shortDescription')
or YoutubeBaseInfoExtractor._get_text(microformats, 'description')
or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
or search_meta(['description', 'og:description', 'twitter:description']))
uploader = video_details.get('author')
# Uploader ID and URL
uploader_mobj = re.search(
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024
webpage)
if uploader_mobj is not None:
uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url')
else:
# @a6211d2
uploader_url = url_or_none(microformats.get('ownerProfileUrl'))
uploader_id = self._search_regex(
r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None)
upload_date = unified_strdate(
dict_get(microformats, ('uploadDate', 'publishDate'))
or search_meta(['uploadDate', 'datePublished'])
or self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520
webpage, 'upload date', default=None))
return {
'title': video_title,
'description': description,
'upload_date': upload_date,
'uploader': uploader,
'channel_id': channel_id,
'channel_url': channel_url,
'duration': duration,
'uploader_url': uploader_url,
'uploader_id': uploader_id,
}
def _extract_thumbnails(self, video_id):
try_all = 'thumbnails' in self._configuration_arg('check_all')
thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(
webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)
for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]
thumbnails = []
for url in thumbnail_base_urls:
response = self._call_cdx_api(
video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],
collapse=['urlkey'], query={'matchType': 'prefix'})
if not response:
continue
thumbnails.extend(
{
'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),
'filesize': int_or_none(thumbnail_dict.get('length')),
'preference': int_or_none(thumbnail_dict.get('length'))
} for thumbnail_dict in response)
if not try_all:
break
self._remove_duplicate_formats(thumbnails)
return thumbnails
def _get_capture_dates(self, video_id, url_date):
capture_dates = []
# Note: CDX API will not find watch pages with extra params in the url.
response = self._call_cdx_api(
video_id, f'https://www.youtube.com/watch?v={video_id}',
filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None])
# Prefer the new polymer UI captures as we support extracting more metadata from them
# WBM captures seem to all switch to this layout ~July 2020
modern_captures = [x for x in all_captures if x >= 20200701000000]
if modern_captures:
capture_dates.append(modern_captures[0])
capture_dates.append(url_date)
if all_captures:
capture_dates.append(all_captures[0])
if 'captures' in self._configuration_arg('check_all'):
capture_dates.extend(modern_captures + all_captures)
# Fallbacks if any of the above fail
capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
return orderedSet(filter(None, capture_dates))
def _real_extract(self, url):
video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
url_date = url_date or url_date_2
urlh = None
try:
urlh = self._request_webpage(
HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org',
expected=True)
else:
raise
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
info = {'id': video_id}
for capture in capture_dates:
webpage = self._download_webpage(
(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
note='Downloading capture webpage')
current_info = self._extract_metadata(video_id, webpage or '')
# Try avoid getting deleted video metadata
if current_info.get('title'):
info = merge_dicts(info, current_info)
if 'captures' not in self._configuration_arg('check_all'):
break
info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh:
url = compat_urllib_parse_unquote(urlh.geturl())
video_file_url_qs = parse_qs(url)
# Attempt to recover any ext & format info from playback url & response headers
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
if itag and itag in YoutubeIE._formats:
format.update(YoutubeIE._formats[itag])
format.update({'format_id': itag})
else:
mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
ext = (mimetype2ext(mime)
or urlhandle_detect_ext(urlh)
or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
format.update({'ext': ext})
info['formats'] = [format]
if not info.get('duration'):
info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
if not info.get('title'):
info['title'] = video_id
return info

View File

@@ -0,0 +1,169 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
int_or_none,
parse_iso8601,
try_get,
)
class ArcPublishingIE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
_VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX
_TESTS = [{
# https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
'only_matching': True,
}, {
# https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
'only_matching': True,
}, {
# https://www.actionnewsjax.com/video/live-stream/
'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
'only_matching': True,
}, {
# https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
'only_matching': True,
}, {
# https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
'only_matching': True,
}, {
# https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
'only_matching': True,
}, {
# https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
'only_matching': True,
}, {
# https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
'only_matching': True,
}, {
# https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
'only_matching': True,
}, {
# https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
'only_matching': True,
}, {
# https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
'only_matching': True,
}, {
# https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
'only_matching': True,
}]
_POWA_DEFAULTS = [
(['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
([
'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
], 'video-api-cdn.%s.arcpublishing.com/api'),
]
@staticmethod
def _extract_urls(webpage):
entries = []
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
powa = extract_attributes(powa_el) or {}
org = powa.get('data-org')
uuid = powa.get('data-uuid')
if org and uuid:
entries.append('arcpublishing:%s:%s' % (org, uuid))
return entries
def _real_extract(self, url):
org, uuid = self._match_valid_url(url).groups()
for orgs, tmpl in self._POWA_DEFAULTS:
if org in orgs:
base_api_tmpl = tmpl
break
else:
base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
if org == 'wapo':
org = 'washpost'
video = self._download_json(
'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
uuid, query={'uuid': uuid})[0]
title = video['headlines']['basic']
is_live = video.get('status') == 'live'
urls = []
formats = []
for s in video.get('streams', []):
s_url = s.get('url')
if not s_url or s_url in urls:
continue
urls.append(s_url)
stream_type = s.get('stream_type')
if stream_type == 'smil':
smil_formats = self._extract_smil_formats(
s_url, uuid, fatal=False)
for f in smil_formats:
if f['url'].endswith('/cfx/st'):
f['app'] = 'cfx/st'
if not f['play_path'].startswith('mp4:'):
f['play_path'] = 'mp4:' + f['play_path']
if isinstance(f['tbr'], float):
f['vbr'] = f['tbr'] * 1000
del f['tbr']
f['format_id'] = 'rtmp-%d' % f['vbr']
formats.extend(smil_formats)
elif stream_type in ('ts', 'hls'):
m3u8_formats = self._extract_m3u8_formats(
s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
m3u8_id='hls', fatal=False)
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
continue
for f in m3u8_formats:
height = f.get('height')
if not height:
continue
vbr = self._search_regex(
r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
if vbr:
f['vbr'] = int(vbr)
formats.extend(m3u8_formats)
else:
vbr = int_or_none(s.get('bitrate'))
formats.append({
'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type,
'vbr': vbr,
'width': int_or_none(s.get('width')),
'height': int_or_none(s.get('height')),
'filesize': int_or_none(s.get('filesize')),
'url': s_url,
'quality': -10,
})
self._sort_formats(formats)
subtitles = {}
for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
subtitle_url = subtitle.get('url')
if subtitle_url:
subtitles.setdefault('en', []).append({'url': subtitle_url})
return {
'id': uuid,
'title': title,
'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
'description': try_get(video, lambda x: x['subheadlines']['basic']),
'formats': formats,
'duration': int_or_none(video.get('duration'), 100),
'timestamp': parse_iso8601(video.get('created_date')),
'subtitles': subtitles,
'is_live': is_live,
}

View File

@@ -0,0 +1,640 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
parse_duration,
qualities,
str_or_none,
try_get,
unified_strdate,
unified_timestamp,
update_url_query,
url_or_none,
xpath_text,
)
from ..compat import compat_etree_fromstring
class ARDMediathekBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['DE']
def _extract_media_info(self, media_info_url, webpage, video_id):
media_info = self._download_json(
media_info_url, video_id, 'Downloading media JSON')
return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
def _parse_media_info(self, media_info, video_id, fsk):
formats = self._extract_formats(media_info, video_id)
if not formats:
if fsk:
self.raise_no_formats(
'This video is only available after 20:00', expected=True)
elif media_info.get('_geoblocked'):
self.raise_geo_restricted(
'This video is not available due to geoblocking',
countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
if subtitle_url:
subtitles['de'] = [{
'ext': 'ttml',
'url': subtitle_url,
}]
return {
'id': video_id,
'duration': int_or_none(media_info.get('_duration')),
'thumbnail': media_info.get('_previewImage'),
'is_live': media_info.get('_isLive') is True,
'formats': formats,
'subtitles': subtitles,
}
def _ARD_extract_episode_info(self, title):
"""Try to extract season/episode data from the title."""
res = {}
if not title:
return res
for pattern in [
# Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
# from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
# E.g.: title="Fritjof aus Norwegen (2) (AD)"
# from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
# E.g.: title="Folge 25/42: Symmetrie"
# from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
# E.g.: title="Folge 1063 - Vertrauen"
# from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
]:
m = re.match(pattern, title)
if m:
groupdict = m.groupdict()
res['season_number'] = int_or_none(groupdict.get('season_number'))
res['episode_number'] = int_or_none(groupdict.get('episode_number'))
res['episode'] = str_or_none(groupdict.get('episode'))
# Build the episode title by removing numeric episode information:
if groupdict.get('ep_info') and not res['episode']:
res['episode'] = str_or_none(
title.replace(groupdict.get('ep_info'), ''))
if res['episode']:
res['episode'] = res['episode'].strip()
break
# As a fallback use the whole title as the episode name:
if not res.get('episode'):
res['episode'] = title.strip()
return res
def _extract_formats(self, media_info, video_id):
type_ = media_info.get('_type')
media_array = media_info.get('_mediaArray', [])
formats = []
for num, media in enumerate(media_array):
for stream in media.get('_mediaStreamArray', []):
stream_urls = stream.get('_stream')
if not stream_urls:
continue
if not isinstance(stream_urls, list):
stream_urls = [stream_urls]
quality = stream.get('_quality')
server = stream.get('_server')
for stream_url in stream_urls:
if not url_or_none(stream_url):
continue
ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'):
continue
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(stream_url, {
'hdcore': '3.1.1',
'plugin': 'aasp-3.1.1.69.124'
}), video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
if server and server.startswith('rtmp'):
f = {
'url': server,
'play_path': stream_url,
'format_id': 'a%s-rtmp-%s' % (num, quality),
}
else:
f = {
'url': stream_url,
'format_id': 'a%s-%s-%s' % (num, ext, quality)
}
m = re.search(
r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
stream_url)
if m:
f.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
if type_ == 'audio':
f['vcodec'] = 'none'
formats.append(f)
return formats
class ARDMediathekIE(ARDMediathekBaseIE):
IE_NAME = 'ARD:mediathek'
_VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
# available till 26.07.2022
'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
'info_dict': {
'id': '44726822',
'ext': 'mp4',
'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
'duration': 1740,
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
'only_matching': True,
}, {
# audio
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
'only_matching': True,
}, {
'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
'only_matching': True,
}, {
# audio
'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
'only_matching': True,
}, {
'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
def _real_extract(self, url):
# determine video id from url
m = self._match_valid_url(url)
document_id = None
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
document_id = video_id = numid.group(1)
else:
video_id = m.group('video_id')
webpage = self._download_webpage(url, video_id)
ERRORS = (
('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
'Video %s is no longer available'),
)
for pattern, message in ERRORS:
if pattern in webpage:
raise ExtractorError(message % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url):
doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
title = self._og_search_title(webpage, default=None) or self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms\.title" content="(.*?)"/>',
r'<h4 class="headline">(.*?)</h4>',
r'<title[^>]*>(.*?)</title>'],
webpage, 'title')
description = self._og_search_description(webpage, default=None) or self._html_search_meta(
'dcterms.abstract', webpage, 'description', default=None)
if description is None:
description = self._html_search_meta(
'description', webpage, 'meta description', default=None)
if description is None:
description = self._html_search_regex(
r'<p\s+class="teasertext">(.+?)</p>',
webpage, 'teaser text', default=None)
# Thumbnail is sometimes not present.
# It is in the mobile version, but that seems to use a different URL
# structure altogether.
thumbnail = self._og_search_thumbnail(webpage, default=None)
media_streams = re.findall(r'''(?x)
mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
"([^"]+)"''', webpage)
if media_streams:
QUALITIES = qualities(['lo', 'hi', 'hq'])
formats = []
for furl in set(media_streams):
if furl.endswith('.f4m'):
fid = 'f4m'
else:
fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
fid = fid_m.group(1) if fid_m else None
formats.append({
'quality': QUALITIES(fid),
'format_id': fid,
'url': furl,
})
self._sort_formats(formats)
info = {
'formats': formats,
}
else: # request JSON file
if not document_id:
video_id = self._search_regex(
(r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
webpage, 'media id', default=None)
info = self._extract_media_info(
'http://www.ardmediathek.de/play/media/%s' % video_id,
webpage, video_id)
info.update({
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
})
info.update(self._ARD_extract_episode_info(info['title']))
return info
class ARDIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
# available till 7.01.2022
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
'info_dict': {
'id': 'maischberger-die-woche-video100',
'display_id': 'maischberger-die-woche-video100',
'ext': 'mp4',
'duration': 3687.0,
'title': 'maischberger. die woche vom 7. Januar 2021',
'upload_date': '20210107',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
'only_matching': True,
}, {
'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
'only_matching': True,
}, {
'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
'only_matching': True,
}, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
}, {
'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
'only_matching': True,
}, {
'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('id')
player_url = mobj.group('mainurl') + '~playerXml.xml'
doc = self._download_xml(player_url, display_id)
video_node = doc.find('./video')
upload_date = unified_strdate(xpath_text(
video_node, './broadcastDate'))
thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
formats = []
for a in video_node.findall('.//asset'):
file_name = xpath_text(a, './fileName', default=None)
if not file_name:
continue
format_type = a.attrib.get('type')
format_url = url_or_none(file_name)
if format_url:
ext = determine_ext(file_name)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_type or 'hls', fatal=False))
continue
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}),
display_id, f4m_id=format_type or 'hds', fatal=False))
continue
f = {
'format_id': format_type,
'width': int_or_none(xpath_text(a, './frameWidth')),
'height': int_or_none(xpath_text(a, './frameHeight')),
'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
'abr': int_or_none(xpath_text(a, './bitrateAudio')),
'vcodec': xpath_text(a, './codecVideo'),
'tbr': int_or_none(xpath_text(a, './totalBitrate')),
}
server_prefix = xpath_text(a, './serverPrefix', default=None)
if server_prefix:
f.update({
'url': server_prefix,
'playpath': file_name,
})
else:
if not format_url:
continue
f['url'] = format_url
formats.append(f)
self._sort_formats(formats)
_SUB_FORMATS = (
('./dataTimedText', 'ttml'),
('./dataTimedTextNoOffset', 'ttml'),
('./dataTimedTextVtt', 'vtt'),
)
subtitles = {}
for subsel, subext in _SUB_FORMATS:
for node in video_node.findall(subsel):
subtitles.setdefault('de', []).append({
'url': node.attrib['url'],
'ext': subext,
})
return {
'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
'subtitles': subtitles,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
'upload_date': upload_date,
'thumbnail': thumbnail,
}
class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|sammlung))/
(?:(?P<display_id>[^?#]+)/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)'''
_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
'info_dict': {
'display_id': 'die-robuste-roswita',
'id': '78566716',
'title': 'Die robuste Roswita',
'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
'duration': 5316,
'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
'timestamp': 1596658200,
'upload_date': '20200805',
'ext': 'mp4',
},
'skip': 'Error',
}, {
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
'md5': 'f1837e563323b8a642a8ddeff0131f51',
'info_dict': {
'id': '10049223',
'ext': 'mp4',
'title': 'tagesschau, 20:00 Uhr',
'timestamp': 1636398000,
'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
'upload_date': '20211108',
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
}, {
'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
'only_matching': True,
}, {
# playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True,
}, {
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
'only_matching': True,
}]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
""" Query the ARD server for playlist information
and returns the data in "raw" format """
if mode == 'sendung':
graphQL = json.dumps({
'query': '''{
showPage(
client: "%s"
showId: "%s"
pageNumber: %d
) {
pagination {
pageSize
totalElements
}
teasers { # Array
mediumTitle
links { target { id href title } }
type
}
}}''' % (client, playlist_id, pageNumber),
}).encode()
else: # mode == 'sammlung'
graphQL = json.dumps({
'query': '''{
morePage(
client: "%s"
compilationId: "%s"
pageNumber: %d
) {
widget {
pagination {
pageSize
totalElements
}
teasers { # Array
mediumTitle
links { target { id href title } }
type
}
}
}}''' % (client, playlist_id, pageNumber),
}).encode()
# Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway
show_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
'[Playlist] %s' % display_id,
data=graphQL,
headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data:
if mode == 'sendung':
show_page = show_page['showPage']
else: # mode == 'sammlung'
show_page = show_page['morePage']['widget']
return show_page
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung' and 'sammlung', and also nested
playlists. """
entries = []
pageNumber = 0
while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snipped(
playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']:
# alternativ cond.: teaser['type'] == "compilation"
# => This is an nested compilation, e.g. like:
# https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
link_mode = 'sammlung'
else:
link_mode = 'video'
item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
client, link_mode, display_id,
# perform HTLM quoting of episode title similar to ARD:
re.sub('^-|-$', '', # remove '-' from begin/end
re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
teaser['links']['target']['title'].lower()
.replace('ä', 'ae').replace('ö', 'oe')
.replace('ü', 'ue').replace('ß', 'ss'))),
teaser['links']['target']['id'])
entries.append(self.url_result(
item_url,
ie=ARDBetaMediathekIE.ie_key()))
if (show_page['pagination']['pageSize'] * (pageNumber + 1)
>= show_page['pagination']['totalElements']):
# we've processed enough pages to get all playlist entries
break
pageNumber = pageNumber + 1
return self.playlist_result(entries, playlist_title=display_id)
def _real_extract(self, url):
video_id, display_id, playlist_type, client = self._match_valid_url(url).group(
'id', 'display_id', 'playlist', 'client')
display_id, client = display_id or video_id, client or 'ard'
if playlist_type:
return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
display_id, data=json.dumps({
'query': '''{
playerPage(client:"%s", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
mediaCollection {
_duration
_geoblocked
_isLive
_mediaArray {
_mediaStreamArray {
_quality
_server
_stream
}
}
_previewImage
_subtitleUrl
_type
}
show {
title
}
synopsis
title
tracking {
atiCustomVars {
contentId
}
}
}
}''' % (client, video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
title = player_page['title']
content_id = str_or_none(try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
media_collection = player_page.get('mediaCollection') or {}
if not media_collection and content_id:
media_collection = self._download_json(
'https://www.ardmediathek.de/play/media/' + content_id,
content_id, fatal=False) or {}
info = self._parse_media_info(
media_collection, content_id or video_id,
player_page.get('blockedByFsk'))
age_limit = None
description = player_page.get('synopsis')
maturity_content_rating = player_page.get('maturityContentRating')
if maturity_content_rating:
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
if not age_limit and description:
age_limit = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
'display_id': display_id,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
})
info.update(self._ARD_extract_episode_info(info['title']))
return info

View File

@@ -0,0 +1,163 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
parse_iso8601,
parse_qs,
try_get,
)
class ArkenaIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
video\.(?:arkena|qbrick)\.com/play2/embed/player\?|
play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
)
'''
_TESTS = [{
'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
'md5': '97f117754e5f3c020f5f26da4a44ebaf',
'info_dict': {
'id': 'd8ab4607-00090107-aab86310',
'ext': 'mp4',
'title': 'EM_HT20_117_roslund_v2.mp4',
'timestamp': 1608285912,
'upload_date': '20201218',
'duration': 1429.162667,
'subtitles': {
'sv': 'count:3',
},
},
}, {
'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
'only_matching': True,
}, {
'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893',
'only_matching': True,
}, {
'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972',
'only_matching': True,
}, {
'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/',
'only_matching': True,
}, {
'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled',
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
account_id = mobj.group('account_id')
# Handle http://video.arkena.com/play2/embed/player URL
if not video_id:
qs = parse_qs(url)
video_id = qs.get('mediaId', [None])[0]
account_id = qs.get('accountId', [None])[0]
if not video_id or not account_id:
raise ExtractorError('Invalid URL', expected=True)
media = self._download_json(
'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id),
video_id, query={
# https://video.qbrick.com/docs/api/examples/library-api.html
'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags',
})
metadata = media.get('metadata') or {}
title = metadata['title']
duration = None
formats = []
thumbnails = []
subtitles = {}
for resource in media['asset']['resources']:
for rendition in (resource.get('renditions') or []):
rendition_type = rendition.get('type')
for i, link in enumerate(rendition.get('links') or []):
href = link.get('href')
if not href:
continue
if rendition_type == 'image':
thumbnails.append({
'filesize': int_or_none(rendition.get('size')),
'height': int_or_none(rendition.get('height')),
'id': rendition.get('id'),
'url': href,
'width': int_or_none(rendition.get('width')),
})
elif rendition_type == 'subtitle':
subtitles.setdefault(rendition.get('language') or 'en', []).append({
'url': href,
})
elif rendition_type == 'video':
f = {
'filesize': int_or_none(rendition.get('size')),
'format_id': rendition.get('id'),
'url': href,
}
video = try_get(rendition, lambda x: x['videos'][i], dict)
if video:
if not duration:
duration = float_or_none(video.get('duration'))
f.update({
'height': int_or_none(video.get('height')),
'tbr': int_or_none(video.get('bitrate'), 1000),
'vcodec': video.get('codec'),
'width': int_or_none(video.get('width')),
})
audio = try_get(video, lambda x: x['audios'][0], dict)
if audio:
f.update({
'acodec': audio.get('codec'),
'asr': int_or_none(audio.get('sampleRate')),
})
formats.append(f)
elif rendition_type == 'index':
mime_type = link.get('mimeType')
if mime_type == 'application/smil+xml':
formats.extend(self._extract_smil_formats(
href, video_id, fatal=False))
elif mime_type == 'application/x-mpegURL':
formats.extend(self._extract_m3u8_formats(
href, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif mime_type == 'application/hds+xml':
formats.extend(self._extract_f4m_formats(
href, video_id, f4m_id='hds', fatal=False))
elif mime_type == 'application/dash+xml':
formats.extend(self._extract_f4m_formats(
href, video_id, f4m_id='hds', fatal=False))
elif mime_type == 'application/vnd.ms-sstr+xml':
formats.extend(self._extract_ism_formats(
href, video_id, ism_id='mss', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': metadata.get('description'),
'timestamp': parse_iso8601(media.get('created')),
'thumbnails': thumbnails,
'subtitles': subtitles,
'duration': duration,
'tags': media.get('tags'),
'formats': formats,
}

View File

@@ -0,0 +1,102 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
format_field,
float_or_none,
int_or_none,
parse_iso8601,
remove_start,
)
class ArnesIE(InfoExtractor):
IE_NAME = 'video.arnes.si'
IE_DESC = 'Arnes Video'
_VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P<id>[0-9a-zA-Z]{12})'
_TESTS = [{
'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10',
'md5': '4d0f4d0a03571b33e1efac25fd4a065d',
'info_dict': {
'id': 'a1qrWTOQfVoU',
'ext': 'mp4',
'title': 'Linearna neodvisnost, definicija',
'description': 'Linearna neodvisnost, definicija',
'license': 'PRIVATE',
'creator': 'Polona Oblak',
'timestamp': 1585063725,
'upload_date': '20200324',
'channel': 'Polona Oblak',
'channel_id': 'q6pc04hw24cj',
'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj',
'duration': 596.75,
'view_count': int,
'tags': ['linearna_algebra'],
'start_time': 10,
}
}, {
'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4',
'only_matching': True,
}, {
'url': 'https://video.arnes.si/embed/s1YjnV7hadlC',
'only_matching': True,
}, {
'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC',
'only_matching': True,
}, {
'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1',
'only_matching': True,
}, {
'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC',
'only_matching': True,
}]
_BASE_URL = 'https://video.arnes.si'
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
self._BASE_URL + '/api/public/video/' + video_id, video_id)['data']
title = video['title']
formats = []
for media in (video.get('media') or []):
media_url = media.get('url')
if not media_url:
continue
formats.append({
'url': self._BASE_URL + media_url,
'format_id': remove_start(media.get('format'), 'FORMAT_'),
'format_note': media.get('formatTranslation'),
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
})
self._sort_formats(formats)
channel = video.get('channel') or {}
channel_id = channel.get('url')
thumbnail = video.get('thumbnailUrl')
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': self._BASE_URL + thumbnail,
'description': video.get('description'),
'license': video.get('license'),
'creator': video.get('author'),
'timestamp': parse_iso8601(video.get('creationTime')),
'channel': channel.get('name'),
'channel_id': channel_id,
'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'),
'duration': float_or_none(video.get('duration'), 1000),
'view_count': int_or_none(video.get('views')),
'tags': video.get('hashtags'),
'start_time': int_or_none(compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('t', [None])[0]),
}

View File

@@ -0,0 +1,255 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
qualities,
try_get,
unified_strdate,
url_or_none,
)
class ArteTVBaseIE(InfoExtractor):
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
if not vsr:
error = None
if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
error = try_get(
player_info, lambda x: x['custom_msg']['msg'], compat_str)
if not error:
error = 'Video %s is not available' % player_info.get('VID') or video_id
raise ExtractorError(error, expected=True)
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
'fr': 'F',
'de': 'A',
'en': 'E[ANG]',
'es': 'E[ESP]',
'it': 'E[ITA]',
'pl': 'E[POL]',
}
langcode = LANGS.get(lang, lang)
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
# Language preference from most to least priority
# Reference: section 6.8 of
# https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
PREFERENCES = (
# original version in requested language, without subtitles
r'VO{0}$'.format(l),
# original version in requested language, with partial subtitles in requested language
r'VO{0}-ST{0}$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO{0}-STM{0}$'.format(l),
# non-original (dubbed) version in requested language, without subtitles
r'V{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
r'V{0}-ST{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'V{0}-STM{0}$'.format(l),
# original version in requested language, with partial subtitles in different language
r'VO{0}-ST(?!{0}).+?$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
r'VO{0}-STM(?!{0}).+?$'.format(l),
# original version in different language, with partial subtitles in requested language
r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
# original version in different language, without subtitles
r'VO(?:(?!{0}))?$'.format(l),
# original version in different language, with partial subtitles in different language
r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in different language
r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
)
for pref, p in enumerate(PREFERENCES):
if re.match(p, versionCode):
lang_pref = len(PREFERENCES) - pref
break
else:
lang_pref = -1
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
format = {
'format_id': format_id,
'language_preference': lang_pref,
'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
'quality': qfunc(f.get('quality')),
}
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
else:
format['url'] = f['url']
formats.append(format)
# For this extractor, quality only represents the relative quality
# with respect to other formats with the same resolution
self._sort_formats(formats, ('res', 'quality'))
return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE') or player_info.get('V7T'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
'id': '100605-013-A',
'ext': 'mp4',
'title': 'United we Stream November Lockdown Edition #13',
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url):
qs = parse_qs(url)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
'id': 'RC-016954',
'title': 'Earn a Living',
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = self._match_valid_url(url).groups()
collection = self._download_json(
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
return self.playlist_result(entries, playlist_id, title, description)

View File

@@ -0,0 +1,200 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import re
from .common import InfoExtractor
from .kaltura import KalturaIE
from ..utils import (
extract_attributes,
int_or_none,
OnDemandPagedList,
parse_age_limit,
strip_or_none,
try_get,
)
class AsianCrushBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))'
_KALTURA_KEYS = [
'video_url', 'progressive_url', 'download_url', 'thumbnail_url',
'widescreen_thumbnail_url', 'screencap_widescreen',
]
_API_SUFFIX = {'retrocrush.tv': '-ott'}
def _call_api(self, host, endpoint, video_id, query, resource):
return self._download_json(
'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id,
'Downloading %s JSON metadata' % resource, query=query,
headers=self.geo_verification_headers())['objects']
def _download_object_data(self, host, object_id, resource):
return self._call_api(
host, 'search', object_id, {'id': object_id}, resource)[0]
def _get_object_description(self, obj):
return strip_or_none(obj.get('long_description') or obj.get('short_description'))
def _parse_video_data(self, video):
title = video['name']
entry_id, partner_id = [None] * 2
for k in self._KALTURA_KEYS:
k_url = video.get(k)
if k_url:
mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url)
if mobj:
partner_id, entry_id = mobj.groups()
break
meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or []
categories = list(filter(None, [c.get('name') for c in meta_categories]))
show_info = video.get('show_info') or {}
return {
'_type': 'url_transparent',
'url': 'kaltura:%s:%s' % (partner_id, entry_id),
'ie_key': KalturaIE.ie_key(),
'id': entry_id,
'title': title,
'description': self._get_object_description(video),
'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')),
'categories': categories,
'series': show_info.get('show_name'),
'season_number': int_or_none(show_info.get('season_num')),
'season_id': show_info.get('season_id'),
'episode_number': int_or_none(show_info.get('episode_num')),
}
class AsianCrushIE(AsianCrushBaseIE):
_VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE
_TESTS = [{
'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt',
'md5': 'c3b740e48d0ba002a42c0b72857beae6',
'info_dict': {
'id': '1_y4tmjm5r',
'ext': 'mp4',
'title': 'Women Who Flirt',
'description': 'md5:b65c7e0ae03a85585476a62a186f924c',
'timestamp': 1496936429,
'upload_date': '20170608',
'uploader_id': 'craig@crifkin.com',
'age_limit': 13,
'categories': 'count:5',
'duration': 5812,
},
}, {
'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/',
'only_matching': True,
}, {
'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/',
'only_matching': True,
}, {
'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/video/010400v/drifters/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/',
'only_matching': True,
}, {
'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/',
'only_matching': True,
}, {
'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears',
'only_matching': True,
}]
def _real_extract(self, url):
host, video_id = self._match_valid_url(url).groups()
if host == 'cocoro.tv':
webpage = self._download_webpage(url, video_id)
embed_vars = self._parse_json(self._search_regex(
r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars',
default='{}'), video_id, fatal=False) or {}
video_id = embed_vars.get('entry_id') or video_id
video = self._download_object_data(host, video_id, 'video')
return self._parse_video_data(video)
class AsianCrushPlaylistIE(AsianCrushBaseIE):
_VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE
_TESTS = [{
'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai',
'info_dict': {
'id': '6447',
'title': 'Fruity Samurai',
'description': 'md5:7535174487e4a202d3872a7fc8f2f154',
},
'playlist_count': 13,
}, {
'url': 'https://www.yuyutv.com/series/013920s/peep-show/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/series/016375s/mononoke/',
'only_matching': True,
}, {
'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/',
'only_matching': True,
}, {
'url': 'https://www.retrocrush.tv/series/012355s/true-tears',
'only_matching': True,
}]
_PAGE_SIZE = 1000000000
def _fetch_page(self, domain, parent_id, page):
videos = self._call_api(
domain, 'getreferencedobjects', parent_id, {
'max': self._PAGE_SIZE,
'object_type': 'video',
'parent_id': parent_id,
'start': page * self._PAGE_SIZE,
}, 'page %d' % (page + 1))
for video in videos:
yield self._parse_video_data(video)
def _real_extract(self, url):
host, playlist_id = self._match_valid_url(url).groups()
if host == 'cocoro.tv':
webpage = self._download_webpage(url, playlist_id)
entries = []
for mobj in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
webpage):
attrs = extract_attributes(mobj.group(0))
if attrs.get('class') == 'clearfix':
entries.append(self.url_result(
mobj.group('url'), ie=AsianCrushIE.ie_key()))
title = self._html_search_regex(
r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)
description = self._og_search_description(
webpage, default=None) or self._html_search_meta(
'twitter:description', webpage, 'description', fatal=False)
else:
show = self._download_object_data(host, playlist_id, 'show')
title = show.get('name')
description = self._get_object_description(show)
entries = OnDemandPagedList(
functools.partial(self._fetch_page, host, playlist_id),
self._PAGE_SIZE)
return self.playlist_result(entries, playlist_id, title, description)

View File

@@ -0,0 +1,116 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
urlencode_postdata,
)
class AtresPlayerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})'
_NETRC_MACHINE = 'atresplayer'
_TESTS = [
{
'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/',
'info_dict': {
'id': '5d4aa2c57ed1a88fc715a615',
'ext': 'mp4',
'title': 'Capítulo 7: Asuntos pendientes',
'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc',
'duration': 3413,
},
'skip': 'This video is only available for registered users'
},
{
'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/',
'only_matching': True,
},
{
'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/',
'only_matching': True,
},
]
_API_BASE = 'https://api.atresplayer.com/'
def _real_initialize(self):
self._login()
def _handle_error(self, e, code):
if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
error = self._parse_json(e.cause.read(), None)
if error.get('error') == 'required_registered':
self.raise_login_required()
raise ExtractorError(error['error_description'], expected=True)
raise
def _login(self):
username, password = self._get_login_info()
if username is None:
return
self._request_webpage(
self._API_BASE + 'login', None, 'Downloading login page')
try:
target_url = self._download_json(
'https://account.atresmedia.com/api/login', None,
'Logging in', headers={
'Content-Type': 'application/x-www-form-urlencoded'
}, data=urlencode_postdata({
'username': username,
'password': password,
}))['targetUrl']
except ExtractorError as e:
self._handle_error(e, 400)
self._request_webpage(target_url, None, 'Following Target URL')
def _real_extract(self, url):
display_id, video_id = self._match_valid_url(url).groups()
try:
episode = self._download_json(
self._API_BASE + 'client/v1/player/episode/' + video_id, video_id)
except ExtractorError as e:
self._handle_error(e, 403)
title = episode['titulo']
formats = []
subtitles = {}
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
formats, subtitles = self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
elif src_type == 'application/dash+xml':
formats, subtitles = self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
omniture = episode.get('omniture') or {}
get_meta = lambda x: heartbeat.get(x) or omniture.get(x)
return {
'display_id': display_id,
'id': video_id,
'title': title,
'description': episode.get('descripcion'),
'thumbnail': episode.get('imgPoster'),
'duration': int_or_none(episode.get('duration')),
'formats': formats,
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
'subtitles': subtitles,
}

View File

@@ -0,0 +1,55 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unified_strdate
class ATTTechChannelIE(InfoExtractor):
_VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)'
_TEST = {
'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
'info_dict': {
'id': '11316',
'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
'ext': 'flv',
'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use',
'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140127',
},
'params': {
# rtmp download
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
r"url\s*:\s*'(rtmp://[^']+)'",
webpage, 'video URL')
video_id = self._search_regex(
r'mediaid\s*=\s*(\d+)',
webpage, 'video id', fatal=False)
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
upload_date = unified_strdate(self._search_regex(
r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})',
webpage, 'upload date', fatal=False), False)
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'ext': 'flv',
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
}

View File

@@ -0,0 +1,106 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
from .common import InfoExtractor
from ..utils import (
float_or_none,
jwt_encode_hs256,
try_get,
)
class ATVAtIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atv\.at/tv/(?:[^/]+/){2,3}(?P<id>.*)'
_TESTS = [{
'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/bauer-sucht-frau/bauer-sucht-frau-staffel-18-folge-3-die-hofwochen',
'md5': '3c3b4aaca9f63e32b35e04a9c2515903',
'info_dict': {
'id': 'v-ce9cgn1e70n5-1',
'ext': 'mp4',
'title': 'Bauer sucht Frau - Staffel 18 Folge 3 - Die Hofwochen',
}
}, {
'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/episode-01/bauer-sucht-frau-staffel-18-vorstellungsfolge-1',
'only_matching': True,
}]
# extracted from bootstrap.js function (search for e.encryption_key and use your browser's debugger)
_ACCESS_ID = 'x_atv'
_ENCRYPTION_KEY = 'Hohnaekeishoogh2omaeghooquooshia'
def _extract_video_info(self, url, content, video):
clip_id = content.get('splitId', content['id'])
formats = []
clip_urls = video['urls']
for protocol, variant in clip_urls.items():
source_url = try_get(variant, lambda x: x['clear']['url'])
if not source_url:
continue
if protocol == 'dash':
formats.extend(self._extract_mpd_formats(
source_url, clip_id, mpd_id=protocol, fatal=False))
elif protocol == 'hls':
formats.extend(self._extract_m3u8_formats(
source_url, clip_id, 'mp4', 'm3u8_native',
m3u8_id=protocol, fatal=False))
else:
formats.append({
'url': source_url,
'format_id': protocol,
})
self._sort_formats(formats)
return {
'id': clip_id,
'title': content.get('title'),
'duration': float_or_none(content.get('duration')),
'series': content.get('tvShowTitle'),
'formats': formats,
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json_data = self._parse_json(
self._search_regex(r'<script id="state" type="text/plain">(.*)</script>', webpage, 'json_data'),
video_id=video_id)
video_title = json_data['views']['default']['page']['title']
contentResource = json_data['views']['default']['page']['contentResource']
content_id = contentResource[0]['id']
content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']}
for id, content in enumerate(contentResource)]
time_of_request = datetime.datetime.now()
not_before = time_of_request - datetime.timedelta(minutes=5)
expire = time_of_request + datetime.timedelta(minutes=5)
payload = {
'content_ids': {
content_id: content_ids,
},
'secure_delivery': True,
'iat': int(time_of_request.timestamp()),
'nbf': int(not_before.timestamp()),
'exp': int(expire.timestamp()),
}
jwt_token = jwt_encode_hs256(payload, self._ENCRYPTION_KEY, headers={'kid': self._ACCESS_ID})
videos = self._download_json(
'https://vas-v4.p7s1video.net/4.0/getsources',
content_id, 'Downloading videos JSON', query={
'token': jwt_token.decode('utf-8')
})
video_id, videos_data = list(videos['data'].items())[0]
entries = [
self._extract_video_info(url, contentResource[video['id']], video)
for video in videos_data]
return {
'_type': 'multi_video',
'id': video_id,
'title': video_title,
'entries': entries,
}

View File

@@ -0,0 +1,93 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
)
class AudiMediaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': {
'id': '1565',
'ext': 'mp4',
'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test',
'description': 'md5:60e5d30a78ced725f7b8d34370762941',
'upload_date': '20151124',
'timestamp': 1448354940,
'duration': 74022,
'view_count': int,
}
}, {
'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
raw_payload = self._search_regex([
r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"',
r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"',
r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"',
r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"',
r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})',
], webpage, 'raw payload')
_, stage_mode, video_id, _ = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams)
if stage_mode not in ('s', 'e'):
video_data = self._download_json(
'https://www.audimedia.tv/api/video/v1/videos/' + video_id,
video_id, query={
'embed[]': ['video_versions', 'thumbnail_image'],
})['results']
formats = []
stream_url_hls = video_data.get('stream_url_hls')
if stream_url_hls:
formats.extend(self._extract_m3u8_formats(
stream_url_hls, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
stream_url_hds = video_data.get('stream_url_hds')
if stream_url_hds:
formats.extend(self._extract_f4m_formats(
stream_url_hds + '?hdcore=3.4.0',
video_id, f4m_id='hds', fatal=False))
for video_version in video_data.get('video_versions', []):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url:
continue
f = {
'url': video_version_url,
'width': int_or_none(video_version.get('width')),
'height': int_or_none(video_version.get('height')),
'abr': int_or_none(video_version.get('audio_bitrate')),
'vbr': int_or_none(video_version.get('video_bitrate')),
}
bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
if bitrate:
f.update({
'format_id': 'http-%s' % bitrate,
})
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
'title': video_data['title'],
'description': video_data.get('subtitle'),
'thumbnail': video_data.get('thumbnail_image', {}).get('file'),
'timestamp': parse_iso8601(video_data.get('publication_date')),
'duration': int_or_none(video_data.get('duration')),
'view_count': int_or_none(video_data.get('view_count')),
'formats': formats,
}

View File

@@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
float_or_none,
)
class AudioBoomIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://audioboom.com/posts/7398103-asim-chaudhry',
'md5': '7b00192e593ff227e6a315486979a42d',
'info_dict': {
'id': '7398103',
'ext': 'mp3',
'title': 'Asim Chaudhry',
'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc',
'duration': 4000.99,
'uploader': 'Sue Perkins: An hour or so with...',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins',
}
}, {
'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
clip = None
clip_store = self._parse_json(
self._html_search_regex(
r'data-new-clip-store=(["\'])(?P<json>{.+?})\1',
webpage, 'clip store', default='{}', group='json'),
video_id, fatal=False)
if clip_store:
clips = clip_store.get('clips')
if clips and isinstance(clips, list) and isinstance(clips[0], dict):
clip = clips[0]
def from_clip(field):
if clip:
return clip.get(field)
audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
'audio', webpage, 'audio url')
title = from_clip('title') or self._html_search_meta(
['og:title', 'og:audio:title', 'audio_title'], webpage)
description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage)
duration = float_or_none(from_clip('duration') or self._html_search_meta(
'weibo:audio:duration', webpage))
uploader = from_clip('author') or self._html_search_meta(
['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader')
uploader_url = from_clip('author_url') or self._html_search_meta(
'audioboo:channel', webpage, 'uploader url')
return {
'id': video_id,
'url': audio_url,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_url': uploader_url,
}

View File

@@ -0,0 +1,149 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import time
from .common import InfoExtractor
from .soundcloud import SoundcloudIE
from ..compat import compat_str
from ..utils import (
ExtractorError,
url_basename,
)
class AudiomackIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack'
_TESTS = [
# hosted on audiomack
{
'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
'info_dict':
{
'id': '310086',
'ext': 'mp3',
'uploader': 'Roosh Williams',
'title': 'Extraordinary'
}
},
# audiomack wrapper around soundcloud song
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',
'info_dict': {
'id': '258901379',
'ext': 'mp3',
'description': 'mamba day freestyle for the legend Kobe Bryant ',
'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
'uploader': 'ILOVEMAKONNEN',
'upload_date': '20160414',
},
'skip': 'Song has been removed from the site',
},
]
def _real_extract(self, url):
# URLs end with [uploader name]/song/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
album_url_tag = self._match_id(url).replace('/song/', '/')
# Request the extended version of the api for extra fields like artist and title
api_response = self._download_json(
'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % (
album_url_tag, time.time()),
album_url_tag)
# API is inconsistent with errors
if 'url' not in api_response or not api_response['url'] or 'error' in api_response:
raise ExtractorError('Invalid url %s' % url)
# Audiomack wraps a lot of soundcloud tracks in their branded wrapper
# if so, pass the work off to the soundcloud extractor
if SoundcloudIE.suitable(api_response['url']):
return self.url_result(api_response['url'], SoundcloudIE.ie_key())
return {
'id': compat_str(api_response.get('id', album_url_tag)),
'uploader': api_response.get('artist'),
'title': api_response.get('title'),
'url': api_response['url'],
}
class AudiomackAlbumIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack:album'
_TESTS = [
# Standard album playlist
{
'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
'playlist_count': 11,
'info_dict':
{
'id': '812251',
'title': 'Tha Tour: Part 2 (Official Mixtape)'
}
},
# Album playlist ripped from fakeshoredrive with no metadata
{
'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project',
'info_dict': {
'title': 'PPP (Pistol P Project)',
'id': '837572',
},
'playlist': [{
'info_dict': {
'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )',
'id': '837576',
'ext': 'mp3',
'uploader': 'Lil Herb a.k.a. G Herbo',
}
}, {
'info_dict': {
'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)',
'id': '837580',
'ext': 'mp3',
'uploader': 'Lil Herb a.k.a. G Herbo',
}
}],
}
]
def _real_extract(self, url):
# URLs end with [uploader name]/album/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
album_url_tag = self._match_id(url).replace('/album/', '/')
result = {'_type': 'playlist', 'entries': []}
# There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
# Therefore we don't know how many songs the album has and must infi-loop until failure
for track_no in itertools.count():
# Get song's metadata
api_response = self._download_json(
'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d'
% (album_url_tag, track_no, time.time()), album_url_tag,
note='Querying song information (%d)' % (track_no + 1))
# Total failure, only occurs when url is totally wrong
# Won't happen in middle of valid playlist (next case)
if 'url' not in api_response or 'error' in api_response:
raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url))
# URL is good but song id doesn't exist - usually means end of playlist
elif not api_response['url']:
break
else:
# Pull out the album metadata and add to result (if it exists)
for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
if apikey in api_response and resultkey not in result:
result[resultkey] = compat_str(api_response[apikey])
song_id = url_basename(api_response['url']).rpartition('.')[0]
result['entries'].append({
'id': compat_str(api_response.get('id', song_id)),
'uploader': api_response.get('artist'),
'title': api_response.get('title', song_id),
'url': api_response['url'],
})
return result

View File

@@ -0,0 +1,274 @@
# coding: utf-8
from __future__ import unicode_literals
import random
from .common import InfoExtractor
from ..utils import ExtractorError, try_get, compat_str, str_or_none
from ..compat import compat_urllib_parse_unquote
class AudiusBaseIE(InfoExtractor):
_API_BASE = None
_API_V = '/v1'
def _get_response_data(self, response):
if isinstance(response, dict):
response_data = response.get('data')
if response_data is not None:
return response_data
if len(response) == 1 and 'message' in response:
raise ExtractorError('API error: %s' % response['message'],
expected=True)
raise ExtractorError('Unexpected API response')
def _select_api_base(self):
"""Selecting one of the currently available API hosts"""
response = super(AudiusBaseIE, self)._download_json(
'https://api.audius.co/', None,
note='Requesting available API hosts',
errnote='Unable to request available API hosts')
hosts = self._get_response_data(response)
if isinstance(hosts, list):
self._API_BASE = random.choice(hosts)
return
raise ExtractorError('Unable to get available API hosts')
@staticmethod
def _prepare_url(url, title):
"""
Audius removes forward slashes from the uri, but leaves backslashes.
The problem is that the current version of Chrome replaces backslashes
in the address bar with a forward slashes, so if you copy the link from
there and paste it into youtube-dl, you won't be able to download
anything from this link, since the Audius API won't be able to resolve
this url
"""
url = compat_urllib_parse_unquote(url)
title = compat_urllib_parse_unquote(title)
if '/' in title or '%2F' in title:
fixed_title = title.replace('/', '%5C').replace('%2F', '%5C')
return url.replace(title, fixed_title)
return url
def _api_request(self, path, item_id=None, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
expected_status=None):
if self._API_BASE is None:
self._select_api_base()
try:
response = super(AudiusBaseIE, self)._download_json(
'%s%s%s' % (self._API_BASE, self._API_V, path), item_id, note=note,
errnote=errnote, expected_status=expected_status)
except ExtractorError as exc:
# some of Audius API hosts may not work as expected and return HTML
if 'Failed to parse JSON' in compat_str(exc):
raise ExtractorError('An error occurred while receiving data. Try again',
expected=True)
raise exc
return self._get_response_data(response)
def _resolve_url(self, url, item_id):
return self._api_request('/resolve?url=%s' % url, item_id,
expected_status=404)
class AudiusIE(AudiusBaseIE):
_VALID_URL = r'''(?x)https?://(?:www\.)?(?:audius\.co/(?P<uploader>[\w\d-]+)(?!/album|/playlist)/(?P<title>\S+))'''
IE_DESC = 'Audius.co'
_TESTS = [
{
# URL from Chrome address bar which replace backslash to forward slash
'url': 'https://audius.co/test_acc/t%D0%B5%D0%B5%D0%B5est-1.%5E_%7B%7D/%22%3C%3E.%E2%84%96~%60-198631',
'md5': '92c35d3e754d5a0f17eef396b0d33582',
'info_dict': {
'id': 'xd8gY',
'title': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''',
'ext': 'mp3',
'description': 'Description',
'duration': 30,
'track': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''',
'artist': 'test',
'genre': 'Electronic',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
'like_count': int,
'repost_count': int,
}
},
{
# Regular track
'url': 'https://audius.co/voltra/radar-103692',
'md5': '491898a0a8de39f20c5d6a8a80ab5132',
'info_dict': {
'id': 'KKdy2',
'title': 'RADAR',
'ext': 'mp3',
'duration': 318,
'track': 'RADAR',
'artist': 'voltra',
'genre': 'Trance',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
'like_count': int,
'repost_count': int,
}
},
]
_ARTWORK_MAP = {
"150x150": 150,
"480x480": 480,
"1000x1000": 1000
}
def _real_extract(self, url):
mobj = self._match_valid_url(url)
track_id = try_get(mobj, lambda x: x.group('track_id'))
if track_id is None:
title = mobj.group('title')
# uploader = mobj.group('uploader')
url = self._prepare_url(url, title)
track_data = self._resolve_url(url, title)
else: # API link
title = None
# uploader = None
track_data = self._api_request('/tracks/%s' % track_id, track_id)
if not isinstance(track_data, dict):
raise ExtractorError('Unexpected API response')
track_id = track_data.get('id')
if track_id is None:
raise ExtractorError('Unable to get ID of the track')
artworks_data = track_data.get('artwork')
thumbnails = []
if isinstance(artworks_data, dict):
for quality_key, thumbnail_url in artworks_data.items():
thumbnail = {
"url": thumbnail_url
}
quality_code = self._ARTWORK_MAP.get(quality_key)
if quality_code is not None:
thumbnail['preference'] = quality_code
thumbnails.append(thumbnail)
return {
'id': track_id,
'title': track_data.get('title', title),
'url': '%s/v1/tracks/%s/stream' % (self._API_BASE, track_id),
'ext': 'mp3',
'description': track_data.get('description'),
'duration': track_data.get('duration'),
'track': track_data.get('title'),
'artist': try_get(track_data, lambda x: x['user']['name'], compat_str),
'genre': track_data.get('genre'),
'thumbnails': thumbnails,
'view_count': track_data.get('play_count'),
'like_count': track_data.get('favorite_count'),
'repost_count': track_data.get('repost_count'),
}
class AudiusTrackIE(AudiusIE):
_VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P<track_id>\w+)'''
IE_NAME = 'audius:track'
IE_DESC = 'Audius track ID or API link. Prepend with "audius:"'
_TESTS = [
{
'url': 'audius:9RWlo',
'only_matching': True
},
{
'url': 'audius:http://discoveryprovider.audius.prod-us-west-2.staked.cloud/v1/tracks/9RWlo',
'only_matching': True
},
]
class AudiusPlaylistIE(AudiusBaseIE):
_VALID_URL = r'https?://(?:www\.)?audius\.co/(?P<uploader>[\w\d-]+)/(?:album|playlist)/(?P<title>\S+)'
IE_NAME = 'audius:playlist'
IE_DESC = 'Audius.co playlists'
_TEST = {
'url': 'https://audius.co/test_acc/playlist/test-playlist-22910',
'info_dict': {
'id': 'DNvjN',
'title': 'test playlist',
'description': 'Test description\n\nlol',
},
'playlist_count': 175,
}
def _build_playlist(self, tracks):
entries = []
for track in tracks:
if not isinstance(track, dict):
raise ExtractorError('Unexpected API response')
track_id = str_or_none(track.get('id'))
if not track_id:
raise ExtractorError('Unable to get track ID from playlist')
entries.append(self.url_result(
'audius:%s' % track_id,
ie=AudiusTrackIE.ie_key(), video_id=track_id))
return entries
def _real_extract(self, url):
self._select_api_base()
mobj = self._match_valid_url(url)
title = mobj.group('title')
# uploader = mobj.group('uploader')
url = self._prepare_url(url, title)
playlist_response = self._resolve_url(url, title)
if not isinstance(playlist_response, list) or len(playlist_response) != 1:
raise ExtractorError('Unexpected API response')
playlist_data = playlist_response[0]
if not isinstance(playlist_data, dict):
raise ExtractorError('Unexpected API response')
playlist_id = playlist_data.get('id')
if playlist_id is None:
raise ExtractorError('Unable to get playlist ID')
playlist_tracks = self._api_request(
'/playlists/%s/tracks' % playlist_id,
title, note='Downloading playlist tracks metadata',
errnote='Unable to download playlist tracks metadata')
if not isinstance(playlist_tracks, list):
raise ExtractorError('Unexpected API response')
entries = self._build_playlist(playlist_tracks)
return self.playlist_result(entries, playlist_id,
playlist_data.get('playlist_name', title),
playlist_data.get('description'))
class AudiusProfileIE(AudiusPlaylistIE):
IE_NAME = 'audius:artist'
IE_DESC = 'Audius.co profile/artist pages'
_VALID_URL = r'https?://(?:www)?audius\.co/(?P<id>[^\/]+)/?(?:[?#]|$)'
_TEST = {
'url': 'https://audius.co/pzl/',
'info_dict': {
'id': 'ezRo7',
'description': 'TAMALE\n\nContact: officialpzl@gmail.com',
'title': 'pzl',
},
'playlist_count': 24,
}
def _real_extract(self, url):
self._select_api_base()
profile_id = self._match_id(url)
try:
_profile_data = self._api_request('/full/users/handle/' + profile_id, profile_id)
except ExtractorError as e:
raise ExtractorError('Could not download profile info; ' + str(e))
profile_audius_id = _profile_data[0]['id']
profile_bio = _profile_data[0].get('bio')
api_call = self._api_request('/full/users/handle/%s/tracks' % profile_id, profile_id)
return self.playlist_result(self._build_playlist(api_call), profile_audius_id, profile_id, profile_bio)

View File

@@ -0,0 +1,187 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode,
compat_str,
)
from ..utils import (
format_field,
int_or_none,
parse_iso8601,
smuggle_url,
unsmuggle_url,
urlencode_postdata,
)
class AWAANIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<id>\d+)/(?P<season_id>\d+))?'
def _real_extract(self, url):
show_id, video_id, season_id = self._match_valid_url(url).groups()
if video_id and int(video_id) > 0:
return self.url_result(
'http://awaan.ae/media/%s' % video_id, 'AWAANVideo')
elif season_id and int(season_id) > 0:
return self.url_result(smuggle_url(
'http://awaan.ae/program/season/%s' % season_id,
{'show_id': show_id}), 'AWAANSeason')
else:
return self.url_result(
'http://awaan.ae/program/%s' % show_id, 'AWAANSeason')
class AWAANBaseIE(InfoExtractor):
def _parse_video_data(self, video_data, video_id, is_live):
title = video_data.get('title_en') or video_data['title_ar']
img = video_data.get('img')
return {
'id': video_id,
'title': title,
'description': video_data.get('description_en') or video_data.get('description_ar'),
'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,
'uploader_id': video_data.get('user_id'),
}
class AWAANVideoIE(AWAANBaseIE):
IE_NAME = 'awaan:video'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
'md5': '5f61c33bfc7794315c671a62d43116aa',
'info_dict':
{
'id': '17375',
'ext': 'mp4',
'title': 'رحلة العمر : الحلقة 1',
'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
'duration': 2041,
'timestamp': 1227504126,
'upload_date': '20081124',
'uploader_id': '71',
},
}, {
'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
video_id, headers={'Origin': 'http://awaan.ae'})
info = self._parse_video_data(video_data, video_id, False)
embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({
'id': video_data['id'],
'user_id': video_data['user_id'],
'signature': video_data['signature'],
'countries': 'Q0M=',
'filter': 'DENY',
})
info.update({
'_type': 'url_transparent',
'url': embed_url,
'ie_key': 'MangomoloVideo',
})
return info
class AWAANLiveIE(AWAANBaseIE):
IE_NAME = 'awaan:live'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
_TEST = {
'url': 'http://awaan.ae/live/6/dubai-tv',
'info_dict': {
'id': '6',
'ext': 'mp4',
'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'upload_date': '20150107',
'timestamp': 1420588800,
'uploader_id': '71',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_data = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id,
channel_id, headers={'Origin': 'http://awaan.ae'})
info = self._parse_video_data(channel_data, channel_id, True)
embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({
'id': base64.b64encode(channel_data['user_id'].encode()).decode(),
'channelid': base64.b64encode(channel_data['id'].encode()).decode(),
'signature': channel_data['signature'],
'countries': 'Q0M=',
'filter': 'DENY',
})
info.update({
'_type': 'url_transparent',
'url': embed_url,
'ie_key': 'MangomoloLive',
})
return info
class AWAANSeasonIE(InfoExtractor):
IE_NAME = 'awaan:season'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
_TEST = {
'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
'info_dict':
{
'id': '7910',
'title': 'محاضرات الشيخ الشعراوي',
},
'playlist_mincount': 27,
}
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
show_id, season_id = self._match_valid_url(url).groups()
data = {}
if season_id:
data['season'] = season_id
show_id = smuggled_data.get('show_id')
if show_id is None:
season = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id,
season_id, headers={'Origin': 'http://awaan.ae'})
show_id = season['id']
data['show_id'] = show_id
show = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/show',
show_id, data=urlencode_postdata(data), headers={
'Origin': 'http://awaan.ae',
'Content-Type': 'application/x-www-form-urlencoded'
})
if not season_id:
season_id = show['default_season']
for season in show['seasons']:
if season['id'] == season_id:
title = season.get('title_en') or season['title_ar']
entries = []
for video in show['videos']:
video_id = compat_str(video['id'])
entries.append(self.url_result(
'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id))
return self.playlist_result(entries, season_id, title)

View File

@@ -0,0 +1,78 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import hashlib
import hmac
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
class AWSIE(InfoExtractor):
_AWS_ALGORITHM = 'AWS4-HMAC-SHA256'
_AWS_REGION = 'us-east-1'
def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {}
amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8]
headers = {
'Accept': 'application/json',
'Host': self._AWS_PROXY_HOST,
'X-Amz-Date': amz_date,
'X-Api-Key': self._AWS_API_KEY
}
session_token = aws_dict.get('session_token')
if session_token:
headers['X-Amz-Security-Token'] = session_token
def aws_hash(s):
return hashlib.sha256(s.encode('utf-8')).hexdigest()
# Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
canonical_querystring = compat_urllib_parse_urlencode(query)
canonical_headers = ''
for header_name, header_value in sorted(headers.items()):
canonical_headers += '%s:%s\n' % (header_name.lower(), header_value)
signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())])
canonical_request = '\n'.join([
'GET',
aws_dict['uri'],
canonical_querystring,
canonical_headers,
signed_headers,
aws_hash('')
])
# Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html
credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request']
credential_scope = '/'.join(credential_scope_list)
string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)])
# Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html
def aws_hmac(key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256)
def aws_hmac_digest(key, msg):
return aws_hmac(key, msg).digest()
def aws_hmac_hexdigest(key, msg):
return aws_hmac(key, msg).hexdigest()
k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8')
for value in credential_scope_list:
k_signing = aws_hmac_digest(k_signing, value)
signature = aws_hmac_hexdigest(k_signing, string_to_sign)
# Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html
headers['Authorization'] = ', '.join([
'%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope),
'SignedHeaders=%s' % signed_headers,
'Signature=%s' % signature,
])
return self._download_json(
'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''),
video_id, headers=headers)

View File

@@ -0,0 +1,65 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from .kaltura import KalturaIE
class AZMedienIE(InfoExtractor):
IE_DESC = 'AZ Medien videos'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?P<host>
telezueri\.ch|
telebaern\.tv|
telem1\.ch
)/
[^/]+/
(?P<id>
[^/]+-(?P<article_id>\d+)
)
(?:
\#video=
(?P<kaltura_id>
[_0-9a-z]+
)
)?
'''
_TESTS = [{
'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'info_dict': {
'id': '1_anruz3wy',
'ext': 'mp4',
'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen',
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1',
'only_matching': True
}]
_API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be'
_PARTNER_ID = '1719221'
def _real_extract(self, url):
host, display_id, article_id, entry_id = self._match_valid_url(url).groups()
if not entry_id:
entry_id = self._download_json(
self._API_TEMPL % (host, host.split('.')[0]), display_id, query={
'variables': json.dumps({
'contextId': 'NewsArticle:' + article_id,
}),
})['data']['context']['mainAsset']['video']['kaltura']['kalturaId']
return self.url_result(
'kaltura:%s:%s' % (self._PARTNER_ID, entry_id),
ie=KalturaIE.ie_key(), video_id=entry_id)

View File

@@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unescapeHTML
class BaiduVideoIE(InfoExtractor):
IE_DESC = '百度视频'
_VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
_TESTS = [{
'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
'info_dict': {
'id': '1069',
'title': '中华小当家 TV版国语',
'description': 'md5:51be07afe461cf99fa61231421b5397c',
},
'playlist_count': 52,
}, {
'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand',
'info_dict': {
'id': '11595',
'title': 're:^奔跑吧兄弟',
'description': 'md5:1bf88bad6d850930f542d51547c089b8',
},
'playlist_mincount': 12,
}]
def _call_api(self, path, category, playlist_id, note):
return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % (
path, category, playlist_id), playlist_id, note)
def _real_extract(self, url):
category, playlist_id = self._match_valid_url(url).groups()
if category == 'show':
category = 'tvshow'
if category == 'tv':
category = 'tvplay'
playlist_detail = self._call_api(
'xqinfo', category, playlist_id, 'Download playlist JSON metadata')
playlist_title = playlist_detail['title']
playlist_description = unescapeHTML(playlist_detail.get('intro'))
episodes_detail = self._call_api(
'xqsingle', category, playlist_id, 'Download episodes JSON metadata')
entries = [self.url_result(
episode['url'], video_title=episode['title']
) for episode in episodes_detail['videos']]
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)

View File

@@ -0,0 +1,36 @@
# coding: utf-8
from __future__ import unicode_literals
from .brightcove import BrightcoveNewIE
from ..utils import extract_attributes
class BandaiChannelIE(BrightcoveNewIE):
IE_NAME = 'bandaichannel'
_VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P<id>\d+/\d+)'
_TESTS = [{
'url': 'https://www.b-ch.com/titles/514/001',
'md5': 'a0f2d787baa5729bed71108257f613a4',
'info_dict': {
'id': '6128044564001',
'ext': 'mp4',
'title': 'メタルファイターMIKU 第1話',
'timestamp': 1580354056,
'uploader_id': '5797077852001',
'upload_date': '20200130',
'duration': 1387.733,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(self._search_regex(
r'(<video-js[^>]+\bid="bcplayer"[^>]*>)', webpage, 'player'))
bc = self._download_json(
'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'],
video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc']
return self._parse_brightcove_metadata(bc, bc['id'])

View File

@@ -0,0 +1,431 @@
# coding: utf-8
from __future__ import unicode_literals
import random
import re
import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
KNOWN_EXTENSIONS,
parse_filesize,
str_or_none,
try_get,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
urljoin,
)
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
'title': 'Ben Prunty - Lanius (Battle)',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
'timestamp': 1396508491,
'upload_date': '20140403',
'release_timestamp': 1396483200,
'release_date': '20140403',
'duration': 260.877,
'track': 'Lanius (Battle)',
'track_number': 1,
'track_id': '2650410135',
'artist': 'Ben Prunty',
'album': 'FTL: Advanced Edition Soundtrack',
},
}, {
# no free download, mp3 128
'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
'info_dict': {
'id': '2584466013',
'ext': 'mp3',
'title': 'Mastodon - Hail to Fire',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Mastodon',
'timestamp': 1322005399,
'upload_date': '20111122',
'release_timestamp': 1076112000,
'release_date': '20040207',
'duration': 120.79,
'track': 'Hail to Fire',
'track_number': 5,
'track_id': '2584466013',
'artist': 'Mastodon',
'album': 'Call of the Mastodon',
},
}]
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url):
title = self._match_id(url)
webpage = self._download_webpage(url, title)
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
track_id = None
track = None
track_number = None
duration = None
formats = []
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
download_link = tralbum.get('freeDownloadPage')
if download_link:
track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
lambda x: x['download_items'][0]), dict)
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
thumbnail = info.get('thumb_url')
download_formats = {}
download_formats_list = blob.get('download_formats')
if isinstance(download_formats_list, list):
for f in blob['download_formats']:
name, ext = f.get('name'), f.get('file_extension')
if all(isinstance(x, compat_str) for x in (name, ext)):
download_formats[name] = ext.strip('.')
for format_id, f in downloads.items():
format_url = f.get('url')
if not format_url:
continue
# Stat URL generation algorithm is reverse engineered from
# download_*_bundle_*.js
stat_url = update_url_query(
format_url.replace('/download/', '/statdownload/'), {
'.rand': int(time.time() * 1000 * random.random()),
})
format_id = f.get('encoding_name') or format_id
stat = self._download_json(
stat_url, track_id, 'Downloading %s JSON' % format_id,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
fatal=False)
if not stat:
continue
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
})
self._sort_formats(formats)
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return {
'id': track_id,
'title': title,
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
}
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'playlist': [
{
'md5': '39bc1eded3476e927c724321ddf116cf',
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
{
'md5': '1a2c32e2691474643e912cc6cd4bffaa',
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
],
'info_dict': {
'title': 'Jazz Format Mixtape vol.1',
'id': 'jazz-format-mixtape-vol-1',
'uploader_id': 'blazo',
},
'params': {
'playlistend': 2
},
'skip': 'Bandcamp imposes download limits.'
}, {
'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
'info_dict': {
'title': 'Hierophany of the Open Grave',
'uploader_id': 'nightbringer',
'id': 'hierophany-of-the-open-grave',
},
'playlist_mincount': 9,
}, {
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'title': 'Loom',
'id': 'dotscale',
'uploader_id': 'dotscale',
},
'playlist_mincount': 7,
}, {
# with escaped quote in title
'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
'info_dict': {
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
# not all tracks have songs
'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
'info_dict': {
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@classmethod
def suitable(cls, url):
return (False
if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
uploader_id, album_id = self._match_valid_url(url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
tralbum = self._extract_data_attr(webpage, playlist_id)
track_info = tralbum.get('trackinfo')
if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
self.url_result(
urljoin(url, t['title_link']), BandcampIE.ie_key(),
str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
for t in track_info
if t.get('duration')]
current = tralbum.get('current') or {}
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
'title': current.get('title'),
'description': current.get('about'),
'entries': entries,
}
class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
'url': 'https://bandcamp.com/?show=224',
'md5': 'b00df799c733cf7e0c567ed187dea0fd',
'info_dict': {
'id': '224',
'ext': 'opus',
'title': 'BC Weekly April 4th 2017 - Magic Moments',
'description': 'md5:5d48150916e8e02d030623a48512c874',
'duration': 5829.77,
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_id': '224',
},
'params': {
'format': 'opus-lo',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
blob = self._extract_data_attr(webpage, show_id, 'blob')
show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
if not url_or_none(format_url):
continue
for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id:
ext = known_ext
break
else:
ext = None
formats.append({
'format_id': format_id,
'url': format_url,
'ext': ext,
'vcodec': 'none',
})
self._sort_formats(formats)
title = show.get('audio_title') or 'Bandcamp Weekly'
subtitle = show.get('subtitle')
if subtitle:
title += ' - %s' % subtitle
return {
'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
'is_live': False,
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_id': show_id,
'formats': formats
}
class BandcampMusicIE(InfoExtractor):
_VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music'
_TESTS = [{
'url': 'https://steviasphere.bandcamp.com/music',
'playlist_mincount': 47,
'info_dict': {
'id': 'steviasphere',
},
}, {
'url': 'https://coldworldofficial.bandcamp.com/music',
'playlist_mincount': 10,
'info_dict': {
'id': 'coldworldofficial',
},
}, {
'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
'playlist_mincount': 399,
'info_dict': {
'id': 'nuclearwarnowproductions',
},
}
]
_TYPE_IE_DICT = {
'album': BandcampAlbumIE.ie_key(),
'track': BandcampIE.ie_key()
}
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage)
entries = [
self.url_result(
f'https://{id}.bandcamp.com/{item[0]}',
ie=self._TYPE_IE_DICT[item[1]])
for item in items]
return self.playlist_result(entries, id)

View File

@@ -0,0 +1,158 @@
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
try_get,
int_or_none,
url_or_none,
float_or_none,
unified_timestamp,
)
class BannedVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?banned\.video/watch\?id=(?P<id>[0-f]{24})'
_TESTS = [{
'url': 'https://banned.video/watch?id=5e7a859644e02200c6ef5f11',
'md5': '14b6e81d41beaaee2215cd75c6ed56e4',
'info_dict': {
'id': '5e7a859644e02200c6ef5f11',
'ext': 'mp4',
'title': 'China Discovers Origin of Corona Virus: Issues Emergency Statement',
'thumbnail': r're:^https?://(?:www\.)?assets\.infowarsmedia.com/images/',
'description': 'md5:560d96f02abbebe6c6b78b47465f6b28',
'upload_date': '20200324',
'timestamp': 1585087895,
}
}]
_GRAPHQL_GETMETADATA_QUERY = '''
query GetVideoAndComments($id: String!) {
getVideo(id: $id) {
streamUrl
directUrl
unlisted
live
tags {
name
}
title
summary
playCount
largeImage
videoDuration
channel {
_id
title
}
createdAt
}
getVideoComments(id: $id, limit: 999999, offset: 0) {
_id
content
user {
_id
username
}
voteCount {
positive
}
createdAt
replyCount
}
}'''
_GRAPHQL_GETCOMMENTSREPLIES_QUERY = '''
query GetCommentReplies($id: String!) {
getCommentReplies(id: $id, limit: 999999, offset: 0) {
_id
content
user {
_id
username
}
voteCount {
positive
}
createdAt
replyCount
}
}'''
_GRAPHQL_QUERIES = {
'GetVideoAndComments': _GRAPHQL_GETMETADATA_QUERY,
'GetCommentReplies': _GRAPHQL_GETCOMMENTSREPLIES_QUERY,
}
def _call_api(self, video_id, id, operation, note):
return self._download_json(
'https://api.infowarsmedia.com/graphql', video_id, note=note,
headers={
'Content-Type': 'application/json; charset=utf-8'
}, data=json.dumps({
'variables': {'id': id},
'operationName': operation,
'query': self._GRAPHQL_QUERIES[operation]
}).encode('utf8')).get('data')
def _get_comments(self, video_id, comments, comment_data):
yield from comments
for comment in comment_data.copy():
comment_id = comment.get('_id')
if comment.get('replyCount') > 0:
reply_json = self._call_api(
video_id, comment_id, 'GetCommentReplies',
f'Downloading replies for comment {comment_id}')
for reply in reply_json.get('getCommentReplies'):
yield self._parse_comment(reply, comment_id)
@staticmethod
def _parse_comment(comment_data, parent):
return {
'id': comment_data.get('_id'),
'text': comment_data.get('content'),
'author': try_get(comment_data, lambda x: x['user']['username']),
'author_id': try_get(comment_data, lambda x: x['user']['_id']),
'timestamp': unified_timestamp(comment_data.get('createdAt')),
'parent': parent,
'like_count': try_get(comment_data, lambda x: x['voteCount']['positive']),
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_json = self._call_api(video_id, video_id, 'GetVideoAndComments', 'Downloading video metadata')
video_info = video_json['getVideo']
is_live = video_info.get('live')
comments = [self._parse_comment(comment, 'root') for comment in video_json.get('getVideoComments')]
formats = [{
'format_id': 'direct',
'quality': 1,
'url': video_info.get('directUrl'),
'ext': 'mp4',
}] if url_or_none(video_info.get('directUrl')) else []
if video_info.get('streamUrl'):
formats.extend(self._extract_m3u8_formats(
video_info.get('streamUrl'), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', live=True))
self._sort_formats(formats)
return {
'id': video_id,
'title': video_info.get('title')[:-1],
'formats': formats,
'is_live': is_live,
'description': video_info.get('summary'),
'channel': try_get(video_info, lambda x: x['channel']['title']),
'channel_id': try_get(video_info, lambda x: x['channel']['_id']),
'view_count': int_or_none(video_info.get('playCount')),
'thumbnail': url_or_none(video_info.get('largeImage')),
'duration': float_or_none(video_info.get('videoDuration')),
'timestamp': unified_timestamp(video_info.get('createdAt')),
'tags': [tag.get('name') for tag in video_info.get('tags')],
'availability': self._availability(is_unlisted=video_info.get('unlisted')),
'comments': comments,
'__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments'))
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import int_or_none
class BeatportIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://beatport.com/track/synesthesia-original-mix/5379371',
'md5': 'b3c34d8639a2f6a7f734382358478887',
'info_dict': {
'id': '5379371',
'display_id': 'synesthesia-original-mix',
'ext': 'mp4',
'title': 'Froxic - Synesthesia (Original Mix)',
},
}, {
'url': 'https://beatport.com/track/love-and-war-original-mix/3756896',
'md5': 'e44c3025dfa38c6577fbaeb43da43514',
'info_dict': {
'id': '3756896',
'display_id': 'love-and-war-original-mix',
'ext': 'mp3',
'title': 'Wolfgang Gartner - Love & War (Original Mix)',
},
}, {
'url': 'https://beatport.com/track/birds-original-mix/4991738',
'md5': 'a1fd8e8046de3950fd039304c186c05f',
'info_dict': {
'id': '4991738',
'display_id': 'birds-original-mix',
'ext': 'mp4',
'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)",
}
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
track_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
playables = self._parse_json(
self._search_regex(
r'window\.Playables\s*=\s*({.+?});', webpage,
'playables info', flags=re.DOTALL),
track_id)
track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
if track['mix']:
title += ' (' + track['mix'] + ')'
formats = []
for ext, info in track['preview'].items():
if not info['url']:
continue
fmt = {
'url': info['url'],
'ext': ext,
'format_id': ext,
'vcodec': 'none',
}
if ext == 'mp3':
fmt['acodec'] = 'mp3'
fmt['abr'] = 96
fmt['asr'] = 44100
elif ext == 'mp4':
fmt['acodec'] = 'aac'
fmt['abr'] = 96
fmt['asr'] = 44100
formats.append(fmt)
self._sort_formats(formats)
images = []
for name, info in track['images'].items():
image_url = info.get('url')
if name == 'dynamic' or not image_url:
continue
image = {
'id': name,
'url': image_url,
'height': int_or_none(info.get('height')),
'width': int_or_none(info.get('width')),
}
images.append(image)
return {
'id': compat_str(track.get('id')) or track_id,
'display_id': track.get('slug') or display_id,
'title': title,
'formats': formats,
'thumbnails': images,
}

View File

@@ -0,0 +1,116 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
int_or_none,
parse_qs,
unified_timestamp,
)
class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)'
_TESTS = [{
# api/v6 v1
'url': 'http://beeg.com/5416503',
'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
'title': 'Sultry Striptease',
'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
'timestamp': 1391813355,
'upload_date': '20140207',
'duration': 383,
'tags': list,
'age_limit': 18,
}
}, {
# api/v6 v2
'url': 'https://beeg.com/1941093077?t=911-1391',
'only_matching': True,
}, {
# api/v6 v2 w/o t
'url': 'https://beeg.com/1277207756',
'only_matching': True,
}, {
'url': 'https://beeg.porn/video/5416503',
'only_matching': True,
}, {
'url': 'https://beeg.porn/5416503',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
beeg_version = self._search_regex(
r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
default='1546225636701')
if len(video_id) >= 10:
query = {
'v': 2,
}
qs = parse_qs(url)
t = qs.get('t', [''])[0].split('-')
if len(t) > 1:
query.update({
's': t[0],
'e': t[1],
})
else:
query = {'v': 1}
for api_path in ('', 'api.'):
video = self._download_json(
'https://%sbeeg.com/api/v6/%s/video/%s'
% (api_path, beeg_version, video_id), video_id,
fatal=api_path == 'api.', query=query)
if video:
break
formats = []
for format_id, video_url in video.items():
if not video_url:
continue
height = self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)
if not height:
continue
formats.append({
'url': self._proto_relative_url(
video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
'format_id': format_id,
'height': int(height),
})
self._sort_formats(formats)
title = video['title']
video_id = compat_str(video.get('id') or video_id)
display_id = video.get('code')
description = video.get('desc')
series = video.get('ps_name')
timestamp = unified_timestamp(video.get('date'))
duration = int_or_none(video.get('duration'))
tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'series': series,
'timestamp': timestamp,
'duration': duration,
'tags': tags,
'formats': formats,
'age_limit': self._rta_search(webpage),
}

View File

@@ -0,0 +1,45 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import url_basename
class BehindKinkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
_TEST = {
'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
'info_dict': {
'id': '37127',
'ext': 'mp4',
'title': 'What are you passionate about Marley Blaze',
'description': 'md5:aee8e9611b4ff70186f752975d9b94b4',
'upload_date': '20141205',
'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg',
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
r'<source src="([^"]+)"', webpage, 'video URL')
video_id = url_basename(video_url).split('_')[0]
upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day')
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'upload_date': upload_date,
'age_limit': 18,
}

View File

@@ -0,0 +1,87 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class BellMediaIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?
(?P<domain>
(?:
ctv|
tsn|
bnn(?:bloomberg)?|
thecomedynetwork|
discovery|
discoveryvelocity|
sciencechannel|
investigationdiscovery|
animalplanet|
bravo|
mtv|
space|
etalk|
marilyn
)\.ca|
(?:much|cp24)\.com
)/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070',
'md5': '36d3ef559cfe8af8efe15922cd3ce950',
'info_dict': {
'id': '1403070',
'ext': 'flv',
'title': 'David Cockfield\'s Top Picks',
'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3',
'upload_date': '20180525',
'timestamp': 1527288600,
},
}, {
'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582',
'only_matching': True,
}, {
'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549',
'only_matching': True,
}, {
'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654',
'only_matching': True,
}, {
'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009',
'only_matching': True,
}, {
'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016',
'only_matching': True,
}, {
'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6',
'only_matching': True,
}, {
'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
'only_matching': True,
}, {
'url': 'http://www.etalk.ca/video?videoid=663455',
'only_matching': True,
}, {
'url': 'https://www.cp24.com/video?clipId=1982548',
'only_matching': True,
}]
_DOMAINS = {
'thecomedynetwork': 'comedy',
'discoveryvelocity': 'discvel',
'sciencechannel': 'discsci',
'investigationdiscovery': 'invdisc',
'animalplanet': 'aniplan',
'etalk': 'ctv',
'bnnbloomberg': 'bnn',
'marilyn': 'ctv_marilyn',
}
def _real_extract(self, url):
domain, video_id = self._match_valid_url(url).groups()
domain = domain.split('.')[0]
return {
'_type': 'url_transparent',
'id': video_id,
'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id),
'ie_key': 'NineCNineMedia',
}

View File

@@ -0,0 +1,82 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
from ..utils import unified_strdate
# TODO Remove - Reason: Outdated Site
class BetIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [
{
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
'info_dict': {
'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
'ext': 'flv',
'title': 'A Conversation With President Obama',
'description': 'President Obama urges persistence in confronting racism and bias.',
'duration': 1534,
'upload_date': '20141208',
'thumbnail': r're:(?i)^https?://.*\.jpg$',
'subtitles': {
'en': 'mincount:2',
}
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
'info_dict': {
'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
'display_id': 'justice-for-ferguson-a-community-reacts',
'ext': 'flv',
'title': 'Justice for Ferguson: A Community Reacts',
'description': 'A BET News special.',
'duration': 1696,
'upload_date': '20141125',
'thumbnail': r're:(?i)^https?://.*\.jpg$',
'subtitles': {
'en': 'mincount:2',
}
},
'params': {
# rtmp download
'skip_download': True,
},
}
]
_FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
def _get_feed_query(self, uri):
return {
'uuid': uri,
}
def _extract_mgid(self, webpage):
return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
mgid = self._extract_mgid(webpage)
videos_info = self._get_videos_info(mgid)
info_dict = videos_info['entries'][0]
upload_date = unified_strdate(self._html_search_meta('date', webpage))
description = self._html_search_meta('description', webpage)
info_dict.update({
'display_id': display_id,
'description': description,
'upload_date': upload_date,
})
return info_dict

View File

@@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import extract_attributes
class BFIPlayerIE(InfoExtractor):
IE_NAME = 'bfi:player'
_VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online'
_TEST = {
'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online',
'md5': 'e8783ebd8e061ec4bc6e9501ed547de8',
'info_dict': {
'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63',
'ext': 'mp4',
'title': 'Computer Doctor',
'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b',
},
'skip': 'BFI Player films cannot be played outside of the UK',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
entries = []
for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage):
player_attr = extract_attributes(player_el)
ooyala_id = player_attr.get('data-video-id')
if not ooyala_id:
continue
entries.append(self.url_result(
'ooyala:' + ooyala_id, 'Ooyala',
ooyala_id, player_attr.get('data-label')))
return self.playlist_result(entries)

View File

@@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import extract_attributes
class BFMTVBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/'
_VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _brightcove_url_result(self, video_id, video_block):
account_id = video_block.get('accountid') or '876450612001'
player_id = video_block.get('playerid') or 'I2qBTln4u'
return self.url_result(
self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
'BrightcoveNew', video_id)
class BFMTVIE(BFMTVBaseIE):
IE_NAME = 'bfmtv'
_VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V'
_TESTS = [{
'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html',
'info_dict': {
'id': '6196747868001',
'ext': 'mp4',
'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourdhui, partout dans le monde"',
'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.',
'uploader_id': '876450610001',
'upload_date': '20201002',
'timestamp': 1601629620,
},
}]
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
video_block = extract_attributes(self._search_regex(
self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
return self._brightcove_url_result(video_block['videoid'], video_block)
class BFMTVLiveIE(BFMTVIE):
IE_NAME = 'bfmtv:live'
_VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
_TESTS = [{
'url': 'https://www.bfmtv.com/en-direct/',
'info_dict': {
'id': '5615950982001',
'ext': 'mp4',
'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'uploader_id': '876450610001',
'upload_date': '20171018',
'timestamp': 1508329950,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.bfmtv.com/economie/en-direct/',
'only_matching': True,
}]
class BFMTVArticleIE(BFMTVBaseIE):
IE_NAME = 'bfmtv:article'
_VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A'
_TESTS = [{
'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html',
'info_dict': {
'id': '202101060198',
'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"',
'description': 'md5:947974089c303d3ac6196670ae262843',
},
'playlist_count': 2,
}, {
'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html',
'only_matching': True,
}, {
'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html',
'only_matching': True,
}]
def _real_extract(self, url):
bfmtv_id = self._match_id(url)
webpage = self._download_webpage(url, bfmtv_id)
entries = []
for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
video_block = extract_attributes(video_block_el)
video_id = video_block.get('videoid')
if not video_id:
continue
entries.append(self._brightcove_url_result(video_id, video_block))
return self.playlist_result(
entries, bfmtv_id, self._og_search_title(webpage, fatal=False),
self._html_search_meta(['og:description', 'description'], webpage))

View File

@@ -0,0 +1,30 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class BibelTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch',
'md5': '252f908192d611de038b8504b08bf97f',
'info_dict': {
'id': 'ref:329703',
'ext': 'mp4',
'title': 'Sprachkurs in Malaiisch',
'description': 'md5:3e9f197d29ee164714e67351cf737dfe',
'timestamp': 1608316701,
'uploader_id': '5840105145001',
'upload_date': '20201218',
}
}, {
'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374',
'only_matching': True,
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s'
def _real_extract(self, url):
crn_id = self._match_id(url)
return self.url_result(
self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew')

View File

@@ -0,0 +1,78 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
class BigflixIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
_TESTS = [{
# 2 formats
'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
'info_dict': {
'id': '16070',
'ext': 'mp4',
'title': 'Madarasapatinam',
'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b',
'formats': 'mincount:2',
},
'params': {
'skip_download': True,
}
}, {
# multiple formats
'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>',
webpage, 'title')
def decode_url(quoted_b64_url):
return compat_b64decode(compat_urllib_parse_unquote(
quoted_b64_url)).decode('utf-8')
formats = []
for height, encoded_url in re.findall(
r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage):
video_url = decode_url(encoded_url)
f = {
'url': video_url,
'format_id': '%sp' % height,
'height': int(height),
}
if video_url.startswith('rtmp'):
f['ext'] = 'flv'
formats.append(f)
file_url = self._search_regex(
r'file=([^&]+)', webpage, 'video url', default=None)
if file_url:
video_url = decode_url(file_url)
if all(f['url'] != video_url for f in formats):
formats.append({
'url': decode_url(file_url),
})
self._sort_formats(formats)
description = self._html_search_meta('description', webpage)
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats
}

View File

@@ -0,0 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unescapeHTML,
)
class BildIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
IE_DESC = 'Bild.de'
_TEST = {
'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
'md5': 'dd495cbd99f2413502a1713a1156ac8a',
'info_dict': {
'id': '38184146',
'ext': 'mp4',
'title': 'Das können die neuen iPads',
'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 196,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
return {
'id': video_id,
'title': unescapeHTML(video_data['title']).strip(),
'description': unescapeHTML(video_data.get('description')),
'url': video_data['clipList'][0]['srces'][0]['src'],
'thumbnail': video_data.get('poster'),
'duration': int_or_none(video_data.get('durationSec')),
}

View File

@@ -0,0 +1,953 @@
# coding: utf-8
import base64
import hashlib
import itertools
import functools
import re
import math
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
compat_urllib_parse_urlparse
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
mimetype2ext,
parse_iso8601,
traverse_obj,
parse_count,
smuggle_url,
srt_subtitles_timecode,
str_or_none,
strip_jsonp,
unified_timestamp,
unsmuggle_url,
urlencode_postdata,
url_or_none,
OnDemandPagedList
)
class BiliBiliIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:(?:www|bangumi)\.)?
bilibili\.(?:tv|com)/
(?:
(?:
video/[aA][vV]|
anime/(?P<anime_id>\d+)/play\#
)(?P<id>\d+)|
(s/)?video/[bB][vV](?P<id_bv>[^/?#&]+)
)
(?:/?\?p=(?P<page>\d+))?
'''
_TESTS = [{
'url': 'http://www.bilibili.com/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
'id': '1074402_part1',
'ext': 'mp4',
'title': '【金坷垃】金泡沫',
'uploader_id': '156160',
'uploader': '菊子桑',
'upload_date': '20140420',
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
'timestamp': 1398012678,
},
}, {
# Tested in BiliBiliBangumiIE
'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
'only_matching': True,
}, {
# bilibili.tv
'url': 'http://www.bilibili.tv/video/av1074402/',
'only_matching': True,
}, {
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
'id': '100643_part1',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明并且能够让妄想成为现实。那你会进行怎么样的妄想是淫靡的世界独裁社会毁灭性的制裁还是……2015年涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}, {
'url': 'http://www.bilibili.com/video/av8903802/',
'info_dict': {
'id': '8903802_part1',
'ext': 'mp4',
'title': '阿滴英文|英文歌分享#6 "Closer',
'upload_date': '20170301',
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
'timestamp': 1488382634,
'uploader_id': '65880958',
'uploader': '阿滴英文',
},
'params': {
'skip_download': True,
},
}, {
# new BV video id format
'url': 'https://www.bilibili.com/video/BV1JE411F741',
'only_matching': True,
}, {
# Anthology
'url': 'https://www.bilibili.com/video/BV1bK411W797',
'info_dict': {
'id': 'BV1bK411W797',
'title': '物语中的人物是如何吐槽自己的OP的'
},
'playlist_count': 17,
}]
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:
raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
elif 'code' in result:
raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
else:
raise ExtractorError('Can\'t extract Bangumi episode ID')
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = self._match_valid_url(url)
video_id = mobj.group('id_bv') or mobj.group('id')
av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
video_id = av_id
info = {}
anime_id = mobj.group('anime_id')
page_id = mobj.group('page')
webpage = self._download_webpage(url, video_id)
# Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
# If the video has no page argument, check to see if it's an anthology
if page_id is None:
if not self.get_param('noplaylist'):
r = self._extract_anthology_entries(bv_id, video_id, webpage)
if r is not None:
self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id)
return r
else:
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
if 'anime/' not in url:
cid = self._search_regex(
r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid',
default=None
) or self._search_regex(
r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
default=None
) or compat_parse_qs(self._search_regex(
[r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
webpage, 'player parameters'))['cid'][0]
else:
if 'no_bangumi_tip' not in smuggled_data:
self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run yt-dlp with %s' % (
video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': url
}
headers.update(self.geo_verification_headers())
js = self._download_json(
'http://bangumi.bilibili.com/web_api/get_source', video_id,
data=urlencode_postdata({'episode_id': video_id}),
headers=headers)
if 'result' not in js:
self._report_error(js)
cid = js['result']['cid']
headers = {
'Accept': 'application/json',
'Referer': url
}
headers.update(self.geo_verification_headers())
video_info = self._parse_json(
self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}',
video_id, fatal=False)
video_info = video_info.get('data') or {}
durl = traverse_obj(video_info, ('dash', 'video'))
audios = traverse_obj(video_info, ('dash', 'audio')) or []
entries = []
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
for num, rendition in enumerate(RENDITIONS, start=1):
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
if not video_info:
video_info = self._download_json(
'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
video_id, note='Downloading video info page',
headers=headers, fatal=num == len(RENDITIONS))
if not video_info:
continue
if not durl and 'durl' not in video_info:
if num < len(RENDITIONS):
continue
self._report_error(video_info)
formats = []
for idx, durl in enumerate(durl or video_info['durl']):
formats.append({
'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'),
'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')),
'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')),
'width': int_or_none(durl.get('width')),
'height': int_or_none(durl.get('height')),
'vcodec': durl.get('codecs'),
'acodec': 'none' if audios else None,
'tbr': float_or_none(durl.get('bandwidth'), scale=1000),
'filesize': int_or_none(durl.get('size')),
})
for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []:
formats.append({
'url': backup_url,
'quality': -2 if 'hd.mp4' in backup_url else -3,
})
for a_format in formats:
a_format.setdefault('http_headers', {}).update({
'Referer': url,
})
for audio in audios:
formats.append({
'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'),
'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')),
'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')),
'width': int_or_none(audio.get('width')),
'height': int_or_none(audio.get('height')),
'acodec': audio.get('codecs'),
'vcodec': 'none',
'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
'filesize': int_or_none(audio.get('size'))
})
for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []:
formats.append({
'url': backup_url,
# backup URLs have lower priorities
'quality': -3,
})
info.update({
'id': video_id,
'duration': float_or_none(durl.get('length'), 1000),
'formats': formats,
})
break
self._sort_formats(formats)
title = self._html_search_regex((
r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)',
r'(?s)<h1[^>]*>(?P<content>.+?)</h1>',
self._meta_regex('title')
), webpage, 'title', group='content', fatal=False)
# Get part title for anthologies
if page_id is not None:
# TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
part_info = traverse_obj(self._download_json(
f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
video_id, note='Extracting videos in anthology'), 'data', expected_type=list)
title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
default=None) or self._html_search_meta(
'uploadDate', webpage, 'timestamp', default=None))
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript
info.update({
'id': f'{video_id}_part{page_id or 1}',
'cid': cid,
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail,
'duration': float_or_none(video_info.get('timelength'), scale=1000),
})
uploader_mobj = re.search(
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
webpage)
if uploader_mobj:
info.update({
'uploader': uploader_mobj.group('name').strip(),
'uploader_id': uploader_mobj.group('id'),
})
if not info.get('uploader'):
info['uploader'] = self._html_search_meta(
'author', webpage, 'uploader', default=None)
top_level_info = {
'tags': traverse_obj(self._download_json(
f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}',
video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
}
info['subtitles'] = {
'danmaku': [{
'ext': 'xml',
'url': f'https://comment.bilibili.com/{cid}.xml',
}]
}
r'''
# Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3
# See https://github.com/animelover1984/youtube-dl
raw_danmaku = self._download_webpage(
f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments')
danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576)
entries[0]['subtitles'] = {
'danmaku': [{
'ext': 'ass',
'data': danmaku
}]
}
'''
top_level_info['__post_extractor'] = self.extract_comments(video_id)
for entry in entries:
entry.update(info)
if len(entries) == 1:
entries[0].update(top_level_info)
return entries[0]
for idx, entry in enumerate(entries):
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
return {
'id': str(video_id),
'bv_id': bv_id,
'title': title,
'description': description,
**info, **top_level_info
}
def _extract_anthology_entries(self, bv_id, video_id, webpage):
title = self._html_search_regex(
(r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
r'(?s)<h1[^>]*>(?P<title>.+?)</h1>',
r'<title>(?P<title>.+?)</title>'), webpage, 'title',
group='title')
json_data = self._download_json(
f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
video_id, note='Extracting videos in anthology')
if json_data['data']:
return self.playlist_from_matches(
json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(),
getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page']))
def _get_video_id_set(self, id, is_bv):
query = {'bvid': id} if is_bv else {'aid': id}
response = self._download_json(
"http://api.bilibili.cn/x/web-interface/view",
id, query=query,
note='Grabbing original ID via API')
if response['code'] == -400:
raise ExtractorError('Video ID does not exist', expected=True, video_id=id)
elif response['code'] != 0:
raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})',
expected=True, video_id=id)
return response['data']['aid'], response['data']['bvid']
def _get_comments(self, video_id, commentPageNumber=0):
for idx in itertools.count(1):
replies = traverse_obj(
self._download_json(
f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
video_id, note=f'Extracting comments from page {idx}', fatal=False),
('data', 'replies'))
if not replies:
return
for children in map(self._get_all_children, replies):
yield from children
def _get_all_children(self, reply):
yield {
'author': traverse_obj(reply, ('member', 'uname')),
'author_id': traverse_obj(reply, ('member', 'mid')),
'id': reply.get('rpid'),
'text': traverse_obj(reply, ('content', 'message')),
'timestamp': reply.get('ctime'),
'parent': reply.get('parent') or 'root',
}
for children in map(self._get_all_children, reply.get('replies') or []):
yield from children
class BiliBiliBangumiIE(InfoExtractor):
_VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
IE_NAME = 'bangumi.bilibili.com'
IE_DESC = 'BiliBili番剧'
_TESTS = [{
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist_count': 26,
}, {
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist': [{
'md5': '91da8621454dd58316851c27c68b0c13',
'info_dict': {
'id': '40062',
'ext': 'mp4',
'title': '混沌武士',
'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
'timestamp': 1414538739,
'upload_date': '20141028',
'episode': '疾风怒涛 Tempestuous Temperaments',
'episode_number': 1,
},
}],
'params': {
'playlist_items': '1',
},
}]
@classmethod
def suitable(cls, url):
return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
def _real_extract(self, url):
bangumi_id = self._match_id(url)
# Sometimes this API returns a JSONP response
season_info = self._download_json(
'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
bangumi_id, transform_source=strip_jsonp)['result']
entries = [{
'_type': 'url_transparent',
'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
'ie_key': BiliBiliIE.ie_key(),
'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
'episode': episode.get('index_title'),
'episode_number': int_or_none(episode.get('index')),
} for episode in season_info['episodes']]
entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
return self.playlist_result(
entries, bangumi_id,
season_info.get('bangumi_title'), season_info.get('evaluate'))
class BilibiliChannelIE(InfoExtractor):
_VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)'
_API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp"
_TESTS = [{
'url': 'https://space.bilibili.com/3985676/video',
'info_dict': {},
'playlist_mincount': 112,
}]
def _entries(self, list_id):
count, max_count = 0, None
for page_num in itertools.count(1):
data = self._download_json(
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
max_count = max_count or traverse_obj(data, ('page', 'count'))
entries = traverse_obj(data, ('list', 'vlist'))
if not entries:
return
for entry in entries:
yield self.url_result(
'https://www.bilibili.com/video/%s' % entry['bvid'],
BiliBiliIE.ie_key(), entry['bvid'])
count += len(entries)
if max_count and count >= max_count:
return
def _real_extract(self, url):
list_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id)
class BilibiliCategoryIE(InfoExtractor):
IE_NAME = 'Bilibili category extractor'
_MAX_RESULTS = 1000000
_VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_TESTS = [{
'url': 'https://www.bilibili.com/v/kichiku/mad',
'info_dict': {
'id': 'kichiku: mad',
'title': 'kichiku: mad'
},
'playlist_mincount': 45,
'params': {
'playlistend': 45
}
}]
def _fetch_page(self, api_url, num_pages, query, page_num):
parsed_json = self._download_json(
api_url, query, query={'Search_key': query, 'pn': page_num},
note='Extracting results from page %s of %s' % (page_num, num_pages))
video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
if not video_list:
raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
for video in video_list:
yield self.url_result(
'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid'])
def _entries(self, category, subcategory, query):
# map of categories : subcategories : RIDs
rid_map = {
'kichiku': {
'mad': 26,
'manual_vocaloid': 126,
'guide': 22,
'theatre': 216,
'course': 127
},
}
if category not in rid_map:
raise ExtractorError(
f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}')
if subcategory not in rid_map[category]:
raise ExtractorError(
f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}')
rid_value = rid_map[category][subcategory]
api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
if count is None or not size:
raise ExtractorError('Failed to calculate either page count or size')
num_pages = math.ceil(count / size)
return OnDemandPagedList(functools.partial(
self._fetch_page, api_url, num_pages, query), size)
def _real_extract(self, url):
u = compat_urllib_parse_urlparse(url)
category, subcategory = u.path.split('/')[2:4]
query = '%s: %s' % (category, subcategory)
return self.playlist_result(self._entries(category, subcategory, query), query, query)
class BiliBiliSearchIE(SearchInfoExtractor):
IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
def _search_results(self, query):
for page_num in itertools.count(1):
videos = self._download_json(
'https://api.bilibili.com/x/web-interface/search/type', query,
note=f'Extracting results from page {page_num}', query={
'Search_key': query,
'keyword': query,
'page': page_num,
'context': '',
'order': 'pubdate',
'duration': 0,
'tids_2': '',
'__refresh__': 'true',
'search_type': 'video',
'tids': 0,
'highlight': 1,
})['data'].get('result') or []
for video in videos:
yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
class BilibiliAudioBaseIE(InfoExtractor):
def _call_api(self, path, sid, query=None):
if not query:
query = {'sid': sid}
return self._download_json(
'https://www.bilibili.com/audio/music-service-c/web/' + path,
sid, query=query)['data']
class BilibiliAudioIE(BilibiliAudioBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
_TEST = {
'url': 'https://www.bilibili.com/audio/au1003142',
'md5': 'fec4987014ec94ef9e666d4d158ad03b',
'info_dict': {
'id': '1003142',
'ext': 'm4a',
'title': '【tsukimi】YELLOW / 神山羊',
'artist': 'tsukimi',
'comment_count': int,
'description': 'YELLOW的mp3版',
'duration': 183,
'subtitles': {
'origin': [{
'ext': 'lrc',
}],
},
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1564836614,
'upload_date': '20190803',
'uploader': 'tsukimi-つきみぐー',
'view_count': int,
},
}
def _real_extract(self, url):
au_id = self._match_id(url)
play_data = self._call_api('url', au_id)
formats = [{
'url': play_data['cdns'][0],
'filesize': int_or_none(play_data.get('size')),
'vcodec': 'none'
}]
song = self._call_api('song/info', au_id)
title = song['title']
statistic = song.get('statistic') or {}
subtitles = None
lyric = song.get('lyric')
if lyric:
subtitles = {
'origin': [{
'url': lyric,
}]
}
return {
'id': au_id,
'title': title,
'formats': formats,
'artist': song.get('author'),
'comment_count': int_or_none(statistic.get('comment')),
'description': song.get('intro'),
'duration': int_or_none(song.get('duration')),
'subtitles': subtitles,
'thumbnail': song.get('cover'),
'timestamp': int_or_none(song.get('passtime')),
'uploader': song.get('uname'),
'view_count': int_or_none(statistic.get('play')),
}
class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
_TEST = {
'url': 'https://www.bilibili.com/audio/am10624',
'info_dict': {
'id': '10624',
'title': '每日新曲推荐每日11:00更新',
'description': '每天11:00更新为你推送最新音乐',
},
'playlist_count': 19,
}
def _real_extract(self, url):
am_id = self._match_id(url)
songs = self._call_api(
'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
entries = []
for song in songs:
sid = str_or_none(song.get('id'))
if not sid:
continue
entries.append(self.url_result(
'https://www.bilibili.com/audio/au' + sid,
BilibiliAudioIE.ie_key(), sid))
if entries:
album_data = self._call_api('menu/info', am_id) or {}
album_title = album_data.get('title')
if album_title:
for entry in entries:
entry['album'] = album_title
return self.playlist_result(
entries, am_id, album_title, album_data.get('intro'))
return self.playlist_result(entries, am_id)
class BiliBiliPlayerIE(InfoExtractor):
_VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
_TEST = {
'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
'only_matching': True,
}
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'http://www.bilibili.tv/video/av%s/' % video_id,
ie=BiliBiliIE.ie_key(), video_id=video_id)
class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bilibili.tv/intl/gateway'
_NETRC_MACHINE = 'biliintl'
def _call_api(self, endpoint, *args, **kwargs):
json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
if json.get('code'):
if json['code'] in (10004004, 10004005, 10023006):
self.raise_login_required()
elif json['code'] == 10004001:
self.raise_geo_restricted()
else:
if json.get('message') and str(json['code']) != json['message']:
errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
else:
errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
if kwargs.get('fatal'):
raise ExtractorError(errmsg)
else:
self.report_warning(errmsg)
return json.get('data')
def json2srt(self, json):
data = '\n\n'.join(
f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
for i, line in enumerate(json['body']) if line.get('content'))
return data
def _get_subtitles(self, ep_id):
sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id)
subtitles = {}
for sub in sub_json.get('subtitles') or []:
sub_url = sub.get('url')
if not sub_url:
continue
sub_data = self._download_json(
sub_url, ep_id, errnote='Unable to download subtitles', fatal=False,
note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
if not sub_data:
continue
subtitles.setdefault(sub.get('lang_key', 'en'), []).append({
'ext': 'srt',
'data': self.json2srt(sub_data)
})
return subtitles
def _get_formats(self, ep_id):
video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id,
note='Downloading video formats', errnote='Unable to download video formats')
video_json = video_json['playurl']
formats = []
for vid in video_json.get('video') or []:
video_res = vid.get('video_resource') or {}
video_info = vid.get('stream_info') or {}
if not video_res.get('url'):
continue
formats.append({
'url': video_res['url'],
'ext': 'mp4',
'format_note': video_info.get('desc_words'),
'width': video_res.get('width'),
'height': video_res.get('height'),
'vbr': video_res.get('bandwidth'),
'acodec': 'none',
'vcodec': video_res.get('codecs'),
'filesize': video_res.get('size'),
})
for aud in video_json.get('audio_resource') or []:
if not aud.get('url'):
continue
formats.append({
'url': aud['url'],
'ext': 'mp4',
'abr': aud.get('bandwidth'),
'acodec': aud.get('codecs'),
'vcodec': 'none',
'filesize': aud.get('size'),
})
self._sort_formats(formats)
return formats
def _extract_ep_info(self, episode_data, ep_id):
return {
'id': ep_id,
'title': episode_data.get('title_display') or episode_data['title'],
'thumbnail': episode_data.get('cover'),
'episode_number': int_or_none(self._search_regex(
r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)),
'formats': self._get_formats(ep_id),
'subtitles': self._get_subtitles(ep_id),
'extractor_key': BiliIntlIE.ie_key(),
}
def _login(self):
username, password = self._get_login_info()
if username is None:
return
try:
from Cryptodome.PublicKey import RSA
from Cryptodome.Cipher import PKCS1_v1_5
except ImportError:
try:
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_v1_5
except ImportError:
raise ExtractorError('pycryptodomex not found. Please install', expected=True)
key_data = self._download_json(
'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
note='Downloading login key', errnote='Unable to download login key')['data']
public_key = RSA.importKey(key_data['key'])
password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
login_post = self._download_json(
'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
'username': username,
'password': base64.b64encode(password_hash).decode('ascii'),
'keep_me': 'true',
's_locale': 'en_US',
'isTrusted': 'true'
}), note='Logging in', errnote='Unable to log in')
if login_post.get('code'):
if login_post.get('message'):
raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
else:
raise ExtractorError('Unable to log in')
def _real_initialize(self):
self._login()
class BiliIntlIE(BiliIntlBaseIE):
_VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
_TESTS = [{
# Bstation page
'url': 'https://www.bilibili.tv/en/play/34613/341736',
'info_dict': {
'id': '341736',
'ext': 'mp4',
'title': 'E2 - The First Night',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 2,
}
}, {
# Non-Bstation page
'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
'info_dict': {
'id': '11005006',
'ext': 'mp4',
'title': 'E3 - Who?',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 3,
}
}, {
# Subtitle with empty content
'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
'info_dict': {
'id': '10131790',
'ext': 'mp4',
'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 140,
},
'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
}, {
'url': 'https://www.biliintl.com/en/play/34613/341736',
'only_matching': True,
}]
def _real_extract(self, url):
season_id, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id)
# Bstation layout
initial_data = self._parse_json(self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), video_id, fatal=False) or {}
episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
if not episode_data:
# Non-Bstation layout, read through episode list
season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
episode_data = next(
episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict)
if str(episode.get('episode_id')) == video_id)
return self._extract_ep_info(episode_data, video_id)
class BiliIntlSeriesIE(BiliIntlBaseIE):
_VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$'
_TESTS = [{
'url': 'https://www.bilibili.tv/en/play/34613',
'playlist_mincount': 15,
'info_dict': {
'id': '34613',
'title': 'Fly Me to the Moon',
'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627',
'categories': ['Romance', 'Comedy', 'Slice of life'],
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'view_count': int,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.biliintl.com/en/play/34613',
'only_matching': True,
}]
def _entries(self, series_id):
series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
episode_id = str(episode.get('episode_id'))
yield self._extract_ep_info(episode, episode_id)
def _real_extract(self, url):
series_id = self._match_id(url)
series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {}
return self.playlist_result(
self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'),
categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none),
thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))

View File

@@ -0,0 +1,86 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
remove_end,
)
class BioBioChileTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
_TESTS = [{
'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml',
'md5': '26f51f03cf580265defefb4518faec09',
'info_dict': {
'id': 'sobre-camaras-y-camarillas-parlamentarias',
'ext': 'mp4',
'title': 'Sobre Cámaras y camarillas parlamentarias',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Fernando Atria',
},
'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
}, {
# different uploader layout
'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml',
'md5': 'edc2e6b58974c46d5b047dea3c539ff3',
'info_dict': {
'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades',
'ext': 'mp4',
'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Piangella Obrador',
},
'params': {
'skip_download': True,
},
'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
}, {
'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml',
'info_dict': {
'id': 'b4xd0LK3SK',
'ext': 'mp4',
# TODO: fix url_transparent information overriding
# 'uploader': 'Juan Pablo Echenique',
'title': 'Comentario Oscar Cáceres',
},
'params': {
# empty m3u8 manifest
'skip_download': True,
},
}, {
'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
'only_matching': True,
}, {
'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
rudo_url = self._search_regex(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage, 'embed URL', None, group='url')
if not rudo_url:
raise ExtractorError('No videos found')
title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>',
webpage, 'uploader', fatal=False)
return {
'_type': 'url_transparent',
'url': rudo_url,
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'uploader': uploader,
}

View File

@@ -0,0 +1,105 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .vk import VKIE
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
from ..utils import int_or_none
class BIQLEIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
# Youtube embed
'url': 'https://biqle.ru/watch/-115995369_456239081',
'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
'info_dict': {
'id': '8v4f-avW-VI',
'ext': 'mp4',
'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
'description': 'Passe-Partout',
'uploader_id': 'mrsimpsonstef3',
'uploader': 'Phanolito',
'upload_date': '20120822',
},
}, {
'url': 'http://biqle.org/watch/-44781847_168547604',
'md5': '7f24e72af1db0edf7c1aaba513174f97',
'info_dict': {
'id': '-44781847_168547604',
'ext': 'mp4',
'title': 'Ребенок в шоке от автоматической мойки',
'timestamp': 1396633454,
'uploader': 'Dmitry Kotov',
'upload_date': '20140404',
'uploader_id': '47850140',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._proto_relative_url(self._search_regex(
r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
webpage, 'embed url'))
if VKIE.suitable(embed_url):
return self.url_result(embed_url, VKIE.ie_key(), video_id)
embed_page = self._download_webpage(
embed_url, video_id, headers={'Referer': url})
video_ext = self._get_cookies(embed_url).get('video_ext')
if video_ext:
video_ext = compat_urllib_parse_unquote(video_ext.value)
if not video_ext:
video_ext = compat_b64decode(self._search_regex(
r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
embed_page, 'video_ext')).decode()
video_id, sig, _, access_token = video_ext.split(':')
item = self._download_json(
'https://api.vk.com/method/video.get', video_id,
headers={'User-Agent': 'okhttp/3.4.1'}, query={
'access_token': access_token,
'sig': sig,
'v': 5.44,
'videos': video_id,
})['response']['items'][0]
title = item['title']
formats = []
for f_id, f_url in item.get('files', {}).items():
if f_id == 'external':
return self.url_result(f_url)
ext, height = f_id.split('_')
formats.append({
'format_id': height + 'p',
'url': f_url,
'height': int_or_none(height),
'ext': ext,
})
self._sort_formats(formats)
thumbnails = []
for k, v in item.items():
if k.startswith('photo_') and v:
width = k.replace('photo_', '')
thumbnails.append({
'id': width,
'url': v,
'width': int_or_none(width),
})
return {
'id': video_id,
'title': title,
'formats': formats,
'comment_count': int_or_none(item.get('comments')),
'description': item.get('description'),
'duration': int_or_none(item.get('duration')),
'thumbnails': thumbnails,
'timestamp': int_or_none(item.get('date')),
'uploader': item.get('owner_id'),
'view_count': int_or_none(item.get('views')),
}

View File

@@ -0,0 +1,158 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
GeoRestrictedError,
orderedSet,
unified_strdate,
urlencode_postdata,
)
class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'szoMrox2JEI',
'ext': 'mp4',
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20170103',
},
}, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
'only_matching': True,
}, {
'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
})
title = self._html_search_regex(
(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
webpage, 'title', default=None) or self._html_search_meta(
'description', webpage, 'title',
default=None) or self._og_search_description(webpage)
format_urls = []
for mobj in re.finditer(
r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
format_urls.append(mobj.group('url'))
format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
formats = [
{'url': format_url}
for format_url in orderedSet(format_urls)]
if not formats:
entries = self._parse_html5_media_entries(
url, webpage, video_id)
if not entries:
error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
if error == 'Video Unavailable':
raise GeoRestrictedError(error)
raise ExtractorError(error)
formats = entries[0]['formats']
self._check_formats(formats, video_id)
self._sort_formats(formats)
description = self._html_search_regex(
r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(
webpage, default=None) or self._html_search_meta(
'twitter:image:src', webpage, 'thumbnail')
uploader = self._html_search_regex(
(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._search_regex(
r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
webpage, 'upload date', fatal=False))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'upload_date': upload_date,
'formats': formats,
}
class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.bitchute.com/channel/victoriaxrave/',
'playlist_mincount': 185,
'info_dict': {
'id': 'victoriaxrave',
},
}
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
def _entries(self, channel_id):
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
offset = 0
for page_num in itertools.count(1):
data = self._download_json(
'%sextend/' % channel_url, channel_id,
'Downloading channel page %d' % page_num,
data=urlencode_postdata({
'csrfmiddlewaretoken': self._TOKEN,
'name': '',
'offset': offset,
}), headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': channel_url,
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'csrftoken=%s' % self._TOKEN,
})
if data.get('success') is False:
break
html = data.get('html')
if not html:
break
video_ids = re.findall(
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
html)
if not video_ids:
break
offset += len(video_ids)
for video_id in video_ids:
yield self.url_result(
'https://www.bitchute.com/video/%s' % video_id,
ie=BitChuteIE.ie_key(), video_id=video_id)
def _real_extract(self, url):
channel_id = self._match_id(url)
return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id)

View File

@@ -0,0 +1,61 @@
from __future__ import unicode_literals
from .common import InfoExtractor
class BitwaveReplayIE(InfoExtractor):
IE_NAME = 'bitwave:replay'
_VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$'
_TEST = {
'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr',
'only_matching': True
}
def _real_extract(self, url):
replay_id = self._match_id(url)
replay = self._download_json(
'https://api.bitwave.tv/v1/replays/' + replay_id,
replay_id
)
return {
'id': replay_id,
'title': replay['data']['title'],
'uploader': replay['data']['name'],
'uploader_id': replay['data']['name'],
'url': replay['data']['url'],
'thumbnails': [
{'url': x} for x in replay['data']['thumbnails']
],
}
class BitwaveStreamIE(InfoExtractor):
IE_NAME = 'bitwave:stream'
_VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$'
_TEST = {
'url': 'https://bitwave.tv/doomtube',
'only_matching': True
}
def _real_extract(self, url):
username = self._match_id(url)
channel = self._download_json(
'https://api.bitwave.tv/v1/channels/' + username,
username)
formats = self._extract_m3u8_formats(
channel['data']['url'], username,
'mp4')
self._sort_formats(formats)
return {
'id': username,
'title': channel['data']['title'],
'uploader': username,
'uploader_id': username,
'formats': formats,
'thumbnail': channel['data']['thumbnail'],
'is_live': True,
'view_count': channel['data']['viewCount']
}

View File

@@ -0,0 +1,67 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import parse_iso8601
class BlackboardCollaborateIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?P<region>[a-z-]+)\.bbcollab\.com/
(?:
collab/ui/session/playback/load|
recording
)/
(?P<id>[^/]+)'''
_TESTS = [
{
'url': 'https://us-lti.bbcollab.com/collab/ui/session/playback/load/0a633b6a88824deb8c918f470b22b256',
'md5': 'bb7a055682ee4f25fdb5838cdf014541',
'info_dict': {
'id': '0a633b6a88824deb8c918f470b22b256',
'title': 'HESI A2 Information Session - Thursday, May 6, 2021 - recording_1',
'ext': 'mp4',
'duration': 1896000,
'timestamp': 1620331399,
'upload_date': '20210506',
},
},
{
'url': 'https://us.bbcollab.com/collab/ui/session/playback/load/76761522adfe4345a0dee6794bbcabda',
'only_matching': True,
},
{
'url': 'https://ca.bbcollab.com/collab/ui/session/playback/load/b6399dcb44df4f21b29ebe581e22479d',
'only_matching': True,
},
{
'url': 'https://eu.bbcollab.com/recording/51ed7b50810c4444a106e48cefb3e6b5',
'only_matching': True,
},
{
'url': 'https://au.bbcollab.com/collab/ui/session/playback/load/2bccf7165d7c419ab87afc1ec3f3bb15',
'only_matching': True,
},
]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
region = mobj.group('region')
video_id = mobj.group('id')
info = self._download_json(
'https://{}.bbcollab.com/collab/api/csa/recordings/{}/data'.format(region, video_id), video_id)
duration = info.get('duration')
title = info['name']
upload_date = info.get('created')
streams = info['streams']
formats = [{'format_id': k, 'url': url} for k, url in streams.items()]
return {
'duration': duration,
'formats': formats,
'id': video_id,
'timestamp': parse_iso8601(upload_date),
'title': title,
}

View File

@@ -0,0 +1,112 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .amp import AMPIE
from ..utils import (
ExtractorError,
int_or_none,
parse_iso8601,
)
class BleacherReportIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)'
_TESTS = [{
'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football',
'md5': 'a3ffc3dc73afdbc2010f02d98f990f20',
'info_dict': {
'id': '2496438',
'ext': 'mp4',
'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?',
'uploader_id': 3992341,
'description': 'CFB, ACC, Florida State',
'timestamp': 1434380212,
'upload_date': '20150615',
'uploader': 'Team Stream Now ',
},
'add_ie': ['Ooyala'],
}, {
'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
'md5': '6a5cd403418c7b01719248ca97fb0692',
'info_dict': {
'id': '2586817',
'ext': 'webm',
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
article_id = self._match_id(url)
article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article']
thumbnails = []
primary_photo = article_data.get('primaryPhoto')
if primary_photo:
thumbnails = [{
'url': primary_photo['url'],
'width': primary_photo.get('width'),
'height': primary_photo.get('height'),
}]
info = {
'_type': 'url_transparent',
'id': article_id,
'title': article_data['title'],
'uploader': article_data.get('author', {}).get('name'),
'uploader_id': article_data.get('authorId'),
'timestamp': parse_iso8601(article_data.get('createdAt')),
'thumbnails': thumbnails,
'comment_count': int_or_none(article_data.get('commentsCount')),
'view_count': int_or_none(article_data.get('hitCount')),
}
video = article_data.get('video')
if video:
video_type = video['type']
if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'):
info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id']
elif video_type == 'ooyala.com':
info['url'] = 'ooyala:%s' % video['id']
elif video_type == 'youtube.com':
info['url'] = video['id']
elif video_type == 'vine.co':
info['url'] = 'https://vine.co/v/%s' % video['id']
else:
info['url'] = video_type + video['id']
return info
else:
raise ExtractorError('no video in the article', expected=True)
class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms',
'md5': '670b2d73f48549da032861130488c681',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
'ext': 'mp4',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
'upload_date': '20150723',
'timestamp': 1437679032,
},
'expected_warnings': [
'Unable to download f4m manifest'
]
}]
def _real_extract(self, url):
video_id = self._match_id(url)
info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id)
info['id'] = video_id
return info

View File

@@ -0,0 +1,86 @@
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
remove_start,
int_or_none,
)
class BlinkxIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
'id': 'Da0Gw3xc',
'ext': 'mp4',
'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
'uploader': 'IGN News',
'upload_date': '20150217',
'timestamp': 1424215740,
'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
'duration': 47.743333,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
+ 'video=%s' % video_id)
data_json = self._download_webpage(api_url, display_id)
data = json.loads(data_json)['api']['results'][0]
duration = None
thumbnails = []
formats = []
for m in data['media']:
if m['type'] == 'jpg':
thumbnails.append({
'url': m['link'],
'width': int(m['w']),
'height': int(m['h']),
})
elif m['type'] == 'original':
duration = float(m['d'])
elif m['type'] == 'youtube':
yt_id = m['link']
self.to_screen('Youtube video detected: %s' % yt_id)
return self.url_result(yt_id, 'Youtube', video_id=yt_id)
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int_or_none(m.get('w')),
'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)
return {
'id': display_id,
'fullid': video_id,
'title': data['title'],
'formats': formats,
'uploader': data.get('channel_name'),
'timestamp': data.get('pubdate_epoch'),
'description': data.get('description'),
'thumbnails': thumbnails,
'duration': duration,
}

View File

@@ -0,0 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ..utils import (
mimetype2ext,
parse_duration,
parse_qs,
str_or_none,
traverse_obj,
)
from .common import InfoExtractor
class BloggerIE(InfoExtractor):
IE_NAME = 'blogger.com'
_VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
_VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
_TESTS = [{
'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
'info_dict': {
'id': 'BLOGGER-video-3c740e3a49197e16-796',
'title': 'BLOGGER-video-3c740e3a49197e16-796',
'ext': 'mp4',
'thumbnail': r're:^https?://.*',
'duration': 76.068,
}
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(BloggerIE._VALID_EMBED, webpage)
def _real_extract(self, url):
token_id = self._match_id(url)
webpage = self._download_webpage(url, token_id)
data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data')
data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id)
streams = data['streams']
formats = [{
'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))),
'url': stream['play_url'],
'format_id': str_or_none(stream.get('format_id')),
} for stream in streams]
return {
'id': data.get('iframe_id', token_id),
'title': data.get('iframe_id', token_id),
'formats': formats,
'thumbnail': data.get('thumbnail'),
'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))),
}

View File

@@ -0,0 +1,83 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
# The md5 checksum changes
'info_dict': {
'id': 'qurhIVlJSB6hzkVi229d8g',
'ext': 'flv',
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
'description': 'md5:a8ba0302912d03d246979735c17d2761',
},
'params': {
'format': 'best[format_id^=hds]',
},
}, {
# video ID in BPlayer(...)
'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
'info_dict': {
'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
'ext': 'flv',
'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
'description': 'Hello World, Episode 1: New Zealands freaky AI babies, robot exoskeletons, and a virtual you.',
},
'params': {
'format': 'best[format_id^=hds]',
},
}, {
# data-bmmrid=
'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
'only_matching': True,
}, {
'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
'only_matching': True,
}, {
'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
'only_matching': True,
}]
def _real_extract(self, url):
name = self._match_id(url)
webpage = self._download_webpage(url, name)
video_id = self._search_regex(
(r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
webpage, 'id', group='id', default=None)
if not video_id:
bplayer_data = self._parse_json(self._search_regex(
r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
video_id = bplayer_data['id']
title = re.sub(': Video$', '', self._og_search_title(webpage))
embed_info = self._download_json(
'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
formats = []
for stream in embed_info['streams']:
stream_url = stream.get('url')
if not stream_url:
continue
if stream['muxing_format'] == 'TS':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.extend(self._extract_f4m_formats(
stream_url, video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import ExtractorError
class BokeCCBaseIE(InfoExtractor):
def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
player_params_str = self._html_search_regex(
r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)',
webpage, 'player params', group='query')
player_params = compat_parse_qs(player_params_str)
info_xml = self._download_xml(
'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
player_params['siteid'][0], player_params['vid'][0]), video_id)
formats = [{
'format_id': format_id,
'url': quality.find('./copy').attrib['playurl'],
'quality': int(quality.attrib['value']),
} for quality in info_xml.findall('./video/quality')]
self._sort_formats(formats)
return formats
class BokeCCIE(BokeCCBaseIE):
_IE_DESC = 'CC视频'
_VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
_TESTS = [{
'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A',
'info_dict': {
'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461',
'ext': 'flv',
'title': 'BokeCC Video',
},
}]
def _real_extract(self, url):
qs = compat_parse_qs(self._match_valid_url(url).group('query'))
if not qs.get('vid') or not qs.get('uid'):
raise ExtractorError('Invalid URL', expected=True)
video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
webpage = self._download_webpage(url, video_id)
return {
'id': video_id,
'title': 'BokeCC Video', # no title provided in the webpage
'formats': self._extract_bokecc_formats(webpage, video_id),
}

View File

@@ -0,0 +1,59 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
try_get,
urlencode_postdata,
)
class BongaCamsIE(InfoExtractor):
_VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://de.bongacams.com/azumi-8',
'only_matching': True,
}, {
'url': 'https://cn.bongacams.com/azumi-8',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
host = mobj.group('host')
channel_id = mobj.group('id')
amf = self._download_json(
'https://%s/tools/amf.php' % host, channel_id,
data=urlencode_postdata((
('method', 'getRoomData'),
('args[]', channel_id),
('args[]', 'false'),
)), headers={'X-Requested-With': 'XMLHttpRequest'})
server_url = amf['localData']['videoServerUrl']
uploader_id = try_get(
amf, lambda x: x['performerData']['username'], compat_str) or channel_id
uploader = try_get(
amf, lambda x: x['performerData']['displayName'], compat_str)
like_count = int_or_none(try_get(
amf, lambda x: x['performerData']['loversCount']))
formats = self._extract_m3u8_formats(
'%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id),
channel_id, 'mp4', m3u8_id='hls', live=True)
self._sort_formats(formats)
return {
'id': channel_id,
'title': uploader or uploader_id,
'uploader': uploader,
'uploader_id': uploader_id,
'like_count': like_count,
'age_limit': 18,
'is_live': True,
'formats': formats,
}

View File

@@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
)
class BostonGlobeIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
_TESTS = [
{
'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
'md5': '0a62181079c85c2d2b618c9a738aedaf',
'info_dict': {
'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
'id': '5320421710001',
'ext': 'mp4',
'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
'timestamp': 1486877593,
'upload_date': '20170212',
'uploader_id': '245991542',
},
},
{
# Embedded youtube video; we hand it off to the Generic extractor.
'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
'md5': '582b40327089d5c0c949b3c54b13c24b',
'info_dict': {
'title': "Who Is Matt Damon's Favorite Batman?",
'id': 'ZW1QCnlA6Qc',
'ext': 'mp4',
'upload_date': '20170217',
'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
'uploader': 'The Late Late Show with James Corden',
'uploader_id': 'TheLateLateShow',
},
'expected_warnings': ['404'],
},
]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
page_title = self._og_search_title(webpage, default=None)
# <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
entries = []
for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
attrs = extract_attributes(video)
video_id = attrs.get('data-brightcove-video-id')
account_id = attrs.get('data-account')
player_id = attrs.get('data-player')
embed = attrs.get('data-embed')
if video_id and account_id and player_id and embed:
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
% (account_id, player_id, embed, video_id))
if len(entries) == 0:
return self.url_result(url, 'Generic')
elif len(entries) == 1:
return self.url_result(entries[0], 'BrightcoveNew')
else:
return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')

View File

@@ -0,0 +1,97 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
determine_ext,
parse_iso8601,
# try_get,
update_url_query,
)
class BoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)'
_TEST = {
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
'info_dict': {
'id': '510727257538',
'ext': 'mp4',
'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4',
'uploader': 'MLS Video',
'timestamp': 1566320259,
'upload_date': '20190820',
'uploader_id': '235196876',
}
}
def _real_extract(self, url):
shared_name, file_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, file_id)
request_token = self._parse_json(self._search_regex(
r'Box\.config\s*=\s*({.+?});', webpage,
'Box config'), file_id)['requestToken']
access_token = self._download_json(
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
'Downloading token JSON metadata',
data=json.dumps({'fileIDs': [file_id]}).encode(), headers={
'Content-Type': 'application/json',
'X-Request-Token': request_token,
'X-Box-EndUser-API': 'sharedName=' + shared_name,
})[file_id]['read']
shared_link = 'https://app.box.com/s/' + shared_name
f = self._download_json(
'https://api.box.com/2.0/files/' + file_id, file_id,
'Downloading file JSON metadata', headers={
'Authorization': 'Bearer ' + access_token,
'BoxApi': 'shared_link=' + shared_link,
'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats
}, query={
'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size'
})
title = f['name']
query = {
'access_token': access_token,
'shared_link': shared_link
}
formats = []
# for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []):
# entry_url_template = try_get(
# entry, lambda x: x['content']['url_template'])
# if not entry_url_template:
# continue
# representation = entry.get('representation')
# if representation == 'dash':
# TODO: append query to every fragment URL
# formats.extend(self._extract_mpd_formats(
# entry_url_template.replace('{+asset_path}', 'manifest.mpd'),
# file_id, query=query))
authenticated_download_url = f.get('authenticated_download_url')
if authenticated_download_url and f.get('is_download_available'):
formats.append({
'ext': f.get('extension') or determine_ext(title),
'filesize': f.get('size'),
'format_id': 'download',
'url': update_url_query(authenticated_download_url, query),
})
self._sort_formats(formats)
creator = f.get('created_by') or {}
return {
'id': file_id,
'title': title,
'formats': formats,
'description': f.get('description') or None,
'uploader': creator.get('name'),
'timestamp': parse_iso8601(f.get('created_at')),
'uploader_id': creator.get('id'),
}

View File

@@ -0,0 +1,62 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
determine_ext,
)
class BpbIE(InfoExtractor):
IE_DESC = 'Bundeszentrale für politische Bildung'
_VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/'
_TEST = {
'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
# md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2
'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',
'info_dict': {
'id': '297',
'ext': 'mp4',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h2 class="white">(.*?)</h2>', webpage, 'title')
video_info_dicts = re.findall(
r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
formats = []
for video_info in video_info_dicts:
video_info = self._parse_json(
video_info, video_id, transform_source=js_to_json, fatal=False)
if not video_info:
continue
video_url = video_info.get('src')
if not video_url:
continue
quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
'quality': 10 if quality == 'high' else 0,
'format_note': quality,
'format_id': '%s-%s' % (quality, determine_ext(video_url)),
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': title,
'description': self._og_search_description(webpage),
}

View File

@@ -0,0 +1,310 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
parse_duration,
parse_iso8601,
xpath_element,
xpath_text,
)
class BRIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk'
_VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'
_TESTS = [
{
'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
'md5': '83a0477cf0b8451027eb566d88b51106',
'info_dict': {
'id': '48f656ef-287e-486f-be86-459122db22cc',
'ext': 'mp4',
'title': 'Die böse Überraschung',
'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',
'duration': 180,
'uploader': 'Reinhard Weber',
'upload_date': '20150422',
},
'skip': '404 not found',
},
{
'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',
'info_dict': {
'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
'ext': 'flv',
'title': 'Manfred Schreiber ist tot',
'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
'duration': 26,
},
'skip': '404 not found',
},
{
'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
'info_dict': {
'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
'ext': 'aac',
'title': 'Kurzweilig und sehr bewegend',
'description': 'md5:0351996e3283d64adeb38ede91fac54e',
'duration': 296,
},
'skip': '404 not found',
},
{
'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
'info_dict': {
'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
'ext': 'mp4',
'title': 'Umweltbewusster Häuslebauer',
'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',
'duration': 116,
}
},
{
'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
'md5': '23bca295f1650d698f94fc570977dae3',
'info_dict': {
'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
'ext': 'mp4',
'title': 'Folge 1 - Metaphysik',
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
'upload_date': '20170208',
}
},
]
def _real_extract(self, url):
base_url, display_id = self._match_valid_url(url).groups()
page = self._download_webpage(url, display_id)
xml_url = self._search_regex(
r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
xml = self._download_xml(base_url + xml_url, display_id)
medias = []
for xml_media in xml.findall('video') + xml.findall('audio'):
media_id = xml_media.get('externalId')
media = {
'id': media_id,
'title': xpath_text(xml_media, 'title', 'title', True),
'duration': parse_duration(xpath_text(xml_media, 'duration')),
'formats': self._extract_formats(xpath_element(
xml_media, 'assets'), media_id),
'thumbnails': self._extract_thumbnails(xpath_element(
xml_media, 'teaserImage/variants'), base_url),
'description': xpath_text(xml_media, 'desc'),
'webpage_url': xpath_text(xml_media, 'permalink'),
'uploader': xpath_text(xml_media, 'author'),
}
broadcast_date = xpath_text(xml_media, 'broadcastDate')
if broadcast_date:
media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))
medias.append(media)
if len(medias) > 1:
self.report_warning(
'found multiple medias; please '
'report this with the video URL to http://yt-dl.org/bug')
if not medias:
raise ExtractorError('No media entries found')
return medias[0]
def _extract_formats(self, assets, media_id):
formats = []
for asset in assets.findall('asset'):
format_url = xpath_text(asset, ['downloadUrl', 'url'])
asset_type = asset.get('type')
if asset_type.startswith('HDS'):
formats.extend(self._extract_f4m_formats(
format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False))
elif asset_type.startswith('HLS'):
formats.extend(self._extract_m3u8_formats(
format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False))
else:
format_info = {
'ext': xpath_text(asset, 'mediaType'),
'width': int_or_none(xpath_text(asset, 'frameWidth')),
'height': int_or_none(xpath_text(asset, 'frameHeight')),
'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')),
'abr': int_or_none(xpath_text(asset, 'bitrateAudio')),
'vcodec': xpath_text(asset, 'codecVideo'),
'acodec': xpath_text(asset, 'codecAudio'),
'container': xpath_text(asset, 'mediaType'),
'filesize': int_or_none(xpath_text(asset, 'size')),
}
format_url = self._proto_relative_url(format_url)
if format_url:
http_format_info = format_info.copy()
http_format_info.update({
'url': format_url,
'format_id': 'http-%s' % asset_type,
})
formats.append(http_format_info)
server_prefix = xpath_text(asset, 'serverPrefix')
if server_prefix:
rtmp_format_info = format_info.copy()
rtmp_format_info.update({
'url': server_prefix,
'play_path': xpath_text(asset, 'fileName'),
'format_id': 'rtmp-%s' % asset_type,
})
formats.append(rtmp_format_info)
self._sort_formats(formats)
return formats
def _extract_thumbnails(self, variants, base_url):
thumbnails = [{
'url': base_url + xpath_text(variant, 'url'),
'width': int_or_none(xpath_text(variant, 'width')),
'height': int_or_none(xpath_text(variant, 'height')),
} for variant in variants.findall('variant') if xpath_text(variant, 'url')]
thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
return thumbnails
class BRMediathekIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk Mediathek'
_VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
_TESTS = [{
'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
'md5': 'fdc3d485835966d1622587d08ba632ec',
'info_dict': {
'id': 'av:5a1e6a6e8fce6d001871cc8e',
'ext': 'mp4',
'title': 'Die Sendung vom 28.11.2017',
'description': 'md5:6000cdca5912ab2277e5b7339f201ccc',
'timestamp': 1511942766,
'upload_date': '20171129',
}
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
clip = self._download_json(
'https://proxy-base.master.mango.express/graphql',
clip_id, data=json.dumps({
"query": """{
viewer {
clip(id: "%s") {
title
description
duration
createdAt
ageRestriction
videoFiles {
edges {
node {
publicLocation
fileSize
videoProfile {
width
height
bitrate
encoding
}
}
}
}
captionFiles {
edges {
node {
publicLocation
}
}
}
teaserImages {
edges {
node {
imageFiles {
edges {
node {
publicLocation
width
height
}
}
}
}
}
}
}
}
}""" % clip_id}).encode(), headers={
'Content-Type': 'application/json',
})['data']['viewer']['clip']
title = clip['title']
formats = []
for edge in clip.get('videoFiles', {}).get('edges', []):
node = edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
ext = determine_ext(n_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
n_url, clip_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
video_profile = node.get('videoProfile', {})
tbr = int_or_none(video_profile.get('bitrate'))
format_id = 'http'
if tbr:
format_id += '-%d' % tbr
formats.append({
'format_id': format_id,
'url': n_url,
'width': int_or_none(video_profile.get('width')),
'height': int_or_none(video_profile.get('height')),
'tbr': tbr,
'filesize': int_or_none(node.get('fileSize')),
})
self._sort_formats(formats)
subtitles = {}
for edge in clip.get('captionFiles', {}).get('edges', []):
node = edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
subtitles.setdefault('de', []).append({
'url': n_url,
})
thumbnails = []
for edge in clip.get('teaserImages', {}).get('edges', []):
for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []):
node = image_edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
thumbnails.append({
'url': n_url,
'width': int_or_none(node.get('width')),
'height': int_or_none(node.get('height')),
})
return {
'id': clip_id,
'title': title,
'description': clip.get('description'),
'duration': int_or_none(clip.get('duration')),
'timestamp': parse_iso8601(clip.get('createdAt')),
'age_limit': int_or_none(clip.get('ageRestriction')),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
}

View File

@@ -0,0 +1,120 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .adobepass import AdobePassIE
from ..utils import (
smuggle_url,
update_url_query,
int_or_none,
float_or_none,
try_get,
dict_get,
)
class BravoTVIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
'info_dict': {
'id': 'epL0pmK1kQlT',
'ext': 'mp4',
'title': 'The Top Chef Season 16 Winner Is...',
'description': 'Find out who takes the title of Top Chef!',
'uploader': 'NBCU-BRAV',
'upload_date': '20190314',
'timestamp': 1552591860,
'season_number': 16,
'episode_number': 15,
'series': 'Top Chef',
'episode': 'The Top Chef Season 16 Winner Is...',
'duration': 190.0,
}
}, {
'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
}, {
'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2',
'only_matching': True,
}]
def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
settings = self._parse_json(self._search_regex(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
display_id)
info = {}
query = {
'mbr': 'true',
}
account_pid, release_pid = [None] * 2
tve = settings.get('ls_tve')
if tve:
query['manifest'] = 'm3u'
mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage)
if mobj:
account_pid, tp_path = mobj.groups()
release_pid = tp_path.strip('/').split('/')[-1]
else:
account_pid = 'HNK2IC'
tp_path = release_pid = tve['release_pid']
if tve.get('entitlement') == 'auth':
adobe_pass = settings.get('tve_adobe_auth', {})
if site == 'bravotv':
site = 'bravo'
resource = self._get_mvpd_resource(
adobe_pass.get('adobePassResourceId') or site,
tve['title'], release_pid, tve.get('rating'))
query['auth'] = self._extract_mvpd_auth(
url, release_pid,
adobe_pass.get('adobePassRequestorId') or site, resource)
else:
shared_playlist = settings['ls_playlist']
account_pid = shared_playlist['account_pid']
metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']]
tp_path = release_pid = metadata.get('release_pid')
if not release_pid:
release_pid = metadata['guid']
tp_path = 'media/guid/2140479951/' + release_pid
info.update({
'title': metadata['title'],
'description': metadata.get('description'),
'season_number': int_or_none(metadata.get('season_num')),
'episode_number': int_or_none(metadata.get('episode_num')),
})
query['switch'] = 'progressive'
tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path)
tp_metadata = self._download_json(
update_url_query(tp_url, {'format': 'preview'}),
display_id, fatal=False)
if tp_metadata:
info.update({
'title': tp_metadata.get('title'),
'description': tp_metadata.get('description'),
'duration': float_or_none(tp_metadata.get('duration'), 1000),
'season_number': int_or_none(
dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))),
'episode_number': int_or_none(
dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))),
# For some reason the series is sometimes wrapped into a single element array.
'series': try_get(
dict_get(tp_metadata, ('pl1$show', 'nbcu$show')),
lambda x: x[0] if isinstance(x, list) else x,
expected_type=str),
'episode': dict_get(
tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')),
})
info.update({
'_type': 'url_transparent',
'id': release_pid,
'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}),
'ie_key': 'ThePlatform',
})
return info

View File

@@ -0,0 +1,90 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
int_or_none,
url_or_none,
)
class BreakIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
_TESTS = [{
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
'age_limit': 13,
},
}, {
# youtube embed
'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work',
'info_dict': {
'id': 'RrrDLdeL2HQ',
'ext': 'mp4',
'title': 'Whale Watching Boat Crashing Into San Diego Dock',
'description': 'md5:afc1b2772f0a8468be51dd80eb021069',
'upload_date': '20160331',
'uploader': 'Steve Holden',
'uploader_id': 'sdholden07',
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
'only_matching': True,
}]
def _real_extract(self, url):
display_id, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
youtube_url = YoutubeIE._extract_url(webpage)
if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
content = self._parse_json(
self._search_regex(
r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage,
'content'),
display_id)
formats = []
for video in content:
video_url = url_or_none(video.get('url'))
if not video_url:
continue
bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None))
formats.append({
'url': video_url,
'format_id': 'http-%d' % bitrate if bitrate else 'http',
'tbr': bitrate,
})
self._sort_formats(formats)
title = self._search_regex(
(r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value')
def get(key, name):
return int_or_none(self._search_regex(
r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name,
default=None))
age_limit = get('ratings', 'age limit')
video_id = video_id or get('pid', 'video id') or display_id
return {
'id': video_id,
'display_id': display_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': age_limit,
'formats': formats,
}

View File

@@ -0,0 +1,39 @@
from __future__ import unicode_literals
from .common import InfoExtractor
class BreitBartIE(InfoExtractor):
_VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji',
'md5': '0aa6d1d6e183ac5ca09207fe49f17ade',
'info_dict': {
'id': '5cOz1yup',
'ext': 'mp4',
'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery',
'description': 'md5:bac35eb0256d1cb17f517f54c79404d5',
'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg',
'age_limit': 0,
}
}, {
'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4')
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, 'video title'),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': self._rta_search(webpage),
'formats': formats
}

View File

@@ -0,0 +1,688 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import re
import struct
from .adobepass import AdobePassIE
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
compat_HTTPError,
compat_parse_qs,
compat_urlparse,
compat_xml_parse_error,
)
from ..utils import (
clean_html,
dict_get,
extract_attributes,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
int_or_none,
js_to_json,
mimetype2ext,
parse_iso8601,
parse_qs,
smuggle_url,
str_or_none,
try_get,
unescapeHTML,
unsmuggle_url,
UnsupportedError,
update_url_query,
url_or_none,
)
class BrightcoveLegacyIE(InfoExtractor):
IE_NAME = 'brightcove:legacy'
_VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_TESTS = [
{
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
'md5': '5423e113865d26e40624dce2e4b45d95',
'note': 'Test Brightcove downloads and detection in GenericIE',
'info_dict': {
'id': '2371591881001',
'ext': 'mp4',
'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
'uploader': '8TV',
'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
'timestamp': 1368213670,
'upload_date': '20130510',
'uploader_id': '1589608506001',
},
'skip': 'The player has been deactivated by the content owner',
},
{
# From http://medianetwork.oracle.com/video/player/1785452137001
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
'info_dict': {
'id': '1785452137001',
'ext': 'flv',
'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
'uploader': 'Oracle',
'timestamp': 1344975024,
'upload_date': '20120814',
'uploader_id': '1460825906',
},
'skip': 'video not playable',
},
{
# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
'info_dict': {
'id': '2750934548001',
'ext': 'mp4',
'title': 'This Bracelet Acts as a Personal Thermostat',
'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
# 'uploader': 'Mashable',
'timestamp': 1382041798,
'upload_date': '20131017',
'uploader_id': '1130468786001',
},
},
{
# test that the default referer works
# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
'info_dict': {
'id': '2878862109001',
'ext': 'mp4',
'title': 'Lost in Motion II',
'description': 'md5:363109c02998fee92ec02211bd8000df',
'uploader': 'National Ballet of Canada',
},
'skip': 'Video gone',
},
{
# test flv videos served by akamaihd.net
# From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
# The md5 checksum changes on each download
'info_dict': {
'id': '3750436379001',
'ext': 'flv',
'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
'uploader': 'RBTV Old (do not use)',
'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
'timestamp': 1409122195,
'upload_date': '20140827',
'uploader_id': '710858724001',
},
'skip': 'Video gone',
},
{
# playlist with 'videoList'
# from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
'title': 'Sealife',
'id': '3550319591001',
},
'playlist_mincount': 7,
'skip': 'Unsupported URL',
},
{
# playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965)
'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
'info_dict': {
'id': '1522758701001',
'title': 'Lesson 08',
},
'playlist_mincount': 10,
'skip': 'Unsupported URL',
},
{
# playerID inferred from bcpid
# from http://www.un.org/chinese/News/story.asp?NewsID=27724
'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
'only_matching': True, # Tested in GenericIE
}
]
@classmethod
def _build_brightcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
"""
# Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553
object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>',
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608
object_str = object_str.replace('<--', '<!--')
# remove namespace to simplify extraction
object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
object_str = fix_xml_ampersands(object_str)
try:
object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
except compat_xml_parse_error:
return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
flashvars = dict(
(k, v[0])
for k, v in compat_parse_qs(fv_el.attrib['value']).items())
else:
flashvars = {}
data_url = object_doc.attrib.get('data', '')
data_url_params = parse_qs(data_url)
def find_param(name):
if name in flashvars:
return flashvars[name]
node = find_xpath_attr(object_doc, './param', 'name', name)
if node is not None:
return node.attrib['value']
return data_url_params.get(name)
params = {}
playerID = find_param('playerID') or find_param('playerId')
if playerID is None:
raise ExtractorError('Cannot find player ID')
params['playerID'] = playerID
playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey
# These fields hold the id of the video
videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
if videoPlayer is not None:
if isinstance(videoPlayer, list):
videoPlayer = videoPlayer[0]
videoPlayer = videoPlayer.strip()
# UUID is also possible for videoPlayer (e.g.
# http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
# or http://www8.hp.com/cn/zh/home.html)
if not (re.match(
r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
videoPlayer) or videoPlayer.startswith('ref:')):
return None
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase
return cls._make_brightcove_url(params)
@classmethod
def _build_brightcove_url_from_js(cls, object_js):
# The layout of JS is as follows:
# customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
# // build Brightcove <object /> XML
# }
m = re.search(
r'''(?x)customBC\.createVideo\(
.*? # skipping width and height
["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
# in length, however it's appended to itself
# in places, so truncate
["\'](?P<videoID>\d+)["\'] # @videoPlayer
''', object_js)
if m:
return cls._make_brightcove_url(m.groupdict())
@classmethod
def _make_brightcove_url(cls, params):
return update_url_query(
'http://c.brightcove.com/services/viewer/htmlFederated', params)
@classmethod
def _extract_brightcove_url(cls, webpage):
"""Try to extract the brightcove url from the webpage, returns None
if it can't be found
"""
urls = cls._extract_brightcove_urls(webpage)
return urls[0] if urls else None
@classmethod
def _extract_brightcove_urls(cls, webpage):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(
r'''(?x)
<meta\s+
(?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+
content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2
''', webpage)
if url_m:
url = unescapeHTML(url_m.group('url'))
# Some sites don't add it, we can't download with this url, for example:
# http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
if 'playerKey' in url or 'videoId' in url or 'idVideo' in url:
return [url]
matches = re.findall(
r'''(?sx)<object
(?:
[^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
).+?>\s*</object>''',
webpage)
if matches:
return list(filter(None, [cls._build_brightcove_url(m) for m in matches]))
matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
if matches:
return list(filter(None, [
cls._build_brightcove_url_from_js(custom_bc)
for custom_bc in matches]))
return [src for _, src in re.findall(
r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
# Change the 'videoId' and others field to '@videoPlayer'
url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
# Change bckey (used by bcove.me urls) to playerKey
url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
mobj = self._match_valid_url(url)
query_str = mobj.group('query')
query = compat_urlparse.parse_qs(query_str)
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
# We set the original url as the default 'Referer' header
referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url)
video_id = videoPlayer[0]
if 'playerID' not in query:
mobj = re.search(r'/bcpid(\d+)', url)
if mobj is not None:
query['playerID'] = [mobj.group(1)]
publisher_id = query.get('publisherId')
if publisher_id and publisher_id[0].isdigit():
publisher_id = publisher_id[0]
if not publisher_id:
player_key = query.get('playerKey')
if player_key and ',' in player_key[0]:
player_key = player_key[0]
else:
player_id = query.get('playerID')
if player_id and player_id[0].isdigit():
headers = {}
if referer:
headers['Referer'] = referer
player_page = self._download_webpage(
'http://link.brightcove.com/services/player/bcpid' + player_id[0],
video_id, headers=headers, fatal=False)
if player_page:
player_key = self._search_regex(
r'<param\s+name="playerKey"\s+value="([\w~,-]+)"',
player_page, 'player key', fatal=False)
if player_key:
enc_pub_id = player_key.split(',')[1].replace('~', '=')
publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
if publisher_id:
brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id)
if referer:
brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
# TODO: figure out if it's possible to extract playlistId from playerKey
# elif 'playerKey' in query:
# player_key = query['playerKey']
# return self._get_playlist_info(player_key[0])
raise UnsupportedError(url)
class BrightcoveNewIE(AdobePassIE):
IE_NAME = 'brightcove:new'
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
'md5': 'c8100925723840d4b0d243f7025703be',
'info_dict': {
'id': '4463358922001',
'ext': 'mp4',
'title': 'Meet the man behind Popcorn Time',
'description': 'md5:eac376a4fe366edc70279bfb681aea16',
'duration': 165.768,
'timestamp': 1441391203,
'upload_date': '20150904',
'uploader_id': '929656772001',
'formats': 'mincount:20',
},
}, {
# with rtmp streams
'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
'info_dict': {
'id': '4279049078001',
'ext': 'mp4',
'title': 'Titansgrave: Chapter 0',
'description': 'Titansgrave: Chapter 0',
'duration': 1242.058,
'timestamp': 1433556729,
'upload_date': '20150606',
'uploader_id': '4036320279001',
'formats': 'mincount:39',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
# playlist stream
'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001',
'info_dict': {
'id': '5718313430001',
'title': 'No Audio Playlist',
},
'playlist_count': 7,
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001',
'only_matching': True,
}, {
# ref: prefixed video id
'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
'only_matching': True,
}, {
# non numeric ref: prefixed video id
'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
'only_matching': True,
}, {
# unavailable video without message but with error_code
'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
'only_matching': True,
}]
@staticmethod
def _extract_url(ie, webpage):
urls = BrightcoveNewIE._extract_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
# 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
# 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
# 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
# Look for iframe embeds [1]
for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
# Look for <video> tags [2] and embed_in_page embeds [3]
# [2] looks like:
for video, script_tag, account_id, player_id, embed in re.findall(
r'''(?isx)
(<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
(?:.*?
(<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
(\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
)
)?
''', webpage):
attrs = extract_attributes(video)
# According to examples from [4] it's unclear whether video id
# may be optional and what to do when it is
video_id = attrs.get('data-video-id')
if not video_id:
continue
account_id = account_id or attrs.get('data-account')
if not account_id:
continue
player_id = player_id or attrs.get('data-player') or 'default'
embed = embed or attrs.get('data-embed') or 'default'
bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
account_id, player_id, embed, video_id)
# Some brightcove videos may be embedded with video tag only and
# without script tag or any mentioning of brightcove at all. Such
# embeds are considered ambiguous since they are matched based only
# on data-video-id and data-account attributes and in the wild may
# not be brightcove embeds at all. Let's check reconstructed
# brightcove URLs in case of such embeds and only process valid
# ones. By this we ensure there is indeed a brightcove embed.
if not script_tag and not ie._is_valid_url(
bc_url, video_id, 'possible brightcove video'):
continue
entries.append(bc_url)
return entries
def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
title = json_data['name'].strip()
formats, subtitles = [], {}
sources = json_data.get('sources') or []
for source in sources:
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
if ext == 'm3u8' or container == 'M2TS':
if not src:
continue
fmts, subs = self._extract_m3u8_formats_and_subtitles(
src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
subtitles = self._merge_subtitles(subtitles, subs)
elif ext == 'mpd':
if not src:
continue
fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
subtitles = self._merge_subtitles(subtitles, subs)
else:
streaming_src = source.get('streaming_src')
stream_name, app_name = source.get('stream_name'), source.get('app_name')
if not src and not streaming_src and (not stream_name or not app_name):
continue
tbr = float_or_none(source.get('avg_bitrate'), 1000)
height = int_or_none(source.get('height'))
width = int_or_none(source.get('width'))
f = {
'tbr': tbr,
'filesize': int_or_none(source.get('size')),
'container': container,
'ext': ext or container.lower(),
}
if width == 0 and height == 0:
f.update({
'vcodec': 'none',
})
else:
f.update({
'width': width,
'height': height,
'vcodec': source.get('codec'),
})
def build_format_id(kind):
format_id = kind
if tbr:
format_id += '-%dk' % int(tbr)
if height:
format_id += '-%dp' % height
return format_id
if src or streaming_src:
f.update({
'url': src or streaming_src,
'format_id': build_format_id('http' if src else 'http-streaming'),
'source_preference': 0 if src else -1,
})
else:
f.update({
'url': app_name,
'play_path': stream_name,
'format_id': build_format_id('rtmp'),
})
fmts = [f]
# https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
if container == 'WVM' or source.get('key_systems') or ext == 'ism':
for f in fmts:
f['has_drm'] = True
formats.extend(fmts)
if not formats:
errors = json_data.get('errors')
if errors:
error = errors[0]
self.raise_no_formats(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {}).update(headers)
for text_track in json_data.get('text_tracks', []):
if text_track.get('kind') != 'captions':
continue
text_track_url = url_or_none(text_track.get('src'))
if not text_track_url:
continue
lang = (str_or_none(text_track.get('srclang'))
or str_or_none(text_track.get('label')) or 'en').lower()
subtitles.setdefault(lang, []).append({
'url': text_track_url,
})
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
if duration is not None and duration <= 0:
is_live = True
common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)]
thumb_base_url = dict_get(json_data, ('poster', 'thumbnail'))
thumbnails = [{
'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url),
'width': w,
'height': h,
} for w, h in common_res] if thumb_base_url else None
return {
'id': video_id,
'title': title,
'description': clean_html(json_data.get('description')),
'thumbnails': thumbnails,
'duration': duration,
'timestamp': parse_iso8601(json_data.get('published_at')),
'uploader_id': json_data.get('account_id'),
'formats': formats,
'subtitles': subtitles,
'tags': json_data.get('tags', []),
'is_live': is_live,
}
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({
'countries': smuggled_data.get('geo_countries'),
'ip_blocks': smuggled_data.get('geo_ip_blocks'),
})
account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups()
policy_key_id = '%s_%s' % (account_id, player_id)
policy_key = self._downloader.cache.load('brightcove', policy_key_id)
policy_key_extracted = False
store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
def extract_policy_key():
base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)
config = self._download_json(
base_url + 'config.json', video_id, fatal=False) or {}
policy_key = try_get(
config, lambda x: x['video_cloud']['policy_key'])
if not policy_key:
webpage = self._download_webpage(
base_url + 'index.min.js', video_id)
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
if catalog:
catalog = self._parse_json(
js_to_json(catalog), video_id, fatal=False)
if catalog:
policy_key = catalog.get('policyKey')
if not policy_key:
policy_key = self._search_regex(
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk')
store_pk(policy_key)
return policy_key
api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)
headers = {}
referrer = smuggled_data.get('referrer')
if referrer:
headers.update({
'Referer': referrer,
'Origin': re.search(r'https?://[^/]+', referrer).group(0),
})
for _ in range(2):
if not policy_key:
policy_key = extract_policy_key()
policy_key_extracted = True
headers['Accept'] = 'application/json;pk=%s' % policy_key
try:
json_data = self._download_json(api_url, video_id, headers=headers)
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
message = json_data.get('message') or json_data['error_code']
if json_data.get('error_subcode') == 'CLIENT_GEO':
self.raise_geo_restricted(msg=message)
elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted:
policy_key = None
store_pk(None)
continue
raise ExtractorError(message, expected=True)
raise
errors = json_data.get('errors')
if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
custom_fields = json_data['custom_fields']
tve_token = self._extract_mvpd_auth(
smuggled_data['source_url'], video_id,
custom_fields['bcadobepassrequestorid'],
custom_fields['bcadobepassresourceid'])
json_data = self._download_json(
api_url, video_id, headers={
'Accept': 'application/json;pk=%s' % policy_key
}, query={
'tveToken': tve_token,
})
if content_type == 'playlist':
return self.playlist_result(
[self._parse_brightcove_metadata(vid, vid.get('id'), headers)
for vid in json_data.get('videos', []) if vid.get('id')],
json_data.get('id'), json_data.get('name'),
json_data.get('description'))
return self._parse_brightcove_metadata(
json_data, video_id, headers=headers)

View File

@@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
class BusinessInsiderIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6',
'md5': 'ffed3e1e12a6f950aa2f7d83851b497a',
'info_dict': {
'id': 'cjGDb0X9',
'ext': 'mp4',
'title': "Bananas give you more radiation exposure than living next to a nuclear power plant",
'description': 'md5:0175a3baf200dd8fa658f94cade841b3',
'upload_date': '20160611',
'timestamp': 1465675620,
},
}, {
'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/',
'md5': '43f438dbc6da0b89f5ac42f68529d84a',
'info_dict': {
'id': '5zJwd4FK',
'ext': 'mp4',
'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort',
'description': 'md5:2af8975825d38a4fed24717bbe51db49',
'upload_date': '20170705',
'timestamp': 1499270528,
},
}, {
'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
jwplatform_id = self._search_regex(
(r'data-media-id=["\']([a-zA-Z0-9]{8})',
r'id=["\']jwplayer_([a-zA-Z0-9]{8})',
r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})',
r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'),
webpage, 'jwplatform id')
return self.url_result(
'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(),
video_id=video_id)

View File

@@ -0,0 +1,98 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .facebook import FacebookIE
class BuzzFeedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
'info_dict': {
'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
'description': 'Rambro!',
},
'playlist': [{
'info_dict': {
'id': 'aVCR29aE_OQ',
'ext': 'mp4',
'title': 'Angry Ram destroys a punching bag..',
'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
'upload_date': '20141024',
'uploader_id': 'Buddhanz1',
'uploader': 'Angry Ram',
}
}]
}, {
'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
'params': {
'skip_download': True, # Got enough YouTube download tests
},
'info_dict': {
'id': 'look-at-this-cute-dog-omg',
'description': 're:Munchkin the Teddy Bear is back ?!',
'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
},
'playlist': [{
'info_dict': {
'id': 'mVmBL8B-In0',
'ext': 'mp4',
'title': 're:Munchkin the Teddy Bear gets her exercise',
'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
'uploader': 're:^Munchkin the',
},
}]
}, {
'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
'info_dict': {
'id': 'the-most-adorable-crash-landing-ever',
'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
'description': 'This gosling knows how to stick a landing.',
},
'playlist': [{
'md5': '763ca415512f91ca62e4621086900a23',
'info_dict': {
'id': '971793786185728',
'ext': 'mp4',
'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
'uploader': 'Calgary Outdoor Centre-University of Calgary',
},
}],
'add_ie': ['Facebook'],
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
all_buckets = re.findall(
r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
webpage)
entries = []
for bd_json in all_buckets:
bd = json.loads(bd_json)
video = bd.get('video') or bd.get('progload_video')
if not video:
continue
entries.append(self.url_result(video['url']))
facebook_urls = FacebookIE._extract_urls(webpage)
entries.extend([
self.url_result(facebook_url)
for facebook_url in facebook_urls])
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'entries': entries,
}

View File

@@ -0,0 +1,122 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
merge_dicts,
parse_duration,
url_or_none,
)
class BYUtvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?'
_TESTS = [{
# ooyalaVOD
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'info_dict': {
'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH',
'display_id': 'studio-c-season-5-episode-5',
'ext': 'mp4',
'title': 'Season 5 Episode 5',
'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65',
'thumbnail': r're:^https?://.*',
'duration': 1486.486,
},
'params': {
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, {
# dvr
'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2',
'info_dict': {
'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451',
'display_id': 'byu-softball-pacific-vs-byu-41219---game-2',
'ext': 'mp4',
'title': 'Pacific vs. BYU (4/12/19)',
'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3',
'duration': 11645,
},
'params': {
'skip_download': True
},
}, {
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d',
'only_matching': True,
}, {
'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
video = self._download_json(
'https://api.byutv.org/api3/catalog/getvideosforcontent',
display_id, query={
'contentid': video_id,
'channel': 'byutv',
'x-byutv-context': 'web$US',
}, headers={
'x-byutv-context': 'web$US',
'x-byutv-platformkey': 'xsaaw9c7y5',
})
ep = video.get('ooyalaVOD')
if ep:
return {
'_type': 'url_transparent',
'ie_key': 'Ooyala',
'url': 'ooyala:%s' % ep['providerId'],
'id': video_id,
'display_id': display_id,
'title': ep.get('title'),
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
}
info = {}
formats = []
subtitles = {}
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
video_url = url_or_none(ep.get('videoUrl'))
if not video_url:
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
video_url, video_id, mpd_id='dash', fatal=False)
formats.extend(mpd_fmts)
subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
formats.append({
'url': video_url,
'format_id': format_id,
})
merge_dicts(info, {
'title': ep.get('title'),
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
'duration': parse_duration(ep.get('length')),
})
self._sort_formats(formats)
return merge_dicts(info, {
'id': video_id,
'display_id': display_id,
'title': display_id,
'formats': formats,
'subtitles': subtitles,
})

View File

@@ -0,0 +1,64 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import js_to_json
class C56IE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
_TESTS = [{
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
'id': '93440716',
'ext': 'flv',
'title': '网事知多少 第32期车怒',
'duration': 283.813,
},
}, {
'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
'md5': '',
'info_dict': {
'id': '82247482',
'title': '爱的诅咒之杜鹃花开',
},
'playlist_count': 7,
'add_ie': ['Sohu'],
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
text_id = mobj.group('textid')
webpage = self._download_webpage(url, text_id)
sohu_video_info_str = self._search_regex(
r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
if sohu_video_info_str:
sohu_video_info = self._parse_json(
sohu_video_info_str, text_id, transform_source=js_to_json)
return self.url_result(sohu_video_info['url'], 'Sohu')
page = self._download_json(
'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
info = page['info']
formats = [
{
'format_id': f['type'],
'filesize': int(f['filesize']),
'url': f['url']
} for f in info['rfiles']
]
self._sort_formats(formats)
return {
'id': info['vid'],
'title': info['Subject'],
'duration': int(info['duration']) / 1000.0,
'formats': formats,
'thumbnail': info.get('bimg') or info.get('img'),
}

View File

@@ -0,0 +1,34 @@
# coding: utf-8
from .common import InfoExtractor
class CableAVIE(InfoExtractor):
_VALID_URL = r'https://cableav\.tv/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://cableav.tv/lS4iR9lWjN8/',
'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18',
'info_dict': {
'id': 'lS4iR9lWjN8',
'ext': 'mp4',
'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV',
'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家',
'thumbnail': r're:^https?://.*\.jpg$',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._og_search_video_url(webpage, secure=False)
formats = self._extract_m3u8_formats(video_url, video_id, 'mp4')
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}

View File

@@ -0,0 +1,114 @@
# coding: utf-8
from .common import InfoExtractor
from ..utils import (
traverse_obj,
float_or_none,
int_or_none
)
class CallinIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
_TESTS = [{
'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
'info_dict': {
'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
'ext': 'ts',
'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
'thumbnail': 're:https://.+\\.png',
'description': 'First episode',
'uploader': 'Wesley Yang',
'timestamp': 1639404128.65,
'upload_date': '20211213',
'uploader_id': 'wesyang',
'uploader_url': 'http://wesleyyang.substack.com',
'channel': 'Conversations in Year Zero',
'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
'duration': 9951.936,
'view_count': int,
'categories': ['News & Politics', 'History', 'Technology'],
'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
'series': 'Conversations in Year Zero',
'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
'episode_number': 1,
'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
}
}]
def try_get_user_name(self, d):
names = [d.get(n) for n in ('first', 'last')]
if None in names:
return next((n for n in names if n), default=None)
return ' '.join(names)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
next_data = self._search_nextjs_data(webpage, display_id)
episode = next_data['props']['pageProps']['episode']
id = episode['id']
title = (episode.get('title')
or self._og_search_title(webpage, fatal=False)
or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
url = episode['m3u8']
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
self._sort_formats(formats)
show = traverse_obj(episode, ('show', 'title'))
show_id = traverse_obj(episode, ('show', 'id'))
show_json = None
app_slug = (self._html_search_regex(
'<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
webpage, 'app slug', fatal=False) or next_data.get('buildId'))
show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
if app_slug and show_slug and '/' in show_slug:
show_slug = show_slug.rsplit('/', 1)[1]
show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
show_json = self._download_json(show_json_url, display_id, fatal=False)
host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
or traverse_obj(episode, ('speakers', 0)))
host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
cast = list(filter(None, [
self.try_get_user_name(u) for u in
traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
]))
episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
episode_number = next(
(len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
None)
return {
'id': id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': traverse_obj(episode, ('show', 'photo')),
'description': episode.get('description'),
'uploader': self.try_get_user_name(host) if host else None,
'timestamp': episode.get('publishedAt'),
'uploader_id': host_nick,
'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
'channel': show,
'channel_id': show_id,
'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
'duration': float_or_none(episode.get('runtime')),
'view_count': int_or_none(episode.get('plays')),
'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
'cast': cast if cast else None,
'series': show,
'series_id': show_id,
'episode': title,
'episode_number': episode_number,
'episode_id': id
}

View File

@@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class CAM4IE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?cam4\.com/(?P<id>[a-z0-9_]+)'
_TEST = {
'url': 'https://www.cam4.com/foxynesss',
'info_dict': {
'id': 'foxynesss',
'ext': 'mp4',
'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'age_limit': 18,
'live_status': 'is_live',
'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss',
}
}
def _real_extract(self, url):
channel_id = self._match_id(url)
m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL')
formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True)
self._sort_formats(formats)
return {
'id': channel_id,
'title': channel_id,
'is_live': True,
'age_limit': 18,
'formats': formats,
'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}',
}

View File

@@ -0,0 +1,161 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
clean_html,
parse_duration,
str_to_int,
unified_strdate,
)
class CamdemyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
_TESTS = [{
# single file
'url': 'http://www.camdemy.com/media/5181/',
'md5': '5a5562b6a98b37873119102e052e311b',
'info_dict': {
'id': '5181',
'ext': 'mp4',
'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
'thumbnail': r're:^https?://.*\.jpg$',
'creator': 'ss11spring',
'duration': 1591,
'upload_date': '20130114',
'view_count': int,
}
}, {
# With non-empty description
# webpage returns "No permission or not login"
'url': 'http://www.camdemy.com/media/13885',
'md5': '4576a3bb2581f86c61044822adbd1249',
'info_dict': {
'id': '13885',
'ext': 'mp4',
'title': 'EverCam + Camdemy QuickStart',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
'creator': 'evercam',
'duration': 318,
}
}, {
# External source (YouTube)
'url': 'http://www.camdemy.com/media/14842',
'info_dict': {
'id': '2vsYQzNIsJo',
'ext': 'mp4',
'title': 'Excel 2013 Tutorial - How to add Password Protection',
'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
'upload_date': '20130211',
'uploader': 'Hun Kim',
'uploader_id': 'hunkimtutorials',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
src_from = self._html_search_regex(
r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
webpage, 'external source', default=None, group='url')
if src_from:
return self.url_result(src_from)
oembed_obj = self._download_json(
'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
title = oembed_obj['title']
thumb_url = oembed_obj['thumbnail_url']
video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
file_list_doc = self._download_xml(
compat_urlparse.urljoin(video_folder, 'fileList.xml'),
video_id, 'Downloading filelist XML')
file_name = file_list_doc.find('./video/item/fileName').text
video_url = compat_urlparse.urljoin(video_folder, file_name)
# Some URLs return "No permission or not login" in a webpage despite being
# freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
upload_date = unified_strdate(self._search_regex(
r'>published on ([^<]+)<', webpage,
'upload date', default=None))
view_count = str_to_int(self._search_regex(
r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
webpage, 'view count', default=None))
description = self._html_search_meta(
'description', webpage, default=None) or clean_html(
oembed_obj.get('description'))
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumb_url,
'description': description,
'creator': oembed_obj.get('author_name'),
'duration': parse_duration(oembed_obj.get('duration')),
'upload_date': upload_date,
'view_count': view_count,
}
class CamdemyFolderIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
_TESTS = [{
# links with trailing slash
'url': 'http://www.camdemy.com/folder/450',
'info_dict': {
'id': '450',
'title': '信號與系統 2012 & 2011 (Signals and Systems)',
},
'playlist_mincount': 145
}, {
# links without trailing slash
# and multi-page
'url': 'http://www.camdemy.com/folder/853',
'info_dict': {
'id': '853',
'title': '科學計算 - 使用 Matlab'
},
'playlist_mincount': 20
}, {
# with displayMode parameter. For testing the codes to add parameters
'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
'info_dict': {
'id': '853',
'title': '科學計算 - 使用 Matlab'
},
'playlist_mincount': 20
}]
def _real_extract(self, url):
folder_id = self._match_id(url)
# Add displayMode=list so that all links are displayed in a single page
parsed_url = list(compat_urlparse.urlparse(url))
query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
query.update({'displayMode': 'list'})
parsed_url[4] = compat_urllib_parse_urlencode(query)
final_url = compat_urlparse.urlunparse(parsed_url)
page = self._download_webpage(final_url, folder_id)
matches = re.findall(r"href='(/media/\d+/?)'", page)
entries = [self.url_result('http://www.camdemy.com' + media_path)
for media_path in matches]
folder_title = self._html_search_meta('keywords', page)
return self.playlist_result(entries, folder_id, folder_title)

View File

@@ -0,0 +1,98 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)
class CamModelsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.cammodels.com/cam/AutumnKnight/',
'only_matching': True,
'age_limit': 18
}]
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(
url, user_id, headers=self.geo_verification_headers())
manifest_root = self._html_search_regex(
r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
if not manifest_root:
ERRORS = (
("I'm offline, but let's stay connected", 'This user is currently offline'),
('in a private show', 'This user is in a private show'),
('is currently performing LIVE', 'This model is currently performing live'),
)
for pattern, message in ERRORS:
if pattern in webpage:
error = message
expected = True
break
else:
error = 'Unable to find manifest URL root'
expected = False
raise ExtractorError(error, expected=expected)
manifest = self._download_json(
'%s%s.json' % (manifest_root, user_id), user_id)
formats = []
for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict):
continue
encodings = format_dict.get('encodings')
if not isinstance(encodings, list):
continue
vcodec = format_dict.get('videoCodec')
acodec = format_dict.get('audioCodec')
for media in encodings:
if not isinstance(media, dict):
continue
media_url = url_or_none(media.get('location'))
if not media_url:
continue
format_id_list = [format_id]
height = int_or_none(media.get('videoHeight'))
if height is not None:
format_id_list.append('%dp' % height)
f = {
'url': media_url,
'format_id': '-'.join(format_id_list),
'width': int_or_none(media.get('videoWidth')),
'height': height,
'vbr': int_or_none(media.get('videoKbps')),
'abr': int_or_none(media.get('audioKbps')),
'fps': int_or_none(media.get('fps')),
'vcodec': vcodec,
'acodec': acodec,
}
if 'rtmp' in format_id:
f['ext'] = 'flv'
elif 'hls' in format_id:
f.update({
'ext': 'mp4',
# hls skips fragments, preferring rtmp
'quality': -10,
})
else:
continue
formats.append(f)
self._sort_formats(formats)
return {
'id': user_id,
'title': user_id,
'is_live': True,
'formats': formats,
'age_limit': 18
}

View File

@@ -0,0 +1,89 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
unified_strdate,
)
class CamWithHerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)'
_TESTS = [{
'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=',
'info_dict': {
'id': '5644',
'ext': 'flv',
'title': 'Periscope Tease',
'description': 'In the clouds teasing on periscope to my favorite song',
'duration': 240,
'view_count': int,
'comment_count': int,
'uploader': 'MileenaK',
'upload_date': '20160322',
'age_limit': 18,
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937',
'only_matching': True,
}, {
'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=',
'only_matching': True,
}, {
'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
flv_id = self._html_search_regex(
r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id')
# Video URL construction algorithm is reverse-engineered from cwhplayer.swf
rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % (
('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id)
title = self._html_search_regex(
r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title')
description = self._html_search_regex(
r'>Description:</span>(.+?)</div>', webpage, 'description', default=None)
runtime = self._search_regex(
r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None)
if runtime:
runtime = re.sub(r'[\s-]', '', runtime)
duration = parse_duration(runtime)
view_count = int_or_none(self._search_regex(
r'Views\s*:\s*(\d+)', webpage, 'view count', default=None))
comment_count = int_or_none(self._search_regex(
r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None))
uploader = self._search_regex(
r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None)
upload_date = unified_strdate(self._search_regex(
r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None))
return {
'id': flv_id,
'url': rtmp_url,
'ext': 'flv',
'no_resume': True,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'uploader': uploader,
'upload_date': upload_date,
'age_limit': 18
}

View File

@@ -0,0 +1,98 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
dict_get,
try_get,
unified_strdate,
)
class CanalAlphaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P<id>\d+)/?.*'
_TESTS = [{
'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021',
'info_dict': {
'id': '24520',
'ext': 'mp4',
'title': 'Jeudi 28 octobre 2021',
'description': 'md5:d30c6c3e53f8ad40d405379601973b30',
'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg',
'upload_date': '20211028',
'duration': 1125,
},
'params': {'skip_download': True}
}, {
'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique',
'info_dict': {
'id': '24512',
'ext': 'mp4',
'title': 'La Poste fait de Neuchâtel un pôle cryptographique',
'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf',
'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg',
'upload_date': '20211028',
'duration': 138,
},
'params': {'skip_download': True}
}, {
'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable',
'info_dict': {
'id': '24484',
'ext': 'mp4',
'title': 'Ces innovations qui veulent rendre lagriculture plus durable',
'description': 'md5:3de3f151180684621e85be7c10e4e613',
'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg',
'upload_date': '20211026',
'duration': 360,
},
'params': {'skip_download': True}
}, {
'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage',
'info_dict': {
'id': '23516',
'ext': 'mp4',
'title': 'Redonner de l\'éclat grâce au polissage',
'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1',
'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png',
'upload_date': '20210726',
'duration': 360,
},
'params': {'skip_download': True}
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
data_json = self._parse_json(self._search_regex(
r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;',
webpage, 'data_json'), id)['1']['data']['data']
manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {}
subtitles = {}
formats = [{
'url': video['$url'],
'ext': 'mp4',
'width': try_get(video, lambda x: x['res']['width'], expected_type=int),
'height': try_get(video, lambda x: x['res']['height'], expected_type=int),
} for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')]
if manifests.get('hls'):
m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id)
formats.extend(m3u8_frmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
if manifests.get('dash'):
dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'])
formats.extend(dash_frmts)
subtitles = self._merge_subtitles(subtitles, dash_subs)
self._sort_formats(formats)
return {
'id': id,
'title': data_json.get('title').strip(),
'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))),
'thumbnail': data_json.get('poster'),
'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))),
'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int),
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
_VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.canalc2.tv/video/%s' % video_id, video_id)
title = self._html_search_regex(
r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>',
webpage, 'title')
formats = []
for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage):
if video_url.startswith('rtmp://'):
rtmp = re.search(
r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
formats.append({
'url': rtmp.group('url'),
'format_id': 'rtmp',
'ext': 'flv',
'app': rtmp.group('app'),
'play_path': rtmp.group('play_path'),
'page_url': url,
})
else:
formats.append({
'url': video_url,
'format_id': 'http',
})
if formats:
info = {
'formats': formats,
}
else:
info = self._parse_html5_media_entries(url, webpage, url)[0]
self._sort_formats(info['formats'])
info.update({
'id': video_id,
'title': title,
'duration': parse_duration(self._search_regex(
r'id=["\']video_duree["\'][^>]*>([^<]+)',
webpage, 'duration', fatal=False)),
})
return info

Some files were not shown because too many files have changed in this diff Show More