Plugin cleanup and tweaks

This commit is contained in:
2023-02-20 19:18:45 -06:00
parent 372e4ff3dc
commit 3ad9e1c7bb
1138 changed files with 48878 additions and 40445 deletions

View File

@@ -1,33 +1,15 @@
import os
from ..compat.compat_utils import passthrough_module
from ..utils import load_plugins
_LAZY_LOADER = False
if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
try:
from .lazy_extractors import *
from .lazy_extractors import _ALL_CLASSES
_LAZY_LOADER = True
except ImportError:
pass
if not _LAZY_LOADER:
from .extractors import *
_ALL_CLASSES = [
klass
for name, klass in globals().items()
if name.endswith('IE') and name != 'GenericIE'
]
_ALL_CLASSES.append(GenericIE)
_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
passthrough_module(__name__, '.extractors')
del passthrough_module
def gen_extractor_classes():
""" Return a list of supported extractors.
The order does matter; the first extractor matched is the one handling the URL.
"""
from .extractors import _ALL_CLASSES
return _ALL_CLASSES
@@ -38,17 +20,23 @@ def gen_extractors():
return [klass() for klass in gen_extractor_classes()]
def list_extractors(age_limit):
"""
Return a list of extractors that are suitable for the given age,
sorted by extractor ID.
"""
def list_extractor_classes(age_limit=None):
"""Return a list of extractors that are suitable for the given age, sorted by extractor name"""
from .generic import GenericIE
return sorted(
filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
key=lambda ie: ie.IE_NAME.lower())
yield from sorted(filter(
lambda ie: ie.is_suitable(age_limit) and ie != GenericIE,
gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower())
yield GenericIE
def list_extractors(age_limit=None):
"""Return a list of extractor instances that are suitable for the given age, sorted by extractor name"""
return [ie() for ie in list_extractor_classes(age_limit)]
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name + 'IE']
from . import extractors
return getattr(extractors, f'{ie_name}IE')

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
import hashlib
import hmac
import re
@@ -157,8 +155,6 @@ class ABCIE(InfoExtractor):
'format_id': format_id
})
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage),
@@ -213,7 +209,7 @@ class ABCIViewIE(InfoExtractor):
'hdnea': token,
})
for sd in ('720', 'sd', 'sd-low'):
for sd in ('1080', '720', 'sd', 'sd-low'):
sd_url = try_get(
stream, lambda x: x['streams']['hls'][sd], compat_str)
if not sd_url:
@@ -223,7 +219,6 @@ class ABCIViewIE(InfoExtractor):
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
if formats:
break
self._sort_formats(formats)
subtitles = {}
src_vtt = stream.get('captions', {}).get('src-vtt')

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .amp import AMPIE
from .common import InfoExtractor
from ..utils import (

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -82,7 +78,6 @@ class ABCOTVSIE(InfoExtractor):
'url': mp4_url,
'width': 640,
})
self._sort_formats(formats)
image = video.get('image') or {}
@@ -123,7 +118,6 @@ class ABCOTVSClipsIE(InfoExtractor):
title = video_data['title']
formats = self._extract_m3u8_formats(
video_data['videoURL'].split('?')[0], video_id, 'mp4')
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -0,0 +1,522 @@
import base64
import binascii
import functools
import hashlib
import hmac
import io
import json
import re
import struct
import time
import urllib.parse
import urllib.request
import urllib.response
import uuid
from .common import InfoExtractor
from ..aes import aes_ecb_decrypt
from ..utils import (
ExtractorError,
bytes_to_intlist,
decode_base_n,
int_or_none,
intlist_to_bytes,
OnDemandPagedList,
request_to_url,
time_seconds,
traverse_obj,
update_url_query,
)
# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862)
def add_opener(ydl, handler):
''' Add a handler for opening URLs, like _download_webpage '''
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
assert isinstance(ydl._opener, urllib.request.OpenerDirector)
ydl._opener.add_handler(handler)
def remove_opener(ydl, handler):
'''
Remove handler(s) for opening URLs
@param handler Either handler object itself or handler type.
Specifying handler type will remove all handler which isinstance returns True.
'''
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
opener = ydl._opener
assert isinstance(ydl._opener, urllib.request.OpenerDirector)
if isinstance(handler, (type, tuple)):
find_cp = lambda x: isinstance(x, handler)
else:
find_cp = lambda x: x is handler
removed = []
for meth in dir(handler):
if meth in ["redirect_request", "do_open", "proxy_open"]:
# oops, coincidental match
continue
i = meth.find("_")
protocol = meth[:i]
condition = meth[i + 1:]
if condition.startswith("error"):
j = condition.find("_") + i + 1
kind = meth[j + 1:]
try:
kind = int(kind)
except ValueError:
pass
lookup = opener.handle_error.get(protocol, {})
opener.handle_error[protocol] = lookup
elif condition == "open":
kind = protocol
lookup = opener.handle_open
elif condition == "response":
kind = protocol
lookup = opener.process_response
elif condition == "request":
kind = protocol
lookup = opener.process_request
else:
continue
handlers = lookup.setdefault(kind, [])
if handlers:
handlers[:] = [x for x in handlers if not find_cp(x)]
removed.append(x for x in handlers if find_cp(x))
if removed:
for x in opener.handlers:
if find_cp(x):
x.add_parent(None)
opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)]
class AbemaLicenseHandler(urllib.request.BaseHandler):
handler_order = 499
STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
def __init__(self, ie: 'AbemaTVIE'):
# the protocol that this should really handle is 'abematv-license://'
# abematv_license_open is just a placeholder for development purposes
# ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open'))
self.ie = ie
def _get_videokey_from_ticket(self, ticket):
to_show = self.ie.get_param('verbose', False)
media_token = self.ie._get_media_token(to_show=to_show)
license_response = self.ie._download_json(
'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False,
query={'t': media_token},
data=json.dumps({
'kv': 'a',
'lt': ticket
}).encode('utf-8'),
headers={
'Content-Type': 'application/json',
})
res = decode_base_n(license_response['k'], table=self.STRTABLE)
encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
h = hmac.new(
binascii.unhexlify(self.HKEY),
(license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'),
digestmod=hashlib.sha256)
enckey = bytes_to_intlist(h.digest())
return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
def abematv_license_open(self, url):
url = request_to_url(url)
ticket = urllib.parse.urlparse(url).netloc
response_data = self._get_videokey_from_ticket(ticket)
return urllib.response.addinfourl(io.BytesIO(response_data), headers={
'Content-Length': len(response_data),
}, url=url, code=200)
class AbemaTVBaseIE(InfoExtractor):
_USERTOKEN = None
_DEVICE_ID = None
_MEDIATOKEN = None
_SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe'
@classmethod
def _generate_aks(cls, deviceid):
deviceid = deviceid.encode('utf-8')
# add 1 hour and then drop minute and secs
ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
time_struct = time.gmtime(ts_1hour)
ts_1hour_str = str(ts_1hour).encode('utf-8')
tmp = None
def mix_once(nonce):
nonlocal tmp
h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256)
h.update(nonce)
tmp = h.digest()
def mix_tmp(count):
nonlocal tmp
for i in range(count):
mix_once(tmp)
def mix_twist(nonce):
nonlocal tmp
mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce)
mix_once(cls._SECRETKEY)
mix_tmp(time_struct.tm_mon)
mix_twist(deviceid)
mix_tmp(time_struct.tm_mday % 5)
mix_twist(ts_1hour_str)
mix_tmp(time_struct.tm_hour % 5)
return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8')
def _get_device_token(self):
if self._USERTOKEN:
return self._USERTOKEN
username, _ = self._get_login_info()
AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username)
if AbemaTVBaseIE._USERTOKEN:
# try authentication with locally stored token
try:
self._get_media_token(True)
return
except ExtractorError as e:
self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
aks = self._generate_aks(self._DEVICE_ID)
user_data = self._download_json(
'https://api.abema.io/v1/users', None, note='Authorizing',
data=json.dumps({
'deviceId': self._DEVICE_ID,
'applicationKeySecret': aks,
}).encode('utf-8'),
headers={
'Content-Type': 'application/json',
})
AbemaTVBaseIE._USERTOKEN = user_data['token']
# don't allow adding it 2 times or more, though it's guarded
remove_opener(self._downloader, AbemaLicenseHandler)
add_opener(self._downloader, AbemaLicenseHandler(self))
return self._USERTOKEN
def _get_media_token(self, invalidate=False, to_show=True):
if not invalidate and self._MEDIATOKEN:
return self._MEDIATOKEN
AbemaTVBaseIE._MEDIATOKEN = self._download_json(
'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False,
query={
'osName': 'android',
'osVersion': '6.0.1',
'osLang': 'ja_JP',
'osTimezone': 'Asia/Tokyo',
'appId': 'tv.abema',
'appVersion': '3.27.1'
}, headers={
'Authorization': f'bearer {self._get_device_token()}',
})['token']
return self._MEDIATOKEN
def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'):
return self._download_json(
f'https://api.abema.io/{endpoint}', video_id, query=query or {},
note=note,
headers={
'Authorization': f'bearer {self._get_device_token()}',
})
def _extract_breadcrumb_list(self, webpage, video_id):
for jld in re.finditer(
r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
webpage):
jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
if traverse_obj(jsonld, '@type') != 'BreadcrumbList':
continue
items = traverse_obj(jsonld, ('itemListElement', ..., 'name'))
if items:
return items
return []
class AbemaTVIE(AbemaTVBaseIE):
_VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)'
_NETRC_MACHINE = 'abematv'
_TESTS = [{
'url': 'https://abema.tv/video/episode/194-25_s2_p1',
'info_dict': {
'id': '194-25_s2_p1',
'title': '第1話 「チーズケーキ」 「モーニング再び」',
'series': '異世界食堂2',
'series_number': 2,
'episode': '第1話 「チーズケーキ」 「モーニング再び」',
'episode_number': 1,
},
'skip': 'expired',
}, {
'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d',
'info_dict': {
'id': 'E8tvAnMJ7a9a5d',
'title': 'ゆるキャン△ SEASON 全話一挙【無料ビデオ72時間】',
'series': 'ゆるキャン△ SEASON',
'episode': 'ゆるキャン△ SEASON 全話一挙【無料ビデオ72時間】',
'series_number': 2,
'episode_number': 1,
'description': 'md5:9c5a3172ae763278f9303922f0ea5b17',
},
'skip': 'expired',
}, {
'url': 'https://abema.tv/video/episode/87-877_s1282_p31047',
'info_dict': {
'id': 'E8tvAnMJ7a9a5d',
'title': '第5話『光射す』',
'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d',
'thumbnail': r're:https://hayabusa\.io/.+',
'series': '相棒',
'episode': '第5話『光射す』',
},
'skip': 'expired',
}, {
'url': 'https://abema.tv/now-on-air/abema-anime',
'info_dict': {
'id': 'abema-anime',
# this varies
# 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】',
'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f',
'is_live': True,
},
'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server',
}]
_TIMETABLE = None
def _perform_login(self, username, password):
self._get_device_token()
if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token():
self.write_debug('Skipping logging in')
return
if '@' in username: # don't strictly check if it's email address or not
ep, method = 'user/email', 'email'
else:
ep, method = 'oneTimePassword', 'userId'
login_response = self._download_json(
f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in',
data=json.dumps({
method: username,
'password': password
}).encode('utf-8'), headers={
'Authorization': f'bearer {self._get_device_token()}',
'Origin': 'https://abema.tv',
'Referer': 'https://abema.tv/',
'Content-Type': 'application/json',
})
AbemaTVBaseIE._USERTOKEN = login_response['token']
self._get_media_token(True)
self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN)
def _real_extract(self, url):
# starting download using infojson from this extractor is undefined behavior,
# and never be fixed in the future; you must trigger downloads by directly specifying URL.
# (unless there's a way to hook before downloading by extractor)
video_id, video_type = self._match_valid_url(url).group('id', 'type')
headers = {
'Authorization': 'Bearer ' + self._get_device_token(),
}
video_type = video_type.split('/')[-1]
webpage = self._download_webpage(url, video_id)
canonical_url = self._search_regex(
r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL',
default=url)
info = self._search_json_ld(webpage, video_id, default={})
title = self._search_regex(
r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None)
if not title:
jsonld = None
for jld in re.finditer(
r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
webpage):
jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False)
if jsonld:
break
if jsonld:
title = jsonld.get('caption')
if not title and video_type == 'now-on-air':
if not self._TIMETABLE:
# cache the timetable because it goes to 5MiB in size (!!)
self._TIMETABLE = self._download_json(
'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id,
headers=headers)
now = time_seconds(hours=9)
for slot in self._TIMETABLE.get('slots', []):
if slot.get('channelId') != video_id:
continue
if slot['startAt'] <= now and now < slot['endAt']:
title = slot['title']
break
# read breadcrumb on top of page
breadcrumb = self._extract_breadcrumb_list(webpage, video_id)
if breadcrumb:
# breadcrumb list translates to: (e.g. 1st test for this IE)
# Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title)
# hence this works
info['series'] = breadcrumb[-2]
info['episode'] = breadcrumb[-1]
if not title:
title = info['episode']
description = self._html_search_regex(
(r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div',
r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',),
webpage, 'description', default=None, group=1)
if not description:
og_desc = self._html_search_meta(
('description', 'og:description', 'twitter:description'), webpage)
if og_desc:
description = re.sub(r'''(?sx)
^(.+?)(?:
アニメの動画を無料で見るならABEMA| # anime
等、.+ # applies for most of categories
)?
''', r'\1', og_desc)
# canonical URL may contain series and episode number
mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
if mobj:
seri = int_or_none(mobj.group(1), default=float('inf'))
epis = int_or_none(mobj.group(2), default=float('inf'))
info['series_number'] = seri if seri < 100 else None
# some anime like Detective Conan (though not available in AbemaTV)
# has more than 1000 episodes (1026 as of 2021/11/15)
info['episode_number'] = epis if epis < 2000 else None
is_live, m3u8_url = False, None
if video_type == 'now-on-air':
is_live = True
channel_url = 'https://api.abema.io/v1/channels'
if video_id == 'news-global':
channel_url = update_url_query(channel_url, {'division': '1'})
onair_channels = self._download_json(channel_url, video_id)
for ch in onair_channels['channels']:
if video_id == ch['id']:
m3u8_url = ch['playback']['hls']
break
else:
raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True)
elif video_type == 'episode':
api_response = self._download_json(
f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
note='Checking playability',
headers=headers)
ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
if 3 not in ondemand_types:
# cannot acquire decryption key for these streams
self.report_warning('This is a premium-only stream')
m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
elif video_type == 'slots':
api_response = self._download_json(
f'https://api.abema.io/v1/media/slots/{video_id}', video_id,
note='Checking playability',
headers=headers)
if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False):
self.report_warning('This is a premium-only stream')
m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8'
else:
raise ExtractorError('Unreachable')
if is_live:
self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV")
self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)')
formats = self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', live=is_live)
info.update({
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'is_live': is_live,
})
return info
class AbemaTVTitleIE(AbemaTVBaseIE):
_VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)'
_PAGE_SIZE = 25
_TESTS = [{
'url': 'https://abema.tv/video/title/90-1597',
'info_dict': {
'id': '90-1597',
'title': 'シャッフルアイランド',
},
'playlist_mincount': 2,
}, {
'url': 'https://abema.tv/video/title/193-132',
'info_dict': {
'id': '193-132',
'title': '真心が届く~僕とスターのオフィス・ラブ!?~',
},
'playlist_mincount': 16,
}, {
'url': 'https://abema.tv/video/title/25-102',
'info_dict': {
'id': '25-102',
'title': 'ソードアート・オンライン アリシゼーション',
},
'playlist_mincount': 24,
}]
def _fetch_page(self, playlist_id, series_version, page):
programs = self._call_api(
f'v1/video/series/{playlist_id}/programs', playlist_id,
note=f'Downloading page {page + 1}',
query={
'seriesVersion': series_version,
'offset': str(page * self._PAGE_SIZE),
'order': 'seq',
'limit': str(self._PAGE_SIZE),
})
yield from (
self.url_result(f'https://abema.tv/video/episode/{x}')
for x in traverse_obj(programs, ('programs', ..., 'id')))
def _entries(self, playlist_id, series_version):
return OnDemandPagedList(
functools.partial(self._fetch_page, playlist_id, series_version),
self._PAGE_SIZE)
def _real_extract(self, url):
playlist_id = self._match_id(url)
series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id)
return self.playlist_result(
self._entries(playlist_id, series_info['version']), playlist_id=playlist_id,
playlist_title=series_info.get('title'),
playlist_description=series_info.get('content'))

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,

View File

@@ -0,0 +1,199 @@
from .common import InfoExtractor
from ..utils import (
float_or_none,
format_field,
int_or_none,
traverse_obj,
parse_codecs,
parse_qs,
)
class AcFunVideoBaseIE(InfoExtractor):
def _extract_metadata(self, video_id, video_info):
playjson = self._parse_json(video_info['ksPlayJson'], video_id)
formats, subtitles = [], {}
for video in traverse_obj(playjson, ('adaptationSet', 0, 'representation')):
fmts, subs = self._extract_m3u8_formats_and_subtitles(video['url'], video_id, 'mp4', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
for f in fmts:
f.update({
'fps': float_or_none(video.get('frameRate')),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'tbr': float_or_none(video.get('avgBitrate')),
**parse_codecs(video.get('codecs', ''))
})
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'duration': float_or_none(video_info.get('durationMillis'), 1000),
'timestamp': int_or_none(video_info.get('uploadTime'), 1000),
'http_headers': {'Referer': 'https://www.acfun.cn/'},
}
class AcFunVideoIE(AcFunVideoBaseIE):
_VALID_URL = r'https?://www\.acfun\.cn/v/ac(?P<id>[_\d]+)'
_TESTS = [{
'url': 'https://www.acfun.cn/v/ac35457073',
'info_dict': {
'id': '35457073',
'ext': 'mp4',
'duration': 174.208,
'timestamp': 1656403967,
'title': '1 8 岁 现 状',
'description': '“赶紧回去!班主任查班了!”',
'uploader': '锤子game',
'uploader_id': '51246077',
'thumbnail': r're:^https?://.*\.(jpg|jpeg)',
'upload_date': '20220628',
'like_count': int,
'view_count': int,
'comment_count': int,
'tags': list,
},
}, {
# example for len(video_list) > 1
'url': 'https://www.acfun.cn/v/ac35468952_2',
'info_dict': {
'id': '35468952_2',
'ext': 'mp4',
'title': '【动画剧集】Rocket & Groot Season 12022/火箭浣熊与格鲁特第1季 P02 S01E02 十拿九穩',
'duration': 90.459,
'uploader': '比令',
'uploader_id': '37259967',
'upload_date': '20220629',
'timestamp': 1656479962,
'tags': list,
'like_count': int,
'view_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg)',
'description': 'md5:67583aaf3a0f933bd606bc8a2d3ebb17',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json_all = self._search_json(r'window.videoInfo\s*=', webpage, 'videoInfo', video_id)
title = json_all.get('title')
video_list = json_all.get('videoList') or []
video_internal_id = traverse_obj(json_all, ('currentVideoInfo', 'id'))
if video_internal_id and len(video_list) > 1:
part_idx, part_video_info = next(
(idx + 1, v) for (idx, v) in enumerate(video_list)
if v['id'] == video_internal_id)
title = f'{title} P{part_idx:02d} {part_video_info["title"]}'
return {
**self._extract_metadata(video_id, json_all['currentVideoInfo']),
'title': title,
'thumbnail': json_all.get('coverUrl'),
'description': json_all.get('description'),
'uploader': traverse_obj(json_all, ('user', 'name')),
'uploader_id': traverse_obj(json_all, ('user', 'href')),
'tags': traverse_obj(json_all, ('tagList', ..., 'name')),
'view_count': int_or_none(json_all.get('viewCount')),
'like_count': int_or_none(json_all.get('likeCountShow')),
'comment_count': int_or_none(json_all.get('commentCountShow')),
}
class AcFunBangumiIE(AcFunVideoBaseIE):
_VALID_URL = r'https?://www\.acfun\.cn/bangumi/(?P<id>aa[_\d]+)'
_TESTS = [{
'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1745457?ac=2',
'info_dict': {
'id': 'aa6002917_36188_1745457__2',
'ext': 'mp4',
'title': '【7月】租借女友 水原千鹤角色曲『DATE』特别PV',
'upload_date': '20200916',
'timestamp': 1600243813,
'duration': 92.091,
},
}, {
'url': 'https://www.acfun.cn/bangumi/aa5023171_36188_1750645',
'info_dict': {
'id': 'aa5023171_36188_1750645',
'ext': 'mp4',
'title': '红孩儿之趴趴蛙寻石记 第5话 ',
'duration': 760.0,
'season': '红孩儿之趴趴蛙寻石记',
'season_id': 5023171,
'season_number': 1, # series has only 1 season
'episode': 'Episode 5',
'episode_number': 5,
'upload_date': '20181223',
'timestamp': 1545552185,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)',
'comment_count': int,
},
}, {
'url': 'https://www.acfun.cn/bangumi/aa6065485_36188_1885061',
'info_dict': {
'id': 'aa6065485_36188_1885061',
'ext': 'mp4',
'title': '叽歪老表(第二季) 第5话 坚不可摧',
'season': '叽歪老表(第二季)',
'season_number': 2,
'season_id': 6065485,
'episode': '坚不可摧',
'episode_number': 5,
'upload_date': '20220324',
'timestamp': 1648082786,
'duration': 105.002,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)',
'comment_count': int,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
ac_idx = parse_qs(url).get('ac', [None])[-1]
video_id = f'{video_id}{format_field(ac_idx, None, "__%s")}'
webpage = self._download_webpage(url, video_id)
json_bangumi_data = self._search_json(r'window.bangumiData\s*=', webpage, 'bangumiData', video_id)
if ac_idx:
video_info = json_bangumi_data['hlVideoInfo']
return {
**self._extract_metadata(video_id, video_info),
'title': video_info.get('title'),
}
video_info = json_bangumi_data['currentVideoInfo']
season_id = json_bangumi_data.get('bangumiId')
season_number = season_id and next((
idx for idx, v in enumerate(json_bangumi_data.get('relatedBangumis') or [], 1)
if v.get('id') == season_id), 1)
json_bangumi_list = self._search_json(
r'window\.bangumiList\s*=', webpage, 'bangumiList', video_id, fatal=False)
video_internal_id = int_or_none(traverse_obj(json_bangumi_data, ('currentVideoInfo', 'id')))
episode_number = video_internal_id and next((
idx for idx, v in enumerate(json_bangumi_list.get('items') or [], 1)
if v.get('videoId') == video_internal_id), None)
return {
**self._extract_metadata(video_id, video_info),
'title': json_bangumi_data.get('showTitle'),
'thumbnail': json_bangumi_data.get('image'),
'season': json_bangumi_data.get('bangumiTitle'),
'season_id': season_id,
'season_number': season_number,
'episode': json_bangumi_data.get('title'),
'episode_number': episode_number,
'comment_count': int_or_none(json_bangumi_data.get('commentCount')),
}

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import binascii
import json
@@ -31,30 +28,34 @@ from ..utils import (
class ADNIE(InfoExtractor):
IE_DESC = 'Anime Digital Network'
_VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'md5': '0319c99885ff5547565cacb4f3f9348d',
IE_DESC = 'Animation Digital Network'
_VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir',
'md5': '1c9ef066ceb302c86f80c2b371615261',
'info_dict': {
'id': '7778',
'id': '9841',
'ext': 'mp4',
'title': 'Blue Exorcist - Kyôto Saga - Episode 1',
'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
'series': 'Blue Exorcist - Kyôto Saga',
'duration': 1467,
'release_date': '20170106',
'title': 'Fruits Basket - Episode 1',
'description': 'md5:14be2f72c3c96809b0ca424b0097d336',
'series': 'Fruits Basket',
'duration': 1437,
'release_date': '20190405',
'comment_count': int,
'average_rating': float,
'season_number': 2,
'episode': 'Début des hostilités',
'season_number': 1,
'episode': 'À ce soir !',
'episode_number': 1,
}
}
},
'skip': 'Only available in region (FR, ...)',
}, {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'only_matching': True,
}]
_NETRC_MACHINE = 'animedigitalnetwork'
_BASE_URL = 'http://animedigitalnetwork.fr'
_API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/'
_NETRC_MACHINE = 'animationdigitalnetwork'
_BASE = 'animationdigitalnetwork.fr'
_API_BASE_URL = 'https://gw.api.' + _BASE + '/'
_PLAYER_BASE_URL = _API_BASE_URL + 'player/'
_HEADERS = {}
_LOGIN_ERR_MESSAGE = 'Unable to log in'
@@ -78,14 +79,14 @@ class ADNIE(InfoExtractor):
if subtitle_location:
enc_subtitles = self._download_webpage(
subtitle_location, video_id, 'Downloading subtitles data',
fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'})
fatal=False, headers={'Origin': 'https://' + self._BASE})
if not enc_subtitles:
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
# http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes(
compat_b64decode(enc_subtitles[24:]),
binascii.unhexlify(self._K + 'ab9f52f5baae7c72'),
binascii.unhexlify(self._K + '7fac1178830cfe0c'),
compat_b64decode(enc_subtitles[:24])))
subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False)
if not subtitles_json:
@@ -126,10 +127,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
}])
return subtitles
def _real_initialize(self):
username, password = self._get_login_info()
if not username:
return
def _perform_login(self, username, password):
try:
access_token = (self._download_json(
self._API_BASE_URL + 'authentication/login', None,
@@ -170,7 +168,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
}, data=b'')['token']
links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
self._K = ''.join(random.choices('0123456789abcdef', k=16))
message = bytes_to_intlist(json.dumps({
'k': self._K,
't': token,
@@ -237,7 +235,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
for f in m3u8_formats:
f['language'] = 'fr'
formats.extend(m3u8_formats)
self._sort_formats(formats)
video = (self._download_json(
self._API_BASE_URL + 'video/%s' % video_id, video_id,

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
@@ -14,7 +11,7 @@ class AdobeConnectIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []

View File

@@ -1,26 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import getpass
import json
import re
import time
import urllib.error
import xml.etree.ElementTree as etree
from .common import InfoExtractor
from ..compat import (
compat_kwargs,
compat_urlparse,
compat_getpass
)
from ..compat import compat_urlparse
from ..utils import (
unescapeHTML,
urlencode_postdata,
unified_timestamp,
ExtractorError,
NO_DEFAULT,
ExtractorError,
unescapeHTML,
unified_timestamp,
urlencode_postdata,
)
MSO_INFO = {
'DTV': {
'name': 'DIRECTV',
@@ -1345,10 +1339,20 @@ MSO_INFO = {
'username_field': 'username',
'password_field': 'password',
},
'Suddenlink': {
'name': 'Suddenlink',
'username_field': 'username',
'password_field': 'password',
},
'AlticeOne': {
'name': 'Optimum TV',
'username_field': 'j_username',
'password_field': 'j_password',
},
}
class AdobePassIE(InfoExtractor):
class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
_SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd'
@@ -1360,7 +1364,7 @@ class AdobePassIE(InfoExtractor):
headers.update(kwargs.get('headers', {}))
kwargs['headers'] = headers
return super(AdobePassIE, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
*args, **kwargs)
@staticmethod
def _get_mvpd_resource(provider_id, title, guid, rating):
@@ -1429,32 +1433,34 @@ class AdobePassIE(InfoExtractor):
guid = xml_text(resource, 'guid') if '<' in resource else resource
count = 0
while count < 2:
requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {}
requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {}
authn_token = requestor_info.get('authn_token')
if authn_token and is_expired(authn_token, 'simpleTokenExpires'):
authn_token = None
if not authn_token:
# TODO add support for other TV Providers
mso_id = self.get_param('ap_mso')
if mso_id:
username, password = self._get_login_info('ap_username', 'ap_password', mso_id)
if not username or not password:
raise_mvpd_required()
mso_info = MSO_INFO[mso_id]
provider_redirect_page_res = self._download_webpage_handle(
self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
'Downloading Provider Redirect Page', query={
'noflash': 'true',
'mso_id': mso_id,
'requestor_id': requestor_id,
'no_iframe': 'false',
'domain_name': 'adobe.com',
'redirect_url': url,
})
elif not self._cookies_passed:
raise_mvpd_required()
if not mso_id:
raise_mvpd_required()
username, password = self._get_login_info('ap_username', 'ap_password', mso_id)
if not username or not password:
raise_mvpd_required()
mso_info = MSO_INFO[mso_id]
provider_redirect_page_res = self._download_webpage_handle(
self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
'Downloading Provider Redirect Page', query={
'noflash': 'true',
'mso_id': mso_id,
'requestor_id': requestor_id,
'no_iframe': 'false',
'domain_name': 'adobe.com',
'redirect_url': url,
})
if mso_id == 'Comcast_SSO':
pass
elif mso_id == 'Comcast_SSO':
# Comcast page flow varies by video site and whether you
# are on Comcast's network.
provider_redirect_page, urlh = provider_redirect_page_res
@@ -1502,7 +1508,7 @@ class AdobePassIE(InfoExtractor):
'send_confirm_link': False,
'send_token': True
}))
philo_code = compat_getpass('Type auth code you have received [Return]: ')
philo_code = getpass.getpass('Type auth code you have received [Return]: ')
self._download_webpage(
'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({
'token': philo_code
@@ -1635,6 +1641,58 @@ class AdobePassIE(InfoExtractor):
urlh.geturl(), video_id, 'Sending final bookend',
query=hidden_data)
post_form(mvpd_confirm_page_res, 'Confirming Login')
elif mso_id == 'Suddenlink':
# Suddenlink is similar to SlingTV in using a tab history count and a meta refresh,
# but they also do a dynmaic redirect using javascript that has to be followed as well
first_bookend_page, urlh = post_form(
provider_redirect_page_res, 'Pressing Continue...')
hidden_data = self._hidden_inputs(first_bookend_page)
hidden_data['history_val'] = 1
provider_login_redirect_page_res = self._download_webpage_handle(
urlh.geturl(), video_id, 'Sending First Bookend',
query=hidden_data)
provider_login_redirect_page, urlh = provider_login_redirect_page_res
# Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already
# have the login prompt or not
if 'id="password" type="password" name="password"' in provider_login_redirect_page:
provider_login_page_res = provider_login_redirect_page_res
else:
provider_tryauth_url = self._html_search_regex(
r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl')
provider_tryauth_page = self._download_webpage(
provider_tryauth_url, video_id, 'Submitting TryAuth',
query=hidden_data)
provider_login_page_res = self._download_webpage_handle(
f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}',
video_id, 'Getting Login Page',
query=hidden_data)
provider_association_redirect, urlh = post_form(
provider_login_page_res, 'Logging in', {
mso_info['username_field']: username,
mso_info['password_field']: password
})
provider_refresh_redirect_url = extract_redirect_url(
provider_association_redirect, url=urlh.geturl())
last_bookend_page, urlh = self._download_webpage_handle(
provider_refresh_redirect_url, video_id,
'Downloading Auth Association Redirect Page')
hidden_data = self._hidden_inputs(last_bookend_page)
hidden_data['history_val'] = 3
mvpd_confirm_page_res = self._download_webpage_handle(
urlh.geturl(), video_id, 'Sending Final Bookend',
query=hidden_data)
post_form(mvpd_confirm_page_res, 'Confirming Login')
else:
# Some providers (e.g. DIRECTV NOW) have another meta refresh
@@ -1652,25 +1710,30 @@ class AdobePassIE(InfoExtractor):
mso_info.get('username_field', 'username'): username,
mso_info.get('password_field', 'password'): password
}
if mso_id == 'Cablevision':
if mso_id in ('Cablevision', 'AlticeOne'):
form_data['_eventId_proceed'] = ''
mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data)
if mso_id != 'Rogers':
post_form(mvpd_confirm_page_res, 'Confirming Login')
session = self._download_webpage(
self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
'Retrieving Session', data=urlencode_postdata({
'_method': 'GET',
'requestor_id': requestor_id,
}), headers=mvpd_headers)
try:
session = self._download_webpage(
self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
'Retrieving Session', data=urlencode_postdata({
'_method': 'GET',
'requestor_id': requestor_id,
}), headers=mvpd_headers)
except ExtractorError as e:
if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401:
raise_mvpd_required()
raise
if '<pendingLogout' in session:
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
self.cache.store(self._MVPD_CACHE, requestor_id, {})
count += 1
continue
authn_token = unescapeHTML(xml_text(session, 'authnToken'))
requestor_info['authn_token'] = authn_token
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
authz_token = requestor_info.get(guid)
if authz_token and is_expired(authz_token, 'simpleTokenTTL'):
@@ -1686,14 +1749,14 @@ class AdobePassIE(InfoExtractor):
'userMeta': '1',
}), headers=mvpd_headers)
if '<pendingLogout' in authorize:
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
self.cache.store(self._MVPD_CACHE, requestor_id, {})
count += 1
continue
if '<error' in authorize:
raise ExtractorError(xml_text(authorize, 'details'), expected=True)
authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
requestor_info[guid] = authz_token
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
mvpd_headers.update({
'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
@@ -1709,7 +1772,7 @@ class AdobePassIE(InfoExtractor):
'hashed_guid': 'false',
}), headers=mvpd_headers)
if '<pendingLogout' in short_authorize:
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
self.cache.store(self._MVPD_CACHE, requestor_id, {})
count += 1
continue
return short_authorize

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
import functools
import re
@@ -72,7 +70,6 @@ class AdobeTVBaseIE(InfoExtractor):
})
s3_extracted = True
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
@@ -234,6 +231,7 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']
_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
@@ -270,7 +268,6 @@ class AdobeTVVideoIE(AdobeTVBaseIE):
'width': int_or_none(source.get('width') or None),
'url': source_src,
})
self._sort_formats(formats)
# For both metadata and downloaded files the duration varies among
# formats. I just pick the max one

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .turner import TurnerBaseIE
@@ -183,7 +180,6 @@ class AdultSwimIE(TurnerBaseIE):
info['subtitles'].setdefault('en', []).append({
'url': asset_url,
})
self._sort_formats(info['formats'])
return info
else:

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .theplatform import ThePlatformIE
from ..utils import (
ExtractorError,
@@ -12,7 +8,7 @@ from ..utils import (
)
class AENetworksBaseIE(ThePlatformIE):
class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
_BASE_URL_REGEX = r'''(?x)https?://
(?:(?:www|play|watch)\.)?
(?P<domain>
@@ -32,14 +28,17 @@ class AENetworksBaseIE(ThePlatformIE):
}
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
query = {
'mbr': 'true',
'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3',
}
if auth:
query['auth'] = auth
TP_SMIL_QUERY = [{
'assetTypes': 'high_video_ak',
'switch': 'hls_high_ak'
'switch': 'hls_high_ak',
}, {
'assetTypes': 'high_video_s3'
'assetTypes': 'high_video_s3',
}, {
'assetTypes': 'high_video_s3',
'switch': 'hls_high_fastly',
@@ -63,7 +62,6 @@ class AENetworksBaseIE(ThePlatformIE):
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if last_e and not formats:
raise last_e
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
@@ -305,7 +303,6 @@ class HistoryTopicIE(AENetworksBaseIE):
class HistoryPlayerIE(AENetworksBaseIE):
IE_NAME = 'history:player'
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)'
_TESTS = []
def _real_extract(self, url):
domain, video_id = self._match_valid_url(url).groups()

View File

@@ -0,0 +1,40 @@
from .common import InfoExtractor
from .vimeo import VimeoIE
class AeonCoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aeon\.co/videos/(?P<id>[^/?]+)'
_TESTS = [{
'url': 'https://aeon.co/videos/raw-solar-storm-footage-is-the-punk-rock-antidote-to-sleek-james-webb-imagery',
'md5': 'e5884d80552c9b6ea8d268a258753362',
'info_dict': {
'id': '1284717',
'ext': 'mp4',
'title': 'Brilliant Noise',
'thumbnail': 'https://i.vimeocdn.com/video/21006315-1a1e49da8b07fd908384a982b4ba9ff0268c509a474576ebdf7b1392f4acae3b-d_960',
'uploader': 'Semiconductor',
'uploader_id': 'semiconductor',
'uploader_url': 'https://vimeo.com/semiconductor',
'duration': 348
}
}, {
'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it',
'md5': '4e5f3dad9dbda0dbfa2da41a851e631e',
'info_dict': {
'id': '728595228',
'ext': 'mp4',
'title': 'Wrought',
'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280',
'uploader': 'Biofilm Productions',
'uploader_id': 'user140352216',
'uploader_url': 'https://vimeo.com/user140352216',
'duration': 1344
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id')
vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co')
return self.url_result(vimeo_url, VimeoIE)

View File

@@ -1,14 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
ExtractorError,
OnDemandPagedList,
date_from_str,
determine_ext,
ExtractorError,
int_or_none,
qualities,
traverse_obj,
@@ -32,7 +30,7 @@ class AfreecaTVIE(InfoExtractor):
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
vod\.afreecatv\.com/PLAYER/STATION/
vod\.afreecatv\.com/(PLAYER/STATION|player)/
)
(?P<id>\d+)
'''
@@ -170,6 +168,9 @@ class AfreecaTVIE(InfoExtractor):
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
'only_matching': True,
}, {
'url': 'http://vod.afreecatv.com/player/15055030',
'only_matching': True,
}]
@staticmethod
@@ -181,14 +182,7 @@ class AfreecaTVIE(InfoExtractor):
video_key['part'] = int(m.group('part'))
return video_key
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def _perform_login(self, username, password):
login_form = {
'szWork': 'login',
'szType': 'json',
@@ -284,7 +278,7 @@ class AfreecaTVIE(InfoExtractor):
else:
raise ExtractorError('Unable to download video info')
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
video_element = video_xml.findall('./track/video')[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s does not exist' % video_id, expected=True)
@@ -314,7 +308,7 @@ class AfreecaTVIE(InfoExtractor):
if not video_url:
entries = []
file_elements = video_element.findall(compat_xpath('./file'))
file_elements = video_element.findall('./file')
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
file_url = url_or_none(file_element.text)
@@ -344,7 +338,6 @@ class AfreecaTVIE(InfoExtractor):
}]
if not formats and not self.get_param('ignore_no_formats'):
continue
self._sort_formats(formats)
file_info = common_entry.copy()
file_info.update({
'id': format_id,
@@ -386,7 +379,7 @@ class AfreecaTVIE(InfoExtractor):
return info
class AfreecaTVLiveIE(AfreecaTVIE):
class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'afreecatv:live'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
@@ -416,26 +409,35 @@ class AfreecaTVLiveIE(AfreecaTVIE):
def _real_extract(self, url):
broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
password = self.get_param('videopassword')
info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
data=urlencode_postdata({'bid': broadcaster_id})) or {}
channel_info = info.get('CHANNEL') or {}
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
password_protected = channel_info.get('BPWD')
if not broadcast_no:
raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
if password_protected == 'Y' and password is None:
raise ExtractorError(
'This livestream is protected by a password, use the --video-password option',
expected=True)
formats = []
quality_key = qualities(self._QUALITIES)
for quality_str in self._QUALITIES:
params = {
'bno': broadcast_no,
'stream_type': 'common',
'type': 'aid',
'quality': quality_str,
}
if password is not None:
params['pwd'] = password
aid_response = self._download_json(
self._LIVE_API_URL, broadcast_no, fatal=False,
data=urlencode_postdata({
'bno': broadcast_no,
'stream_type': 'common',
'type': 'aid',
'quality': quality_str,
}),
data=urlencode_postdata(params),
note=f'Downloading access token for {quality_str} stream',
errnote=f'Unable to download access token for {quality_str} stream')
aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
@@ -461,8 +463,6 @@ class AfreecaTVLiveIE(AfreecaTVIE):
'quality': quality_key(quality_str),
})
self._sort_formats(formats)
station_info = self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
query={'szBjId': broadcaster_id}, fatal=False,
@@ -477,3 +477,57 @@ class AfreecaTVLiveIE(AfreecaTVIE):
'formats': formats,
'is_live': True,
}
class AfreecaTVUserIE(InfoExtractor):
IE_NAME = 'afreecatv:user'
_VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P<id>[^/]+)/vods/?(?P<slug_type>[^/]+)?'
_TESTS = [{
'url': 'https://bj.afreecatv.com/ryuryu24/vods/review',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
'title': 'ryuryu24 - review',
},
'playlist_count': 218,
}, {
'url': 'https://bj.afreecatv.com/parang1995/vods/highlight',
'info_dict': {
'_type': 'playlist',
'id': 'parang1995',
'title': 'parang1995 - highlight',
},
'playlist_count': 997,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
'title': 'ryuryu24 - all',
},
'playlist_count': 221,
}, {
'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip',
'info_dict': {
'_type': 'playlist',
'id': 'ryuryu24',
'title': 'ryuryu24 - balloonclip',
},
'playlist_count': 0,
}]
_PER_PAGE = 60
def _fetch_page(self, user_id, user_type, page):
page += 1
info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id,
query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'},
note=f'Downloading {user_type} video page {page}')
for item in info['data']:
yield self.url_result(
f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no'])
def _real_extract(self, url):
user_id, user_type = self._match_valid_url(url).group('id', 'slug_type')
user_type = user_type or 'all'
entries = OnDemandPagedList(functools.partial(self._fetch_page, user_id, user_type), self._PER_PAGE)
return self.playlist_result(entries, user_id, f'{user_id} - {user_type}')

View File

@@ -0,0 +1,251 @@
import functools
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
int_or_none,
month_by_name,
parse_duration,
try_call,
)
class WyborczaVideoIE(InfoExtractor):
# this id is not an article id, it has to be extracted from the article
_VALID_URL = r'(?:wyborcza:video:|https?://wyborcza\.pl/(?:api-)?video/)(?P<id>\d+)'
IE_NAME = 'wyborcza:video'
_TESTS = [{
'url': 'wyborcza:video:26207634',
'info_dict': {
'id': '26207634',
'ext': 'mp4',
'title': '- Polska w 2020 r. jest innym państwem niż w 2015 r. Nie zmieniła się konstytucja, ale jest to już inny ustrój - mówi Adam Bodnar',
'description': ' ',
'uploader': 'Dorota Roman',
'duration': 2474,
'thumbnail': r're:https://.+\.jpg',
},
}, {
'url': 'https://wyborcza.pl/video/26207634',
'only_matching': True,
}, {
'url': 'https://wyborcza.pl/api-video/26207634',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._download_json(f'https://wyborcza.pl/api-video/{video_id}', video_id)
formats = []
base_url = meta['redirector'].replace('http://', 'https://') + meta['basePath']
for quality in ('standard', 'high'):
if not meta['files'].get(quality):
continue
formats.append({
'url': base_url + meta['files'][quality],
'height': int_or_none(
self._search_regex(
r'p(\d+)[a-z]+\.mp4$', meta['files'][quality],
'mp4 video height', default=None)),
'format_id': quality,
})
if meta['files'].get('dash'):
formats.extend(self._extract_mpd_formats(base_url + meta['files']['dash'], video_id))
return {
'id': video_id,
'formats': formats,
'title': meta.get('title'),
'description': meta.get('lead'),
'uploader': meta.get('signature'),
'thumbnail': meta.get('imageUrl'),
'duration': meta.get('duration'),
}
class WyborczaPodcastIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:www\.)?(?:
wyborcza\.pl/podcast(?:/0,172673\.html)?|
wysokieobcasy\.pl/wysokie-obcasy/0,176631\.html
)(?:\?(?:[^&#]+?&)*podcast=(?P<id>\d+))?
'''
_TESTS = [{
'url': 'https://wyborcza.pl/podcast/0,172673.html?podcast=100720#S.main_topic-K.C-B.6-L.1.podcast',
'info_dict': {
'id': '100720',
'ext': 'mp3',
'title': 'Cyfrodziewczyny. Kim były pionierki polskiej informatyki ',
'uploader': 'Michał Nogaś ',
'upload_date': '20210117',
'description': 'md5:49f0a06ffc4c1931210d3ab1416a651d',
'duration': 3684.0,
'thumbnail': r're:https://.+\.jpg',
},
}, {
'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html?podcast=100673',
'info_dict': {
'id': '100673',
'ext': 'mp3',
'title': 'Czym jest ubóstwo menstruacyjne i dlaczego dotyczy każdej i każdego z nas?',
'uploader': 'Agnieszka Urazińska ',
'upload_date': '20210115',
'description': 'md5:c161dc035f8dbb60077011fc41274899',
'duration': 1803.0,
'thumbnail': r're:https://.+\.jpg',
},
}, {
'url': 'https://wyborcza.pl/podcast',
'info_dict': {
'id': '334',
'title': 'Gościnnie: Wyborcza, 8:10',
'series': 'Gościnnie: Wyborcza, 8:10',
},
'playlist_mincount': 370,
}, {
'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html',
'info_dict': {
'id': '395',
'title': 'Gościnnie: Wysokie Obcasy',
'series': 'Gościnnie: Wysokie Obcasy',
},
'playlist_mincount': 12,
}]
def _real_extract(self, url):
podcast_id = self._match_id(url)
if not podcast_id: # playlist
podcast_id = '395' if 'wysokieobcasy.pl/' in url else '334'
return self.url_result(TokFMAuditionIE._create_url(podcast_id), TokFMAuditionIE, podcast_id)
meta = self._download_json('https://wyborcza.pl/api/podcast', podcast_id,
query={'guid': podcast_id, 'type': 'wo' if 'wysokieobcasy.pl/' in url else None})
day, month, year = self._search_regex(r'^(\d\d?) (\w+) (\d{4})$', meta.get('publishedDate'),
'upload date', group=(1, 2, 3), default=(None, None, None))
return {
'id': podcast_id,
'url': meta['url'],
'title': meta.get('title'),
'description': meta.get('description'),
'thumbnail': meta.get('imageUrl'),
'duration': parse_duration(meta.get('duration')),
'uploader': meta.get('author'),
'upload_date': try_call(lambda: f'{year}{month_by_name(month, lang="pl"):0>2}{day:0>2}'),
}
class TokFMPodcastIE(InfoExtractor):
_VALID_URL = r'(?:https?://audycje\.tokfm\.pl/podcast/|tokfm:podcast:)(?P<id>\d+),?'
IE_NAME = 'tokfm:podcast'
_TESTS = [{
'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych',
'info_dict': {
'id': '91275',
'ext': 'aac',
'title': 'md5:a9b15488009065556900169fb8061cce',
'episode': 'md5:a9b15488009065556900169fb8061cce',
'series': 'Analizy',
},
}]
def _real_extract(self, url):
media_id = self._match_id(url)
# in case it breaks see this but it returns a lot of useless data
# https://api.podcast.radioagora.pl/api4/getPodcasts?podcast_id=100091&with_guests=true&with_leaders_for_mobile=true
metadata = self._download_json(
f'https://audycje.tokfm.pl/getp/3{media_id}', media_id, 'Downloading podcast metadata')
if not metadata:
raise ExtractorError('No such podcast', expected=True)
metadata = metadata[0]
formats = []
for ext in ('aac', 'mp3'):
url_data = self._download_json(
f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}',
media_id, 'Downloading podcast %s URL' % ext)
# prevents inserting the mp3 (default) multiple times
if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']:
formats.append({
'url': url_data['link_ssl'],
'ext': ext,
'vcodec': 'none',
'acodec': ext,
})
return {
'id': media_id,
'formats': formats,
'title': metadata.get('podcast_name'),
'series': metadata.get('series_name'),
'episode': metadata.get('podcast_name'),
}
class TokFMAuditionIE(InfoExtractor):
_VALID_URL = r'(?:https?://audycje\.tokfm\.pl/audycja/|tokfm:audition:)(?P<id>\d+),?'
IE_NAME = 'tokfm:audition'
_TESTS = [{
'url': 'https://audycje.tokfm.pl/audycja/218,Analizy',
'info_dict': {
'id': '218',
'title': 'Analizy',
'series': 'Analizy',
},
'playlist_count': 1635,
}]
_PAGE_SIZE = 30
_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 9; Redmi 3S Build/PQ3A.190801.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/87.0.4280.101 Mobile Safari/537.36',
}
@staticmethod
def _create_url(id):
return f'https://audycje.tokfm.pl/audycja/{id}'
def _real_extract(self, url):
audition_id = self._match_id(url)
data = self._download_json(
f'https://api.podcast.radioagora.pl/api4/getSeries?series_id={audition_id}',
audition_id, 'Downloading audition metadata', headers=self._HEADERS)
if not data:
raise ExtractorError('No such audition', expected=True)
data = data[0]
entries = OnDemandPagedList(functools.partial(
self._fetch_page, audition_id, data), self._PAGE_SIZE)
return {
'_type': 'playlist',
'id': audition_id,
'title': data.get('series_name'),
'series': data.get('series_name'),
'entries': entries,
}
def _fetch_page(self, audition_id, data, page):
for retry in self.RetryManager():
podcast_page = self._download_json(
f'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id={audition_id}&limit=30&offset={page}&with_guests=true&with_leaders_for_mobile=true',
audition_id, f'Downloading podcast list page {page + 1}', headers=self._HEADERS)
if not podcast_page:
retry.error = ExtractorError('Agora returned empty page', expected=True)
for podcast in podcast_page:
yield {
'_type': 'url_transparent',
'url': podcast['podcast_sharing_url'],
'ie_key': TokFMPodcastIE.ie_key(),
'title': podcast.get('podcast_name'),
'episode': podcast.get('podcast_name'),
'description': podcast.get('podcast_description'),
'timestamp': int_or_none(podcast.get('podcast_timestamp')),
'series': data.get('series_name'),
}

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor

View File

@@ -0,0 +1,96 @@
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
determine_ext,
int_or_none,
mimetype2ext,
parse_iso8601,
traverse_obj
)
class AirTVIE(InfoExtractor):
_VALID_URL = r'https?://www\.air\.tv/watch\?v=(?P<id>\w+)'
_TESTS = [{
# without youtube_id
'url': 'https://www.air.tv/watch?v=W87jcWleSn2hXZN47zJZsQ',
'info_dict': {
'id': 'W87jcWleSn2hXZN47zJZsQ',
'ext': 'mp4',
'release_date': '20221003',
'release_timestamp': 1664792603,
'channel_id': 'vgfManQlRQKgoFQ8i8peFQ',
'title': 'md5:c12d49ed367c3dadaa67659aff43494c',
'upload_date': '20221003',
'duration': 151,
'view_count': int,
'thumbnail': 'https://cdn-sp-gcs.air.tv/videos/W/8/W87jcWleSn2hXZN47zJZsQ/b13fc56464f47d9d62a36d110b9b5a72-4096x2160_9.jpg',
'timestamp': 1664792603,
}
}, {
# with youtube_id
'url': 'https://www.air.tv/watch?v=sv57EC8tRXG6h8dNXFUU1Q',
'info_dict': {
'id': '2ZTqmpee-bQ',
'ext': 'mp4',
'comment_count': int,
'tags': 'count:11',
'channel_follower_count': int,
'like_count': int,
'uploader': 'Newsflare',
'thumbnail': 'https://i.ytimg.com/vi_webp/2ZTqmpee-bQ/maxresdefault.webp',
'availability': 'public',
'title': 'Geese Chase Alligator Across Golf Course',
'uploader_id': 'NewsflareBreaking',
'channel_url': 'https://www.youtube.com/channel/UCzSSoloGEz10HALUAbYhngQ',
'description': 'md5:99b21d9cea59330149efbd9706e208f5',
'age_limit': 0,
'channel_id': 'UCzSSoloGEz10HALUAbYhngQ',
'uploader_url': 'http://www.youtube.com/user/NewsflareBreaking',
'view_count': int,
'categories': ['News & Politics'],
'live_status': 'not_live',
'playable_in_embed': True,
'channel': 'Newsflare',
'duration': 37,
'upload_date': '20180511',
}
}]
def _get_formats_and_subtitle(self, json_data, video_id):
formats, subtitles = [], {}
for source in traverse_obj(json_data, 'sources', 'sources_desktop', ...):
ext = determine_ext(source.get('src'), mimetype2ext(source.get('type')))
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({'url': source.get('src'), 'ext': ext})
return formats, subtitles
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['initialState']['videos'][display_id]
if nextjs_json.get('youtube_id'):
return self.url_result(
f'https://www.youtube.com/watch?v={nextjs_json.get("youtube_id")}', YoutubeIE)
formats, subtitles = self._get_formats_and_subtitle(nextjs_json, display_id)
return {
'id': display_id,
'title': nextjs_json.get('title') or self._html_search_meta('og:title', webpage),
'formats': formats,
'subtitles': subtitles,
'description': nextjs_json.get('description') or None,
'duration': int_or_none(nextjs_json.get('duration')),
'thumbnails': [
{'url': thumbnail}
for thumbnail in traverse_obj(nextjs_json, ('default_thumbnails', ...))],
'channel_id': traverse_obj(nextjs_json, 'channel', 'channel_slug'),
'timestamp': parse_iso8601(nextjs_json.get('created')),
'release_timestamp': parse_iso8601(nextjs_json.get('published')),
'view_count': int_or_none(nextjs_json.get('views')),
}

View File

@@ -0,0 +1,60 @@
from .common import InfoExtractor
from ..utils import int_or_none, merge_dicts
class AitubeKZVideoIE(InfoExtractor):
_VALID_URL = r'https?://aitube\.kz/(?:video|embed/)\?(?:[^\?]+)?id=(?P<id>[\w-]+)'
_TESTS = [{
# id paramater as first parameter
'url': 'https://aitube.kz/video?id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7&season=1',
'info_dict': {
'id': '9291d29b-c038-49a1-ad42-3da2051d353c',
'ext': 'mp4',
'duration': 2174.0,
'channel_id': '94962f73-013b-432c-8853-1bd78ca860fe',
'like_count': int,
'channel': 'ASTANA TV',
'comment_count': int,
'view_count': int,
'description': 'Смотреть любимые сериалы и видео, поделиться видео и сериалами с друзьями и близкими',
'thumbnail': 'https://cdn.static02.aitube.kz/kz.aitudala.aitube.staticaccess/files/ddf2a2ff-bee3-409b-b5f2-2a8202bba75b',
'upload_date': '20221102',
'timestamp': 1667370519,
'title': 'Ангел хранитель 1 серия',
'channel_follower_count': int,
}
}, {
# embed url
'url': 'https://aitube.kz/embed/?id=9291d29b-c038-49a1-ad42-3da2051d353c',
'only_matching': True,
}, {
# id parameter is not as first paramater
'url': 'https://aitube.kz/video?season=1&id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
nextjs_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['videoInfo']
json_ld_data = self._search_json_ld(webpage, video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
f'https://api-http.aitube.kz/kz.aitudala.aitube.staticaccess/video/{video_id}/video', video_id)
return merge_dicts({
'id': video_id,
'title': nextjs_data.get('title') or self._html_search_meta(['name', 'og:title'], webpage),
'description': nextjs_data.get('description'),
'formats': formats,
'subtitles': subtitles,
'view_count': (nextjs_data.get('viewCount')
or int_or_none(self._html_search_meta('ya:ovs:views_total', webpage))),
'like_count': nextjs_data.get('likeCount'),
'channel': nextjs_data.get('channelTitle'),
'channel_id': nextjs_data.get('channelId'),
'thumbnail': nextjs_data.get('coverUrl'),
'comment_count': nextjs_data.get('commentCount'),
'channel_follower_count': int_or_none(nextjs_data.get('channelSubscriberCount')),
}, json_ld_data)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -18,7 +15,7 @@ class AliExpressLiveIE(InfoExtractor):
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor

View File

@@ -1,12 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
qualities,
remove_end,
strip_or_none,
try_get,
unified_timestamp,
url_basename,
@@ -102,10 +100,7 @@ class AllocineIE(InfoExtractor):
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
title = remove_end(
self._html_search_regex(
r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
' - AlloCiné')
title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
@@ -117,8 +112,6 @@ class AllocineIE(InfoExtractor):
})
duration, view_count, timestamp = [None] * 3
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
parse_iso8601,

View File

@@ -0,0 +1,83 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
dict_get,
get_element_by_class,
int_or_none,
unified_strdate,
url_or_none,
)
class Alsace20TVBaseIE(InfoExtractor):
def _extract_video(self, video_id, url=None):
info = self._download_json(
'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ),
video_id) or {}
title = info.get('titre')
formats = []
for res, fmt_url in (info.get('files') or {}).items():
formats.extend(
self._extract_smil_formats(fmt_url, video_id, fatal=False)
if '/smil:_' in fmt_url
else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False))
webpage = (url and self._download_webpage(url, video_id, fatal=False)) or ''
thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage))
upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None)
upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None
return {
'id': video_id,
'title': title,
'formats': formats,
'description': clean_html(get_element_by_class('wysiwyg', webpage)),
'upload_date': upload_date,
'thumbnail': thumbnail,
'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None),
'view_count': int_or_none(info.get('nb_vues')),
}
class Alsace20TVIE(Alsace20TVBaseIE):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'duration': 1073,
'view_count': int,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id, url)
class Alsace20TVEmbedIE(Alsace20TVBaseIE):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh',
# 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'view_count': int,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -66,22 +63,13 @@ class AluraIE(InfoExtractor):
f['height'] = int('720' if m.group('res') == 'hd' else '480')
formats.extend(video_format)
self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
"formats": formats
}
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
pass
def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login popup')
@@ -123,7 +111,7 @@ class AluraIE(InfoExtractor):
raise ExtractorError('Unable to log in')
class AluraCourseIE(AluraIE):
class AluraCourseIE(AluraIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)'
_LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .youtube import YoutubeIE
from .vimeo import VimeoIE

View File

@@ -1,6 +1,17 @@
# coding: utf-8
import re
from .common import InfoExtractor
from ..utils import int_or_none
from ..utils import (
ExtractorError,
clean_html,
float_or_none,
get_element_by_attribute,
get_element_by_class,
int_or_none,
js_to_json,
traverse_obj,
url_or_none,
)
class AmazonStoreIE(InfoExtractor):
@@ -10,7 +21,7 @@ class AmazonStoreIE(InfoExtractor):
'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
'info_dict': {
'id': 'B098XNCHLD',
'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed',
'title': str,
},
'playlist_mincount': 1,
'playlist': [{
@@ -19,28 +30,48 @@ class AmazonStoreIE(InfoExtractor):
'ext': 'mp4',
'title': 'mcdodo usb c cable 100W 5a',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 34,
},
}]
}],
'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
'info_dict': {
'id': 'B0863TXGM3',
'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff',
'title': str,
},
'playlist_mincount': 4,
'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.com/dp/B0845NXCXF/',
'info_dict': {
'id': 'B0845NXCXF',
'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171',
'title': str,
},
'playlist-mincount': 1,
'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
'info_dict': {
'id': 'B08WX337PQ',
'title': str,
},
'playlist_mincount': 1,
'expected_warnings': ['Unable to extract data'],
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id)
for retry in self.RetryManager():
webpage = self._download_webpage(url, id)
try:
data_json = self._search_json(
r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id,
transform_source=js_to_json)
except ExtractorError as e:
retry.error = e
entries = [{
'id': video['marketPlaceID'],
'url': video['url'],
@@ -50,4 +81,90 @@ class AmazonStoreIE(InfoExtractor):
'height': int_or_none(video.get('videoHeight')),
'width': int_or_none(video.get('videoWidth')),
} for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title'])
return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title'))
class AmazonReviewsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
_TESTS = [{
'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
'info_dict': {
'id': 'R10VE9VUSY19L3',
'ext': 'mp4',
'title': 'Get squad #Suspicious',
'description': 'md5:7012695052f440a1e064e402d87e0afb',
'uploader': 'Kimberly Cronkright',
'average_rating': 1.0,
'thumbnail': r're:^https?://.*\.jpg$',
},
'expected_warnings': ['Review body was not found in webpage'],
}, {
'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
'info_dict': {
'id': 'R10VE9VUSY19L3',
'ext': 'mp4',
'title': 'Get squad #Suspicious',
'description': 'md5:7012695052f440a1e064e402d87e0afb',
'uploader': 'Kimberly Cronkright',
'average_rating': 1.0,
'thumbnail': r're:^https?://.*\.jpg$',
},
'expected_warnings': ['Review body was not found in webpage'],
}, {
'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
'info_dict': {
'id': 'RV1CO8JN5VGXV',
'ext': 'mp4',
'title': 'Not sure about its durability',
'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
'uploader': 'Shoaib Gulzar',
'average_rating': 2.0,
'thumbnail': r're:^https?://.*\.jpg$',
},
'expected_warnings': ['Review body was not found in webpage'],
}]
def _real_extract(self, url):
video_id = self._match_id(url)
for retry in self.RetryManager():
webpage = self._download_webpage(url, video_id)
review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
if not review_body:
retry.error = ExtractorError('Review body was not found in webpage', expected=True)
formats, subtitles = [], {}
manifest_url = self._search_regex(
r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
if url_or_none(manifest_url):
fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4', fatal=False)
formats.extend(fmts)
video_url = self._search_regex(
r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
if url_or_none(video_url):
formats.append({
'url': video_url,
'ext': 'mp4',
'format_id': 'http-mp4',
})
if not formats:
self.raise_no_formats('No video found for this customer review', expected=True)
return {
'id': video_id,
'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
or self._html_extract_title(webpage)),
'description': clean_html(traverse_obj(re.findall(
r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
'average_rating': float_or_none(clean_html(get_element_by_attribute(
'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
'thumbnail': self._search_regex(
r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -0,0 +1,290 @@
import json
from .common import InfoExtractor
from ..utils import ExtractorError, int_or_none, traverse_obj, try_get
class AmazonMiniTVBaseIE(InfoExtractor):
def _real_initialize(self):
self._download_webpage(
'https://www.amazon.in/minitv', None,
note='Fetching guest session cookies')
AmazonMiniTVBaseIE.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value
def _call_api(self, asin, data=None, note=None):
device = {'clientId': 'ATVIN', 'deviceLocale': 'en_GB'}
if data:
data['variables'].update({
'contentType': 'VOD',
'sessionIdToken': self.session_id,
**device,
})
resp = self._download_json(
f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}',
asin, note=note, headers={'Content-Type': 'application/json'},
data=json.dumps(data).encode() if data else None,
query=None if data else {
'deviceType': 'A1WMMUXPCUJL4N',
'contentId': asin,
**device,
})
if resp.get('errors'):
raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}')
elif not data:
return resp
return resp['data'][data['operationName']]
class AmazonMiniTVIE(AmazonMiniTVBaseIE):
_VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)'
_TESTS = [{
'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv',
'info_dict': {
'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840',
'ext': 'mp4',
'title': 'May I Kiss You?',
'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:a549bfc747973e04feb707833474e59d',
'release_timestamp': 1644710400,
'release_date': '20220213',
'duration': 846,
'chapters': 'count:2',
'series': 'Couple Goals',
'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0',
'season': 'Season 3',
'season_number': 3,
'season_id': 'amzn1.dv.gti.20331016-d9b9-4968-b991-c89fa4927a36',
'episode': 'May I Kiss You?',
'episode_number': 2,
'episode_id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840',
},
}, {
'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv',
'info_dict': {
'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab',
'ext': 'mp4',
'title': 'Jahaan',
'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:05eb765a77bf703f322f120ec6867339',
'release_timestamp': 1647475200,
'release_date': '20220317',
'duration': 783,
'chapters': [],
},
}, {
'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab',
'only_matching': True,
}, {
'url': 'amazonminitv:amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab',
'only_matching': True,
}, {
'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab',
'only_matching': True,
}]
_GRAPHQL_QUERY_CONTENT = '''
query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) {
content(
applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId}
contentId: $contentId
contentType: $contentType
) {
contentId
name
... on Episode {
contentId
vodType
name
images
description {
synopsis
contentLengthInSeconds
}
publicReleaseDateUTC
audioTracks
seasonId
seriesId
seriesName
seasonNumber
episodeNumber
timecode {
endCreditsTime
}
}
... on MovieContent {
contentId
vodType
name
description {
synopsis
contentLengthInSeconds
}
images
publicReleaseDateUTC
audioTracks
}
}
}'''
def _real_extract(self, url):
asin = f'amzn1.dv.gti.{self._match_id(url)}'
prs = self._call_api(asin, note='Downloading playback info')
formats, subtitles = [], {}
for type_, asset in prs['playbackAssets'].items():
if not traverse_obj(asset, 'manifestUrl'):
continue
if type_ == 'hls':
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
asset['manifestUrl'], asin, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=type_, fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif type_ == 'dash':
mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
asset['manifestUrl'], asin, mpd_id=type_, fatal=False)
formats.extend(mpd_fmts)
subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
self.report_warning(f'Unknown asset type: {type_}')
title_info = self._call_api(
asin, note='Downloading title info', data={
'operationName': 'content',
'variables': {'contentId': asin},
'query': self._GRAPHQL_QUERY_CONTENT,
})
credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000)
is_episode = title_info.get('vodType') == 'EPISODE'
return {
'id': asin,
'title': title_info.get('name'),
'formats': formats,
'subtitles': subtitles,
'language': traverse_obj(title_info, ('audioTracks', 0)),
'thumbnails': [{
'id': type_,
'url': url,
} for type_, url in (title_info.get('images') or {}).items()],
'description': traverse_obj(title_info, ('description', 'synopsis')),
'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)),
'duration': traverse_obj(title_info, ('description', 'contentLengthInSeconds')),
'chapters': [{
'start_time': credits_time,
'title': 'End Credits',
}] if credits_time else [],
'series': title_info.get('seriesName'),
'series_id': title_info.get('seriesId'),
'season_number': title_info.get('seasonNumber'),
'season_id': title_info.get('seasonId'),
'episode': title_info.get('name') if is_episode else None,
'episode_number': title_info.get('episodeNumber'),
'episode_id': asin if is_episode else None,
}
class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE):
IE_NAME = 'amazonminitv:season'
_VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)'
IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix'
_TESTS = [{
'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0',
'playlist_mincount': 6,
'info_dict': {
'id': 'amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0',
},
}, {
'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0',
'only_matching': True,
}]
_GRAPHQL_QUERY = '''
query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) {
getEpisodes(
applicationContextInput: {sessionIdToken: $sessionIdToken, deviceLocale: $deviceLocale, clientId: $clientId}
episodeOrSeasonId: $episodeOrSeasonId
) {
episodes {
... on Episode {
contentId
name
images
seriesName
seasonId
seriesId
seasonNumber
episodeNumber
description {
synopsis
contentLengthInSeconds
}
publicReleaseDateUTC
}
}
}
}
'''
def _entries(self, asin):
season_info = self._call_api(
asin, note='Downloading season info', data={
'operationName': 'getEpisodes',
'variables': {'episodeOrSeasonId': asin},
'query': self._GRAPHQL_QUERY,
})
for episode in season_info['episodes']:
yield self.url_result(
f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId'])
def _real_extract(self, url):
asin = f'amzn1.dv.gti.{self._match_id(url)}'
return self.playlist_result(self._entries(asin), asin)
class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE):
IE_NAME = 'amazonminitv:series'
_VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)'
_TESTS = [{
'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0',
'playlist_mincount': 3,
'info_dict': {
'id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0',
},
}, {
'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0',
'only_matching': True,
}]
_GRAPHQL_QUERY = '''
query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) {
getSeasons(
applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId}
episodeOrSeasonOrSeriesId: $episodeOrSeasonOrSeriesId
) {
seasons {
seasonId
}
}
}
'''
def _entries(self, asin):
season_info = self._call_api(
asin, note='Downloading series info', data={
'operationName': 'getSeasons',
'variables': {'episodeOrSeasonOrSeriesId': asin},
'query': self._GRAPHQL_QUERY,
})
for season in season_info['seasons']:
yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId'])
def _real_extract(self, url):
asin = f'amzn1.dv.gti.{self._match_id(url)}'
return self.playlist_result(self._entries(asin), asin)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .theplatform import ThePlatformIE
@@ -12,7 +9,7 @@ from ..utils import (
)
class AMCNetworksIE(ThePlatformIE):
class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631',
@@ -109,7 +106,6 @@ class AMCNetworksIE(ThePlatformIE):
media_url = update_url_query(media_url, query)
formats, subtitles = self._extract_theplatform_smil(
media_url, video_id)
self._sort_formats(formats)
thumbnails = []
thumbnail_urls = [properties.get('imageDesktop')]

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
@@ -14,7 +11,7 @@ from ..utils import (
class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
@@ -22,15 +19,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5b400b9ee338f922cb06450c',
'title': 'Japanese Suppers',
'ext': 'mp4',
'display_id': 'weeknight-japanese-suppers',
'description': 'md5:64e606bfee910627efc4b5f050de92b3',
'thumbnail': r're:^https?://',
'timestamp': 1523318400,
'upload_date': '20180410',
'release_date': '20180410',
'series': "America's Test Kitchen",
'season_number': 18,
'timestamp': 1523304000,
'upload_date': '20180409',
'release_date': '20180409',
'series': 'America\'s Test Kitchen',
'season': 'Season 18',
'episode': 'Japanese Suppers',
'season_number': 18,
'episode_number': 15,
'duration': 1376,
'thumbnail': r're:^https?://',
'average_rating': 0,
'view_count': int,
},
'params': {
'skip_download': True,
@@ -43,15 +45,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5fbe8c61bda2010001c6763b',
'title': 'Simple Chicken Dinner',
'ext': 'mp4',
'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4',
'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
'thumbnail': r're:^https?://',
'timestamp': 1610755200,
'upload_date': '20210116',
'release_date': '20210116',
'series': "America's Test Kitchen",
'season_number': 21,
'timestamp': 1610737200,
'upload_date': '20210115',
'release_date': '20210115',
'series': 'America\'s Test Kitchen',
'season': 'Season 21',
'episode': 'Simple Chicken Dinner',
'season_number': 21,
'episode_number': 3,
'duration': 1397,
'thumbnail': r're:^https?://',
'view_count': int,
'average_rating': 0,
},
'params': {
'skip_download': True,
@@ -60,10 +67,10 @@ class AmericasTestKitchenIE(InfoExtractor):
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington',
'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington',
'only_matching': True,
}]
@@ -93,7 +100,7 @@ class AmericasTestKitchenIE(InfoExtractor):
class AmericasTestKitchenSeasonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P<show>/cookscountry)?/episodes/browse/season_(?P<id>\d+)'
_TESTS = [{
# ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
@@ -104,7 +111,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
'playlist_count': 13,
}, {
# Cooks Country Season
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12',
'info_dict': {
'id': 'season_12',
'title': 'Season 12',
@@ -113,17 +120,17 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
}]
def _real_extract(self, url):
show_name, season_number = self._match_valid_url(url).groups()
show_path, season_number = self._match_valid_url(url).group('show', 'id')
season_number = int(season_number)
slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
slug = 'cco' if show_path == '/cookscountry' else 'atk'
season = 'Season %d' % season_number
season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
season, headers={
'Origin': 'https://www.%s.com' % show_name,
'Origin': 'https://www.americastestkitchen.com',
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={
@@ -139,12 +146,12 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
def entries():
for episode in (season_search.get('hits') or []):
search_url = episode.get('search_url')
search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode'
if not search_url:
continue
yield {
'_type': 'url',
'url': 'https://www.%s.com%s' % (show_name, search_url),
'url': f'https://www.americastestkitchen.com{show_path or ""}{search_url}',
'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
'title': episode.get('title'),
'description': episode.get('description'),

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
@@ -13,7 +10,7 @@ from ..utils import (
)
class AMPIE(InfoExtractor):
class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
feed = self._download_json(
@@ -87,8 +84,6 @@ class AMPIE(InfoExtractor):
'ext': ext,
})
self._sort_formats(formats)
timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
return {

View File

@@ -0,0 +1,56 @@
import re
from .common import InfoExtractor
from ..utils import url_or_none, merge_dicts
class AngelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?angel\.com/watch/(?P<series>[^/?#]+)/episode/(?P<id>[\w-]+)/season-(?P<season_number>\d+)/episode-(?P<episode_number>\d+)/(?P<title>[^/?#]+)'
_TESTS = [{
'url': 'https://www.angel.com/watch/tuttle-twins/episode/2f3d0382-ea82-4cdc-958e-84fbadadc710/season-1/episode-1/when-laws-give-you-lemons',
'md5': '4734e5cfdd64a568e837246aa3eaa524',
'info_dict': {
'id': '2f3d0382-ea82-4cdc-958e-84fbadadc710',
'ext': 'mp4',
'title': 'Tuttle Twins Season 1, Episode 1: When Laws Give You Lemons',
'description': 'md5:73b704897c20ab59c433a9c0a8202d5e',
'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$',
'duration': 1359.0
}
}, {
'url': 'https://www.angel.com/watch/the-chosen/episode/8dfb714d-bca5-4812-8125-24fb9514cd10/season-1/episode-1/i-have-called-you-by-name',
'md5': 'e4774bad0a5f0ad2e90d175cafdb797d',
'info_dict': {
'id': '8dfb714d-bca5-4812-8125-24fb9514cd10',
'ext': 'mp4',
'title': 'The Chosen Season 1, Episode 1: I Have Called You By Name',
'description': 'md5:aadfb4827a94415de5ff6426e6dee3be',
'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$',
'duration': 3276.0
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json_ld = self._search_json_ld(webpage, video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
json_ld.pop('url'), video_id, note='Downloading HD m3u8 information')
info_dict = {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'formats': formats,
'subtitles': subtitles
}
# Angel uses cloudinary in the background and supports image transformations.
# We remove these transformations and return the source file
base_thumbnail_url = url_or_none(self._og_search_thumbnail(webpage)) or json_ld.pop('thumbnails')
if base_thumbnail_url:
info_dict['thumbnail'] = re.sub(r'(/upload)/.+(/angel-app/.+)$', r'\1\2', base_thumbnail_url)
return merge_dicts(info_dict, json_ld)

View File

@@ -1,285 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
urlencode_postdata,
int_or_none,
str_or_none,
determine_ext,
)
from ..compat import compat_HTTPError
class AnimeLabBaseIE(InfoExtractor):
_LOGIN_REQUIRED = True
_LOGIN_URL = 'https://www.animelab.com/login'
_NETRC_MACHINE = 'animelab'
def _login(self):
def is_logged_in(login_webpage):
return 'Sign In' not in login_webpage
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
# Check if already logged in
if is_logged_in(login_page):
return
(username, password) = self._get_login_info()
if username is None and self._LOGIN_REQUIRED:
self.raise_login_required('Login is required to access any AnimeLab content')
login_form = {
'email': username,
'password': password,
}
try:
response = self._download_webpage(
self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
data=urlencode_postdata(login_form),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
else:
raise
# if login was successful
if is_logged_in(response):
return
raise ExtractorError('Unable to login (cannot verify if logged in)')
def _real_initialize(self):
self._login()
class AnimeLabIE(AnimeLabBaseIE):
_VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'
# the following tests require authentication, but a free account will suffice
# just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
# or you can set 'username' and 'password' there
# the tests also select a specific format so that the same video is downloaded
# regardless of whether the user is premium or not (needs testing on a premium account)
_TEST = {
'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
'info_dict': {
'id': '383',
'ext': 'mp4',
'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
'series': 'Fullmetal Alchemist: Brotherhood',
'episode': 'Signs of a Counteroffensive',
'episode_number': 42,
'duration': 1469,
'season': 'Season 1',
'season_number': 1,
'season_id': '38',
},
'params': {
'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
},
'skip': 'All AnimeLab content requires authentication',
}
def _real_extract(self, url):
display_id = self._match_id(url)
# unfortunately we can get different URLs for the same formats
# e.g. if we are using a "free" account so no dubs available
# (so _remove_duplicate_formats is not effective)
# so we use a dictionary as a workaround
formats = {}
for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
'https://www.animelab.com/player/%s/dubbed'):
actual_url = language_option_url % display_id
webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)
video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position'))
raw_data = video_collection[position]['videoEntry']
video_id = str_or_none(raw_data['id'])
# create a title from many sources (while grabbing other info)
# TODO use more fallback sources to get some of these
series = raw_data.get('showTitle')
video_type = raw_data.get('videoEntryType', {}).get('name')
episode_number = raw_data.get('episodeNumber')
episode_name = raw_data.get('name')
title_parts = (series, video_type, episode_number, episode_name)
if None not in title_parts:
title = '%s - %s %s - %s' % title_parts
else:
title = episode_name
description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)
duration = int_or_none(raw_data.get('duration'))
thumbnail_data = raw_data.get('images', [])
thumbnails = []
for thumbnail in thumbnail_data:
for instance in thumbnail['imageInstances']:
image_data = instance.get('imageInfo', {})
thumbnails.append({
'id': str_or_none(image_data.get('id')),
'url': image_data.get('fullPath'),
'width': image_data.get('width'),
'height': image_data.get('height'),
})
season_data = raw_data.get('season', {}) or {}
season = str_or_none(season_data.get('name'))
season_number = int_or_none(season_data.get('seasonNumber'))
season_id = str_or_none(season_data.get('id'))
for video_data in raw_data['videoList']:
current_video_list = {}
current_video_list['language'] = video_data.get('language', {}).get('languageCode')
is_hardsubbed = video_data.get('hardSubbed')
for video_instance in video_data['videoInstances']:
httpurl = video_instance.get('httpUrl')
url = httpurl if httpurl else video_instance.get('rtmpUrl')
if url is None:
# this video format is unavailable to the user (not premium etc.)
continue
current_format = current_video_list.copy()
format_id_parts = []
format_id_parts.append(str_or_none(video_instance.get('id')))
if is_hardsubbed is not None:
if is_hardsubbed:
format_id_parts.append('yeshardsubbed')
else:
format_id_parts.append('nothardsubbed')
format_id_parts.append(current_format['language'])
format_id = '_'.join([x for x in format_id_parts if x is not None])
ext = determine_ext(url)
if ext == 'm3u8':
for format_ in self._extract_m3u8_formats(
url, video_id, m3u8_id=format_id, fatal=False):
formats[format_['format_id']] = format_
continue
elif ext == 'mpd':
for format_ in self._extract_mpd_formats(
url, video_id, mpd_id=format_id, fatal=False):
formats[format_['format_id']] = format_
continue
current_format['url'] = url
quality_data = video_instance.get('videoQuality')
if quality_data:
quality = quality_data.get('name') or quality_data.get('description')
else:
quality = None
height = None
if quality:
height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))
if height is None:
self.report_warning('Could not get height of video')
else:
current_format['height'] = height
current_format['format_id'] = format_id
formats[current_format['format_id']] = current_format
formats = list(formats.values())
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'series': series,
'episode': episode_name,
'episode_number': int_or_none(episode_number),
'thumbnails': thumbnails,
'duration': duration,
'formats': formats,
'season': season,
'season_number': season_number,
'season_id': season_id,
}
class AnimeLabShowsIE(AnimeLabBaseIE):
_VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'
_TEST = {
'url': 'https://www.animelab.com/shows/attack-on-titan',
'info_dict': {
'id': '45',
'title': 'Attack on Titan',
'description': 'md5:989d95a2677e9309368d5cf39ba91469',
},
'playlist_count': 59,
'skip': 'All AnimeLab content requires authentication',
}
def _real_extract(self, url):
_BASE_URL = 'http://www.animelab.com'
_SHOWS_API_URL = '/api/videoentries/show/videos/'
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, 'Downloading requested URL')
show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
show_data = self._parse_json(show_data_str, display_id)
show_id = str_or_none(show_data.get('id'))
title = show_data.get('name')
description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')
entries = []
for season in show_data['seasons']:
season_id = season['id']
get_data = urlencode_postdata({
'seasonId': season_id,
'limit': 1000,
})
# despite using urlencode_postdata, we are sending a GET request
target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
response = self._download_webpage(
target_url,
None, 'Season id %s' % season_id)
season_data = self._parse_json(response, display_id)
for video_data in season_data['list']:
entries.append(self.url_result(
_BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
str_or_none(video_data.get('id')), video_data.get('name')
))
return {
'_type': 'playlist',
'id': show_id,
'title': title,
'description': description,
'entries': entries,
}
# TODO implement myqueue

View File

@@ -1,291 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
join_nonempty,
url_or_none,
urlencode_postdata,
urljoin,
)
class AnimeOnDemandIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)'
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
# German-speaking countries of Europe
_GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
_TESTS = [{
# jap, OmU
'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': {
'id': '161',
'title': 'Grimgar, Ashes and Illusions (OmU)',
'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
},
'playlist_mincount': 4,
}, {
# Film wording is used instead of Episode, ger/jap, Dub/OmU
'url': 'https://www.anime-on-demand.de/anime/39',
'only_matching': True,
}, {
# Episodes without titles, jap, OmU
'url': 'https://www.anime-on-demand.de/anime/162',
'only_matching': True,
}, {
# ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/169',
'only_matching': True,
}, {
# Full length film, non-series, ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/185',
'only_matching': True,
}, {
# Flash videos
'url': 'https://www.anime-on-demand.de/anime/12',
'only_matching': True,
}]
def _login(self):
username, password = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page:
self.raise_geo_restricted(
'%s is only available in German-speaking countries of Europe' % self.IE_NAME)
login_form = self._form_hidden_inputs('new_user', login_page)
login_form.update({
'user[login]': username,
'user[password]': password,
})
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in',
data=urlencode_postdata(login_form), headers={
'Referer': self._LOGIN_URL,
})
if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
error = self._search_regex(
r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>',
response, 'error', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_initialize(self):
self._login()
def _real_extract(self, url):
anime_id = self._match_id(url)
webpage = self._download_webpage(url, anime_id)
if 'data-playlist=' not in webpage:
self._download_webpage(
self._APPLY_HTML5_URL, anime_id,
'Activating HTML5 beta', 'Unable to apply HTML5 beta')
webpage = self._download_webpage(url, anime_id)
csrf_token = self._html_search_meta(
'csrf-token', webpage, 'csrf token', fatal=True)
anime_title = self._html_search_regex(
r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>',
webpage, 'anime name')
anime_description = self._html_search_regex(
r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
webpage, 'anime description', default=None)
def extract_info(html, video_id, num=None):
title, description = [None] * 2
formats = []
for input_ in re.findall(
r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
attributes = extract_attributes(input_)
title = attributes.get('data-dialog-header')
playlist_urls = []
for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
playlist_url = attributes.get(playlist_key)
if isinstance(playlist_url, compat_str) and re.match(
r'/?[\da-zA-Z]+', playlist_url):
playlist_urls.append(attributes[playlist_key])
if not playlist_urls:
continue
lang = attributes.get('data-lang')
lang_note = attributes.get('value')
for playlist_url in playlist_urls:
kind = self._search_regex(
r'videomaterialurl/\d+/([^/]+)/',
playlist_url, 'media kind', default=None)
format_id = join_nonempty(lang, kind) if lang or kind else str(num)
format_note = join_nonempty(kind, lang_note, delim=', ')
item_id_list = []
if format_id:
item_id_list.append(format_id)
item_id_list.append('videomaterial')
playlist = self._download_json(
urljoin(url, playlist_url), video_id,
'Downloading %s JSON' % ' '.join(item_id_list),
headers={
'X-Requested-With': 'XMLHttpRequest',
'X-CSRF-Token': csrf_token,
'Referer': url,
'Accept': 'application/json, text/javascript, */*; q=0.01',
}, fatal=False)
if not playlist:
continue
stream_url = url_or_none(playlist.get('streamurl'))
if stream_url:
rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
stream_url)
if rtmp:
formats.append({
'url': rtmp.group('url'),
'app': rtmp.group('app'),
'play_path': rtmp.group('playpath'),
'page_url': url,
'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
'rtmp_real_time': True,
'format_id': 'rtmp',
'ext': 'flv',
})
continue
start_video = playlist.get('startvideo', 0)
playlist = playlist.get('playlist')
if not playlist or not isinstance(playlist, list):
continue
playlist = playlist[start_video]
title = playlist.get('title')
if not title:
continue
description = playlist.get('description')
for source in playlist.get('sources', []):
file_ = source.get('file')
if not file_:
continue
ext = determine_ext(file_)
format_id = join_nonempty(
lang, kind,
'hls' if ext == 'm3u8' else None,
'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None)
if ext == 'm3u8':
file_formats = self._extract_m3u8_formats(
file_, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)
elif source.get('type') == 'video/dash' or ext == 'mpd':
continue
file_formats = self._extract_mpd_formats(
file_, video_id, mpd_id=format_id, fatal=False)
else:
continue
for f in file_formats:
f.update({
'language': lang,
'format_note': format_note,
})
formats.extend(file_formats)
return {
'title': title,
'description': description,
'formats': formats,
}
def extract_entries(html, video_id, common_info, num=None):
info = extract_info(html, video_id, num)
if info['formats']:
self._sort_formats(info['formats'])
f = common_info.copy()
f.update(info)
yield f
# Extract teaser/trailer only when full episode is not available
if not info['formats']:
m = re.search(
r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
html)
if m:
f = common_info.copy()
f.update({
'id': '%s-%s' % (f['id'], m.group('kind').lower()),
'title': m.group('title'),
'url': urljoin(url, m.group('href')),
})
yield f
def extract_episodes(html):
for num, episode_html in enumerate(re.findall(
r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
episodebox_title = self._search_regex(
(r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
episode_html, 'episodebox title', default=None, group='title')
if not episodebox_title:
continue
episode_number = int(self._search_regex(
r'(?:Episode|Film)\s*(\d+)',
episodebox_title, 'episode number', default=num))
episode_title = self._search_regex(
r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
episodebox_title, 'episode title', default=None)
video_id = 'episode-%d' % episode_number
common_info = {
'id': video_id,
'series': anime_title,
'episode': episode_title,
'episode_number': episode_number,
}
for e in extract_entries(episode_html, video_id, common_info):
yield e
def extract_film(html, video_id):
common_info = {
'id': anime_id,
'title': anime_title,
'description': anime_description,
}
for e in extract_entries(html, video_id, common_info):
yield e
def entries():
has_episodes = False
for e in extract_episodes(webpage):
has_episodes = True
yield e
if not has_episodes:
for e in extract_film(webpage, anime_id):
yield e
return self.playlist_result(
entries(), anime_id, anime_title, anime_description)

View File

@@ -0,0 +1,128 @@
import urllib.parse
from .common import InfoExtractor
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
scale_thumbnails_to_max_format_width,
)
class Ant1NewsGrBaseIE(InfoExtractor):
def _download_and_extract_api_data(self, video_id, netloc, cid=None):
url = f'{self.http_scheme()}//{netloc}{self._API_PATH}'
info = self._download_json(url, video_id, query={'cid': cid or video_id})
try:
source = info['url']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
if determine_ext(source) == 'm3u8' else ([{'url': source}], {}))
thumbnails = scale_thumbnails_to_max_format_width(
formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+')
return {
'id': video_id,
'title': info.get('title'),
'thumbnails': thumbnails,
'formats': formats,
'subtitles': subs,
}
class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:watch'
IE_DESC = 'ant1news.gr videos'
_VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/'
_API_PATH = '/templates/data/player'
_TESTS = [{
'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
'md5': '95925e6b32106754235f2417e0d2dfab',
'info_dict': {
'id': '1506168',
'ext': 'mp4',
'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg',
},
}]
def _real_extract(self, url):
video_id, netloc = self._match_valid_url(url).group('id', 'netloc')
webpage = self._download_webpage(url, video_id)
info = self._download_and_extract_api_data(video_id, netloc)
info['description'] = self._og_search_description(webpage)
return info
class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:article'
IE_DESC = 'ant1news.gr articles'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': '294f18331bb516539d72d85a82887dcc',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
'timestamp': 1603092840,
'upload_date': '20201019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
},
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
'info_dict': {
'id': '620286',
'title': 'md5:91fe569e952e4d146485740ae927662b',
},
'playlist_mincount': 2,
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id, expected=True)
return self.playlist_from_matches(
embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(),
video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')})
class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:embed'
IE_DESC = 'ant1news.gr embedded videos'
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer'
_TESTS = [{
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
'md5': 'dfc58c3a11a5a9aad2ba316ed447def3',
'info_dict': {
'id': '3f_li_c_az_jw_y_u=',
'ext': 'mp4',
'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
canonical_url = self._request_webpage(
HEADRequest(url), video_id,
note='Resolve canonical player URL',
errnote='Could not resolve canonical player URL').geturl()
_, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url)
cid = urllib.parse.parse_qs(query)['cid'][0]
return self._download_and_extract_api_data(video_id, netloc, cid=cid)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import hashlib
import json
@@ -10,38 +7,68 @@ import time
from .common import InfoExtractor
from ..aes import aes_encrypt
from ..compat import compat_str
from ..utils import (
bytes_to_intlist,
determine_ext,
intlist_to_bytes,
int_or_none,
intlist_to_bytes,
join_nonempty,
smuggle_url,
strip_jsonp,
traverse_obj,
unescapeHTML,
unsmuggle_url,
)
# This import causes a ModuleNotFoundError on some systems for unknown reason.
# See issues:
# https://github.com/yt-dlp/yt-dlp/issues/35
# https://github.com/ytdl-org/youtube-dl/issues/27449
# https://github.com/animelover1984/youtube-dl/issues/17
try:
from .anvato_token_generator import NFLTokenGenerator
except ImportError:
NFLTokenGenerator = None
def md5_text(s):
if not isinstance(s, compat_str):
s = compat_str(s)
return hashlib.md5(s.encode('utf-8')).hexdigest()
return hashlib.md5(str(s).encode()).hexdigest()
class AnvatoIE(InfoExtractor):
_VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
_API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js
_TESTS = [{
# from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
'md5': '921919dab3cd0b849ff3d624831ae3e2',
'info_dict': {
'id': '899441',
'ext': 'mp4',
'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
'description': 'md5:85e05a3cc163f8c344340f220521136d',
'upload_date': '20201215',
'timestamp': 1608009755,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'NFL',
'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
'Player Highlights', 'Cleveland Browns', 'league'],
'duration': 157,
'categories': ['Entertainment', 'Game', 'Highlights'],
},
}, {
# from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
'md5': '837718bcfb3a7778d022f857f7a9b19e',
'info_dict': {
'id': '8032455',
'ext': 'mp4',
'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream',
'description': 'md5:0a12bab8159445e78f52a297a35c6609',
'upload_date': '20220928',
'timestamp': 1664408881,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'LIN',
'tags': ['video', 'news', '5live'],
'duration': 155,
'categories': ['News'],
},
}]
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -214,86 +241,74 @@ class AnvatoIE(InfoExtractor):
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
}
def _generate_nfl_token(self, anvack, mcp_id):
reroute = self._download_json(
'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100}, note='Fetching token info')
token_type = reroute.get('token_type') or 'Bearer'
auth_token = f'{token_type} {reroute["access_token"]}'
response = self._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id),
}).encode(), headers={
'Authorization': auth_token,
'Content-Type': 'application/json',
}, note='Fetching NFL API token')
return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
_TOKEN_GENERATORS = {
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator,
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
}
_API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
_TESTS = [{
# from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
'info_dict': {
'id': '4465496',
'ext': 'mp4',
'title': 'VIDEO: Humpback whale breaches right next to NH boat',
'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
'duration': 22,
'timestamp': 1534855680,
'upload_date': '20180821',
'uploader': 'ANV',
},
'params': {
'skip_download': True,
},
}, {
# from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
'only_matching': True,
}]
def __init__(self, *args, **kwargs):
super(AnvatoIE, self).__init__(*args, **kwargs)
self.__server_time = None
def _server_time(self, access_key, video_id):
if self.__server_time is not None:
return self.__server_time
return int_or_none(traverse_obj(self._download_json(
f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
note='Fetching server time', fatal=False), 'server_time')) or int(time.time())
self.__server_time = int(self._download_json(
self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
note='Fetching server time')['server_time'])
return self.__server_time
def _api_prefix(self, access_key):
return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
def _get_video_json(self, access_key, video_id):
def _get_video_json(self, access_key, video_id, extracted_token):
# See et() in anvplayer.min.js, which is an alias of getVideoJSON()
video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}'
server_time = self._server_time(access_key, video_id)
input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}'
auth_secret = intlist_to_bytes(aes_encrypt(
bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
query = {
'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'),
'rtyp': 'fp',
}
anvrid = md5_text(time.time() * 1000 * random.random())[:30]
api = {
'anvrid': anvrid,
'anvts': server_time,
}
if self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id)
if extracted_token is not None:
api['anvstk2'] = extracted_token
elif self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
elif self._ANVACK_TABLE.get(access_key) is not None:
api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
else:
api['anvstk'] = md5_text('%s|%s|%d|%s' % (
access_key, anvrid, server_time,
self._ANVACK_TABLE.get(access_key, self._API_KEY)))
api['anvstk2'] = 'default'
return self._download_json(
video_data_url, video_id, transform_source=strip_jsonp,
data=json.dumps({'api': api}).encode('utf-8'))
video_data_url, video_id, transform_source=strip_jsonp, query=query,
data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8'))
def _get_anvato_videos(self, access_key, video_id):
video_data = self._get_video_json(access_key, video_id)
def _get_anvato_videos(self, access_key, video_id, token):
video_data = self._get_video_json(access_key, video_id, token)
formats = []
for published_url in video_data['published_urls']:
video_url = published_url['embed_url']
video_url = published_url.get('embed_url')
if not video_url:
continue
media_format = published_url.get('format')
ext = determine_ext(video_url)
@@ -308,15 +323,27 @@ class AnvatoIE(InfoExtractor):
'tbr': tbr or None,
}
if media_format == 'm3u8' and tbr is not None:
vtt_subs, hls_subs = {}, {}
if media_format == 'vtt':
_, vtt_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, m3u8_id='vtt', fatal=False)
continue
elif media_format == 'm3u8' and tbr is not None:
a_format.update({
'format_id': join_nonempty('hls', tbr),
'ext': 'mp4',
})
elif media_format == 'm3u8-variant' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
# For some videos the initial m3u8 URL returns JSON instead
manifest_json = self._download_json(
video_url, video_id, note='Downloading manifest JSON', errnote=False)
if manifest_json:
video_url = manifest_json.get('master_m3u8')
if not video_url:
continue
hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)
formats.extend(hls_fmts)
continue
elif ext == 'mp3' or media_format == 'mp3':
a_format['vcodec'] = 'none'
@@ -327,8 +354,6 @@ class AnvatoIE(InfoExtractor):
})
formats.append(a_format)
self._sort_formats(formats)
subtitles = {}
for caption in video_data.get('captions', []):
a_caption = {
@@ -336,6 +361,7 @@ class AnvatoIE(InfoExtractor):
'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
}
subtitles.setdefault(caption['language'], []).append(a_caption)
subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs)
return {
'id': video_id,
@@ -352,30 +378,19 @@ class AnvatoIE(InfoExtractor):
'subtitles': subtitles,
}
@staticmethod
def _extract_urls(ie, webpage, video_id):
entries = []
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
anvplayer_data = ie._parse_json(
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
@classmethod
def _extract_from_webpage(cls, url, webpage):
for mobj in re.finditer(cls._ANVP_RE, webpage):
anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
if not access_key:
access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
if not (video_id or '').isdigit() or not access_key:
continue
entries.append(ie.url_result(
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
url = f'anvato:{access_key}:{video_id}'
if anvplayer_data.get('token'):
url = smuggle_url(url, {'token': anvplayer_data['token']})
yield cls.url_result(url, AnvatoIE, video_id)
def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
@@ -383,7 +398,7 @@ class AnvatoIE(InfoExtractor):
self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default'
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -391,9 +406,7 @@ class AnvatoIE(InfoExtractor):
'countries': smuggled_data.get('geo_countries'),
})
mobj = self._match_valid_url(url)
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE:
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
access_key) or access_key
return self._get_anvato_videos(access_key, video_id)
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key
return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token'))

View File

@@ -1,7 +0,0 @@
from __future__ import unicode_literals
from .nfl import NFLTokenGenerator
__all__ = [
'NFLTokenGenerator',
]

View File

@@ -1,6 +0,0 @@
from __future__ import unicode_literals
class TokenGenerator:
def generate(self, anvack, mcp_id):
raise NotImplementedError('This method must be implemented by subclasses')

View File

@@ -1,30 +0,0 @@
from __future__ import unicode_literals
import json
from .common import TokenGenerator
class NFLTokenGenerator(TokenGenerator):
_AUTHORIZATION = None
def generate(ie, anvack, mcp_id):
if not NFLTokenGenerator._AUTHORIZATION:
reroute = ie._download_json(
'https://api.nfl.com/v1/reroute', mcp_id,
data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100})
NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
return ie._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id),
}).encode(), headers={
'Authorization': NFLTokenGenerator._AUTHORIZATION,
'Content-Type': 'application/json',
})['data']['viewer']['mediaToken']['token']

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .yahoo import YahooIE
@@ -12,7 +9,7 @@ from ..utils import (
)
class AolIE(YahooIE):
class AolIE(YahooIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'aol.com'
_VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})'
@@ -122,7 +119,6 @@ class AolIE(YahooIE):
'height': int_or_none(qs.get('h', [None])[0]),
})
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,8 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
@@ -13,6 +8,7 @@ from ..utils import (
class APAIE(InfoExtractor):
_VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
@@ -33,14 +29,6 @@ class APAIE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url')
@@ -84,7 +72,6 @@ class APAIE(InfoExtractor):
'format_id': format_id,
'height': height,
})
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
get_element_by_id,
@@ -13,6 +10,7 @@ from ..utils import (
class AparatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"']
_TESTS = [{
'url': 'http://www.aparat.com/v/wP8On',
@@ -75,7 +73,6 @@ class AparatIE(InfoExtractor):
r'(\d+)[pP]', label or '', 'height',
default=None)),
})
self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, default={})

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
str_to_int,

View File

@@ -1,9 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
clean_podcast_url,
get_element_by_class,
int_or_none,
parse_iso8601,
try_get,
@@ -14,16 +13,17 @@ class ApplePodcastsIE(InfoExtractor):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
'md5': 'df02e6acb11c10e844946a39e7222b08',
'md5': '41dc31cd650143e530d9423b6b5a344f',
'info_dict': {
'id': '1000482637777',
'ext': 'mp3',
'title': '207 - Whitney Webb Returns',
'description': 'md5:13a73bade02d2e43737751e3987e1399',
'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
'upload_date': '20200705',
'timestamp': 1593921600,
'duration': 6425,
'timestamp': 1593932400,
'duration': 6454,
'series': 'The Tim Dillon Show',
'thumbnail': 're:.+[.](png|jpe?g|webp)',
}
}, {
'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
@@ -39,24 +39,47 @@ class ApplePodcastsIE(InfoExtractor):
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
ember_data = self._parse_json(self._search_regex(
r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id)
ember_data = ember_data.get(episode_id) or ember_data
episode = ember_data['data']['attributes']
episode_data = {}
ember_data = {}
# new page type 2021-11
amp_data = self._parse_json(self._search_regex(
r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
amp_data = try_get(amp_data,
lambda a: self._parse_json(
next(a[x] for x in iter(a) if episode_id in x),
episode_id),
dict) or {}
amp_data = amp_data.get('d') or []
episode_data = try_get(
amp_data,
lambda a: next(x for x in a
if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
dict)
if not episode_data:
# try pre 2021-11 page type: TODO: consider deleting if no longer used
ember_data = self._parse_json(self._search_regex(
r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id) or {}
ember_data = ember_data.get(episode_id) or ember_data
episode_data = try_get(ember_data, lambda x: x['data'], dict)
episode = episode_data['attributes']
description = episode.get('description') or {}
series = None
for inc in (ember_data.get('included') or []):
for inc in (amp_data or ember_data.get('included') or []):
if inc.get('type') == 'media/podcast':
series = try_get(inc, lambda x: x['attributes']['name'])
series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
return {
'id': episode_id,
'title': episode['name'],
'title': episode.get('name'),
'url': clean_podcast_url(episode['assetUrl']),
'description': description.get('standard') or description.get('short'),
'timestamp': parse_iso8601(episode.get('releaseDateTime')),
'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
'series': series,
'thumbnail': self._og_search_thumbnail(webpage),
'vcodec': 'none',
}

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
import re
import json
@@ -122,7 +120,6 @@ class AppleTrailersIE(InfoExtractor):
'height': int_or_none(size_data.get('height')),
'language': version[:2],
})
self._sort_formats(formats)
entries.append({
'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
@@ -187,8 +184,6 @@ class AppleTrailersIE(InfoExtractor):
'height': int_or_none(format['height']),
})
self._sort_formats(formats)
playlist.append({
'_type': 'video',
'id': video_id,

View File

@@ -1,39 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import json
import re
import urllib.error
import urllib.parse
from .common import InfoExtractor
from .youtube import YoutubeIE, YoutubeBaseInfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_HTTPError
)
from .naver import NaverBaseIE
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
from ..compat import compat_HTTPError, compat_urllib_parse_unquote
from ..utils import (
KNOWN_EXTENSIONS,
ExtractorError,
HEADRequest,
bug_reports_message,
clean_html,
dict_get,
extract_attributes,
ExtractorError,
get_element_by_id,
HEADRequest,
int_or_none,
join_nonempty,
KNOWN_EXTENSIONS,
js_to_json,
merge_dicts,
mimetype2ext,
orderedSet,
parse_duration,
parse_qs,
str_to_int,
str_or_none,
str_to_int,
traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
url_or_none,
urlhandle_detect_ext,
url_or_none
)
@@ -54,6 +52,11 @@ class ArchiveOrgIE(InfoExtractor):
'upload_date': '20100315',
'creator': 'SRI International',
'uploader': 'laura@archive.org',
'thumbnail': r're:https://archive\.org/download/.*\.jpg',
'release_year': 1968,
'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr',
'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect',
},
}, {
'url': 'https://archive.org/details/Cops1922',
@@ -62,33 +65,43 @@ class ArchiveOrgIE(InfoExtractor):
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca',
'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
'upload_date': '20131222',
'display_id': 'Cops-v2.mp4',
'thumbnail': r're:https://archive\.org/download/.*\.jpg',
'duration': 1091.96,
},
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'only_matching': True,
}, {
'url': 'https://archive.org/details/Election_Ads',
'md5': '284180e857160cf866358700bab668a3',
'md5': 'eec5cddebd4793c6a653b69c3b11f2e6',
'info_dict': {
'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
'ext': 'mp4',
'ext': 'mpg',
'thumbnail': r're:https://archive\.org/download/.*\.jpg',
'duration': 59.77,
'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
},
}, {
'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'md5': '7915213ef02559b5501fe630e1a53f59',
'md5': 'ea1eed8234e7d4165f38c8c769edef38',
'info_dict': {
'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'ext': 'mp4',
'ext': 'mpg',
'timestamp': 1205588045,
'uploader': 'mikedavisstripmaster@yahoo.com',
'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
'upload_date': '20080315',
'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
'duration': 59.51,
'license': 'http://creativecommons.org/licenses/publicdomain/',
'thumbnail': r're:https://archive\.org/download/.*\.jpg',
},
}, {
'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
@@ -97,6 +110,12 @@ class ArchiveOrgIE(InfoExtractor):
'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
'title': 'Turning',
'ext': 'flac',
'track': 'Turning',
'creator': 'Grateful Dead',
'display_id': 'gd1977-05-08d01t01.flac',
'track_number': 1,
'album': '1977-05-08 - Barton Hall - Cornell University',
'duration': 39.8,
},
}, {
'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
@@ -107,11 +126,20 @@ class ArchiveOrgIE(InfoExtractor):
'ext': 'flac',
'timestamp': 1205895624,
'uploader': 'mvernon54@yahoo.com',
'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
'description': 'md5:6c921464414814720c6593810a5c7e3d',
'upload_date': '20080319',
'location': 'Barton Hall - Cornell University',
'duration': 438.68,
'track': 'Deal',
'creator': 'Grateful Dead',
'album': '1977-05-08 - Barton Hall - Cornell University',
'release_date': '19770508',
'display_id': 'gd1977-05-08d01t07.flac',
'release_year': 1977,
'track_number': 7,
},
}, {
# FIXME: give a better error message than just IndexError when all available formats are restricted
'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
'md5': '7cb019baa9b332e82ea7c10403acd180',
'info_dict': {
@@ -119,6 +147,7 @@ class ArchiveOrgIE(InfoExtractor):
'title': 'Bells Of Rostov',
'ext': 'mp3',
},
'skip': 'restricted'
}, {
'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
@@ -131,6 +160,52 @@ class ArchiveOrgIE(InfoExtractor):
'description': 'md5:012b2d668ae753be36896f343d12a236',
'upload_date': '20190928',
},
'skip': 'restricted'
}, {
# Original formats are private
'url': 'https://archive.org/details/irelandthemakingofarepublic',
'info_dict': {
'id': 'irelandthemakingofarepublic',
'title': 'Ireland: The Making of a Republic',
'upload_date': '20160610',
'description': 'md5:f70956a156645a658a0dc9513d9e78b7',
'uploader': 'dimitrios@archive.org',
'creator': ['British Broadcasting Corporation', 'Time-Life Films'],
'timestamp': 1465594947,
},
'playlist': [
{
'md5': '0b211261b26590d49df968f71b90690d',
'info_dict': {
'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov',
'ext': 'mp4',
'title': 'irelandthemakingofarepublicreel1_01.mov',
'duration': 130.46,
'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',
'display_id': 'irelandthemakingofarepublicreel1_01.mov',
},
}, {
'md5': '67335ee3b23a0da930841981c1e79b02',
'info_dict': {
'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov',
'ext': 'mp4',
'duration': 1395.13,
'title': 'irelandthemakingofarepublicreel1_02.mov',
'display_id': 'irelandthemakingofarepublicreel1_02.mov',
'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',
},
}, {
'md5': 'e470e86787893603f4a341a16c281eb5',
'info_dict': {
'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov',
'ext': 'mp4',
'duration': 1602.67,
'title': 'irelandthemakingofarepublicreel2.mov',
'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',
'display_id': 'irelandthemakingofarepublicreel2.mov',
},
}
]
}]
@staticmethod
@@ -146,7 +221,7 @@ class ArchiveOrgIE(InfoExtractor):
return json.loads(extract_attributes(element)['value'])
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
video_id = urllib.parse.unquote_plus(self._match_id(url))
identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
# Archive.org metadata API doesn't clearly demarcate playlist entries
@@ -221,17 +296,25 @@ class ArchiveOrgIE(InfoExtractor):
'filesize': int_or_none(f.get('size'))})
extension = (f['name'].rsplit('.', 1) + [None])[1]
if extension in KNOWN_EXTENSIONS:
# We don't want to skip private formats if the user has access to them,
# however without access to an account with such privileges we can't implement/test this.
# For now to be safe, we will only skip them if there is no user logged in.
is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
entry['formats'].append({
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'format': f.get('format'),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'filesize': int_or_none(f.get('size')),
'protocol': 'https'})
'protocol': 'https',
'source_preference': 0 if f.get('source') == 'original' else -1,
'format_note': f.get('source')
})
for entry in entries.values():
self._sort_formats(entry['formats'])
entry['_format_sort_fields'] = ('source', )
if len(entries) == 1:
# If there's only one item, use it as the main info dict
@@ -287,7 +370,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
'duration': 32,
'uploader_id': 'Zeurel',
'uploader_url': 'http://www.youtube.com/user/Zeurel'
'uploader_url': 'https://www.youtube.com/user/Zeurel',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg',
}
}, {
# Internal link
@@ -302,7 +387,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
'duration': 771,
'uploader_id': '1veritasium',
'uploader_url': 'http://www.youtube.com/user/1veritasium'
'uploader_url': 'https://www.youtube.com/user/1veritasium',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA',
}
}, {
# Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
@@ -316,7 +403,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
'duration': 398,
'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
'uploader_id': 'machinima',
'uploader_url': 'http://www.youtube.com/user/machinima'
'uploader_url': 'https://www.youtube.com/user/machinima',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader': 'machinima'
}
}, {
# FLV video. Video file URL does not provide itag information
@@ -330,7 +419,10 @@ class YoutubeWebArchiveIE(InfoExtractor):
'duration': 19,
'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
'uploader_id': 'jawed',
'uploader_url': 'http://www.youtube.com/user/jawed'
'uploader_url': 'https://www.youtube.com/user/jawed',
'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader': 'jawed',
}
}, {
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
@@ -344,7 +436,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
'duration': 204,
'description': 'md5:f7535343b6eda34a314eff8b85444680',
'uploader_id': 'itsmadeon',
'uploader_url': 'http://www.youtube.com/user/itsmadeon'
'uploader_url': 'https://www.youtube.com/user/itsmadeon',
'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w',
'thumbnail': r're:https?://.*\.(jpg|webp)',
}
}, {
# First capture is of dead video, second is the oldest from CDX response.
@@ -355,10 +449,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
'upload_date': '20160218',
'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
'duration': 1236,
'duration': 1235,
'description': 'md5:21032bae736421e89c2edf36d1936947',
'uploader_id': 'MachinimaETC',
'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader': 'ETC News',
}
}, {
# First capture of dead video, capture date in link links to dead capture.
@@ -369,10 +466,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
'upload_date': '20160219',
'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
'duration': 798,
'duration': 797,
'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
'uploader_id': 'MachinimaETC',
'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader': 'ETC News',
},
'expected_warnings': [
r'unable to download capture webpage \(it may not be archived\)'
@@ -392,12 +492,11 @@ class YoutubeWebArchiveIE(InfoExtractor):
'title': 'It\'s Bootleg AirPods Time.',
'upload_date': '20211021',
'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
'duration': 810,
'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader': 'DankPods',
'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug'
}
}, {
# player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
@@ -408,12 +507,135 @@ class YoutubeWebArchiveIE(InfoExtractor):
'title': 'bitch lasagna',
'upload_date': '20181005',
'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
'duration': 135,
'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
'uploader': 'PewDiePie',
'uploader_id': 'PewDiePie',
'uploader_url': 'http://www.youtube.com/user/PewDiePie'
'uploader_url': 'https://www.youtube.com/user/PewDiePie',
'thumbnail': r're:https?://.*\.(jpg|webp)',
}
}, {
# ~June 2010 Capture. swfconfig
'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y',
'info_dict': {
'id': '8XeW5ilk-9Y',
'ext': 'flv',
'title': 'Story of Stuff, The Critique Part 4 of 4',
'duration': 541,
'description': 'md5:28157da06f2c5e94c97f7f3072509972',
'uploader': 'HowTheWorldWorks',
'uploader_id': 'HowTheWorldWorks',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
'upload_date': '20090520',
}
}, {
# Jan 2011: watch-video-date/eow-date surrounded by whitespace
'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',
'info_dict': {
'id': 'Q_yjX80U7Yc',
'ext': 'flv',
'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',
'uploader_id': 'claybutlermusic',
'description': 'md5:4595264559e3d0a0ceb3f011f6334543',
'upload_date': '20090803',
'uploader': 'claybutlermusic',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'duration': 132,
'uploader_url': 'https://www.youtube.com/user/claybutlermusic',
}
}, {
# ~May 2009 swfArgs. ytcfg is spread out over various vars
'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY',
'info_dict': {
'id': 'c5uJgG05xUY',
'ext': 'webm',
'title': 'Story of Stuff, The Critique Part 1 of 4',
'uploader_id': 'HowTheWorldWorks',
'uploader': 'HowTheWorldWorks',
'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
'upload_date': '20090513',
'description': 'md5:4ca77d79538064e41e4cc464e93f44f0',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'duration': 754,
}
}, {
# ~June 2012. Upload date is in another lang so cannot extract.
'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA',
'info_dict': {
'id': 'xWTLLl-dQaA',
'ext': 'mp4',
'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)',
'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy',
'description': 'md5:e25f0133aaf9e6793fb81c18021d193e',
'uploader_id': 'BlackNerdComedy',
'uploader': 'BlackNerdComedy',
'duration': 182,
'thumbnail': r're:https?://.*\.(jpg|webp)',
}
}, {
# ~July 2013
'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM',
'info_dict': {
'id': '9eO1aasHyTM',
'ext': 'mp4',
'title': 'Polar-oid',
'description': 'Cameras and bears are dangerous!',
'uploader_url': 'https://www.youtube.com/user/punkybird',
'uploader_id': 'punkybird',
'duration': 202,
'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ',
'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ',
'upload_date': '20060428',
'uploader': 'punkybird',
}
}, {
# April 2020: Player response in player config
'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en',
'info_dict': {
'id': 'Cf7vS8jc7dY',
'ext': 'mp4',
'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated',
'duration': 64,
'upload_date': '20200408',
'uploader_id': 'GameGrumps',
'uploader': 'GameGrumps',
'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ',
'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'description': 'md5:c625bb3c02c4f5fb4205971e468fa341',
'uploader_url': 'https://www.youtube.com/user/GameGrumps',
}
}, {
# watch7-user-header with yt-user-info
'url': 'ytarchive:kbh4T_b4Ixw:20160307085057',
'info_dict': {
'id': 'kbh4T_b4Ixw',
'ext': 'mp4',
'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix',
'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA',
'uploader': 'Nelward music',
'duration': 213,
'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'upload_date': '20150503',
'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA',
}
}, {
# April 2012
'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU',
'info_dict': {
'id': 'SOm7mPoPskU',
'ext': 'mp4',
'title': 'Boyfriend - Justin Bieber Parody',
'uploader_url': 'https://www.youtube.com/user/thecomputernerd01',
'uploader': 'thecomputernerd01',
'thumbnail': r're:https?://.*\.(jpg|webp)',
'description': 'md5:dd7fa635519c2a5b4d566beaecad7491',
'duration': 200,
'upload_date': '20120407',
'uploader_id': 'thecomputernerd01',
}
}, {
'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
@@ -445,9 +667,11 @@ class YoutubeWebArchiveIE(InfoExtractor):
'only_matching': True
},
]
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
_YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
_YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x:
(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|
{YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}
)'''
_YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
_YT_ALL_THUMB_SERVERS = orderedSet(
@@ -457,7 +681,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
_OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000
def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'):
def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = {
'url': url,
@@ -468,21 +692,17 @@ class YoutubeWebArchiveIE(InfoExtractor):
'collapse': collapse or [],
**(query or {})
}
res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query)
res = self._download_json(
'https://web.archive.org/cdx/search/cdx', item_id,
note or 'Downloading CDX API JSON', query=query, fatal=fatal)
if isinstance(res, list) and len(res) >= 2:
# format response to make it easier to use
return list(dict(zip(res[0], v)) for v in res[1:])
elif not isinstance(res, list) or len(res) != 0:
self.report_warning('Error while parsing CDX API response' + bug_reports_message())
def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
return self._parse_json(self._search_regex(
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
page_title = self._html_search_regex(
r'<title>([^<]*)</title>', webpage, 'title', default='')
page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
@@ -490,10 +710,32 @@ class YoutubeWebArchiveIE(InfoExtractor):
def _extract_metadata(self, video_id, webpage):
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
initial_data = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
player_response = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',
video_id, default={})
initial_data = self._search_json(
self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={})
ytcfg = {}
for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010
ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {})
# XXX: this also may contain a 'ptchn' key
player_config = (
self._search_json(
r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=',
webpage, 'player config', video_id, default=None)
or ytcfg.get('PLAYER_CONFIG') or {})
# XXX: this may also contain a 'creator' key.
swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={})
if swf_args and not traverse_obj(player_config, ('args',)):
player_config['args'] = swf_args
if not player_response:
# April 2020
player_response = self._parse_json(
traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False)
initial_data_video = traverse_obj(
initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
@@ -509,21 +751,64 @@ class YoutubeWebArchiveIE(InfoExtractor):
video_details.get('title')
or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
or traverse_obj(player_config, ('args', 'title'))
or self._extract_webpage_title(webpage)
or search_meta(['og:title', 'twitter:title', 'title']))
def id_from_url(url, type_):
return self._search_regex(
rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None)
# XXX: would the get_elements_by_... functions be better suited here?
_CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"'
uploader_or_channel_url = self._search_regex(
[fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024
fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012
webpage, 'uploader or channel url', default=None)
owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2
# Uploader refers to the /user/ id ONLY
uploader_id = (
id_from_url(owner_profile_url, 'user')
or id_from_url(uploader_or_channel_url, 'user')
or ytcfg.get('VIDEO_USERNAME'))
uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None
# XXX: do we want to differentiate uploader and channel?
uploader = (
self._search_regex(
[r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010
r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009
r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009
r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012
webpage, 'uploader', default=None)
or self._html_search_regex(
[r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016
r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013
get_element_by_id('watch7-user-header', webpage), 'uploader', default=None)
or self._html_search_regex(
r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012
get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None)
or traverse_obj(player_config, ('args', 'creator'))
or video_details.get('author'))
channel_id = str_or_none(
video_details.get('channelId')
or microformats.get('externalChannelId')
or search_meta('channelId')
or self._search_regex(
r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
webpage, 'channel id', default=None, group='id'))
channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None
webpage, 'channel id', default=None, group='id')
or id_from_url(owner_profile_url, 'channel')
or id_from_url(uploader_or_channel_url, 'channel')
or traverse_obj(player_config, ('args', 'ucid')))
channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None
duration = int_or_none(
video_details.get('lengthSeconds')
or microformats.get('lengthSeconds')
or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False)
or parse_duration(search_meta('duration')))
description = (
video_details.get('shortDescription')
@@ -531,26 +816,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
or search_meta(['description', 'og:description', 'twitter:description']))
uploader = video_details.get('author')
# Uploader ID and URL
uploader_mobj = re.search(
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024
webpage)
if uploader_mobj is not None:
uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url')
else:
# @a6211d2
uploader_url = url_or_none(microformats.get('ownerProfileUrl'))
uploader_id = self._search_regex(
r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None)
upload_date = unified_strdate(
dict_get(microformats, ('uploadDate', 'publishDate'))
or search_meta(['uploadDate', 'datePublished'])
or self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520
[r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520
r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively)
webpage, 'upload date', default=None))
return {
@@ -596,7 +868,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
response = self._call_cdx_api(
video_id, f'https://www.youtube.com/watch?v={video_id}',
filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None])
all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None)
# Prefer the new polymer UI captures as we support extracting more metadata from them
# WBM captures seem to all switch to this layout ~July 2020
@@ -619,18 +891,22 @@ class YoutubeWebArchiveIE(InfoExtractor):
url_date = url_date or url_date_2
urlh = None
try:
urlh = self._request_webpage(
HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org',
expected=True)
else:
raise
retry_manager = self.RetryManager(fatal=False)
for retry in retry_manager:
try:
urlh = self._request_webpage(
HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
else:
retry.error = e
if retry_manager.error:
self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id)
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
@@ -671,3 +947,237 @@ class YoutubeWebArchiveIE(InfoExtractor):
if not info.get('title'):
info['title'] = video_id
return info
class VLiveWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:vlive'
IE_DESC = 'web.archive.org saved vlive videos'
_VALID_URL = r'''(?x)
(?:https?://)?web\.archive\.org/
(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
(?:https?(?::|%3[Aa])//)?(?:
(?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL
)
'''
_TESTS = [{
'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
'uploader_url': None,
'uploader': None,
'upload_date': '20150817',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1439816449,
'like_count': int,
'channel': 'Girl\'s Day',
'channel_id': 'FDF27',
'comment_count': int,
'release_timestamp': 1439818140,
'release_date': '20150817',
'duration': 1014,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937',
'info_dict': {
'id': '16937',
'ext': 'mp4',
'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
'uploader_id': 'muploader_j',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20161112',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1478923074,
'like_count': int,
'channel': 'EXO',
'channel_id': 'F94BD',
'comment_count': int,
'release_timestamp': 1478924280,
'release_date': '20161112',
'duration': 906,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870',
'info_dict': {
'id': '101870',
'ext': 'mp4',
'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)',
'creator': 'Dispatch',
'view_count': int,
'subtitles': 'mincount:6',
'uploader_id': 'V__FRA08071',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20181130',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1543601327,
'like_count': int,
'channel': 'Dispatch',
'channel_id': 'C796F3',
'comment_count': int,
'release_timestamp': 1543601040,
'release_date': '20181130',
'duration': 279,
},
'params': {
'skip_download': True,
},
}]
# The wayback machine has special timestamp and "mode" values:
# timestamp:
# 1 = the first capture
# 2 = the last capture
# mode:
# id_ = Identity - perform no alterations of the original resource, return it as it was archived.
_WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/'
def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs):
for retry in self.RetryManager():
try:
return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs)
except ExtractorError as e:
if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404:
raise ExtractorError('Page was not archived', expected=True)
retry.error = e
continue
def _download_archived_json(self, url, video_id, **kwargs):
page = self._download_archived_page(url, video_id, **kwargs)
if not page:
raise ExtractorError('Page was not archived', expected=True)
else:
return self._parse_json(page, video_id)
def _extract_formats_from_m3u8(self, m3u8_url, params, video_id):
m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False)
if not m3u8_doc:
return
# M3U8 document should be changed to archive domain
m3u8_doc = m3u8_doc.splitlines()
url_base = m3u8_url.rsplit('/', 1)[0]
first_segment = None
for i, line in enumerate(m3u8_doc):
if not line.startswith('#'):
m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}'
first_segment = first_segment or m3u8_doc[i]
# Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870
urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False,
fatal=False, note='Check first segment availablity')
if urlh:
formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id)
if subtitles:
self._report_ignoring_subs('m3u8')
return formats
# Closely follows the logic of the ArchiveTeam grab script
# See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua
def _real_extract(self, url):
video_id, url_date = self._match_valid_url(url).group('id', 'date')
webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date)
player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id)
user_country = traverse_obj(player_info, ('common', 'userCountry'))
main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url')
main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script')
app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id')
inkey = self._download_archived_json(
f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={
'appId': app_id,
'platformType': 'PC',
'gcc': user_country,
'locale': 'en_US',
}, fatal=False)
vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId'))
vod_data = self._download_archived_json(
f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={
'key': inkey.get('inkey'),
'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project
'sid': '2024',
'ver': '2.0',
'devt': 'html5_pc',
'doct': 'json',
'ptc': 'https',
'sptc': 'https',
'cpt': 'vtt',
'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D',
'pv': '4.26.9',
'dr': '1920x1080',
'cpl': 'en_US',
'lc': 'en_US',
'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D',
'adu': '%2F',
'videoId': vod_id,
'cc': user_country,
})
formats = []
streams = traverse_obj(vod_data, ('streams', ...))
if len(streams) > 1:
self.report_warning('Multiple streams found. Only the first stream will be downloaded.')
stream = streams[0]
max_stream = max(
stream.get('videos') or [],
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_stream is not None:
params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'}
formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or []
# For parts of the project MP4 files were archived
max_video = max(
traverse_obj(vod_data, ('videos', 'list', ...)),
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_video is not None:
video_url = self._WAYBACK_BASE_URL + max_video.get('source')
urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False,
fatal=False, note='Check video availablity')
if urlh:
formats.append({'url': video_url})
return {
'id': video_id,
'formats': formats,
**traverse_obj(player_info, ('postDetail', 'post', {
'title': ('officialVideo', 'title', {str}),
'creator': ('author', 'nickname', {str}),
'channel': ('channel', 'channelName', {str}),
'channel_id': ('channel', 'channelCode', {str}),
'duration': ('officialVideo', 'playTime', {int_or_none}),
'view_count': ('officialVideo', 'playCount', {int_or_none}),
'like_count': ('officialVideo', 'likeCount', {int_or_none}),
'comment_count': ('officialVideo', 'commentCount', {int_or_none}),
'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}),
'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}),
})),
**traverse_obj(vod_data, ('meta', {
'uploader_id': ('user', 'id', {str}),
'uploader': ('user', 'name', {str}),
'uploader_url': ('user', 'url', {url_or_none}),
'thumbnail': ('cover', 'source', {url_or_none}),
}), expected_type=lambda x: x or None),
**NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]),
}

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -73,8 +70,8 @@ class ArcPublishingIE(InfoExtractor):
], 'video-api-cdn.%s.arcpublishing.com/api'),
]
@staticmethod
def _extract_urls(webpage):
@classmethod
def _extract_embed_urls(cls, url, webpage):
entries = []
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
@@ -124,8 +121,7 @@ class ArcPublishingIE(InfoExtractor):
formats.extend(smil_formats)
elif stream_type in ('ts', 'hls'):
m3u8_formats = self._extract_m3u8_formats(
s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
m3u8_id='hls', fatal=False)
s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
continue
for f in m3u8_formats:
@@ -148,7 +144,6 @@ class ArcPublishingIE(InfoExtractor):
'url': s_url,
'quality': -10,
})
self._sort_formats(formats)
subtitles = {}
for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
@@ -43,14 +40,15 @@ class ARDMediathekBaseIE(InfoExtractor):
'This video is not available due to geoblocking',
countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
if subtitle_url:
subtitles['de'] = [{
'ext': 'ttml',
'url': subtitle_url,
}, {
'ext': 'vtt',
'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
}]
return {
@@ -265,7 +263,6 @@ class ARDMediathekIE(ARDMediathekBaseIE):
'format_id': fid,
'url': furl,
})
self._sort_formats(formats)
info = {
'formats': formats,
}
@@ -292,16 +289,16 @@ class ARDMediathekIE(ARDMediathekBaseIE):
class ARDIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
# available till 7.01.2022
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
# available till 7.12.2023
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
'md5': 'a438f671e87a7eba04000336a119ccc4',
'info_dict': {
'id': 'maischberger-die-woche-video100',
'display_id': 'maischberger-die-woche-video100',
'id': 'maischberger-video-424',
'display_id': 'maischberger-video-424',
'ext': 'mp4',
'duration': 3687.0,
'title': 'maischberger. die woche vom 7. Januar 2021',
'upload_date': '20210107',
'duration': 4452.0,
'title': 'maischberger am 07.12.2022',
'upload_date': '20221207',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
@@ -374,7 +371,6 @@ class ARDIE(InfoExtractor):
continue
f['url'] = format_url
formats.append(f)
self._sort_formats(formats)
_SUB_FORMATS = (
('./dataTimedText', 'ttml'),
@@ -407,8 +403,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|sammlung))/
(?:(?P<display_id>[^?#]+)/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)'''
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
@@ -436,6 +433,13 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
'upload_date': '20211108',
},
}, {
'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
'playlist_count': 6,
'info_dict': {
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
'title': 'beforeigners/beforeigners/staffel-1',
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
@@ -561,14 +565,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
break
pageNumber = pageNumber + 1
return self.playlist_result(entries, playlist_title=display_id)
return self.playlist_result(entries, playlist_id, playlist_title=display_id)
def _real_extract(self, url):
video_id, display_id, playlist_type, client = self._match_valid_url(url).group(
'id', 'display_id', 'playlist', 'client')
video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
'id', 'display_id', 'playlist', 'client', 'season')
display_id, client = display_id or video_id, client or 'ard'
if playlist_type:
# TODO: Extract only specified season
return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
player_page = self._download_json(

View File

@@ -1,8 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -22,6 +17,8 @@ class ArkenaIE(InfoExtractor):
play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
)
'''
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1']
_TESTS = [{
'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
'md5': '97f117754e5f3c020f5f26da4a44ebaf',
@@ -53,15 +50,6 @@ class ArkenaIE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
@@ -148,7 +136,6 @@ class ArkenaIE(InfoExtractor):
elif mime_type == 'application/vnd.ms-sstr+xml':
formats.extend(self._extract_ism_formats(
href, video_id, ism_id='mss', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
@@ -76,7 +73,6 @@ class ArnesIE(InfoExtractor):
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
})
self._sort_formats(formats)
channel = video.get('channel') or {}
channel_id = channel.get('url')
@@ -93,7 +89,7 @@ class ArnesIE(InfoExtractor):
'timestamp': parse_iso8601(video.get('creationTime')),
'channel': channel.get('name'),
'channel_id': channel_id,
'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'),
'channel_url': format_field(channel_id, None, f'{self._BASE_URL}/?channel=%s'),
'duration': float_or_none(video.get('duration'), 1000),
'view_count': int_or_none(video.get('views')),
'tags': video.get('hashtags'),

View File

@@ -1,188 +1,232 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
parse_iso8601,
parse_qs,
qualities,
try_get,
unified_strdate,
strip_or_none,
traverse_obj,
url_or_none,
)
class ArteTVBaseIE(InfoExtractor):
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'
_API_BASE = 'https://api.arte.tv/api/player/v2'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
|arte://program)
/(?P<id>\d{6}-\d{3}-[AF]|LIVE)
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
'only_matching': True,
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
'info_dict': {
'id': '100103-000-A',
'title': 'USA: Dyskryminacja na porodówce',
'description': 'md5:242017b7cce59ffae340a54baefcafb1',
'alt_title': 'ARTE Reportage',
'upload_date': '20201103',
'duration': 554,
'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
'timestamp': 1604417980,
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'}
}, {
'note': 'No alt_title',
'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
'info_dict': {
'id': '110371-000-A',
'ext': 'mp4',
'upload_date': '20220718',
'duration': 154,
'timestamp': 1658162460,
'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
'title': 'La chaleur, supplice des arbres de rue',
'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
},
'params': {'skip_download': 'm3u8'}
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
'only_matching': True,
}, {
'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
'info_dict': {
'id': '110203-006-A',
'chapters': 'count:16',
'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
'alt_title': 'Zaz',
'title': 'Baloise Session 2022',
'timestamp': 1668445200,
'duration': 4054,
'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
'upload_date': '20221114',
'ext': 'mp4',
},
'expected_warnings': ['geo restricted']
}]
_GEO_BYPASS = True
_LANG_MAP = { # ISO639 -> French abbreviations
'fr': 'F',
'de': 'A',
'en': 'E[ANG]',
'es': 'E[ESP]',
'it': 'E[ITA]',
'pl': 'E[POL]',
# XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
# uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
'mul': 'EU',
}
_VERSION_CODE_RE = re.compile(r'''(?x)
V
(?P<original_voice>O?)
(?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
(?P<audio_desc>AUD|)
(?:
(?P<has_sub>-ST)
(?P<sdh_sub>M?)
(?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
)?
''')
# all obtained by exhaustive testing
_COUNTRIES_MAP = {
'DE_FR': (
'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
'PF', 'PM', 'RE', 'WF', 'YT',
),
# with both of the below 'BE' sometimes works, sometimes doesn't
'EUR_DE_FR': (
'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
'YT',
),
'SAT': (
'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
),
}
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
langauge_code = self._LANG_MAP.get(lang)
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
vsr = try_get(player_info, lambda x: x['VSR'], dict)
if not vsr:
error = None
if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
error = try_get(
player_info, lambda x: x['custom_msg']['msg'], compat_str)
if not error:
error = 'Video %s is not available' % player_info.get('VID') or video_id
raise ExtractorError(error, expected=True)
geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
if geoblocking.get('restrictedArea'):
raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
if not traverse_obj(config, ('data', 'attributes', 'rights')):
# Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
# Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
raise ExtractorError(
'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
formats, subtitles = [], {}
secondary_formats = []
for stream in config['data']['attributes']['streams']:
# official player contains code like `e.get("versions")[0].eStat.ml5`
stream_version = stream['versions'][0]
stream_version_code = stream_version['eStat']['ml5']
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
lang_pref = -1
m = self._VERSION_CODE_RE.match(stream_version_code)
if m:
lang_pref = int(''.join('01'[x] for x in (
m.group('vlang') == langauge_code, # we prefer voice in the requested language
not m.group('audio_desc'), # and not the audio description version
bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
not m.group('has_sub'), # but we prefer no subtitles otherwise
not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
)))
LANGS = {
'fr': 'F',
'de': 'A',
'en': 'E[ANG]',
'es': 'E[ESP]',
'it': 'E[ITA]',
'pl': 'E[POL]',
}
short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
if stream['protocol'].startswith('HLS'):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
for fmt in fmts:
fmt.update({
'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
'language_preference': lang_pref,
})
if any(map(short_label.startswith, ('cc', 'OGsub'))):
secondary_formats.extend(fmts)
else:
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
langcode = LANGS.get(lang, lang)
elif stream['protocol'] in ('HTTPS', 'RTMP'):
formats.append({
'format_id': f'{stream["protocol"]}-{stream_version_code}',
'url': stream['url'],
'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
'language_preference': lang_pref,
# 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
})
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
# Language preference from most to least priority
# Reference: section 6.8 of
# https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
PREFERENCES = (
# original version in requested language, without subtitles
r'VO{0}$'.format(l),
# original version in requested language, with partial subtitles in requested language
r'VO{0}-ST{0}$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO{0}-STM{0}$'.format(l),
# non-original (dubbed) version in requested language, without subtitles
r'V{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
r'V{0}-ST{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'V{0}-STM{0}$'.format(l),
# original version in requested language, with partial subtitles in different language
r'VO{0}-ST(?!{0}).+?$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
r'VO{0}-STM(?!{0}).+?$'.format(l),
# original version in different language, with partial subtitles in requested language
r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
# original version in different language, without subtitles
r'VO(?:(?!{0}))?$'.format(l),
# original version in different language, with partial subtitles in different language
r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in different language
r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
)
for pref, p in enumerate(PREFERENCES):
if re.match(p, versionCode):
lang_pref = len(PREFERENCES) - pref
break
else:
lang_pref = -1
self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
formats.extend(secondary_formats)
self._remove_duplicate_formats(formats)
format = {
'format_id': format_id,
'language_preference': lang_pref,
'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
'quality': qfunc(f.get('quality')),
}
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
else:
format['url'] = f['url']
formats.append(format)
# For this extractor, quality only represents the relative quality
# with respect to other formats with the same resolution
self._sort_formats(formats, ('res', 'quality'))
metadata = config['data']['attributes']['metadata']
return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE') or player_info.get('V7T'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'id': metadata['providerId'],
'webpage_url': traverse_obj(metadata, ('link', 'url')),
'title': traverse_obj(metadata, 'subtitle', 'title'),
'alt_title': metadata.get('subtitle') and metadata.get('title'),
'description': metadata.get('description'),
'duration': traverse_obj(metadata, ('duration', 'seconds')),
'language': metadata.get('language'),
'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
'is_live': config['data']['attributes'].get('live', False),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [
{'url': image['url'], 'id': image.get('caption')}
for image in metadata.get('images') or [] if url_or_none(image.get('url'))
],
# TODO: chapters may also be in stream['segments']?
'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
'start_time': 'startTime',
'title': 'title',
})) or None,
}
class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
_TESTS = [{
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
@@ -192,17 +236,12 @@ class ArteTVEmbedIE(InfoExtractor):
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
'skip': 'No video available'
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url):
qs = parse_qs(url)
json_url = qs['json_url'][0]
@@ -215,41 +254,71 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
'id': 'RC-016954',
'title': 'Earn a Living',
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
'only_matching': True,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
'playlist_mincount': 100,
'info_dict': {
'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
'id': 'RC-014123',
'title': 'ARTE Reportage - najlepsze reportaże',
},
}]
def _real_extract(self, url):
lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
playlist = self._download_json(
f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
entries = [{
'_type': 'url_transparent',
'url': video['config']['url'],
'ie_key': ArteTVIE.ie_key(),
'id': video.get('providerId'),
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
} for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
return self.playlist_result(entries, playlist_id,
traverse_obj(playlist, ('metadata', 'title')),
traverse_obj(playlist, ('metadata', 'description')))
class ArteTVCategoryIE(ArteTVBaseIE):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/politics-and-society/',
'info_dict': {
'id': 'politics-and-society',
'title': 'Politics and society',
'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
},
'playlist_mincount': 13,
}]
@classmethod
def suitable(cls, url):
return (
not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
and super().suitable(url))
def _real_extract(self, url):
lang, playlist_id = self._match_valid_url(url).groups()
collection = self._download_json(
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
webpage = self._download_webpage(url, playlist_id)
items = []
for video in re.finditer(
r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
webpage):
video = video.group('url')
if video == url:
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
return self.playlist_result(entries, playlist_id, title, description)
if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
items.append(video)
title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
description=self._og_search_description(webpage, default=None))

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import re
@@ -181,8 +178,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE):
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
default=None) or self._html_extract_title(webpage)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
@@ -37,9 +33,6 @@ class AtresPlayerIE(InfoExtractor):
]
_API_BASE = 'https://api.atresplayer.com/'
def _real_initialize(self):
self._login()
def _handle_error(self, e, code):
if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
error = self._parse_json(e.cause.read(), None)
@@ -48,11 +41,7 @@ class AtresPlayerIE(InfoExtractor):
raise ExtractorError(error['error_description'], expected=True)
raise
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def _perform_login(self, username, password):
self._request_webpage(
self._API_BASE + 'login', None, 'Downloading login page')
@@ -95,7 +84,6 @@ class AtresPlayerIE(InfoExtractor):
elif src_type == 'application/dash+xml':
formats, subtitles = self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
omniture = episode.get('omniture') or {}

View File

@@ -0,0 +1,34 @@
import re
from .common import InfoExtractor
class AtScaleConfEventIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atscaleconference\.com/events/(?P<id>[^/&$?]+)'
_TESTS = [{
'url': 'https://atscaleconference.com/events/data-scale-spring-2022/',
'playlist_mincount': 13,
'info_dict': {
'id': 'data-scale-spring-2022',
'title': 'Data @Scale Spring 2022',
'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55'
},
}, {
'url': 'https://atscaleconference.com/events/video-scale-2021/',
'playlist_mincount': 14,
'info_dict': {
'id': 'video-scale-2021',
'title': 'Video @Scale 2021',
'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55'
},
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
return self.playlist_from_matches(
re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage),
ie='Generic', playlist_id=id,
title=self._og_search_title(webpage), description=self._og_search_description(webpage))

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unified_strdate

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
from .common import InfoExtractor
@@ -8,6 +5,7 @@ from ..utils import (
float_or_none,
jwt_encode_hs256,
try_get,
ExtractorError,
)
@@ -51,7 +49,6 @@ class ATVAtIE(InfoExtractor):
'url': source_url,
'format_id': protocol,
})
self._sort_formats(formats)
return {
'id': clip_id,
@@ -94,6 +91,11 @@ class ATVAtIE(InfoExtractor):
})
video_id, videos_data = list(videos['data'].items())[0]
error_msg = try_get(videos_data, lambda x: x['error']['title'])
if error_msg == 'Geo check failed':
self.raise_geo_restricted(error_msg)
elif error_msg:
raise ExtractorError(error_msg)
entries = [
self._extract_video_info(url, contentResource[video['id']], video)
for video in videos_data]

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -79,7 +76,6 @@ class AudiMediaIE(InfoExtractor):
'format_id': 'http-%s' % bitrate,
})
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,27 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
float_or_none,
)
from ..utils import clean_html, float_or_none, traverse_obj, unescapeHTML
class AudioBoomIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://audioboom.com/posts/7398103-asim-chaudhry',
'md5': '7b00192e593ff227e6a315486979a42d',
'md5': '4d68be11c9f9daf3dab0778ad1e010c3',
'info_dict': {
'id': '7398103',
'ext': 'mp3',
'title': 'Asim Chaudhry',
'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc',
'description': 'md5:0ed714ae0e81e5d9119cac2f618ad679',
'duration': 4000.99,
'uploader': 'Sue Perkins: An hour or so with...',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins',
}
}, { # Direct mp3-file link
'url': 'https://audioboom.com/posts/8128496.mp3',
'md5': 'e329edf304d450def95c7f86a9165ee1',
'info_dict': {
'id': '8128496',
'ext': 'mp3',
'title': 'TCRNo8 / DAILY 03 - In Control',
'description': 'md5:44665f142db74858dfa21c5b34787948',
'duration': 1689.7,
'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904',
}
}, {
'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0',
'only_matching': True,
@@ -29,45 +35,23 @@ class AudioBoomIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://audioboom.com/posts/{video_id}', video_id)
webpage = self._download_webpage(url, video_id)
clip = None
clip_store = self._parse_json(
self._html_search_regex(
r'data-new-clip-store=(["\'])(?P<json>{.+?})\1',
webpage, 'clip store', default='{}', group='json'),
video_id, fatal=False)
if clip_store:
clips = clip_store.get('clips')
if clips and isinstance(clips, list) and isinstance(clips[0], dict):
clip = clips[0]
def from_clip(field):
if clip:
return clip.get(field)
audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
'audio', webpage, 'audio url')
title = from_clip('title') or self._html_search_meta(
['og:title', 'og:audio:title', 'audio_title'], webpage)
description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage)
duration = float_or_none(from_clip('duration') or self._html_search_meta(
'weibo:audio:duration', webpage))
uploader = from_clip('author') or self._html_search_meta(
['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader')
uploader_url = from_clip('author_url') or self._html_search_meta(
'audioboo:channel', webpage, 'uploader url')
clip_store = self._search_json(
r'data-react-class="V5DetailPagePlayer"\s*data-react-props=["\']',
webpage, 'clip store', video_id, fatal=False, transform_source=unescapeHTML)
clip = traverse_obj(clip_store, ('clips', 0), expected_type=dict) or {}
return {
'id': video_id,
'url': audio_url,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_url': uploader_url,
'url': clip.get('clipURLPriorToLoading') or self._og_search_property('audio', webpage, 'audio url'),
'title': clip.get('title') or self._html_search_meta(['og:title', 'og:audio:title', 'audio_title'], webpage),
'description': (clip.get('description') or clean_html(clip.get('formattedDescription'))
or self._og_search_description(webpage)),
'duration': float_or_none(clip.get('duration') or self._html_search_meta('weibo:audio:duration', webpage)),
'uploader': clip.get('author') or self._html_search_meta(
['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader'),
'uploader_url': clip.get('author_url') or self._html_search_regex(
r'<div class="avatar flex-shrink-0">\s*<a href="(?P<uploader_url>http[^"]+)"',
webpage, 'uploader url', fatal=False),
}

View File

@@ -0,0 +1,93 @@
from .common import InfoExtractor
from ..utils import int_or_none
class AudiodraftBaseIE(InfoExtractor):
def _audiodraft_extract_from_id(self, player_entry_id):
data_json = self._download_json(
'https://www.audiodraft.com/scripts/general/player/getPlayerInfoNew.php', player_entry_id,
headers={
'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
}, data=f'id={player_entry_id}'.encode('utf-8'))
return {
'id': str(data_json['entry_id']),
'title': data_json.get('entry_title'),
'url': data_json['path'],
'vcodec': 'none',
'ext': 'mp3',
'uploader': data_json.get('designer_name'),
'uploader_id': data_json.get('designer_id'),
'webpage_url': data_json.get('entry_url'),
'like_count': int_or_none(data_json.get('entry_likes')),
'average_rating': int_or_none(data_json.get('entry_rating')),
}
class AudiodraftCustomIE(AudiodraftBaseIE):
IE_NAME = 'Audiodraft:custom'
_VALID_URL = r'https?://(?:[-\w]+)\.audiodraft\.com/entry/(?P<id>\d+)'
_TESTS = [{
'url': 'http://nokiatune.audiodraft.com/entry/5874',
'info_dict': {
'id': '9485',
'ext': 'mp3',
'title': 'Hula Hula Calls',
'uploader': 'unclemaki',
'uploader_id': '13512',
'average_rating': 5,
'like_count': int,
},
}, {
'url': 'http://vikinggrace.audiodraft.com/entry/501',
'info_dict': {
'id': '22241',
'ext': 'mp3',
'title': 'MVG Happy',
'uploader': 'frog',
'uploader_id': '19142',
'average_rating': 5,
'like_count': int,
},
}, {
'url': 'http://timferriss.audiodraft.com/entry/765',
'info_dict': {
'id': '19710',
'ext': 'mp3',
'title': 'ferris03',
'uploader': 'malex',
'uploader_id': '17335',
'average_rating': 5,
'like_count': int,
},
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
player_entry_id = self._search_regex(r'playAudio\(\'(player_entry_\d+)\'\);', webpage, id, 'play entry id')
return self._audiodraft_extract_from_id(player_entry_id)
class AudiodraftGenericIE(AudiodraftBaseIE):
IE_NAME = 'Audiodraft:generic'
_VALID_URL = r'https?://www\.audiodraft\.com/contests/[^/#]+#entries&eid=(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.audiodraft.com/contests/570-Score-A-Video-Surprise-Us#entries&eid=30138',
'info_dict': {
'id': '30138',
'ext': 'mp3',
'title': 'DROP in sound_V2',
'uploader': 'TiagoSilva',
'uploader_id': '19452',
'average_rating': 4,
'like_count': int,
},
}]
def _real_extract(self, url):
id = self._match_id(url)
return self._audiodraft_extract_from_id(f'player_entry_{id}')

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import time
@@ -29,6 +26,7 @@ class AudiomackIE(InfoExtractor):
}
},
# audiomack wrapper around soundcloud song
# Needs new test URL.
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',

View File

@@ -1,11 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
import random
from .common import InfoExtractor
from ..utils import ExtractorError, try_get, compat_str, str_or_none
from ..compat import compat_urllib_parse_unquote
from ..compat import compat_str, compat_urllib_parse_unquote
from ..utils import ExtractorError, str_or_none, try_get
class AudiusBaseIE(InfoExtractor):
@@ -171,7 +168,7 @@ class AudiusIE(AudiusBaseIE):
}
class AudiusTrackIE(AudiusIE):
class AudiusTrackIE(AudiusIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P<track_id>\w+)'''
IE_NAME = 'audius:track'
IE_DESC = 'Audius track ID or API link. Prepend with "audius:"'
@@ -246,7 +243,7 @@ class AudiusPlaylistIE(AudiusBaseIE):
playlist_data.get('description'))
class AudiusProfileIE(AudiusPlaylistIE):
class AudiusProfileIE(AudiusPlaylistIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'audius:artist'
IE_DESC = 'Audius.co profile/artist pages'
_VALID_URL = r'https?://(?:www)?audius\.co/(?P<id>[^\/]+)/?(?:[?#]|$)'

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
from .common import InfoExtractor
@@ -44,7 +41,7 @@ class AWAANBaseIE(InfoExtractor):
'id': video_id,
'title': title,
'description': video_data.get('description_en') or video_data.get('description_ar'),
'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'),
'thumbnail': format_field(img, None, 'http://admin.mangomolo.com/analytics/%s'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import hashlib
import hmac
@@ -9,7 +6,7 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
class AWSIE(InfoExtractor):
class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
_AWS_ALGORITHM = 'AWS4-HMAC-SHA256'
_AWS_REGION = 'us-east-1'

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
@@ -11,11 +8,12 @@ class AZMedienIE(InfoExtractor):
IE_DESC = 'AZ Medien videos'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?:www\.|tv\.)?
(?P<host>
telezueri\.ch|
telebaern\.tv|
telem1\.ch
telem1\.ch|
tvo-online\.ch
)/
[^/]+/
(?P<id>
@@ -30,7 +28,7 @@ class AZMedienIE(InfoExtractor):
'''
_TESTS = [{
'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'info_dict': {
'id': '1_anruz3wy',
'ext': 'mp4',
@@ -38,6 +36,9 @@ class AZMedienIE(InfoExtractor):
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
'view_count': int,
'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031',
'duration': 1930
},
'params': {
'skip_download': True,

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unescapeHTML

View File

@@ -0,0 +1,148 @@
import math
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlparse,
compat_parse_qs,
)
from ..utils import (
format_field,
InAdvancePagedList,
traverse_obj,
unified_timestamp,
)
class BanByeBaseIE(InfoExtractor):
_API_BASE = 'https://api.banbye.com'
_CDN_BASE = 'https://cdn.banbye.com'
_VIDEO_BASE = 'https://banbye.com/watch'
@staticmethod
def _extract_playlist_id(url, param='playlist'):
return compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get(param, [None])[0]
def _extract_playlist(self, playlist_id):
data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id)
return self.playlist_result([
self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE)
for video_id in data['videoIds']], playlist_id, data.get('name'))
class BanByeIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)'
_TESTS = [{
'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
'info_dict': {
'id': 'v_ytfmvkVYLE8T',
'ext': 'mp4',
'title': 'md5:5ec098f88a0d796f987648de6322ba0f',
'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a',
'uploader': 'wRealu24',
'channel_id': 'ch_wrealu24',
'channel_url': 'https://banbye.com/channel/ch_wrealu24',
'timestamp': 1647604800,
'upload_date': '20220318',
'duration': 1931,
'thumbnail': r're:https?://.*\.webp',
'tags': 'count:5',
'like_count': int,
'dislike_count': int,
'view_count': int,
'comment_count': int,
},
}, {
'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ',
'info_dict': {
'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ',
},
'playlist_count': 9,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
playlist_id = self._extract_playlist_id(url, 'playlistId')
if self._yes_playlist(playlist_id, video_id):
return self._extract_playlist(playlist_id)
data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id)
thumbnails = [{
'id': f'{quality}p',
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp',
} for quality in [48, 96, 144, 240, 512, 1080]]
formats = [{
'format_id': f'http-{quality}p',
'quality': quality,
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4',
} for quality in data['quality']]
return {
'id': video_id,
'title': data.get('title'),
'description': data.get('desc'),
'uploader': traverse_obj(data, ('channel', 'name')),
'channel_id': data.get('channelId'),
'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'),
'timestamp': unified_timestamp(data.get('publishedAt')),
'duration': data.get('duration'),
'tags': data.get('tags'),
'formats': formats,
'thumbnails': thumbnails,
'like_count': data.get('likes'),
'dislike_count': data.get('dislikes'),
'view_count': data.get('views'),
'comment_count': data.get('commentCount'),
}
class BanByeChannelIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)'
_TESTS = [{
'url': 'https://banbye.com/channel/ch_wrealu24',
'info_dict': {
'title': 'wRealu24',
'id': 'ch_wrealu24',
'description': 'md5:da54e48416b74dfdde20a04867c0c2f6',
},
'playlist_mincount': 791,
}, {
'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ',
'info_dict': {
'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ',
},
'playlist_count': 9,
}]
_PAGE_SIZE = 100
def _real_extract(self, url):
channel_id = self._match_id(url)
playlist_id = self._extract_playlist_id(url)
if playlist_id:
return self._extract_playlist(playlist_id)
def page_func(page_num):
data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={
'channelId': channel_id,
'sort': 'new',
'limit': self._PAGE_SIZE,
'offset': page_num * self._PAGE_SIZE,
}, note=f'Downloading page {page_num+1}')
return [
self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE)
for video in data['items']
]
channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id)
entries = InAdvancePagedList(
page_func,
math.ceil(channel_data['videoCount'] / self._PAGE_SIZE),
self._PAGE_SIZE)
return self.playlist_result(
entries, channel_id, channel_data.get('name'), channel_data.get('description'))

View File

@@ -1,11 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
from .brightcove import BrightcoveNewIE
from .brightcove import BrightcoveNewBaseIE
from ..utils import extract_attributes
class BandaiChannelIE(BrightcoveNewIE):
class BandaiChannelIE(BrightcoveNewBaseIE):
IE_NAME = 'bandaichannel'
_VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P<id>\d+/\d+)'
_TESTS = [{

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import random
import re
import time
@@ -8,34 +5,42 @@ import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
KNOWN_EXTENSIONS,
ExtractorError,
float_or_none,
int_or_none,
KNOWN_EXTENSIONS,
parse_filesize,
str_or_none,
try_get,
update_url_query,
unified_strdate,
unified_timestamp,
update_url_query,
url_or_none,
urljoin,
)
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
'duration': 9.8485,
'uploader': 'youtube-dl "\'/\\ä↭',
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
'album_artist': 'youtube-dl "\'/\\ä↭',
'track_id': '1812978515',
'artist': 'youtube-dl "\'/\\ä↭',
'uploader_url': 'https://youtube-dl.bandcamp.com',
'uploader_id': 'youtube-dl',
'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
@@ -43,7 +48,8 @@ class BandcampIE(InfoExtractor):
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
'ext': 'm4a',
'acodec': r're:[fa]lac',
'title': 'Ben Prunty - Lanius (Battle)',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
@@ -56,7 +62,10 @@ class BandcampIE(InfoExtractor):
'track_number': 1,
'track_id': '2650410135',
'artist': 'Ben Prunty',
'album_artist': 'Ben Prunty',
'album': 'FTL: Advanced Edition Soundtrack',
'uploader_url': 'https://benprunty.bandcamp.com',
'uploader_id': 'benprunty',
},
}, {
# no free download, mp3 128
@@ -77,7 +86,34 @@ class BandcampIE(InfoExtractor):
'track_number': 5,
'track_id': '2584466013',
'artist': 'Mastodon',
'album_artist': 'Mastodon',
'album': 'Call of the Mastodon',
'uploader_url': 'https://relapsealumni.bandcamp.com',
'uploader_id': 'relapsealumni',
},
}, {
# track from compilation album (artist/album_artist difference)
'url': 'https://diskotopia.bandcamp.com/track/safehouse',
'md5': '19c5337bca1428afa54129f86a2f6a69',
'info_dict': {
'id': '1978174799',
'ext': 'mp3',
'title': 'submerse - submerse - Safehouse',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'submerse',
'timestamp': 1480779297,
'upload_date': '20161203',
'release_timestamp': 1481068800,
'release_date': '20161207',
'duration': 154.066,
'track': 'submerse - Safehouse',
'track_number': 3,
'track_id': '1978174799',
'artist': 'submerse',
'album_artist': 'Diskotopia',
'album': 'DSK F/W 2016-2017 Free Compilation',
'uploader_url': 'https://diskotopia.bandcamp.com',
'uploader_id': 'diskotopia',
},
}]
@@ -87,7 +123,7 @@ class BandcampIE(InfoExtractor):
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url):
title = self._match_id(url)
title, uploader = self._match_valid_url(url).group('id', 'uploader')
webpage = self._download_webpage(url, title)
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
@@ -123,6 +159,9 @@ class BandcampIE(InfoExtractor):
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
album_artist = self._html_search_regex(
r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
webpage, 'album artist', fatal=False)
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
@@ -183,10 +222,9 @@ class BandcampIE(InfoExtractor):
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
'acodec': format_id.split('-')[0],
})
self._sort_formats(formats)
title = '%s - %s' % (artist, track) if artist else track
if not duration:
@@ -198,6 +236,8 @@ class BandcampIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'uploader': artist,
'uploader_id': uploader,
'uploader_url': f'https://{uploader}.bandcamp.com',
'timestamp': timestamp,
'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
'duration': duration,
@@ -206,13 +246,14 @@ class BandcampIE(InfoExtractor):
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'album_artist': album_artist,
'formats': formats,
}
class BandcampAlbumIE(BandcampIE):
class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -257,14 +298,6 @@ class BandcampAlbumIE(BandcampIE):
'id': 'hierophany-of-the-open-grave',
},
'playlist_mincount': 9,
}, {
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'title': 'Loom',
'id': 'dotscale',
'uploader_id': 'dotscale',
},
'playlist_mincount': 7,
}, {
# with escaped quote in title
'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
@@ -321,7 +354,7 @@ class BandcampAlbumIE(BandcampIE):
}
class BandcampWeeklyIE(BandcampIE):
class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
@@ -370,7 +403,6 @@ class BandcampWeeklyIE(BandcampIE):
'ext': ext,
'vcodec': 'none',
})
self._sort_formats(formats)
title = show.get('audio_title') or 'Bandcamp Weekly'
subtitle = show.get('subtitle')
@@ -391,41 +423,63 @@ class BandcampWeeklyIE(BandcampIE):
}
class BandcampMusicIE(InfoExtractor):
_VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music'
class BandcampUserIE(InfoExtractor):
IE_NAME = 'Bandcamp:user'
_VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
_TESTS = [{
# Type 1 Bandcamp user page.
'url': 'https://adrianvonziegler.bandcamp.com',
'info_dict': {
'id': 'adrianvonziegler',
'title': 'Discography of adrianvonziegler',
},
'playlist_mincount': 23,
}, {
# Bandcamp user page with only one album
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'id': 'dotscale',
'title': 'Discography of dotscale'
},
'playlist_count': 1,
}, {
# Type 2 Bandcamp user page.
'url': 'https://nightcallofficial.bandcamp.com',
'info_dict': {
'id': 'nightcallofficial',
'title': 'Discography of nightcallofficial',
},
'playlist_count': 4,
}, {
'url': 'https://steviasphere.bandcamp.com/music',
'playlist_mincount': 47,
'info_dict': {
'id': 'steviasphere',
'title': 'Discography of steviasphere',
},
}, {
'url': 'https://coldworldofficial.bandcamp.com/music',
'playlist_mincount': 10,
'info_dict': {
'id': 'coldworldofficial',
'title': 'Discography of coldworldofficial',
},
}, {
'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
'playlist_mincount': 399,
'info_dict': {
'id': 'nuclearwarnowproductions',
'title': 'Discography of nuclearwarnowproductions',
},
}
]
_TYPE_IE_DICT = {
'album': BandcampAlbumIE.ie_key(),
'track': BandcampIE.ie_key()
}
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage)
entries = [
self.url_result(
f'https://{id}.bandcamp.com/{item[0]}',
ie=self._TYPE_IE_DICT[item[1]])
for item in items]
return self.playlist_result(entries, id)
uploader = self._match_id(url)
webpage = self._download_webpage(url, uploader)
discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
return self.playlist_from_matches(
discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
import json
from .common import InfoExtractor
@@ -137,7 +135,6 @@ query GetCommentReplies($id: String!) {
formats.extend(self._extract_m3u8_formats(
video_info.get('streamUrl'), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', live=True))
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,18 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import itertools
import json
import re
import urllib.error
import xml.etree.ElementTree
from .common import InfoExtractor
from ..compat import (
compat_etree_Element,
compat_HTTPError,
compat_str,
compat_urlparse,
)
from ..compat import compat_HTTPError, compat_str, compat_urlparse
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -38,7 +32,7 @@ from ..utils import (
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
_ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
@@ -52,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
_EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
_LOGIN_URL = 'https://account.bbc.com/signin'
_NETRC_MACHINE = 'bbc'
@@ -263,11 +258,7 @@ class BBCCoUkIE(InfoExtractor):
'only_matching': True,
}]
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def _perform_login(self, username, password):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading signin page')
@@ -293,9 +284,6 @@ class BBCCoUkIE(InfoExtractor):
'Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_initialize(self):
self._login()
class MediaSelectionError(Exception):
def __init__(self, id):
self.id = id
@@ -324,7 +312,7 @@ class BBCCoUkIE(InfoExtractor):
continue
captions = self._download_xml(
cc_url, programme_id, 'Downloading captions', fatal=False)
if not isinstance(captions, compat_etree_Element):
if not isinstance(captions, xml.etree.ElementTree.Element):
continue
subtitles['en'] = [
{
@@ -394,9 +382,17 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_mpd_formats(
href, programme_id, mpd_id=format_id, fatal=False))
elif transfer_format == 'hls':
formats.extend(self._extract_m3u8_formats(
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False))
# TODO: let expected_status be passed into _extract_xxx_formats() instead
try:
fmts = self._extract_m3u8_formats(
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
except ExtractorError as e:
if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
and e.exc_info[1].code in (403, 404)):
raise
fmts = []
formats.extend(fmts)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
@@ -579,8 +575,6 @@ class BBCCoUkIE(InfoExtractor):
else:
programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
self._sort_formats(formats)
return {
'id': programme_id,
'title': title,
@@ -592,10 +586,15 @@ class BBCCoUkIE(InfoExtractor):
}
class BBCIE(BBCCoUkIE):
class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'bbc'
IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
_VALID_URL = r'''(?x)
https?://(?:www\.)?(?:
bbc\.(?:com|co\.uk)|
bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
)/(?:[^/]+/)+(?P<id>[^/#?]+)'''
_MEDIA_SETS = [
'pc',
@@ -784,21 +783,33 @@ class BBCIE(BBCCoUkIE):
'timestamp': 1437785037,
'upload_date': '20150725',
},
}, {
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
'id': 'p0b71qth',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731,
'upload_date': '20211130',
},
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
}, {
# bbcthreeConfig
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
'title': 'Things Not To Say to people that live on council estates',
'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
'duration': 360,
'thumbnail': r're:https?://.+/.+\.jpg',
},
'params': {
'skip_download': True,
}
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
@@ -833,6 +844,12 @@ class BBCIE(BBCCoUkIE):
'upload_date': '20190604',
'categories': ['Psychology'],
},
}, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True,
}, {
'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
'only_matching': True,
}]
@classmethod
@@ -871,7 +888,6 @@ class BBCIE(BBCCoUkIE):
def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
programme_id, title, description, duration, formats, subtitles = \
self._process_legacy_playlist_url(url, playlist_id)
self._sort_formats(formats)
return {
'id': programme_id,
'title': title,
@@ -890,13 +906,8 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
timestamp = json_ld_info.get('timestamp')
playlist_title = json_ld_info.get('title')
if not playlist_title:
playlist_title = self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
playlist_title = json_ld_info.get('title') or re.sub(
r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
playlist_description = json_ld_info.get(
'description') or self._og_search_description(webpage, default=None)
@@ -940,7 +951,6 @@ class BBCIE(BBCCoUkIE):
duration = int_or_none(items[0].get('duration'))
programme_id = items[0].get('vpid')
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': title,
@@ -977,7 +987,6 @@ class BBCIE(BBCCoUkIE):
continue
raise
if entry:
self._sort_formats(entry['formats'])
entries.append(entry)
if entries:
@@ -1001,7 +1010,6 @@ class BBCIE(BBCCoUkIE):
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
# digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
digital_data = self._parse_json(
self._search_regex(
@@ -1033,7 +1041,6 @@ class BBCIE(BBCCoUkIE):
if version_id:
title = smp_data['title']
formats, subtitles = self._download_media_selector(version_id)
self._sort_formats(formats)
image_url = smp_data.get('holdingImageURL')
display_date = init_data.get('displayDate')
topic_title = init_data.get('topicTitle')
@@ -1075,7 +1082,6 @@ class BBCIE(BBCCoUkIE):
continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
@@ -1104,7 +1110,6 @@ class BBCIE(BBCCoUkIE):
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {}
duration = int_or_none(
@@ -1137,7 +1142,6 @@ class BBCIE(BBCCoUkIE):
clip_title = clip.get('title')
if clip_vpid and clip_title:
formats, subtitles = self._download_media_selector(clip_vpid)
self._sort_formats(formats)
return {
'id': clip_vpid,
'title': clip_title,
@@ -1159,7 +1163,6 @@ class BBCIE(BBCCoUkIE):
if not programme_id:
continue
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': playlist_title,
@@ -1171,9 +1174,16 @@ class BBCIE(BBCCoUkIE):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
initial_data = self._parse_json(self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), playlist_id, fatal=False)
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
if initial_data is None:
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
'preload state', default={})
else:
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
def parse_media(media):
if not media:
@@ -1184,7 +1194,6 @@ class BBCIE(BBCCoUkIE):
if not (item_id and item_title):
continue
formats, subtitles = self._download_media_selector(item_id)
self._sort_formats(formats)
item_desc = None
blocks = try_get(media, lambda x: x['summary']['blocks'], list)
if blocks:
@@ -1214,8 +1223,11 @@ class BBCIE(BBCCoUkIE):
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
if block.get('type') != 'media':
for block in (try_get(resp,
(lambda x: x['data']['blocks'],
lambda x: x['data']['content']['model']['blocks'],),
list) or []):
if block.get('type') not in ['media', 'video']:
continue
parse_media(block.get('model'))
return self.playlist_result(
@@ -1282,7 +1294,6 @@ class BBCIE(BBCCoUkIE):
formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
if not formats and not self.get_param('ignore_no_formats'):
continue
self._sort_formats(formats)
video_id = media_meta.get('externalId')
if not video_id:

View File

@@ -0,0 +1,101 @@
from .common import InfoExtractor
from .youtube import YoutubeIE, YoutubeTabIE
class BeatBumpVideoIE(InfoExtractor):
_VALID_URL = r'https://beatbump\.ml/listen\?id=(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs',
'md5': '5ff3fff41d3935b9810a9731e485fe66',
'info_dict': {
'id': 'MgNrAu2pzNs',
'ext': 'mp4',
'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
'artist': 'Stephen',
'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp',
'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
'upload_date': '20190312',
'categories': ['Music'],
'playable_in_embed': True,
'duration': 169,
'like_count': int,
'alt_title': 'Voyeur Girl',
'view_count': int,
'track': 'Voyeur Girl',
'uploader': 'Stephen - Topic',
'title': 'Voyeur Girl',
'channel_follower_count': int,
'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
'age_limit': 0,
'availability': 'public',
'live_status': 'not_live',
'album': 'it\'s too much love to know my dear',
'channel': 'Stephen',
'comment_count': int,
'description': 'md5:7ae382a65843d6df2685993e90a8628f',
'tags': 'count:11',
'creator': 'Stephen',
'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA',
}
}]
def _real_extract(self, url):
id_ = self._match_id(url)
return self.url_result(f'https://music.youtube.com/watch?v={id_}', YoutubeIE, id_)
class BeatBumpPlaylistIE(InfoExtractor):
_VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE',
'playlist_count': 50,
'info_dict': {
'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
'availability': 'unlisted',
'view_count': int,
'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
'description': '',
'tags': [],
'modified_date': '20221223',
}
}, {
'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg',
'playlist_mincount': 1,
'params': {'flatplaylist': True},
'info_dict': {
'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg',
'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg',
'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'channel_follower_count': int,
'title': 'NoCopyrightSounds - Videos',
'uploader': 'NoCopyrightSounds',
'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a',
'channel': 'NoCopyrightSounds',
'tags': 'count:12',
'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
},
}, {
'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
'playlist_mincount': 1,
'params': {'flatplaylist': True},
'info_dict': {
'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds',
'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
'view_count': int,
'channel_url': 'https://www.youtube.com/@NoCopyrightSounds',
'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'title': 'NCS : All Releases 💿',
'uploader': 'NoCopyrightSounds',
'availability': 'public',
'channel': 'NoCopyrightSounds',
'tags': [],
'modified_date': '20221225',
'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
}
}]
def _real_extract(self, url):
id_ = self._match_id(url)
return self.url_result(f'https://music.youtube.com/browse/{id_}', YoutubeTabIE, id_)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -77,7 +74,6 @@ class BeatportIE(InfoExtractor):
fmt['abr'] = 96
fmt['asr'] = 44100
formats.append(fmt)
self._sort_formats(formats)
images = []
for name, info in track['images'].items():

View File

@@ -1,32 +1,43 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
int_or_none,
parse_qs,
traverse_obj,
try_get,
unified_timestamp,
)
class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P<id>\d+)'
_TESTS = [{
# api/v6 v1
'url': 'http://beeg.com/5416503',
'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'url': 'https://beeg.com/-0983946056129650',
'md5': '51d235147c4627cfce884f844293ff88',
'info_dict': {
'id': '5416503',
'id': '0983946056129650',
'ext': 'mp4',
'title': 'Sultry Striptease',
'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
'timestamp': 1391813355,
'upload_date': '20140207',
'duration': 383,
'title': 'sucked cock and fucked in a private plane',
'duration': 927,
'tags': list,
'age_limit': 18,
'upload_date': '20220131',
'timestamp': 1643656455,
'display_id': 2540839,
}
}, {
'url': 'https://beeg.com/-0599050563103750?t=4-861',
'md5': 'bd8b5ea75134f7f07fad63008db2060e',
'info_dict': {
'id': '0599050563103750',
'ext': 'mp4',
'title': 'Bad Relatives',
'duration': 2060,
'tags': list,
'age_limit': 18,
'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9',
'timestamp': 1643623200,
'display_id': 2569965,
'upload_date': '20220131',
}
}, {
# api/v6 v2
@@ -36,12 +47,6 @@ class BeegIE(InfoExtractor):
# api/v6 v2 w/o t
'url': 'https://beeg.com/1277207756',
'only_matching': True,
}, {
'url': 'https://beeg.porn/video/5416503',
'only_matching': True,
}, {
'url': 'https://beeg.porn/5416503',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -49,68 +54,36 @@ class BeegIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
beeg_version = self._search_regex(
r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
default='1546225636701')
video = self._download_json(
'https://store.externulls.com/facts/file/%s' % video_id,
video_id, 'Downloading JSON for %s' % video_id)
if len(video_id) >= 10:
query = {
'v': 2,
}
qs = parse_qs(url)
t = qs.get('t', [''])[0].split('-')
if len(t) > 1:
query.update({
's': t[0],
'e': t[1],
})
else:
query = {'v': 1}
fc_facts = video.get('fc_facts')
first_fact = {}
for fact in fc_facts:
if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']):
first_fact = fact
for api_path in ('', 'api.'):
video = self._download_json(
'https://%sbeeg.com/api/v6/%s/video/%s'
% (api_path, beeg_version, video_id), video_id,
fatal=api_path == 'api.', query=query)
if video:
break
resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources')
formats = []
for format_id, video_url in video.items():
if not video_url:
for format_id, video_uri in resources.items():
if not video_uri:
continue
height = self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)
if not height:
continue
formats.append({
'url': self._proto_relative_url(
video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
'format_id': format_id,
'height': int(height),
})
self._sort_formats(formats)
title = video['title']
video_id = compat_str(video.get('id') or video_id)
display_id = video.get('code')
description = video.get('desc')
series = video.get('ps_name')
timestamp = unified_timestamp(video.get('date'))
duration = int_or_none(video.get('duration'))
tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None))
current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height))
for f in current_formats:
f['height'] = height
formats.extend(current_formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'series': series,
'timestamp': timestamp,
'duration': duration,
'tags': tags,
'display_id': first_fact.get('id'),
'title': traverse_obj(video, ('file', 'stuff', 'sf_name')),
'description': traverse_obj(video, ('file', 'stuff', 'sf_story')),
'timestamp': unified_timestamp(first_fact.get('fc_created')),
'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))),
'tags': traverse_obj(video, ('tags', ..., 'tg_name')),
'formats': formats,
'age_limit': self._rta_search(webpage),
}

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import url_basename

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
@@ -28,7 +24,7 @@ class BellMediaIE(InfoExtractor):
)/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070',
'md5': '36d3ef559cfe8af8efe15922cd3ce950',
'md5': '3e5b8e38370741d5089da79161646635',
'info_dict': {
'id': '1403070',
'ext': 'flv',
@@ -36,6 +32,14 @@ class BellMediaIE(InfoExtractor):
'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3',
'upload_date': '20180525',
'timestamp': 1527288600,
'season_id': 73997,
'season': '2018',
'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg',
'tags': [],
'categories': ['ETFs'],
'season_number': 8,
'duration': 272.038,
'series': 'Market Call Tonight',
},
}, {
'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582',

View File

@@ -0,0 +1,70 @@
from .common import InfoExtractor
from ..utils import float_or_none, mimetype2ext, traverse_obj
class BerufeTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?web\.arbeitsagentur\.de/berufetv/[^?#]+/film;filmId=(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://web.arbeitsagentur.de/berufetv/studienberufe/wirtschaftswissenschaften/wirtschaftswissenschaften-volkswirtschaft/film;filmId=DvKC3DUpMKvUZ_6fEnfg3u',
'md5': '041b6432ec8e6838f84a5c30f31cc795',
'info_dict': {
'id': 'DvKC3DUpMKvUZ_6fEnfg3u',
'ext': 'mp4',
'title': 'Volkswirtschaftslehre',
'description': 'md5:6bd87d0c63163480a6489a37526ee1c1',
'categories': ['Studien&shy;beruf'],
'tags': ['Studienfilm'],
'duration': 602.440,
'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
movie_metadata = self._download_json(
'https://rest.arbeitsagentur.de/infosysbub/berufetv/pc/v1/film-metadata',
video_id, 'Downloading JSON metadata',
headers={'X-API-Key': '79089773-4892-4386-86e6-e8503669f426'}, fatal=False)
meta = traverse_obj(
movie_metadata, ('metadaten', lambda _, i: video_id == i['miId']),
get_all=False, default={})
video = self._download_json(
f'https://d.video-cdn.net/play/player/8YRzUk6pTzmBdrsLe9Y88W/video/{video_id}',
video_id, 'Downloading video JSON')
formats, subtitles = [], {}
for key, source in video['videoSources']['html'].items():
if key == 'auto':
fmts, subs = self._extract_m3u8_formats_and_subtitles(source[0]['source'], video_id)
formats += fmts
subtitles = subs
else:
formats.append({
'url': source[0]['source'],
'ext': mimetype2ext(source[0]['mimeType']),
'format_id': key,
})
for track in video.get('videoTracks') or []:
if track.get('type') != 'SUBTITLES':
continue
subtitles.setdefault(track['language'], []).append({
'url': track['source'],
'name': track.get('label'),
'ext': 'vtt'
})
return {
'id': video_id,
'title': meta.get('titel') or traverse_obj(video, ('videoMetaData', 'title')),
'description': meta.get('beschreibung'),
'thumbnail': meta.get('thumbnail') or f'https://asset-out-cdn.video-cdn.net/private/videos/{video_id}/thumbnails/active',
'duration': float_or_none(video.get('duration'), scale=1000),
'categories': [meta['kategorie']] if meta.get('kategorie') else None,
'tags': meta.get('themengebiete'),
'subtitles': subtitles,
'formats': formats,
}

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
from ..utils import unified_strdate

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -8,7 +5,7 @@ from ..utils import extract_attributes
class BFMTVBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/'
_VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/'
_VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
@@ -34,6 +31,9 @@ class BFMTVIE(BFMTVBaseIE):
'uploader_id': '876450610001',
'upload_date': '20201002',
'timestamp': 1601629620,
'duration': 44.757,
'tags': ['bfmactu', 'politique'],
'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876450610001/5041f4c1-bc48-4af8-a256-1b8300ad8ef0/cf2f9114-e8e2-4494-82b4-ab794ea4bc7d/1920x1080/match/image.jpg',
},
}]
@@ -45,7 +45,7 @@ class BFMTVIE(BFMTVBaseIE):
return self._brightcove_url_result(video_block['videoid'], video_block)
class BFMTVLiveIE(BFMTVIE):
class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'bfmtv:live'
_VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
_TESTS = [{
@@ -84,6 +84,20 @@ class BFMTVArticleIE(BFMTVBaseIE):
}, {
'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html',
'only_matching': True,
}, {
'url': 'https://rmc.bfmtv.com/actualites/societe/transports/ce-n-est-plus-tout-rentable-le-bioethanol-e85-depasse-1eu-le-litre-des-automobilistes-regrettent_AV-202301100268.html',
'info_dict': {
'id': '6318445464112',
'ext': 'mp4',
'title': 'Le plein de bioéthanol fait de plus en plus mal à la pompe',
'description': None,
'uploader_id': '876630703001',
'upload_date': '20230110',
'timestamp': 1673341692,
'duration': 109.269,
'tags': ['rmc', 'show', 'apolline de malherbe', 'info', 'talk', 'matinale', 'radio'],
'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876630703001/5bef74b8-9d5e-4480-a21f-60c2e2480c46/96c88b74-f9db-45e1-8040-e199c5da216c/1920x1080/match/image.jpg'
}
}]
def _real_extract(self, url):

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -66,8 +63,6 @@ class BigflixIE(InfoExtractor):
'url': decode_url(file_url),
})
self._sort_formats(formats)
description = self._html_search_meta('description', webpage)
return {

View File

@@ -0,0 +1,56 @@
from .common import InfoExtractor
from ..utils import ExtractorError, urlencode_postdata
class BigoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://www.bigo.tv/ja/221338632',
'info_dict': {
'id': '6576287577575737440',
'title': '土よ〜💁‍♂️ 休憩室/REST room',
'thumbnail': r're:https?://.+',
'uploader': '✨Shin💫',
'uploader_id': '221338632',
'is_live': True,
},
'skip': 'livestream',
}, {
'url': 'https://www.bigo.tv/th/Tarlerm1304',
'only_matching': True,
}, {
'url': 'https://bigo.tv/115976881',
'only_matching': True,
}]
def _real_extract(self, url):
user_id = self._match_id(url)
info_raw = self._download_json(
'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo',
user_id, data=urlencode_postdata({'siteId': user_id}))
if not isinstance(info_raw, dict):
raise ExtractorError('Received invalid JSON data')
if info_raw.get('code'):
raise ExtractorError(
'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True)
info = info_raw.get('data') or {}
if not info.get('alive'):
raise ExtractorError('This user is offline.', expected=True)
formats, subs = self._extract_m3u8_formats_and_subtitles(
info.get('hls_src'), user_id, 'mp4', 'm3u8')
return {
'id': info.get('roomId') or user_id,
'title': info.get('roomTopic') or info.get('nick_name') or user_id,
'formats': formats,
'subtitles': subs,
'thumbnail': info.get('snapshot'),
'uploader': info.get('nick_name'),
'uploader_id': user_id,
'is_live': True,
}

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,

View File

@@ -1,29 +1,27 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .vk import VKIE
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
from ..compat import compat_b64decode
from ..utils import (
int_or_none,
js_to_json,
traverse_obj,
unified_timestamp,
)
from ..utils import int_or_none
class BIQLEIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
# Youtube embed
'url': 'https://biqle.ru/watch/-115995369_456239081',
'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
'url': 'https://biqle.ru/watch/-2000421746_85421746',
'md5': 'ae6ef4f04d19ac84e4658046d02c151c',
'info_dict': {
'id': '8v4f-avW-VI',
'id': '-2000421746_85421746',
'ext': 'mp4',
'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
'description': 'Passe-Partout',
'uploader_id': 'mrsimpsonstef3',
'uploader': 'Phanolito',
'upload_date': '20120822',
'title': 'Forsaken By Hope Studio Clip',
'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн',
'upload_date': '19700101',
'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb',
'timestamp': 0,
},
}, {
'url': 'http://biqle.org/watch/-44781847_168547604',
@@ -32,54 +30,62 @@ class BIQLEIE(InfoExtractor):
'id': '-44781847_168547604',
'ext': 'mp4',
'title': 'Ребенок в шоке от автоматической мойки',
'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн',
'timestamp': 1396633454,
'uploader': 'Dmitry Kotov',
'upload_date': '20140404',
'uploader_id': '47850140',
'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._proto_relative_url(self._search_regex(
r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
webpage, 'embed url'))
title = self._html_search_meta('name', webpage, 'Title', fatal=False)
timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None))
description = self._html_search_meta('description', webpage, 'Description', default=None)
global_embed_url = self._search_regex(
r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'',
webpage, 'global Embed url')
hash = self._search_regex(
r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash')
embed_url = global_embed_url + hash
if VKIE.suitable(embed_url):
return self.url_result(embed_url, VKIE.ie_key(), video_id)
embed_page = self._download_webpage(
embed_url, video_id, headers={'Referer': url})
video_ext = self._get_cookies(embed_url).get('video_ext')
if video_ext:
video_ext = compat_urllib_parse_unquote(video_ext.value)
if not video_ext:
video_ext = compat_b64decode(self._search_regex(
r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
embed_page, 'video_ext')).decode()
video_id, sig, _, access_token = video_ext.split(':')
embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url})
glob_params = self._parse_json(self._search_regex(
r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>',
embed_page, 'Global Parameters'), video_id, transform_source=js_to_json)
host_name = compat_b64decode(glob_params['server'][::-1]).decode()
item = self._download_json(
'https://api.vk.com/method/video.get', video_id,
headers={'User-Agent': 'okhttp/3.4.1'}, query={
'access_token': access_token,
'sig': sig,
'v': 5.44,
f'https://{host_name}/method/video.get/{video_id}', video_id,
headers={'Referer': url}, query={
'token': glob_params['video']['access_token'],
'videos': video_id,
'ckey': glob_params['c_key'],
'credentials': glob_params['video']['credentials'],
})['response']['items'][0]
title = item['title']
formats = []
for f_id, f_url in item.get('files', {}).items():
if f_id == 'external':
return self.url_result(f_url)
ext, height = f_id.split('_')
formats.append({
'format_id': height + 'p',
'url': f_url,
'height': int_or_none(height),
'ext': ext,
})
self._sort_formats(formats)
height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height))
if height_extra_key:
formats.append({
'format_id': f'{height}p',
'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}',
'height': int_or_none(height),
'ext': ext,
})
thumbnails = []
for k, v in item.items():
@@ -96,10 +102,9 @@ class BIQLEIE(InfoExtractor):
'title': title,
'formats': formats,
'comment_count': int_or_none(item.get('comments')),
'description': item.get('description'),
'description': description,
'duration': int_or_none(item.get('duration')),
'thumbnails': thumbnails,
'timestamp': int_or_none(item.get('date')),
'uploader': item.get('owner_id'),
'timestamp': timestamp,
'view_count': int_or_none(item.get('views')),
}

View File

@@ -1,14 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import functools
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
GeoRestrictedError,
HEADRequest,
OnDemandPagedList,
clean_html,
get_element_by_class,
get_element_by_id,
get_elements_html_by_class,
int_or_none,
orderedSet,
parse_count,
parse_duration,
traverse_obj,
unified_strdate,
urlencode_postdata,
)
@@ -16,11 +22,12 @@ from ..utils import (
class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'szoMrox2JEI',
'id': 'UGlrF9o9b-Q',
'ext': 'mp4',
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
@@ -28,6 +35,31 @@ class BitChuteIE(InfoExtractor):
'uploader': 'BitChute',
'upload_date': '20170103',
},
}, {
# video not downloadable in browser, but we can recover it
'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
'md5': '05c12397d5354bf24494885b08d24ed1',
'info_dict': {
'id': '2s6B3nZjAk7R',
'ext': 'mp4',
'filesize': 71537926,
'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
'description': 'md5:228ee93bd840a24938f536aeac9cf749',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20181113',
},
'params': {'check_formats': None},
}, {
# restricted video
'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/',
'info_dict': {
'id': 'WEnQU7XGcTdl',
'ext': 'mp4',
'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft',
},
'params': {'skip_download': True},
'skip': 'Georestricted in DE',
}, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
'only_matching': True,
@@ -35,124 +67,167 @@ class BitChuteIE(InfoExtractor):
'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
'only_matching': True,
}]
_GEO_BYPASS = False
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
webpage)]
_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
'Referer': 'https://www.bitchute.com/',
}
def _check_format(self, video_url, video_id):
urls = orderedSet(
re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
for url in urls:
try:
response = self._request_webpage(
HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
except ExtractorError as e:
self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
continue
return {
'url': url,
'filesize': int_or_none(response.headers.get('Content-Length'))
}
def _raise_if_restricted(self, webpage):
page_title = clean_html(get_element_by_class('page-title', webpage)) or ''
if re.fullmatch(r'(?:Channel|Video) Restricted', page_title):
reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title
self.raise_geo_restricted(reason)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
})
f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
title = self._html_search_regex(
(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
webpage, 'title', default=None) or self._html_search_meta(
'description', webpage, 'title',
default=None) or self._og_search_description(webpage)
self._raise_if_restricted(webpage)
publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
entries = self._parse_html5_media_entries(url, webpage, video_id)
format_urls = []
for mobj in re.finditer(
r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
format_urls.append(mobj.group('url'))
format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
formats = [
{'url': format_url}
for format_url in orderedSet(format_urls)]
formats = []
for format_ in traverse_obj(entries, (0, 'formats', ...)):
if self.get_param('check_formats') is not False:
format_.update(self._check_format(format_.pop('url'), video_id) or {})
if 'url' not in format_:
continue
formats.append(format_)
if not formats:
entries = self._parse_html5_media_entries(
url, webpage, video_id)
if not entries:
error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
if error == 'Video Unavailable':
raise GeoRestrictedError(error)
raise ExtractorError(error)
formats = entries[0]['formats']
self._check_formats(formats, video_id)
self._sort_formats(formats)
description = self._html_search_regex(
r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(
webpage, default=None) or self._html_search_meta(
'twitter:image:src', webpage, 'thumbnail')
uploader = self._html_search_regex(
(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._search_regex(
r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
webpage, 'upload date', fatal=False))
self.raise_no_formats(
'Video is unavailable. Please make sure this video is playable in the browser '
'before reporting this issue.', expected=True, video_id=video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'upload_date': upload_date,
'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': clean_html(get_element_by_class('owner', webpage)),
'upload_date': unified_strdate(self._search_regex(
r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
'formats': formats,
}
class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.bitchute.com/channel/victoriaxrave/',
'playlist_mincount': 185,
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bitchute.com/channel/bitchute/',
'info_dict': {
'id': 'victoriaxrave',
'id': 'bitchute',
'title': 'BitChute',
'description': 'md5:5329fb3866125afa9446835594a9b138',
},
}
'playlist': [
{
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'UGlrF9o9b-Q',
'ext': 'mp4',
'filesize': None,
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20170103',
'duration': 16,
'view_count': int,
},
}
],
'params': {
'skip_download': True,
'playlist_items': '-1',
},
}, {
'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
'playlist_mincount': 20,
'info_dict': {
'id': 'wV9Imujxasw9',
'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:04913227d2714af1d36d804aa2ab6b1e',
}
}]
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
PAGE_SIZE = 25
HTML_CLASS_NAMES = {
'channel': {
'container': 'channel-videos-container',
'title': 'channel-videos-title',
'description': 'channel-videos-text',
},
'playlist': {
'container': 'playlist-video',
'title': 'title',
'description': 'description',
}
def _entries(self, channel_id):
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
offset = 0
for page_num in itertools.count(1):
data = self._download_json(
'%sextend/' % channel_url, channel_id,
'Downloading channel page %d' % page_num,
data=urlencode_postdata({
'csrfmiddlewaretoken': self._TOKEN,
'name': '',
'offset': offset,
}), headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': channel_url,
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'csrftoken=%s' % self._TOKEN,
})
if data.get('success') is False:
break
html = data.get('html')
if not html:
break
video_ids = re.findall(
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
html)
if not video_ids:
break
offset += len(video_ids)
for video_id in video_ids:
yield self.url_result(
'https://www.bitchute.com/video/%s' % video_id,
ie=BitChuteIE.ie_key(), video_id=video_id)
}
@staticmethod
def _make_url(playlist_id, playlist_type):
return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
def _fetch_page(self, playlist_id, playlist_type, page_num):
playlist_url = self._make_url(playlist_id, playlist_type)
data = self._download_json(
f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
data=urlencode_postdata({
'csrfmiddlewaretoken': self._TOKEN,
'name': '',
'offset': page_num * self.PAGE_SIZE,
}), headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': playlist_url,
'X-Requested-With': 'XMLHttpRequest',
'Cookie': f'csrftoken={self._TOKEN}',
})
if not data.get('success'):
return
classes = self.HTML_CLASS_NAMES[playlist_type]
for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
video_id = self._search_regex(
r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
if not video_id:
continue
yield self.url_result(
f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
title=clean_html(get_element_by_class(classes['title'], video_html)),
description=clean_html(get_element_by_class(classes['description'], video_html)),
duration=parse_duration(get_element_by_class('video-duration', video_html)),
view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
def _real_extract(self, url):
channel_id = self._match_id(url)
playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id)
OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
title=self._html_extract_title(webpage, default=None),
description=self._html_search_meta(
('description', 'og:description', 'twitter:description'), webpage, default=None),
playlist_count=int_or_none(self._html_search_regex(
r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))

View File

@@ -1,5 +1,3 @@
from __future__ import unicode_literals
from .common import InfoExtractor
@@ -47,7 +45,6 @@ class BitwaveStreamIE(InfoExtractor):
formats = self._extract_m3u8_formats(
channel['data']['url'], username,
'mp4')
self._sort_formats(formats)
return {
'id': username,

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import parse_iso8601

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .amp import AMPIE
from ..utils import (

View File

@@ -1,86 +0,0 @@
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
remove_start,
int_or_none,
)
class BlinkxIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
'id': 'Da0Gw3xc',
'ext': 'mp4',
'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
'uploader': 'IGN News',
'upload_date': '20150217',
'timestamp': 1424215740,
'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
'duration': 47.743333,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
+ 'video=%s' % video_id)
data_json = self._download_webpage(api_url, display_id)
data = json.loads(data_json)['api']['results'][0]
duration = None
thumbnails = []
formats = []
for m in data['media']:
if m['type'] == 'jpg':
thumbnails.append({
'url': m['link'],
'width': int(m['w']),
'height': int(m['h']),
})
elif m['type'] == 'original':
duration = float(m['d'])
elif m['type'] == 'youtube':
yt_id = m['link']
self.to_screen('Youtube video detected: %s' % yt_id)
return self.url_result(yt_id, 'Youtube', video_id=yt_id)
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int_or_none(m.get('w')),
'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)
return {
'id': display_id,
'fullid': video_id,
'title': data['title'],
'formats': formats,
'uploader': data.get('channel_name'),
'timestamp': data.get('pubdate_epoch'),
'description': data.get('description'),
'thumbnails': thumbnails,
'duration': duration,
}

View File

@@ -1,8 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ..utils import (
mimetype2ext,
parse_duration,
@@ -16,7 +11,7 @@ from .common import InfoExtractor
class BloggerIE(InfoExtractor):
IE_NAME = 'blogger.com'
_VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
_VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
_EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''']
_TESTS = [{
'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
@@ -29,10 +24,6 @@ class BloggerIE(InfoExtractor):
}
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(BloggerIE._VALID_EMBED, webpage)
def _real_extract(self, url):
token_id = self._match_id(url)
webpage = self._download_webpage(url, token_id)

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -10,13 +7,11 @@ class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
# The md5 checksum changes
'url': 'https://www.bloomberg.com/news/videos/2021-09-14/apple-unveils-the-new-iphone-13-stock-doesn-t-move-much-video',
'info_dict': {
'id': 'qurhIVlJSB6hzkVi229d8g',
'id': 'V8cFcYMxTHaMcEiiYVr39A',
'ext': 'flv',
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
'description': 'md5:a8ba0302912d03d246979735c17d2761',
'title': 'Apple Unveils the New IPhone 13, Stock Doesn\'t Move Much',
},
'params': {
'format': 'best[format_id^=hds]',
@@ -60,7 +55,7 @@ class BloombergIE(InfoExtractor):
title = re.sub(': Video$', '', self._og_search_title(webpage))
embed_info = self._download_json(
'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
'http://www.bloomberg.com/multimedia/api/embed?id=%s' % video_id, video_id)
formats = []
for stream in embed_info['streams']:
stream_url = stream.get('url')
@@ -72,7 +67,6 @@ class BloombergIE(InfoExtractor):
else:
formats.extend(self._extract_f4m_formats(
stream_url, video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import ExtractorError
@@ -25,8 +21,6 @@ class BokeCCBaseIE(InfoExtractor):
'quality': int(quality.attrib['value']),
} for quality in info_xml.findall('./video/quality')]
self._sort_formats(formats)
return formats

View File

@@ -1,6 +1,3 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -11,13 +8,28 @@ from ..utils import (
class BongaCamsIE(InfoExtractor):
_VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://de.bongacams.com/azumi-8',
'only_matching': True,
}, {
'url': 'https://cn.bongacams.com/azumi-8',
'only_matching': True,
}, {
'url': 'https://de.bongacams.net/claireashton',
'info_dict': {
'id': 'claireashton',
'ext': 'mp4',
'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'age_limit': 18,
'uploader_id': 'ClaireAshton',
'uploader': 'ClaireAshton',
'like_count': int,
'is_live': True,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
@@ -45,7 +57,6 @@ class BongaCamsIE(InfoExtractor):
formats = self._extract_m3u8_formats(
'%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id),
channel_id, 'mp4', m3u8_id='hls', live=True)
self._sort_formats(formats)
return {
'id': channel_id,

View File

@@ -0,0 +1,86 @@
from .common import InfoExtractor
from ..utils import int_or_none, str_or_none, traverse_obj
class BooyahBaseIE(InfoExtractor):
_BOOYAH_SESSION_KEY = None
def _real_initialize(self):
BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage(
'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key')
def _get_comments(self, video_id):
comment_json = self._download_json(
f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id,
headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {}
return [{
'id': comment.get('comment_id'),
'author': comment.get('from_nickname'),
'author_id': comment.get('from_uid'),
'author_thumbnail': comment.get('from_thumbnail'),
'text': comment.get('content'),
'timestamp': comment.get('create_time'),
'like_count': comment.get('like_cnt'),
} for comment in comment_json.get('comment_list') or ()]
class BooyahClipsIE(BooyahBaseIE):
_VALID_URL = r'https?://booyah.live/clips/(?P<id>\d+)'
_TESTS = [{
'url': 'https://booyah.live/clips/13887261322952306617',
'info_dict': {
'id': '13887261322952306617',
'ext': 'mp4',
'view_count': int,
'duration': 30,
'channel_id': 90565760,
'like_count': int,
'title': 'Cayendo con estilo 😎',
'uploader': '♡LɪMER',
'comment_count': int,
'uploader_id': '90565760',
'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg',
'upload_date': '20220617',
'timestamp': 1655490556,
'modified_timestamp': 1655490556,
'modified_date': '20220617',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(
f'https://booyah.live/api/v3/playbacks/{video_id}', video_id,
headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY})
formats = []
for video_data in json_data['playback']['endpoint_list']:
formats.extend(({
'url': video_data.get('stream_url'),
'ext': 'mp4',
'height': video_data.get('resolution'),
}, {
'url': video_data.get('download_url'),
'ext': 'mp4',
'format_note': 'Watermarked',
'height': video_data.get('resolution'),
'preference': -10,
}))
return {
'id': video_id,
'title': traverse_obj(json_data, ('playback', 'name')),
'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')),
'formats': formats,
'view_count': traverse_obj(json_data, ('playback', 'views')),
'like_count': traverse_obj(json_data, ('playback', 'likes')),
'duration': traverse_obj(json_data, ('playback', 'duration')),
'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')),
'channel_id': traverse_obj(json_data, ('playback', 'channel_id')),
'uploader': traverse_obj(json_data, ('user', 'nickname')),
'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))),
'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000),
'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000),
'__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)),
}

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor

View File

@@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
@@ -82,8 +79,6 @@ class BoxIE(InfoExtractor):
'url': update_url_query(authenticated_download_url, query),
})
self._sort_formats(formats)
creator = f.get('created_by') or {}
return {

Some files were not shown because too many files have changed in this diff Show More