Upgrade yt_dlp and download script

This commit is contained in:
2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions

View File

@@ -2,10 +2,10 @@ import functools
import itertools
import json
import re
import urllib.parse
import xml.etree.ElementTree
from .common import InfoExtractor
from ..compat import compat_str, compat_urlparse
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
@@ -15,11 +15,13 @@ from ..utils import (
float_or_none,
get_element_by_class,
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
parse_iso8601,
parse_qs,
strip_or_none,
traverse_obj,
try_get,
unescapeHTML,
unified_timestamp,
@@ -33,7 +35,7 @@ class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
_VALID_URL = rf'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
(?:
@@ -41,11 +43,10 @@ class BBCCoUkIE(InfoExtractor):
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]|
radio/player/|
sounds/play/|
events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
(?P<id>{_ID_REGEX})(?!/(?:episodes|broadcasts|clips))
'''
_EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
_LOGIN_URL = 'https://account.bbc.com/signin'
@@ -74,7 +75,7 @@ class BBCCoUkIE(InfoExtractor):
'params': {
# rtmp download
'skip_download': True,
}
},
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
@@ -147,7 +148,7 @@ class BBCCoUkIE(InfoExtractor):
'params': {
# rtmp download
'skip_download': True,
}
},
}, {
'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
'note': 'Video',
@@ -161,7 +162,7 @@ class BBCCoUkIE(InfoExtractor):
'params': {
# rtmp download
'skip_download': True,
}
},
}, {
'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
'info_dict': {
@@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
# rtmp download
'skip_download': True,
},
}, {
'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
'note': 'Audio',
'info_dict': {
'id': 'm0007jz9',
'ext': 'mp4',
'title': 'BBC Proms, 2019, Prom 34: WestEastern Divan Orchestra',
'description': "Live BBC Proms. WestEastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
'duration': 9840,
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
@@ -281,19 +268,19 @@ class BBCCoUkIE(InfoExtractor):
error = clean_html(get_element_by_class('form-message', response))
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
f'Unable to login: {error}', expected=True)
raise ExtractorError('Unable to log in')
class MediaSelectionError(Exception):
def __init__(self, id):
self.id = id
def __init__(self, error_id):
self.id = error_id
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
def _extract_items(self, playlist):
return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
return playlist.findall(f'./{{{self._EMP_PLAYLIST_NS}}}item')
def _extract_medias(self, media_selection):
error = media_selection.get('result')
@@ -325,21 +312,30 @@ class BBCCoUkIE(InfoExtractor):
def _raise_extractor_error(self, media_selection_error):
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
f'{self.IE_NAME} returned error: {media_selection_error.id}',
expected=True)
def _download_media_selector(self, programme_id):
last_exception = None
formats, subtitles = [], {}
for media_set in self._MEDIA_SETS:
try:
return self._download_media_selector_url(
fmts, subs = self._download_media_selector_url(
self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
formats.extend(fmts)
if subs:
self._merge_subtitles(subs, target=subtitles)
except BBCCoUkIE.MediaSelectionError as e:
if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
last_exception = e
continue
self._raise_extractor_error(e)
self._raise_extractor_error(last_exception)
if last_exception:
if formats or subtitles:
self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
else:
self._raise_extractor_error(last_exception)
return formats, subtitles
def _download_media_selector_url(self, url, programme_id=None):
media_selection = self._download_json(
@@ -376,7 +372,7 @@ class BBCCoUkIE(InfoExtractor):
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
formats.append({
'url': ref,
'format_id': 'ref%s_%s' % (i, format_id),
'format_id': f'ref{i}_{format_id}',
})
elif transfer_format == 'dash':
formats.extend(self._extract_mpd_formats(
@@ -398,7 +394,7 @@ class BBCCoUkIE(InfoExtractor):
href, programme_id, f4m_id=format_id, fatal=False))
else:
if not supplier and bitrate:
format_id += '-%d' % bitrate
format_id += f'-{bitrate}'
fmt = {
'format_id': format_id,
'filesize': file_size,
@@ -427,9 +423,9 @@ class BBCCoUkIE(InfoExtractor):
identifier = connection.get('identifier')
server = connection.get('server')
fmt.update({
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
'url': f'{protocol}://{server}/{application}?{auth_string}',
'play_path': identifier,
'app': '%s?%s' % (application, auth_string),
'app': f'{application}?{auth_string}',
'page_url': 'http://www.bbc.co.uk',
'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
'rtmp_live': False,
@@ -445,7 +441,7 @@ class BBCCoUkIE(InfoExtractor):
def _download_playlist(self, playlist_id):
try:
playlist = self._download_json(
'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
f'http://www.bbc.co.uk/programmes/{playlist_id}/playlist.json',
playlist_id, 'Downloading playlist JSON')
formats = []
subtitles = {}
@@ -484,32 +480,32 @@ class BBCCoUkIE(InfoExtractor):
def _process_legacy_playlist(self, playlist_id):
return self._process_legacy_playlist_url(
'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
f'http://www.bbc.co.uk/iplayer/playlist/{playlist_id}', playlist_id)
def _download_legacy_playlist_url(self, url, playlist_id=None):
return self._download_xml(
url, playlist_id, 'Downloading legacy playlist XML')
def _extract_from_legacy_playlist(self, playlist, playlist_id):
no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
no_items = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}noItems')
if no_items is not None:
reason = no_items.get('reason')
if reason == 'preAvailability':
msg = 'Episode %s is not yet available' % playlist_id
msg = f'Episode {playlist_id} is not yet available'
elif reason == 'postAvailability':
msg = 'Episode %s is no longer available' % playlist_id
msg = f'Episode {playlist_id} is no longer available'
elif reason == 'noMedia':
msg = 'Episode %s is not currently available' % playlist_id
msg = f'Episode {playlist_id} is not currently available'
else:
msg = 'Episode %s is not available: %s' % (playlist_id, reason)
msg = f'Episode {playlist_id} is not available: {reason}'
raise ExtractorError(msg, expected=True)
for item in self._extract_items(playlist):
kind = item.get('kind')
if kind not in ('programme', 'radioProgramme'):
continue
title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
title = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}title').text
description_el = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}summary')
description = description_el.text if description_el is not None else None
def get_programme_id(item):
@@ -519,7 +515,7 @@ class BBCCoUkIE(InfoExtractor):
if value and re.match(r'^[pb][\da-z]{7}$', value):
return value
get_from_attributes(item)
mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
mediator = item.find(f'./{{{self._EMP_PLAYLIST_NS}}}mediator')
if mediator is not None:
return get_from_attributes(mediator)
@@ -559,7 +555,7 @@ class BBCCoUkIE(InfoExtractor):
if not programme_id:
programme_id = self._search_regex(
r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
rf'"vpid"\s*:\s*"({self._ID_REGEX})"', webpage, 'vpid', fatal=False, default=None)
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
@@ -606,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade',
'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
},
'playlist_count': 2,
@@ -627,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
'title': 'BUGGER',
'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
},
'playlist_count': 18,
}, {
@@ -635,16 +632,16 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'description': 'md5:2868290467291b37feda7863f7a83f54',
'title': 'Germanwings crash site aerial video',
'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324',
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
},
'params': {
# rtmp download
'skip_download': True,
}
},
}, {
# article with single video embedded with data-playable containing XML playlist
# with direct video links as progressiveDownloadUrl (for now these are extracted)
@@ -660,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'params': {
'skip_download': True,
}
},
'skip': 'now SIMORGH_DATA with no video',
}, {
# single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'info_dict': {
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'id': '39275083',
'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'timestamp': 1434713142,
'upload_date': '20150619',
'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
},
'params': {
'skip_download': True,
}
},
}, {
# single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@@ -687,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'params': {
'skip_download': True,
}
},
'skip': '404 Not Found',
}, {
# single video story with digitalData
# single video story with __PWA_PRELOADED_STATE__
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': {
'id': 'p02q6gc4',
'ext': 'flv',
'title': 'Sri Lankas spicy secret',
'description': 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.',
'timestamp': 1437674293,
'upload_date': '20150723',
'ext': 'mp4',
'title': 'Tasting the spice of life in Jaffna',
'description': r're:(?s)BBC Travel Shows Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
'timestamp': 1646058397,
'upload_date': '20220228',
'duration': 255,
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@@ -714,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'timestamp': 1415867444,
'upload_date': '20141113',
},
'params': {
# rtmp download
'skip_download': True,
}
'skip': 'redirects to TopGear home page',
}, {
# single video embedded with Morph
# TODO: replacement test page
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
'info_dict': {
'id': 'p041vhd0',
@@ -730,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'BBC Sport',
'uploader_id': 'bbc_sport',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to UK',
'skip': 'Video no longer in page',
}, {
# single video with playlist.sxml URL in playlist param
# single video in __INITIAL_DATA__
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
'title': 'Ronaldo to Man Utd, Arsenal to spend?',
'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
'timestamp': 1437750175,
'upload_date': '20150724',
'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': 140,
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# article with multiple videos embedded with playlist.sxml in playlist param
# article with multiple videos embedded with Morph.setPayload
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
@@ -758,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
},
'playlist_count': 3,
}, {
# Testing noplaylist
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': 'p034ppnv',
'ext': 'mp4',
'title': 'All you need to know about Jurgen Klopp',
'timestamp': 1444335081,
'upload_date': '20151008',
'duration': 122.0,
'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
},
'params': {
'noplaylist': True,
},
}, {
# school report article with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@@ -766,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': 'School which breaks down barriers in Jerusalem',
},
'playlist_count': 1,
'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
}, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
@@ -782,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1437785037,
'upload_date': '20150725',
'duration': 105,
},
}, {
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
'id': 'p0b71qth',
'id': 'p0b779gc',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731,
'upload_date': '20211130',
'timestamp': 1638215626,
'upload_date': '20211129',
'duration': 125,
},
}, {
# video with script id __NEXT_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/uk-68546268',
'info_dict': {
'id': 'p0hj0lq7',
'ext': 'mp4',
'title': 'Nasser Hospital doctor describes his treatment by IDF',
'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1710188248,
'upload_date': '20240311',
'duration': 104,
},
}, {
# single video article embedded with data-media-vpid
@@ -821,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
'skip': '404 Not Found',
}, {
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': {
@@ -828,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4',
'title': 'md5:2fabf12a726603193a2879a055f72514',
'description': 'Learn English words and phrases from this story',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
},
'add_ie': [BBCCoUkIE.ie_key()],
}, {
@@ -836,13 +861,29 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': 'p07c6sb9',
'ext': 'mp4',
'title': 'How positive thinking is harming your happiness',
'alt_title': 'The downsides of positive thinking',
'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
'title': 'The downsides of positive thinking',
'description': 'The downsides of positive thinking',
'duration': 235,
'thumbnail': r're:https?://.+/p07c9dsr.jpg',
'upload_date': '20190604',
'categories': ['Psychology'],
'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
'upload_date': '20220223',
'timestamp': 1645632746,
},
}, {
# BBC Sounds
'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
'info_dict': {
'id': 'p0hrw4nr',
'ext': 'mp4',
'title': 'Are our coastlines being washed away?',
'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
'timestamp': 1713556800,
'upload_date': '20240419',
'duration': 1588,
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
'uploader': 'World Service',
'uploader_id': 'bbc_world_service',
'series': 'CrowdScience',
'chapters': [],
},
}, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
@@ -856,7 +897,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
def suitable(cls, url):
EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
else super(BBCIE, cls).suitable(url))
else super().suitable(url))
def _extract_from_media_meta(self, media_meta, video_id):
# Direct links to media in media metadata (e.g.
@@ -968,7 +1009,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
if playlist:
entry = None
for key in ('streaming', 'progressiveDownload'):
playlist_url = playlist.get('%sUrl' % key)
playlist_url = playlist.get(f'{key}Url')
if not playlist_url:
continue
try:
@@ -994,18 +1035,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
# http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
group_id = self._search_regex(
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
rf'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\']({self._ID_REGEX})',
webpage, 'group id', default=None)
if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())
f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
[rf'data-(?:video-player|media)-vpid="({self._ID_REGEX})"',
rf'<param[^>]+name="externalIdentifier"[^>]+value="({self._ID_REGEX})"',
rf'videoId\s*:\s*["\']({self._ID_REGEX})["\']'],
webpage, 'vpid', default=None)
if programme_id:
@@ -1059,76 +1099,133 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
morph_payload = self._parse_json(
self._search_regex(
r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
webpage, 'morph payload', default='{}'),
playlist_id, fatal=False)
# Several setPayload calls may be present but the video(s)
# should be in one that mentions leadMedia or videoData
morph_payload = self._search_json(
r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
default={})
if morph_payload:
components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
for component in components:
if not isinstance(component, dict):
continue
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
if not lead_media:
continue
identifiers = lead_media.get('identifiers')
if not identifiers or not isinstance(identifiers, dict):
continue
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
for lead_media in traverse_obj(morph_payload, (
'body', 'components', ..., 'props', 'leadMedia', {dict})):
programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
if not programme_id:
continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
duration = None
duration_d = lead_media.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return {
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'title': lead_media.get('title') or self._og_search_title(webpage),
**traverse_obj(lead_media, {
'description': ('summary', {str}),
'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
'uploader': ('masterBrand', {str}),
'uploader_id': ('mid', {str}),
}),
'formats': formats,
'subtitles': subtitles,
}
body = self._parse_json(traverse_obj(morph_payload, (
'body', 'content', 'article', 'body')), playlist_id, fatal=False)
for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
if video_data.get('vpid'):
video_id = video_data['vpid']
formats, subtitles = self._download_media_selector(video_id)
entry = {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
else:
video_id = video_data['pid']
entry = self.url_result(
f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
video_id, url_transparent=True)
entry.update({
'timestamp': traverse_obj(morph_payload, (
'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}),
),
**traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}),
})
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
return entry
entries.append(entry)
if entries:
playlist_title = traverse_obj(morph_payload, (
'body', 'content', 'article', 'headline', {str})) or playlist_title
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
preload_state = self._parse_json(self._search_regex(
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), playlist_id, fatal=False)
if preload_state:
current_programme = preload_state.get('programmes', {}).get('current') or {}
programme_id = current_programme.get('id')
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {}
duration = int_or_none(
current_programme.get('duration', {}).get('value'))
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
thumbnail = image_url.replace('{recipe}', 'raw')
# various PRELOADED_STATE JSON
preload_state = self._search_json(
r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
'preload state', playlist_id, transform_source=js_to_json, default={})
# PRELOADED_STATE with current programmme
current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
programme_id = traverse_obj(current_programme, ('id', {str}))
if programme_id and current_programme.get('type') == 'playable_item':
title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'formats': formats,
**traverse_obj(current_programme, {
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
'series': ('titles', 'primary', {str}),
}),
'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
}),
),
}
# PWA_PRELOADED_STATE with article video asset
asset_id = traverse_obj(preload_state, (
'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
'assetVideo', 0, {str}, any))
if asset_id:
video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
if video_id:
article = traverse_obj(preload_state, (
'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
def image_url(image_id):
return traverse_obj(preload_state, (
'entities', 'images', image_id, 'url',
{lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
formats, subtitles = self._download_media_selector(video_id)
return {
'id': programme_id,
'title': title,
'description': dict_get(synopses, ('long', 'medium', 'short')),
'thumbnail': thumbnail,
'duration': duration,
'uploader': network.get('short_title'),
'uploader_id': network.get('id'),
'id': video_id,
**traverse_obj(preload_state, ('entities', 'videos', asset_id, {
'title': ('title', {str}),
'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
'thumbnail': (0, {image_url}),
'duration': ('duration', {int_or_none}),
})),
'formats': formats,
'subtitles': subtitles,
'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
}
else:
return self.url_result(
f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
asset_id, playlist_title, display_id=playlist_id,
description=playlist_description)
bbc3_config = self._parse_json(
self._search_regex(
@@ -1174,17 +1271,52 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_model(model):
"""Extract single video from model structure"""
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
formats, subtitles = self._download_media_selector(item_id)
return {
'id': item_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, filter, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {int_or_none(scale=1000)}),
}),
}
def is_type(*types):
return lambda _, v: v['type'] in types
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
if initial_data is None:
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
'preload state', default={})
'preload state', default='{}')
else:
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
for video_data in traverse_obj(initial_data, (
'stores', 'article', 'articleBodyContent', is_type('video'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
entry = parse_model(model)
if entry:
entries.append(entry)
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_media(media):
if not media:
return
@@ -1199,7 +1331,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
if blocks:
summary = []
for block in blocks:
text = try_get(block, lambda x: x['model']['text'], compat_str)
text = try_get(block, lambda x: x['model']['text'], str)
if text:
summary.append(text)
if summary:
@@ -1217,37 +1349,100 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'subtitles': subtitles,
'timestamp': item_time,
'description': strip_or_none(item_desc),
'duration': int_or_none(item.get('duration')),
})
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name')
for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
name = resp['name']
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
for block in (try_get(resp,
(lambda x: x['data']['blocks'],
lambda x: x['data']['content']['model']['blocks'],),
list) or []):
if block.get('type') not in ['media', 'video']:
continue
parse_media(block.get('model'))
for block in traverse_obj(resp, (
'data', (None, ('content', 'model')), 'blocks',
is_type('media', 'video'), 'model', {dict})):
parse_media(block)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# extract from SIMORGH_DATA hydration JSON
simorgh_data = self._search_json(
r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
'simorgh data', playlist_id, default={})
if simorgh_data:
done = False
for video_data in traverse_obj(simorgh_data, (
'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
if video_data['type'] == 'video':
entry = parse_model(model)
else: # legacyMedia: no duration, subtitles
block_id, entry = traverse_obj(model, ('blockId', {str})), None
media_data = traverse_obj(simorgh_data, (
'pageData', 'promo', 'media',
{lambda x: x if x['id'] == block_id else None}))
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'ext': ('format', {str}),
'tbr': ('bitrate', {int_or_none(scale=1000)}),
}))
if formats:
entry = {
'id': block_id,
'display_id': playlist_id,
'formats': formats,
'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'timestamp': ('firstPublished', {int_or_none(scale=1000)}),
}),
}
done = True
if entry:
entries.append(entry)
if done:
break
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage))))
return list(filter(None, (
self._parse_json(s, playlist_id, fatal=False)
for s in re.findall(pattern, webpage))))
# US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
('props', 'pageProps', 'page'))
model = traverse_obj(next_data, (
..., 'contents', is_type('video'),
'model', 'blocks', is_type('media'),
'model', 'blocks', is_type('mediaMetadata'),
'model', {dict}, any))
if model and (entry := parse_model(model)):
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', is_type('timestamp'), 'model',
'timestamp', {int_or_none(scale=1000)}, any))
entries.append(entry)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
EMBED_URL = rf'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+{self._ID_REGEX}(?:\b[^"]+)?'
entries = []
for match in extract_all(r'new\s+SMP\(({.+?})\)'):
embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
if embed_url and re.match(EMBED_URL, embed_url):
entries.append(embed_url)
entries.extend(re.findall(
r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
rf'setPlaylist\("({EMBED_URL})"\)', webpage))
if entries:
return self.playlist_result(
[self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
@@ -1297,11 +1492,11 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
video_id = media_meta.get('externalId')
if not video_id:
video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
video_id = playlist_id if len(medias) == 1 else f'{playlist_id}-{num}'
title = media_meta.get('caption')
if not title:
title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
title = playlist_title if len(medias) == 1 else f'{playlist_title} - Video {num}'
duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
@@ -1362,8 +1557,8 @@ class BBCCoUkArticleIE(InfoExtractor):
class BBCCoUkPlaylistBaseIE(InfoExtractor):
def _entries(self, webpage, url, playlist_id):
single_page = 'page' in compat_urlparse.parse_qs(
compat_urlparse.urlparse(url).query)
single_page = 'page' in urllib.parse.parse_qs(
urllib.parse.urlparse(url).query)
for page_num in itertools.count(2):
for video_id in re.findall(
self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
@@ -1377,8 +1572,8 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
if not next_page:
break
webpage = self._download_webpage(
compat_urlparse.urljoin(url, next_page), playlist_id,
'Downloading page %d' % page_num, page_num)
urllib.parse.urljoin(url, next_page), playlist_id,
f'Downloading page {page_num}', page_num)
def _real_extract(self, url):
playlist_id = self._match_id(url)
@@ -1393,7 +1588,7 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
_VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
_VALID_URL_TMPL = rf'https?://(?:www\.)?bbc\.co\.uk/iplayer/%s/(?P<id>{BBCCoUkIE._ID_REGEX})'
@staticmethod
def _get_default(episode, key, default_key='default'):
@@ -1517,11 +1712,11 @@ class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
variables['sliceId'] = series_id
return self._download_json(
'https://graph.ibl.api.bbc.co.uk/', pid, headers={
'Content-Type': 'application/json'
'Content-Type': 'application/json',
}, data=json.dumps({
'id': '5692d93d5aac8d796a0305e895e61551',
'variables': variables,
}).encode('utf-8'))['data']['programme']
}).encode())['data']['programme']
@staticmethod
def _get_playlist_data(data):
@@ -1581,7 +1776,7 @@ class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
def _call_api(self, pid, per_page, page=1, series_id=None):
return self._download_json(
'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
f'http://ibl.api.bbc.co.uk/ibl/v1/groups/{pid}/episodes',
pid, query={
'page': page,
'per_page': per_page,
@@ -1597,7 +1792,7 @@ class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
IE_NAME = 'bbc.co.uk:playlist'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
_VALID_URL = rf'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>{BBCCoUkIE._ID_REGEX})/(?:episodes|broadcasts|clips)'
_URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
_VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
_TESTS = [{