Upgrade yt_dlp and download script

This commit is contained in:
2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions

View File

@@ -1,12 +1,12 @@
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
US_RATINGS,
ExtractorError,
determine_ext,
int_or_none,
float_or_none,
int_or_none,
js_to_json,
orderedSet,
strip_jsonp,
@@ -14,7 +14,6 @@ from ..utils import (
traverse_obj,
unified_strdate,
url_or_none,
US_RATINGS,
)
@@ -48,7 +47,7 @@ class PBSIE(InfoExtractor):
(r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/
(r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org
(r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org
(r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/
(r'(?:video\.|www\.)pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/
(r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/
(r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org
(r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/
@@ -62,7 +61,7 @@ class PBSIE(InfoExtractor):
(r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org
(r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/
(r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/
(r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org
(r'(?:video\.|www\.)thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org
(r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org
(r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org
(r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/
@@ -182,18 +181,19 @@ class PBSIE(InfoExtractor):
)
IE_NAME = 'pbs'
IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1])
IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1]))
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
(?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/?#]+) |
# Direct video URL, or article with embedded player
(?:{})/(?:
(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/#]|$) |
(?:[^/?#]+/){{1,5}}(?P<presumptive_id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])
)
)
''' % '|'.join(list(zip(*_STATIONS))[0])
'''.format('|'.join(next(zip(*_STATIONS))))
_GEO_COUNTRIES = ['US']
@@ -208,16 +208,40 @@ class PBSIE(InfoExtractor):
'description': 'md5:31b664af3c65fd07fa460d306b837d00',
'duration': 3190,
},
'skip': 'dead URL',
},
{
'url': 'https://www.thirteen.org/programs/the-woodwrights-shop/carving-away-with-mary-may-tioglz/',
'info_dict': {
'id': '3004803331',
'ext': 'mp4',
'title': "The Woodwright's Shop - Carving Away with Mary May",
'description': 'md5:7cbaaaa8b9bcc78bd8f0e31911644e28',
'duration': 1606,
'display_id': 'carving-away-with-mary-may-tioglz',
'chapters': [],
'thumbnail': 'https://image.pbs.org/video-assets/NcnTxNl-asset-mezzanine-16x9-K0Keoyv.jpg',
},
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
'md5': '6f722cb3c3982186d34b0f13374499c7',
'md5': '372b12b670070de39438b946474df92f',
'info_dict': {
'id': '2365297690',
'ext': 'mp4',
'title': 'FRONTLINE - Losing Iraq',
'description': 'md5:5979a4d069b157f622d02bff62fbe654',
'duration': 5050,
'chapters': [
{'start_time': 0.0, 'end_time': 1234.0, 'title': 'After Saddam, Chaos'},
{'start_time': 1233.0, 'end_time': 1719.0, 'title': 'The Insurgency Takes Root'},
{'start_time': 1718.0, 'end_time': 2461.0, 'title': 'A Light Footprint'},
{'start_time': 2460.0, 'end_time': 3589.0, 'title': 'The Surge '},
{'start_time': 3588.0, 'end_time': 4355.0, 'title': 'The Withdrawal '},
{'start_time': 4354.0, 'end_time': 5051.0, 'title': 'ISIS on the March '},
],
'display_id': 'losing-iraq',
'thumbnail': 'https://image.pbs.org/video-assets/pbs/frontline/138098/images/mezzanine_401.jpg',
},
},
{
@@ -404,6 +428,19 @@ class PBSIE(InfoExtractor):
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{
'url': 'https://www.pbssocal.org/shows/newshour/clip/capehart-johnson-1715984001',
'info_dict': {
'id': '3091549094',
'ext': 'mp4',
'title': 'PBS NewsHour - Capehart and Johnson on the unusual Biden-Trump debate plans',
'description': 'Capehart and Johnson on how the Biden-Trump debates could shape the campaign season',
'display_id': 'capehart-johnson-1715984001',
'duration': 593,
'thumbnail': 'https://image.pbs.org/video-assets/mF3oSVn-asset-mezzanine-16x9-QeXjXPy.jpg',
'chapters': [],
},
},
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
@@ -415,7 +452,7 @@ class PBSIE(InfoExtractor):
{
'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
'only_matching': True,
}
},
]
_ERRORS = {
101: 'We\'re sorry, but this video is not yet available.',
@@ -464,10 +501,12 @@ class PBSIE(InfoExtractor):
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
r'\sclass="passportcoveplayer"[^>]*\sdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
r'\bhttps?://player\.pbs\.org/[\w-]+player/(\d+)', # last pattern to avoid false positives
]
media_id = self._search_regex(
@@ -518,7 +557,7 @@ class PBSIE(InfoExtractor):
if not video_id:
video_info = self._extract_video_data(
player_page, 'video data', display_id)
video_id = compat_str(
video_id = str(
video_info.get('id') or video_info['contentID'])
else:
video_id = mobj.group('id')
@@ -539,7 +578,7 @@ class PBSIE(InfoExtractor):
if isinstance(video_id, list):
entries = [self.url_result(
'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
f'http://video.pbs.org/video/{vid_id}', 'PBS', vid_id)
for vid_id in video_id]
return self.playlist_result(entries, display_id)
@@ -568,11 +607,11 @@ class PBSIE(InfoExtractor):
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
'http://player.pbs.org/%s/%s' % (page, video_id),
display_id, 'Downloading %s page' % page, fatal=False)
f'http://player.pbs.org/{page}/{video_id}',
display_id, f'Downloading {page} page', fatal=False)
if player:
video_info = self._extract_video_data(
player, '%s video data' % page, display_id, fatal=False)
player, f'{page} video data', display_id, fatal=False)
if video_info:
extract_redirect_urls(video_info)
if not info:
@@ -603,7 +642,7 @@ class PBSIE(InfoExtractor):
redirect_id = redirect.get('eeid')
redirect_info = self._download_json(
'%s?format=json' % redirect['url'], display_id,
'{}?format=json'.format(redirect['url']), display_id,
'Downloading %s video url info' % (redirect_id or num),
headers=self.geo_verification_headers())
@@ -614,7 +653,7 @@ class PBSIE(InfoExtractor):
self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
f'{self.IE_NAME} said: {message}', expected=True)
format_url = redirect_info.get('url')
if not format_url:
@@ -649,7 +688,7 @@ class PBSIE(InfoExtractor):
f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
# This may produce invalid links sometimes (e.g.
# http://www.pbs.org/wgbh/frontline/film/suicide-plan)
if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate):
if not self._is_valid_url(f_url, display_id, f'http-{bitrate}k video'):
continue
f = m3u8_format.copy()
f.update({
@@ -671,7 +710,7 @@ class PBSIE(InfoExtractor):
captions = info.get('cc') or {}
for caption_url in captions.values():
subtitles.setdefault('en', []).append({
'url': caption_url
'url': caption_url,
})
subtitles = self._merge_subtitles(subtitles, hls_subs)
@@ -715,7 +754,7 @@ class PBSKidsIE(InfoExtractor):
'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
'categories': ['Episode'],
'upload_date': '20190718',
}
},
},
{
'url': 'https://pbskids.org/video/plum-landing/2365205059',
@@ -730,8 +769,8 @@ class PBSKidsIE(InfoExtractor):
'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
'categories': ['Episode'],
'upload_date': '20140302',
}
}
},
},
]
def _real_extract(self, url):
@@ -753,5 +792,5 @@ class PBSKidsIE(InfoExtractor):
'series': ('video_obj', 'program_title', {str}),
'title': ('video_obj', 'title', {str}),
'upload_date': ('video_obj', 'air_date', {unified_strdate}),
})
}),
}