Upgrade yt_dlp and download script

This commit is contained in:
2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions

View File

@@ -1,12 +1,13 @@
import re
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
GeoRestrictedError,
clean_html,
determine_ext,
ExtractorError,
filter_dict,
GeoRestrictedError,
int_or_none,
join_nonempty,
parse_duration,
@@ -27,6 +28,29 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
def _fix_m3u8_formats(self, media_url, video_id):
fmts = self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
# Fix malformed m3u8 manifests by setting audio-only/video-only formats
for f in fmts:
if not f.get('acodec'):
f['acodec'] = 'mp4a'
if not f.get('vcodec'):
f['vcodec'] = 'avc1'
man_url = f['url']
if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only
f['vcodec'] = 'none'
elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only
f['acodec'] = 'none'
else: # video+audio
if f['acodec'] == 'none':
f['acodec'] = 'mp4a'
if f['vcodec'] == 'none':
f['vcodec'] = 'avc1'
return fmts
def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
def fix_cdata(s):
# remove \r\n\t before and after <![CDATA[ ]]> to avoid
@@ -68,8 +92,7 @@ class RaiBaseIE(InfoExtractor):
'format_id': 'https-mp3',
})
elif ext == 'm3u8' or 'format=m3u8' in media_url:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
formats.extend(self._fix_m3u8_formats(media_url, video_id))
elif ext == 'f4m':
# very likely no longer needed. Cannot find any url that uses it.
manifest_url = update_url_query(
@@ -91,7 +114,7 @@ class RaiBaseIE(InfoExtractor):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
if not audio_only and not is_live:
formats.extend(self._create_http_urls(media_url, relinker_url, formats))
formats.extend(self._create_http_urls(media_url, relinker_url, formats, video_id))
return filter_dict({
'is_live': is_live,
@@ -99,7 +122,7 @@ class RaiBaseIE(InfoExtractor):
'formats': formats,
})
def _create_http_urls(self, manifest_url, relinker_url, fmts):
def _create_http_urls(self, manifest_url, relinker_url, fmts, video_id):
_MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
@@ -120,7 +143,7 @@ class RaiBaseIE(InfoExtractor):
}
def percentage(number, target, pc=20, roof=125):
'''check if the target is in the range of number +/- percent'''
"""check if the target is in the range of number +/- percent"""
if not number or number < 0:
return False
return abs(target - number) < min(float(number) * float(pc) / 100.0, roof)
@@ -152,10 +175,10 @@ class RaiBaseIE(InfoExtractor):
'format_id': f'https-{tbr}',
'width': format_copy.get('width'),
'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'),
'vcodec': format_copy.get('vcodec'),
'acodec': format_copy.get('acodec'),
'fps': format_copy.get('fps'),
'tbr': format_copy.get('tbr') or tbr,
'vcodec': format_copy.get('vcodec') or 'avc1',
'acodec': format_copy.get('acodec') or 'mp4a',
'fps': format_copy.get('fps') or 25,
} if format_copy else {
'format_id': f'https-{tbr}',
'width': _QUALITY[tbr][0],
@@ -166,9 +189,17 @@ class RaiBaseIE(InfoExtractor):
'fps': 25,
}
# Check if MP4 download is available
try:
self._request_webpage(
HEADRequest(_MP4_TMPL % (relinker_url, '*')), video_id, 'Checking MP4 availability')
except ExtractorError as e:
self.to_screen(f'{video_id}: MP4 direct download is not available: {e.cause}')
return []
# filter out single-stream formats
fmts = [f for f in fmts
if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none']
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
mobj = re.search(_MANIFEST_REG, manifest_url)
if not mobj:
@@ -182,7 +213,7 @@ class RaiBaseIE(InfoExtractor):
'url': _MP4_TMPL % (relinker_url, q),
'protocol': 'https',
'ext': 'mp4',
**get_format_info(q)
**get_format_info(q),
})
return formats
@@ -236,7 +267,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'Report',
'season': '2013/14',
'subtitles': {'it': 'count:4'},
'release_year': 2022,
'release_year': 2024,
'episode': 'Espresso nel caffè - 07/04/2014',
'timestamp': 1396919880,
'upload_date': '20140408',
@@ -244,7 +275,7 @@ class RaiPlayIE(RaiBaseIE):
},
'params': {'skip_download': True},
}, {
# 1080p direct mp4 url
# 1080p
'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
'md5': 'aeda7243115380b2dd5e881fd42d949a',
'info_dict': {
@@ -265,10 +296,10 @@ class RaiPlayIE(RaiBaseIE):
'episode': 'Senza occhi',
'timestamp': 1637318940,
'upload_date': '20211119',
'formats': 'count:12',
'formats': 'count:7',
},
'params': {'skip_download': True},
'expected_warnings': ['Video not available. Likely due to geo-restriction.']
'expected_warnings': ['Video not available. Likely due to geo-restriction.'],
}, {
# 1500 quality
'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html',
@@ -290,6 +321,27 @@ class RaiPlayIE(RaiBaseIE):
'timestamp': 1348495020,
'upload_date': '20120924',
},
}, {
# checking program_info gives false positive for DRM
'url': 'https://www.raiplay.it/video/2022/10/Ad-ogni-costo---Un-giorno-in-Pretura---Puntata-del-15102022-1dfd1295-ea38-4bac-b51e-f87e2881693b.html',
'md5': '572c6f711b7c5f2d670ba419b4ae3b08',
'info_dict': {
'id': '1dfd1295-ea38-4bac-b51e-f87e2881693b',
'ext': 'mp4',
'title': 'Ad ogni costo - Un giorno in Pretura - Puntata del 15/10/2022',
'alt_title': 'St 2022/23 - Un giorno in pretura - Ad ogni costo',
'description': 'md5:4046d97b2687f74f06a8b8270ba5599f',
'uploader': 'Rai 3',
'duration': 3773.0,
'thumbnail': 'https://www.raiplay.it/dl/img/2022/10/12/1665586539957_2048x2048.png',
'creators': ['Rai 3'],
'series': 'Un giorno in pretura',
'season': '2022/23',
'episode': 'Ad ogni costo',
'timestamp': 1665507240,
'upload_date': '20221011',
'release_year': 2025,
},
}, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True,
@@ -309,9 +361,8 @@ class RaiPlayIE(RaiBaseIE):
media = self._download_json(
f'{base}.json', video_id, 'Downloading video JSON')
if not self.get_param('allow_unplayable_formats'):
if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')):
self.report_drm(video_id)
if traverse_obj(media, ('rights_management', 'rights', 'drm')):
self.report_drm(video_id)
video = media['video']
relinker_info = self._extract_relinker_info(video['content_url'], video_id)
@@ -342,7 +393,7 @@ class RaiPlayIE(RaiBaseIE):
'episode_number': int_or_none(media.get('episode')),
'subtitles': self._extract_subtitles(url, video),
'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))),
**relinker_info
**relinker_info,
}
@@ -518,7 +569,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor):
'info_dict': {
'id': 'ilruggitodelconiglio',
'title': 'Il Ruggito del Coniglio',
'description': 'md5:48cff6972435964284614d70474132e6',
'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e',
},
'playlist_mincount': 65,
}, {
@@ -565,7 +616,7 @@ class RaiIE(RaiBaseIE):
'upload_date': '20140612',
},
'params': {'skip_download': True},
'expected_warnings': ['Video not available. Likely due to geo-restriction.']
'expected_warnings': ['Video not available. Likely due to geo-restriction.'],
}, {
'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
'info_dict': {
@@ -575,7 +626,7 @@ class RaiIE(RaiBaseIE):
'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2214,
'upload_date': '20161103'
'upload_date': '20161103',
},
'params': {'skip_download': True},
}, {
@@ -601,7 +652,7 @@ class RaiIE(RaiBaseIE):
'ext': media.get('formatoAudio'),
'vcodec': 'none',
'acodec': media.get('formatoAudio'),
}]
}],
}
elif 'Video' in media['type']:
relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
@@ -621,23 +672,24 @@ class RaiIE(RaiBaseIE):
'upload_date': unified_strdate(media.get('date')),
'duration': parse_duration(media.get('length')),
'subtitles': self._extract_subtitles(url, media),
**relinker_info
**relinker_info,
}
class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
class RaiNewsIE(RaiBaseIE):
_VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html'
_EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)']
_TESTS = [{
# new rainews player (#3911)
'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html',
'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html',
'info_dict': {
'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf',
'id': '31e8017c-845c-43f5-9c48-245b43c3a079',
'ext': 'mp4',
'title': 'Puntata del 29/05/2022',
'duration': 1589,
'upload_date': '20220529',
'title': 'md5:1e81364b09de4a149042bac3c7d36f0b',
'duration': 196,
'upload_date': '20240225',
'uploader': 'rainews',
'formats': 'count:2',
},
'params': {'skip_download': True},
}, {
@@ -650,7 +702,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'description': 'I film in uscita questa settimana.',
'thumbnail': r're:^https?://.*\.png$',
'duration': 833,
'upload_date': '20161103'
'upload_date': '20161103',
'formats': 'count:8',
},
'params': {'skip_download': True},
'expected_warnings': ['unable to extract player_data'],
@@ -675,7 +728,7 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
if not relinker_url:
# fallback on old implementation for some old content
try:
return self._extract_from_content_id(video_id, url)
return RaiIE._real_extract(self, url)
except GeoRestrictedError:
raise
except ExtractorError as e:
@@ -688,7 +741,7 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage),
'upload_date': unified_strdate(track_info.get('date')),
'uploader': strip_or_none(track_info.get('editor') or None),
**relinker_info
**relinker_info,
}