Upgrade yt_dlp and download script
This commit is contained in:
@@ -4,10 +4,13 @@ import types
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree
|
||||
|
||||
from .common import InfoExtractor # isort: split
|
||||
from .common import InfoExtractor
|
||||
from .commonprotocols import RtmpIE
|
||||
from .youtube import YoutubeIE
|
||||
from ..compat import compat_etree_fromstring
|
||||
from ..cookies import LenientSimpleCookie
|
||||
from ..networking.exceptions import HTTPError
|
||||
from ..networking.impersonate import ImpersonateTarget
|
||||
from ..utils import (
|
||||
KNOWN_EXTENSIONS,
|
||||
MEDIA_EXTENSIONS,
|
||||
@@ -17,6 +20,7 @@ from ..utils import (
|
||||
determine_protocol,
|
||||
dict_get,
|
||||
extract_basic_auth,
|
||||
filter_dict,
|
||||
format_field,
|
||||
int_or_none,
|
||||
is_html,
|
||||
@@ -33,14 +37,17 @@ from ..utils import (
|
||||
unescapeHTML,
|
||||
unified_timestamp,
|
||||
unsmuggle_url,
|
||||
update_url,
|
||||
update_url_query,
|
||||
url_or_none,
|
||||
urlhandle_detect_ext,
|
||||
urljoin,
|
||||
variadic,
|
||||
xpath_attr,
|
||||
xpath_text,
|
||||
xpath_with_ns,
|
||||
)
|
||||
from ..utils._utils import _UnsafeExtensionError
|
||||
|
||||
|
||||
class GenericIE(InfoExtractor):
|
||||
@@ -58,7 +65,9 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'mp4',
|
||||
'title': 'trailer',
|
||||
'upload_date': '20100513',
|
||||
}
|
||||
'direct': True,
|
||||
'timestamp': 1273772943.0,
|
||||
},
|
||||
},
|
||||
# Direct link to media delivered compressed (until Accept-Encoding is *)
|
||||
{
|
||||
@@ -71,7 +80,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20140522',
|
||||
},
|
||||
'expected_warnings': [
|
||||
'URL could be a direct video link, returning it as such.'
|
||||
'URL could be a direct video link, returning it as such.',
|
||||
],
|
||||
'skip': 'URL invalid',
|
||||
},
|
||||
@@ -101,10 +110,12 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'webm',
|
||||
'title': '5_Lennart_Poettering_-_Systemd',
|
||||
'upload_date': '20141120',
|
||||
'direct': True,
|
||||
'timestamp': 1416498816.0,
|
||||
},
|
||||
'expected_warnings': [
|
||||
'URL could be a direct video link, returning it as such.'
|
||||
]
|
||||
'URL could be a direct video link, returning it as such.',
|
||||
],
|
||||
},
|
||||
# RSS feed
|
||||
{
|
||||
@@ -112,7 +123,7 @@ class GenericIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
|
||||
'title': 'Zero Punctuation',
|
||||
'description': 're:.*groundbreaking video review series.*'
|
||||
'description': 're:.*groundbreaking video review series.*',
|
||||
},
|
||||
'playlist_mincount': 11,
|
||||
},
|
||||
@@ -133,6 +144,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20201204',
|
||||
},
|
||||
}],
|
||||
'skip': 'Dead link',
|
||||
},
|
||||
# RSS feed with item with description and thumbnails
|
||||
{
|
||||
@@ -145,12 +157,12 @@ class GenericIE(InfoExtractor):
|
||||
'playlist': [{
|
||||
'info_dict': {
|
||||
'ext': 'm4a',
|
||||
'id': 'c1c879525ce2cb640b344507e682c36d',
|
||||
'id': '818a5d38-01cd-152f-2231-ee479677fa82',
|
||||
'title': 're:Hydrogen!',
|
||||
'description': 're:.*In this episode we are going.*',
|
||||
'timestamp': 1567977776,
|
||||
'upload_date': '20190908',
|
||||
'duration': 459,
|
||||
'duration': 423,
|
||||
'thumbnail': r're:^https?://.*\.jpg$',
|
||||
'episode_number': 1,
|
||||
'season_number': 1,
|
||||
@@ -267,6 +279,7 @@ class GenericIE(InfoExtractor):
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
'skip': '404 Not Found',
|
||||
},
|
||||
# MPD from http://dash-mse-test.appspot.com/media.html
|
||||
{
|
||||
@@ -278,6 +291,20 @@ class GenericIE(InfoExtractor):
|
||||
'title': 'car-20120827-manifest',
|
||||
'formats': 'mincount:9',
|
||||
'upload_date': '20130904',
|
||||
'timestamp': 1378272859.0,
|
||||
},
|
||||
},
|
||||
# Live DASH MPD
|
||||
{
|
||||
'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd',
|
||||
'info_dict': {
|
||||
'id': 'Manifest',
|
||||
'ext': 'mp4',
|
||||
'title': r're:Manifest \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
|
||||
'live_status': 'is_live',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'livestream',
|
||||
},
|
||||
},
|
||||
# m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
|
||||
@@ -318,14 +345,14 @@ class GenericIE(InfoExtractor):
|
||||
'id': 'cmQHVoWB5FY',
|
||||
'ext': 'mp4',
|
||||
'upload_date': '20130224',
|
||||
'uploader_id': 'TheVerge',
|
||||
'uploader_id': '@TheVerge',
|
||||
'description': r're:^Chris Ziegler takes a look at the\.*',
|
||||
'uploader': 'The Verge',
|
||||
'title': 'First Firefox OS phones side-by-side',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': False,
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
# redirect in Refresh HTTP header
|
||||
@@ -351,7 +378,7 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'mp4',
|
||||
'uploader': 'www.hodiho.fr',
|
||||
'title': 'R\u00e9gis plante sa Jeep',
|
||||
}
|
||||
},
|
||||
},
|
||||
# bandcamp page with custom domain
|
||||
{
|
||||
@@ -365,46 +392,6 @@ class GenericIE(InfoExtractor):
|
||||
},
|
||||
'skip': 'There is a limit of 200 free downloads / month for the test song',
|
||||
},
|
||||
# ooyala video
|
||||
{
|
||||
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
|
||||
'md5': '166dd577b433b4d4ebfee10b0824d8ff',
|
||||
'info_dict': {
|
||||
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
|
||||
'ext': 'mp4',
|
||||
'title': '2cc213299525360.mov', # that's what we get
|
||||
'duration': 238.231,
|
||||
},
|
||||
'add_ie': ['Ooyala'],
|
||||
},
|
||||
{
|
||||
# ooyala video embedded with http://player.ooyala.com/iframe.js
|
||||
'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
|
||||
'info_dict': {
|
||||
'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
|
||||
'ext': 'mp4',
|
||||
'title': '"Steve Jobs: Man in the Machine" trailer',
|
||||
'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
|
||||
'duration': 135.427,
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
'skip': 'movie expired',
|
||||
},
|
||||
# ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
|
||||
{
|
||||
'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
|
||||
'info_dict': {
|
||||
'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
|
||||
'ext': 'mp4',
|
||||
'title': 'Steampunk Fest Comes to Honesdale',
|
||||
'duration': 43.276,
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
# embed.ly video
|
||||
{
|
||||
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
|
||||
@@ -469,19 +456,19 @@ class GenericIE(InfoExtractor):
|
||||
'id': '370908',
|
||||
'title': 'Госзаказ. День 3',
|
||||
'ext': 'mp4',
|
||||
}
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': '370905',
|
||||
'title': 'Госзаказ. День 2',
|
||||
'ext': 'mp4',
|
||||
}
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': '370902',
|
||||
'title': 'Госзаказ. День 1',
|
||||
'ext': 'mp4',
|
||||
}
|
||||
},
|
||||
}],
|
||||
'params': {
|
||||
# m3u8 download
|
||||
@@ -497,7 +484,8 @@ class GenericIE(InfoExtractor):
|
||||
'title': 'Ужастики, русский трейлер (2015)',
|
||||
'thumbnail': r're:^https?://.*\.jpg$',
|
||||
'duration': 153,
|
||||
}
|
||||
},
|
||||
'skip': 'Site dead',
|
||||
},
|
||||
# XHamster embed
|
||||
{
|
||||
@@ -521,7 +509,7 @@ class GenericIE(InfoExtractor):
|
||||
'title': 'Hidden miracles of the natural world',
|
||||
'uploader': 'Louie Schwartzberg',
|
||||
'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
|
||||
}
|
||||
},
|
||||
},
|
||||
# nowvideo embed hidden behind percent encoding
|
||||
{
|
||||
@@ -546,7 +534,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20140320',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Requires rtmpdump'
|
||||
'skip_download': 'Requires rtmpdump',
|
||||
},
|
||||
'skip': 'video gone',
|
||||
},
|
||||
@@ -567,8 +555,8 @@ class GenericIE(InfoExtractor):
|
||||
'skip_download': True,
|
||||
},
|
||||
'expected_warnings': [
|
||||
'Forbidden'
|
||||
]
|
||||
'Forbidden',
|
||||
],
|
||||
},
|
||||
# Condé Nast embed
|
||||
{
|
||||
@@ -578,7 +566,7 @@ class GenericIE(InfoExtractor):
|
||||
'id': '53501be369702d3275860000',
|
||||
'ext': 'mp4',
|
||||
'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
|
||||
}
|
||||
},
|
||||
},
|
||||
# Dailymotion embed
|
||||
{
|
||||
@@ -625,7 +613,7 @@ class GenericIE(InfoExtractor):
|
||||
'add_ie': ['Youtube'],
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
},
|
||||
# MTVServices embed
|
||||
{
|
||||
@@ -654,7 +642,7 @@ class GenericIE(InfoExtractor):
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
},
|
||||
# Flowplayer
|
||||
{
|
||||
@@ -666,7 +654,7 @@ class GenericIE(InfoExtractor):
|
||||
'age_limit': 18,
|
||||
'uploader': 'www.handjobhub.com',
|
||||
'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
|
||||
}
|
||||
},
|
||||
},
|
||||
# MLB embed
|
||||
{
|
||||
@@ -710,7 +698,7 @@ class GenericIE(InfoExtractor):
|
||||
'uploader': 'Sophos Security',
|
||||
'title': 'Chet Chat 171 - Oct 29, 2014',
|
||||
'upload_date': '20141029',
|
||||
}
|
||||
},
|
||||
},
|
||||
# Soundcloud multiple embeds
|
||||
{
|
||||
@@ -744,7 +732,7 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'flv',
|
||||
'upload_date': '20141112',
|
||||
'title': 'Rosetta #CometLanding webcast HL 10',
|
||||
}
|
||||
},
|
||||
},
|
||||
# Another Livestream embed, without 'new.' in URL
|
||||
{
|
||||
@@ -769,15 +757,17 @@ class GenericIE(InfoExtractor):
|
||||
'playlist_mincount': 1,
|
||||
'add_ie': ['Youtube'],
|
||||
},
|
||||
# Cinchcast embed
|
||||
# Libsyn embed
|
||||
{
|
||||
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
|
||||
'info_dict': {
|
||||
'id': '7141703',
|
||||
'id': '3793998',
|
||||
'ext': 'mp3',
|
||||
'upload_date': '20141126',
|
||||
'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
|
||||
}
|
||||
'title': 'Underground Wellness Radio - Jack Tips: 5 Steps to Permanent Gut Healing',
|
||||
'thumbnail': 'https://assets.libsyn.com/secure/item/3793998/?height=90&width=90',
|
||||
'duration': 3989.0,
|
||||
},
|
||||
},
|
||||
# Cinerama player
|
||||
{
|
||||
@@ -787,7 +777,7 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'mp4',
|
||||
'uploader': 'www.abc.net.au',
|
||||
'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
|
||||
}
|
||||
},
|
||||
},
|
||||
# embedded viddler video
|
||||
{
|
||||
@@ -904,7 +894,7 @@ class GenericIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': 'aanslagen-kopenhagen',
|
||||
'title': 'Aanslagen Kopenhagen',
|
||||
}
|
||||
},
|
||||
},
|
||||
# Zapiks embed
|
||||
{
|
||||
@@ -913,7 +903,7 @@ class GenericIE(InfoExtractor):
|
||||
'id': '118046',
|
||||
'ext': 'mp4',
|
||||
'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
|
||||
}
|
||||
},
|
||||
},
|
||||
# Kaltura embed (different embed code)
|
||||
{
|
||||
@@ -952,11 +942,11 @@ class GenericIE(InfoExtractor):
|
||||
},
|
||||
'add_ie': ['Kaltura'],
|
||||
'expected_warnings': [
|
||||
'Could not send HEAD request'
|
||||
'Could not send HEAD request',
|
||||
],
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
# Kaltura embedded, some fileExt broken (#11480)
|
||||
@@ -1083,7 +1073,7 @@ class GenericIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': '8RUoRhRi',
|
||||
'ext': 'mp4',
|
||||
'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
|
||||
'title': 'Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!',
|
||||
'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
|
||||
'timestamp': 1428207000,
|
||||
'upload_date': '20150405',
|
||||
@@ -1159,7 +1149,7 @@ class GenericIE(InfoExtractor):
|
||||
'uploader': 'clickhole',
|
||||
'upload_date': '20150527',
|
||||
'timestamp': 1432744860,
|
||||
}
|
||||
},
|
||||
},
|
||||
# SnagFilms embed
|
||||
{
|
||||
@@ -1168,7 +1158,7 @@ class GenericIE(InfoExtractor):
|
||||
'id': '74849a00-85a9-11e1-9660-123139220831',
|
||||
'ext': 'mp4',
|
||||
'title': '#whilewewatch',
|
||||
}
|
||||
},
|
||||
},
|
||||
# AdobeTVVideo embed
|
||||
{
|
||||
@@ -1464,7 +1454,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20211217',
|
||||
'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg',
|
||||
},
|
||||
}]
|
||||
}],
|
||||
},
|
||||
{
|
||||
'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/',
|
||||
@@ -1558,16 +1548,6 @@ class GenericIE(InfoExtractor):
|
||||
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
|
||||
},
|
||||
},
|
||||
{
|
||||
# vzaar embed
|
||||
'url': 'http://help.vzaar.com/article/165-embedding-video',
|
||||
'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
|
||||
'info_dict': {
|
||||
'id': '8707641',
|
||||
'ext': 'mp4',
|
||||
'title': 'Building A Business Online: Principal Chairs Q & A',
|
||||
},
|
||||
},
|
||||
{
|
||||
# multiple HTML5 videos on one page
|
||||
'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
|
||||
@@ -1585,7 +1565,7 @@ class GenericIE(InfoExtractor):
|
||||
'id': '0f64ce6',
|
||||
'title': 'vl14062007715967',
|
||||
'ext': 'mp4',
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
|
||||
@@ -1597,7 +1577,7 @@ class GenericIE(InfoExtractor):
|
||||
'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
|
||||
'timestamp': 1474354800,
|
||||
'upload_date': '20160920',
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton',
|
||||
@@ -1689,7 +1669,7 @@ class GenericIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': '83645793',
|
||||
'title': 'Lock up and get excited',
|
||||
'ext': 'mp4'
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'skip': 'TODO: fix nested playlists processing in tests',
|
||||
},
|
||||
@@ -1765,7 +1745,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20220110',
|
||||
'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg',
|
||||
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
# blogger embed
|
||||
@@ -1935,8 +1915,8 @@ class GenericIE(InfoExtractor):
|
||||
'timestamp': 1501941939.0,
|
||||
'title': 'That small heart attack.',
|
||||
'upload_date': '20170805',
|
||||
'uploader': 'Antw87'
|
||||
}
|
||||
'uploader': 'Antw87',
|
||||
},
|
||||
},
|
||||
{
|
||||
# 1080p Reddit-hosted video that will redirect and be processed by RedditIE
|
||||
@@ -1948,8 +1928,8 @@ class GenericIE(InfoExtractor):
|
||||
'title': "The game Didn't want me to Knife that Guy I guess",
|
||||
'uploader': 'paraf1ve',
|
||||
'timestamp': 1636788683.0,
|
||||
'upload_date': '20211113'
|
||||
}
|
||||
'upload_date': '20211113',
|
||||
},
|
||||
},
|
||||
{
|
||||
# MainStreaming player
|
||||
@@ -1961,15 +1941,15 @@ class GenericIE(InfoExtractor):
|
||||
'ext': 'mp4',
|
||||
'live_status': 'not_live',
|
||||
'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
|
||||
'duration': 1512
|
||||
}
|
||||
'duration': 1512,
|
||||
},
|
||||
},
|
||||
{
|
||||
# Multiple gfycat iframe embeds
|
||||
'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
|
||||
'info_dict': {
|
||||
'title': '재이, 윤, 세은 황금 드레스를 입고 빛난다',
|
||||
'id': 'board'
|
||||
'id': 'board',
|
||||
},
|
||||
'playlist_count': 8,
|
||||
},
|
||||
@@ -1978,18 +1958,18 @@ class GenericIE(InfoExtractor):
|
||||
'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=612199',
|
||||
'info_dict': {
|
||||
'title': '옳게 된 크롭 니트 스테이씨 아이사',
|
||||
'id': 'board'
|
||||
'id': 'board',
|
||||
},
|
||||
'playlist_count': 6
|
||||
'playlist_count': 6,
|
||||
},
|
||||
{
|
||||
# Multiple gfycat embeds, with uppercase "IFR" in urls
|
||||
'url': 'https://kkzz.kr/?vid=2295',
|
||||
'info_dict': {
|
||||
'title': '지방시 앰버서더 에스파 카리나 움짤',
|
||||
'id': '?vid=2295'
|
||||
'id': '?vid=2295',
|
||||
},
|
||||
'playlist_count': 9
|
||||
'playlist_count': 9,
|
||||
},
|
||||
{
|
||||
# Panopto embeds
|
||||
@@ -2022,9 +2002,9 @@ class GenericIE(InfoExtractor):
|
||||
'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html',
|
||||
'info_dict': {
|
||||
'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä',
|
||||
'id': 'art-2000008762560'
|
||||
'id': 'art-2000008762560',
|
||||
},
|
||||
'playlist_count': 3
|
||||
'playlist_count': 3,
|
||||
},
|
||||
{
|
||||
# Ruutu embed in hs.fi with a single video
|
||||
@@ -2053,7 +2033,7 @@ class GenericIE(InfoExtractor):
|
||||
'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg',
|
||||
'timestamp': 1652833414,
|
||||
'age_limit': 0,
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
|
||||
@@ -2093,7 +2073,7 @@ class GenericIE(InfoExtractor):
|
||||
'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720',
|
||||
'duration': 5688.0,
|
||||
'upload_date': '20210111',
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'note': 'JSON LD with multiple @type',
|
||||
@@ -2109,7 +2089,7 @@ class GenericIE(InfoExtractor):
|
||||
'upload_date': '20200411',
|
||||
'age_limit': 0,
|
||||
'duration': 111.0,
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'note': 'JSON LD with unexpected data type',
|
||||
@@ -2124,7 +2104,7 @@ class GenericIE(InfoExtractor):
|
||||
'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$',
|
||||
'age_limit': 0,
|
||||
'direct': True,
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.',
|
||||
@@ -2142,22 +2122,6 @@ class GenericIE(InfoExtractor):
|
||||
'age_limit': 0,
|
||||
},
|
||||
},
|
||||
{
|
||||
'note': 'JW Player embed with unicode-escape sequences in URL',
|
||||
'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
|
||||
'info_dict': {
|
||||
'id': 'm',
|
||||
'ext': 'mp4',
|
||||
'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
|
||||
'description': 'Mahler\'s ',
|
||||
'uploader': 'www.medici.tv',
|
||||
'age_limit': 0,
|
||||
'thumbnail': r're:^https?://.+\.jpg',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
|
||||
'md5': 'e2f0a4c329f7986280b7328e24036d60',
|
||||
@@ -2202,7 +2166,7 @@ class GenericIE(InfoExtractor):
|
||||
|
||||
def report_following_redirect(self, new_url):
|
||||
"""Report information extraction."""
|
||||
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
|
||||
self._downloader.to_screen(f'[redirect] Following redirect to {new_url}')
|
||||
|
||||
def report_detected(self, name, num=1, note=None):
|
||||
if num > 1:
|
||||
@@ -2221,7 +2185,15 @@ class GenericIE(InfoExtractor):
|
||||
urllib.parse.urlparse(fragment_query).query or fragment_query
|
||||
or urllib.parse.urlparse(manifest_url).query or None)
|
||||
|
||||
hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
|
||||
key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
|
||||
if key_query is not None:
|
||||
info['extra_param_to_key_url'] = (
|
||||
urllib.parse.urlparse(key_query).query or key_query
|
||||
or urllib.parse.urlparse(manifest_url).query or None)
|
||||
|
||||
def hex_or_none(value):
|
||||
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
|
||||
|
||||
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
|
||||
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
|
||||
}) or None
|
||||
@@ -2242,10 +2214,21 @@ class GenericIE(InfoExtractor):
|
||||
if is_live is not None:
|
||||
info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
|
||||
return
|
||||
headers = m3u8_format.get('http_headers') or info.get('http_headers')
|
||||
duration = self._extract_m3u8_vod_duration(
|
||||
m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
|
||||
errnote='Failed to download m3u8 media playlist', headers=headers)
|
||||
headers = m3u8_format.get('http_headers') or info.get('http_headers') or {}
|
||||
display_id = info.get('id')
|
||||
urlh = self._request_webpage(
|
||||
m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False,
|
||||
headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False)
|
||||
if urlh is False:
|
||||
return
|
||||
first_bytes = urlh.read(512)
|
||||
if not first_bytes.startswith(b'#EXTM3U'):
|
||||
return
|
||||
m3u8_doc = self._webpage_read_content(
|
||||
urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False)
|
||||
if not m3u8_doc:
|
||||
return
|
||||
duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id)
|
||||
if not duration:
|
||||
info['live_status'] = 'is_live'
|
||||
info['duration'] = info.get('duration') or duration
|
||||
@@ -2298,29 +2281,29 @@ class GenericIE(InfoExtractor):
|
||||
return video_url # not obfuscated
|
||||
|
||||
parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
|
||||
license = cls._kvs_get_license_token(license_code)
|
||||
license_token = cls._kvs_get_license_token(license_code)
|
||||
urlparts = parsed.path.split('/')
|
||||
|
||||
HASH_LENGTH = 32
|
||||
hash = urlparts[3][:HASH_LENGTH]
|
||||
hash_ = urlparts[3][:HASH_LENGTH]
|
||||
indices = list(range(HASH_LENGTH))
|
||||
|
||||
# Swap indices of hash according to the destination calculated from the license token
|
||||
accum = 0
|
||||
for src in reversed(range(HASH_LENGTH)):
|
||||
accum += license[src]
|
||||
accum += license_token[src]
|
||||
dest = (src + accum) % HASH_LENGTH
|
||||
indices[src], indices[dest] = indices[dest], indices[src]
|
||||
|
||||
urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
|
||||
urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:]
|
||||
return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
|
||||
|
||||
@staticmethod
|
||||
def _kvs_get_license_token(license):
|
||||
license = license.replace('$', '')
|
||||
license_values = [int(char) for char in license]
|
||||
def _kvs_get_license_token(license_code):
|
||||
license_code = license_code.replace('$', '')
|
||||
license_values = [int(char) for char in license_code]
|
||||
|
||||
modlicense = license.replace('0', '1')
|
||||
modlicense = license_code.replace('0', '1')
|
||||
center = len(modlicense) // 2
|
||||
fronthalf = int(modlicense[:center + 1])
|
||||
backhalf = int(modlicense[center:])
|
||||
@@ -2370,7 +2353,7 @@ class GenericIE(InfoExtractor):
|
||||
'id': flashvars['video_id'],
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'thumbnail': thumbnail,
|
||||
'thumbnail': urljoin(url, thumbnail),
|
||||
'formats': formats,
|
||||
}
|
||||
|
||||
@@ -2385,25 +2368,24 @@ class GenericIE(InfoExtractor):
|
||||
default_search = 'fixup_error'
|
||||
|
||||
if default_search in ('auto', 'auto_warning', 'fixup_error'):
|
||||
if re.match(r'^[^\s/]+\.[^\s/]+/', url):
|
||||
if re.match(r'[^\s/]+\.[^\s/]+/', url):
|
||||
self.report_warning('The url doesn\'t specify the protocol, trying with http')
|
||||
return self.url_result('http://' + url)
|
||||
elif default_search != 'fixup_error':
|
||||
if default_search == 'auto_warning':
|
||||
if re.match(r'^(?:url|URL)$', url):
|
||||
raise ExtractorError(
|
||||
'Invalid URL: %r . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
|
||||
f'Invalid URL: {url!r} . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ',
|
||||
expected=True)
|
||||
else:
|
||||
self.report_warning(
|
||||
'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
|
||||
f'Falling back to youtube search for {url} . Set --default-search "auto" to suppress this warning.')
|
||||
return self.url_result('ytsearch:' + url)
|
||||
|
||||
if default_search in ('error', 'fixup_error'):
|
||||
raise ExtractorError(
|
||||
'%r is not a valid URL. '
|
||||
'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:%s" ) to search YouTube'
|
||||
% (url, url), expected=True)
|
||||
f'{url!r} is not a valid URL. '
|
||||
f'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:{url}" ) to search YouTube', expected=True)
|
||||
else:
|
||||
if ':' not in default_search:
|
||||
default_search += ':'
|
||||
@@ -2419,6 +2401,11 @@ class GenericIE(InfoExtractor):
|
||||
else:
|
||||
video_id = self._generic_id(url)
|
||||
|
||||
# Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
|
||||
impersonate = self._configuration_arg('impersonate', ['false'])
|
||||
if 'false' in impersonate:
|
||||
impersonate = None
|
||||
|
||||
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
|
||||
# making it impossible to download only chunk of the file (yet we need only 512kB to
|
||||
# test whether it's HTML or not). According to yt-dlp default Accept-Encoding
|
||||
@@ -2427,12 +2414,30 @@ class GenericIE(InfoExtractor):
|
||||
# to accept raw bytes and being able to download only a chunk.
|
||||
# It may probably better to solve this by checking Content-Type for application/octet-stream
|
||||
# after a HEAD request, but not sure if we can rely on this.
|
||||
full_response = self._request_webpage(url, video_id, headers={
|
||||
'Accept-Encoding': 'identity',
|
||||
**smuggled_data.get('http_headers', {})
|
||||
})
|
||||
try:
|
||||
full_response = self._request_webpage(url, video_id, headers=filter_dict({
|
||||
'Accept-Encoding': 'identity',
|
||||
'Referer': smuggled_data.get('referer'),
|
||||
}), impersonate=impersonate)
|
||||
except ExtractorError as e:
|
||||
if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
|
||||
and e.cause.response.get_header('cf-mitigated') == 'challenge'
|
||||
and e.cause.response.extensions.get('impersonate') is None):
|
||||
raise
|
||||
cf_cookie_domain = traverse_obj(
|
||||
LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
|
||||
('__cf_bm', 'domain'))
|
||||
if cf_cookie_domain:
|
||||
self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
|
||||
self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
|
||||
msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
|
||||
if not self._downloader._impersonate_target_available(ImpersonateTarget()):
|
||||
msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for '
|
||||
'how to install the required impersonation dependency, and ')
|
||||
raise ExtractorError(
|
||||
f'{msg}try again with --extractor-args "generic:impersonate"', expected=True)
|
||||
|
||||
new_url = full_response.url
|
||||
url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
|
||||
if new_url != extract_basic_auth(url)[0]:
|
||||
self.report_following_redirect(new_url)
|
||||
if force_videoid:
|
||||
@@ -2442,29 +2447,29 @@ class GenericIE(InfoExtractor):
|
||||
info_dict = {
|
||||
'id': video_id,
|
||||
'title': self._generic_title(url),
|
||||
'timestamp': unified_timestamp(full_response.headers.get('Last-Modified'))
|
||||
'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')),
|
||||
}
|
||||
|
||||
# Check for direct link to a video
|
||||
content_type = full_response.headers.get('Content-Type', '').lower()
|
||||
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
|
||||
m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
|
||||
if m:
|
||||
self.report_detected('direct video link')
|
||||
headers = smuggled_data.get('http_headers', {})
|
||||
headers = filter_dict({'Referer': smuggled_data.get('referer')})
|
||||
format_id = str(m.group('format_id'))
|
||||
ext = determine_ext(url)
|
||||
ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
|
||||
subtitles = {}
|
||||
if format_id.endswith('mpegurl') or ext == 'm3u8':
|
||||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
|
||||
elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd':
|
||||
formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
|
||||
elif format_id == 'f4m' or ext == 'f4m':
|
||||
formats = self._extract_f4m_formats(url, video_id, headers=headers)
|
||||
# Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way
|
||||
else:
|
||||
formats = [{
|
||||
'format_id': format_id,
|
||||
'url': url,
|
||||
'vcodec': 'none' if m.group('type') == 'audio' else None
|
||||
'ext': ext,
|
||||
'vcodec': 'none' if m.group('type') == 'audio' else None,
|
||||
}]
|
||||
info_dict['direct'] = True
|
||||
info_dict.update({
|
||||
@@ -2493,9 +2498,13 @@ class GenericIE(InfoExtractor):
|
||||
if not is_html(first_bytes):
|
||||
self.report_warning(
|
||||
'URL could be a direct video link, returning it as such.')
|
||||
ext = determine_ext(url)
|
||||
if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
|
||||
ext = 'unknown_video'
|
||||
info_dict.update({
|
||||
'direct': True,
|
||||
'url': url,
|
||||
'ext': ext,
|
||||
})
|
||||
return info_dict
|
||||
|
||||
@@ -2512,7 +2521,7 @@ class GenericIE(InfoExtractor):
|
||||
try:
|
||||
doc = compat_etree_fromstring(webpage)
|
||||
except xml.etree.ElementTree.ParseError:
|
||||
doc = compat_etree_fromstring(webpage.encode('utf-8'))
|
||||
doc = compat_etree_fromstring(webpage.encode())
|
||||
if doc.tag == 'rss':
|
||||
self.report_detected('RSS feed')
|
||||
return self._extract_rss(url, video_id, doc)
|
||||
@@ -2529,13 +2538,15 @@ class GenericIE(InfoExtractor):
|
||||
return self.playlist_result(
|
||||
self._parse_xspf(
|
||||
doc, video_id, xspf_url=url,
|
||||
xspf_base_url=full_response.url),
|
||||
xspf_base_url=new_url),
|
||||
video_id)
|
||||
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
|
||||
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
|
||||
doc,
|
||||
mpd_base_url=full_response.url.rpartition('/')[0],
|
||||
# Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs
|
||||
mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0],
|
||||
mpd_url=url)
|
||||
info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None
|
||||
self._extra_manifest_info(info_dict, url)
|
||||
self.report_detected('DASH manifest')
|
||||
return info_dict
|
||||
@@ -2660,7 +2671,7 @@ class GenericIE(InfoExtractor):
|
||||
})
|
||||
# https://docs.videojs.com/player#addRemoteTextTrack
|
||||
# https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
|
||||
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
|
||||
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
|
||||
sub = self._parse_json(
|
||||
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
|
||||
sub_src = str_or_none(sub.get('src'))
|
||||
@@ -2701,7 +2712,7 @@ class GenericIE(InfoExtractor):
|
||||
'url': smuggle_url(json_ld['url'], {
|
||||
'force_videoid': video_id,
|
||||
'to_generic': True,
|
||||
'http_headers': {'Referer': url},
|
||||
'referer': url,
|
||||
}),
|
||||
}, json_ld)]
|
||||
|
||||
@@ -2782,7 +2793,7 @@ class GenericIE(InfoExtractor):
|
||||
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
|
||||
found = re.search(
|
||||
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
|
||||
r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
|
||||
rf'(?:[a-z-]+="[^"]+"\s+)*?content="{REDIRECT_REGEX}',
|
||||
webpage)
|
||||
if not found:
|
||||
# Look also in Refresh HTTP header
|
||||
@@ -2826,7 +2837,7 @@ class GenericIE(InfoExtractor):
|
||||
|
||||
video_id = os.path.splitext(video_id)[0]
|
||||
headers = {
|
||||
'referer': actual_url
|
||||
'referer': actual_url,
|
||||
}
|
||||
|
||||
entry_info_dict = {
|
||||
@@ -2883,5 +2894,5 @@ class GenericIE(InfoExtractor):
|
||||
for num, e in enumerate(entries, start=1):
|
||||
# 'url' results don't have a title
|
||||
if e.get('title') is not None:
|
||||
e['title'] = '%s (%d)' % (e['title'], num)
|
||||
e['title'] = '{} ({})'.format(e['title'], num)
|
||||
return entries
|
||||
|
Reference in New Issue
Block a user