Upgrade yt_dlp and download script

This commit is contained in:
2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions

View File

@@ -1,8 +1,10 @@
import functools
import itertools
import urllib.parse
from .common import InfoExtractor
from .sproutvideo import VidsIoIE
from .vimeo import VimeoIE
from ..compat import compat_urllib_parse_unquote
from ..networking.exceptions import HTTPError
from ..utils import (
KNOWN_EXTENSIONS,
@@ -12,29 +14,35 @@ from ..utils import (
int_or_none,
mimetype2ext,
parse_iso8601,
smuggle_url,
str_or_none,
traverse_obj,
try_get,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj, value
class PatreonBaseIE(InfoExtractor):
USER_AGENT = 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)'
@functools.cached_property
def patreon_user_agent(self):
# Patreon mobile UA is needed to avoid triggering Cloudflare anti-bot protection.
# Newer UA yields higher res m3u8 formats for locked posts, but gives 401 if not logged-in
if self._get_cookies('https://www.patreon.com/').get('session_id'):
return 'Patreon/72.2.28 (Android; Android 14; Scale/2.10)'
return 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)'
def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None):
if headers is None:
headers = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = self.USER_AGENT
headers['User-Agent'] = self.patreon_user_agent
if query:
query.update({'json-api-version': 1.0})
try:
return self._download_json(
f'https://www.patreon.com/api/{ep}',
item_id, note='Downloading API JSON' if not note else note,
item_id, note=note if note else 'Downloading API JSON',
query=query, fatal=fatal, headers=headers)
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.response.headers.get('Content-Type')) != 'json':
@@ -47,6 +55,7 @@ class PatreonBaseIE(InfoExtractor):
class PatreonIE(PatreonBaseIE):
IE_NAME = 'patreon'
_VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.patreon.com/creation?hid=743933',
@@ -54,6 +63,7 @@ class PatreonIE(PatreonBaseIE):
'info_dict': {
'id': '743933',
'ext': 'mp3',
'alt_title': 'cd166.mp3',
'title': 'Episode 166: David Smalley of Dogma Debate',
'description': 'md5:34d207dd29aa90e24f1b3f58841b81c7',
'uploader': 'Cognitive Dissonance Podcast',
@@ -92,7 +102,7 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': 're:^https?://.*$',
'upload_date': '20150211',
'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364',
'uploader_id': 'TraciJHines',
'uploader_id': '@TraciHinesMusic',
'categories': ['Entertainment'],
'duration': 282,
'view_count': int,
@@ -106,13 +116,16 @@ class PatreonIE(PatreonBaseIE):
'availability': 'public',
'channel_follower_count': int,
'playable_in_embed': True,
'uploader_url': 'http://www.youtube.com/user/TraciJHines',
'uploader_url': 'https://www.youtube.com/@TraciHinesMusic',
'comment_count': int,
'channel_is_verified': True,
'chapters': 'count:4',
'timestamp': 1423689666,
},
'params': {
'noplaylist': True,
'skip_download': True,
}
},
}, {
'url': 'https://www.patreon.com/posts/episode-166-of-743933',
'only_matching': True,
@@ -132,7 +145,7 @@ class PatreonIE(PatreonBaseIE):
'description': 'md5:557a409bd79d3898689419094934ba79',
'uploader_id': '14936315',
},
'skip': 'Patron-only content'
'skip': 'Patron-only content',
}, {
# m3u8 video (https://github.com/yt-dlp/yt-dlp/issues/2277)
'url': 'https://www.patreon.com/posts/video-sketchbook-32452882',
@@ -153,7 +166,7 @@ class PatreonIE(PatreonBaseIE):
'channel_id': '1641751',
'channel_url': 'https://www.patreon.com/loish',
'channel_follower_count': int,
}
},
}, {
# bad videos under media (if media is included). Real one is under post_file
'url': 'https://www.patreon.com/posts/premium-access-70282931',
@@ -176,13 +189,99 @@ class PatreonIE(PatreonBaseIE):
'uploader_url': 'https://www.patreon.com/thenormies',
},
'skip': 'Patron-only content',
}, {
# dead vimeo and embed URLs, need to extract post_file
'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913',
'info_dict': {
'id': '34007913',
'ext': 'mp4',
'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!',
'like_count': int,
'uploader': 'YaBoyRoshi',
'timestamp': 1581636833,
'channel_url': 'https://www.patreon.com/yaboyroshi',
'thumbnail': r're:^https?://.*$',
'tags': ['Hunter x Hunter'],
'uploader_id': '14264111',
'comment_count': int,
'channel_follower_count': int,
'description': 'Kurapika is a walking cheat code!',
'upload_date': '20200213',
'channel_id': '2147162',
'uploader_url': 'https://www.patreon.com/yaboyroshi',
},
}, {
# NSFW vimeo embed URL
'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599',
'info_dict': {
'id': '902250943',
'ext': 'mp4',
'title': '❤️(4K) Spiderman Girl Yeonhwas Gift ❤️(4K) 스파이더맨걸 연화의 선물',
'description': '❤️(4K) Spiderman Girl Yeonhwas Gift \n❤️(4K) 스파이더맨걸 연화의 선물',
'uploader': 'Npickyeonhwa',
'uploader_id': '90574422',
'uploader_url': 'https://www.patreon.com/Yeonhwa726',
'channel_id': '10237902',
'channel_url': 'https://www.patreon.com/Yeonhwa726',
'duration': 70,
'timestamp': 1705150153,
'upload_date': '20240113',
'comment_count': int,
'like_count': int,
'thumbnail': r're:^https?://.+',
},
'params': {'skip_download': 'm3u8'},
'expected_warnings': ['Failed to parse XML: not well-formed'],
}, {
# multiple attachments/embeds
'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
'playlist_count': 3,
'info_dict': {
'id': '100601977',
'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
'uploader': 'Bradley Hall',
'uploader_id': '24401883',
'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_id': '3193932',
'channel_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_follower_count': int,
'timestamp': 1710777855,
'upload_date': '20240318',
'like_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}, {
# Contains a comment reply in the 'included' section
'url': 'https://www.patreon.com/posts/114721679',
'info_dict': {
'id': '114721679',
'ext': 'mp4',
'upload_date': '20241025',
'uploader': 'Japanalysis',
'like_count': int,
'thumbnail': r're:^https?://.+',
'comment_count': int,
'title': 'Karasawa Part 2',
'description': 'Part 2 of this video https://www.youtube.com/watch?v=Azms2-VTASk',
'uploader_url': 'https://www.patreon.com/japanalysis',
'uploader_id': '80504268',
'channel_url': 'https://www.patreon.com/japanalysis',
'channel_follower_count': int,
'timestamp': 1729897015,
'channel_id': '9346307',
},
'params': {'getcomments': True},
}]
_RETURN_TYPE = 'video'
def _real_extract(self, url):
video_id = self._match_id(url)
post = self._call_api(
f'posts/{video_id}', video_id, query={
'fields[media]': 'download_url,mimetype,size_bytes',
'fields[media]': 'download_url,mimetype,size_bytes,file_name',
'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title,current_user_can_view',
'fields[user]': 'full_name,url',
'fields[post_tag]': 'value',
@@ -191,102 +290,120 @@ class PatreonIE(PatreonBaseIE):
'include': 'audio,user,user_defined_tags,campaign,attachments_media',
})
attributes = post['data']['attributes']
title = attributes['title'].strip()
image = attributes.get('image') or {}
info = {
'id': video_id,
'title': title,
'description': clean_html(attributes.get('content')),
'thumbnail': image.get('large_url') or image.get('url'),
'timestamp': parse_iso8601(attributes.get('published_at')),
'like_count': int_or_none(attributes.get('like_count')),
'comment_count': int_or_none(attributes.get('comment_count')),
}
can_view_post = traverse_obj(attributes, 'current_user_can_view')
if can_view_post and info['comment_count']:
info['__post_extractor'] = self.extract_comments(video_id)
info = traverse_obj(attributes, {
'title': ('title', {str.strip}),
'description': ('content', {clean_html}),
'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
'timestamp': ('published_at', {parse_iso8601}),
'like_count': ('like_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
})
for i in post.get('included', []):
i_type = i.get('type')
if i_type == 'media':
media_attributes = i.get('attributes') or {}
download_url = media_attributes.get('download_url')
entries = []
idx = 0
for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
include_type = include['type']
if include_type == 'media':
media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
# XXX: what happens if there are multiple attachments?
return {
**info,
idx += 1
entries.append({
'id': f'{video_id}-{idx}',
'ext': ext,
'filesize': size_bytes,
'url': download_url,
}
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
'alt_title': traverse_obj(media_attributes, ('file_name', {str})),
})
elif i_type == 'post_tag':
info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value')))
elif include_type == 'user':
info.update(traverse_obj(include, {
'uploader': ('attributes', 'full_name', {str}),
'uploader_id': ('id', {str_or_none}),
'uploader_url': ('attributes', 'url', {url_or_none}),
}))
elif i_type == 'campaign':
info.update({
'channel': traverse_obj(i, ('attributes', 'title')),
'channel_id': str_or_none(i.get('id')),
'channel_url': traverse_obj(i, ('attributes', 'url')),
'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))),
})
elif include_type == 'post_tag':
if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
info.setdefault('tags', []).append(post_tag)
elif include_type == 'campaign':
info.update(traverse_obj(include, {
'channel': ('attributes', 'title', {str}),
'channel_id': ('id', {str_or_none}),
'channel_url': ('attributes', 'url', {url_or_none}),
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo
headers = {'referer': 'https://patreon.com/'}
# handle Vimeo embeds
if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo':
embed_html = try_get(attributes, lambda x: x['embed']['html'])
v_url = url_or_none(compat_urllib_parse_unquote(
self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False)))
if v_url:
return {
**info,
'_type': 'url_transparent',
'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'),
'ie_key': 'Vimeo',
}
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
v_url = urllib.parse.unquote(self._html_search_regex(
r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)',
traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '')
if url_or_none(v_url) and self._request_webpage(
v_url, video_id, 'Checking Vimeo embed URL', headers=headers,
fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection
entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True))
embed_url = try_get(attributes, lambda x: x['embed']['url'])
if embed_url:
return {
**info,
'_type': 'url',
'url': embed_url,
}
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and (urlh := self._request_webpage(
embed_url, video_id, 'Checking embed URL', headers=headers,
fatal=False, errnote=False, expected_status=403)):
# Vimeo's Cloudflare anti-bot protection will return HTTP status 200 for 404, so we need
# to check for "Sorry, we couldn&amp;rsquo;t find that page" in the meta description tag
meta_description = clean_html(self._html_search_meta(
'description', self._webpage_read_content(urlh, embed_url, video_id, fatal=False), default=None))
# Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie
if ((urlh.status != 403 and meta_description != 'Sorry, we couldnt find that page')
or VidsIoIE.suitable(embed_url)):
entries.append(self.url_result(smuggle_url(embed_url, headers)))
post_file = traverse_obj(attributes, 'post_file')
post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file:
name = post_file.get('name')
ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS:
return {
**info,
entries.append({
'id': video_id,
'ext': ext,
'url': post_file['url'],
}
elif name == 'video':
})
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return {
**info,
entries.append({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
})
if can_view_post is False:
can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None
if can_view_post and info.get('comment_count'):
comments = self.extract_comments(video_id)
if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
else:
elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
elif len(entries) == 1:
info.update(entries[0])
else:
for entry in entries:
entry.update(info)
return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
info['id'] = video_id
info['__post_extractor'] = comments
return info
def _get_comments(self, post_id):
@@ -307,29 +424,27 @@ class PatreonIE(PatreonBaseIE):
params.update({'page[cursor]': cursor} if cursor else {})
response = self._call_api(
f'posts/{post_id}/comments', post_id, query=params, note='Downloading comments page %d' % page)
f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}')
cursor = None
for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)):
for comment in traverse_obj(response, (('data', 'included'), lambda _, v: v['type'] == 'comment' and v['id'])):
count += 1
comment_id = comment.get('id')
attributes = comment.get('attributes') or {}
if comment_id is None:
continue
author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id'))
author_info = traverse_obj(
response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'),
get_all=False, expected_type=dict, default={})
yield {
'id': comment_id,
'text': attributes.get('body'),
'timestamp': parse_iso8601(attributes.get('created')),
'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'),
'author_is_uploader': attributes.get('is_by_creator'),
**traverse_obj(comment, {
'id': ('id', {str_or_none}),
'text': ('attributes', 'body', {str}),
'timestamp': ('attributes', 'created', {parse_iso8601}),
'parent': ('relationships', 'parent', 'data', ('id', {value('root')}), {str}, any),
'author_is_uploader': ('attributes', 'is_by_creator', {bool}),
}),
**traverse_obj(response, (
'included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes', {
'author': ('full_name', {str}),
'author_thumbnail': ('image_url', {url_or_none}),
}), get_all=False),
'author_id': author_id,
'author': author_info.get('full_name'),
'author_thumbnail': author_info.get('image_url'),
}
if count < traverse_obj(response, ('meta', 'count')):
@@ -340,15 +455,19 @@ class PatreonIE(PatreonBaseIE):
class PatreonCampaignIE(PatreonBaseIE):
_VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P<campaign_id>\d+))|(?P<vanity>[-\w]+))'
IE_NAME = 'patreon:campaign'
_VALID_URL = r'''(?x)
https?://(?:www\.)?patreon\.com/(?:
(?:m|api/campaigns)/(?P<campaign_id>\d+)|
(?:c/)?(?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+)
)(?:/posts)?/?(?:$|[?#])'''
_TESTS = [{
'url': 'https://www.patreon.com/dissonancepod/',
'info_dict': {
'title': 'Cognitive Dissonance Podcast',
'channel_url': 'https://www.patreon.com/dissonancepod',
'id': '80642',
'description': 'md5:eb2fa8b83da7ab887adeac34da6b7af7',
'description': r're:(?s).*We produce a weekly news podcast focusing on stories that deal with skepticism and religion.*',
'channel_id': '80642',
'channel': 'Cognitive Dissonance Podcast',
'age_limit': 0,
@@ -363,31 +482,66 @@ class PatreonCampaignIE(PatreonBaseIE):
'url': 'https://www.patreon.com/m/4767637/posts',
'info_dict': {
'title': 'Not Just Bikes',
'channel_follower_count': int,
'id': '4767637',
'channel_id': '4767637',
'channel_url': 'https://www.patreon.com/notjustbikes',
'description': 'md5:595c6e7dca76ae615b1d38c298a287a1',
'description': r're:(?s).*Not Just Bikes started as a way to explain why we chose to live in the Netherlands.*',
'age_limit': 0,
'channel': 'Not Just Bikes',
'uploader_url': 'https://www.patreon.com/notjustbikes',
'uploader': 'Not Just Bikes',
'uploader': 'Jason',
'uploader_id': '37306634',
'thumbnail': r're:^https?://.*$',
},
'playlist_mincount': 71
'playlist_mincount': 71,
}, {
'url': 'https://www.patreon.com/api/campaigns/4243769/posts',
'info_dict': {
'title': 'Second Thought',
'channel_follower_count': int,
'id': '4243769',
'channel_id': '4243769',
'channel_url': 'https://www.patreon.com/secondthought',
'description': r're:(?s).*Second Thought is an educational YouTube channel.*',
'age_limit': 0,
'channel': 'Second Thought',
'uploader_url': 'https://www.patreon.com/secondthought',
'uploader': 'JT Chapman',
'uploader_id': '32718287',
'thumbnail': r're:^https?://.*$',
},
'playlist_mincount': 201,
}, {
'url': 'https://www.patreon.com/c/OgSog',
'info_dict': {
'id': '8504388',
'title': 'OGSoG',
'description': r're:(?s)Hello and welcome to our Patreon page. We are Mari, Lasercorn, .+',
'channel': 'OGSoG',
'channel_id': '8504388',
'channel_url': 'https://www.patreon.com/OgSog',
'uploader_url': 'https://www.patreon.com/OgSog',
'uploader_id': '72323575',
'uploader': 'David Moss',
'thumbnail': r're:https?://.+/.+',
'channel_follower_count': int,
'age_limit': 0,
},
'playlist_mincount': 331,
}, {
'url': 'https://www.patreon.com/c/OgSog/posts',
'only_matching': True,
}, {
'url': 'https://www.patreon.com/dissonancepod/posts',
'only_matching': True
'only_matching': True,
}, {
'url': 'https://www.patreon.com/m/5932659',
'only_matching': True
'only_matching': True,
}, {
'url': 'https://www.patreon.com/api/campaigns/4243769',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if PatreonIE.suitable(url) else super(PatreonCampaignIE, cls).suitable(url)
def _entries(self, campaign_id):
cursor = None
params = {
@@ -401,7 +555,7 @@ class PatreonCampaignIE(PatreonBaseIE):
for page in itertools.count(1):
params.update({'page[cursor]': cursor} if cursor else {})
posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page)
posts_json = self._call_api('posts', campaign_id, query=params, note=f'Downloading posts page {page}')
cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next'))
for post_url in traverse_obj(posts_json, ('data', ..., 'attributes', 'patreon_url')):
@@ -414,14 +568,15 @@ class PatreonCampaignIE(PatreonBaseIE):
campaign_id, vanity = self._match_valid_url(url).group('campaign_id', 'vanity')
if campaign_id is None:
webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.USER_AGENT})
campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID')
webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.patreon_user_agent})
campaign_id = self._search_nextjs_data(
webpage, vanity)['props']['pageProps']['bootstrapEnvelope']['pageBootstrap']['campaign']['data']['id']
params = {
'json-api-use-default-includes': 'false',
'fields[user]': 'full_name,url',
'fields[campaign]': 'name,summary,url,patron_count,creation_count,is_nsfw,avatar_photo_url',
'include': 'creator'
'include': 'creator',
}
campaign_response = self._call_api(