Updated ytdlp version

This commit is contained in:
2023-08-13 20:13:21 -05:00
parent 5264103f31
commit ee3e042b1b
344 changed files with 20808 additions and 7875 deletions

View File

@@ -1,10 +1,9 @@
import functools
import json
import re
import urllib.error
from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE
from ..compat import functools # isort: split
from ..compat import (
compat_parse_qs,
compat_urllib_parse_unquote,
@@ -13,10 +12,12 @@ from ..compat import (
from ..utils import (
ExtractorError,
dict_get,
filter_dict,
float_or_none,
format_field,
int_or_none,
make_archive_id,
remove_end,
str_or_none,
strip_or_none,
traverse_obj,
@@ -30,13 +31,67 @@ from ..utils import (
class TwitterBaseIE(InfoExtractor):
_NETRC_MACHINE = 'twitter'
_API_BASE = 'https://api.twitter.com/1.1/'
_GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
_TOKENS = {
'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None,
'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None,
}
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
_AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
_LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
_flow_token = None
_LOGIN_INIT_DATA = json.dumps({
'input_flow_data': {
'flow_context': {
'debug_overrides': {},
'start_location': {
'location': 'unknown'
}
}
},
'subtask_versions': {
'action_list': 2,
'alert_dialog': 1,
'app_download_cta': 1,
'check_logged_in_account': 1,
'choice_selection': 3,
'contacts_live_sync_permission_prompt': 0,
'cta': 7,
'email_verification': 2,
'end_flow': 1,
'enter_date': 1,
'enter_email': 2,
'enter_password': 5,
'enter_phone': 2,
'enter_recaptcha': 1,
'enter_text': 5,
'enter_username': 2,
'generic_urt': 3,
'in_app_notification': 1,
'interest_picker': 3,
'js_instrumentation': 1,
'menu_dialog': 1,
'notifications_permission_prompt': 2,
'open_account': 2,
'open_home_timeline': 1,
'open_link': 1,
'phone_verification': 4,
'privacy_options': 1,
'security_key': 3,
'select_avatar': 4,
'select_banner': 2,
'settings_list': 7,
'show_code': 1,
'sign_up': 2,
'sign_up_review': 4,
'tweet_selection_urt': 1,
'update_users': 1,
'upload_media': 1,
'user_recommendations_list': 4,
'user_recommendations_urt': 1,
'wait_spinner': 3,
'web_modal': 1
}
}, separators=(',', ':')).encode()
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
@@ -88,73 +143,179 @@ class TwitterBaseIE(InfoExtractor):
'height': int(m.group('height')),
})
@functools.cached_property
@property
def is_logged_in(self):
return bool(self._get_cookies(self._API_BASE).get('auth_token'))
def _call_api(self, path, video_id, query={}, graphql=False):
cookies = self._get_cookies(self._API_BASE)
headers = {}
def _fetch_guest_token(self, display_id):
guest_token = traverse_obj(self._download_json(
f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'',
headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))),
('guest_token', {str}))
if not guest_token:
raise ExtractorError('Could not retrieve guest token')
return guest_token
csrf_cookie = cookies.get('ct0')
if csrf_cookie:
headers['x-csrf-token'] = csrf_cookie.value
def _set_base_headers(self, legacy=False):
bearer_token = self._LEGACY_AUTH if legacy and not self.is_logged_in else self._AUTH
return filter_dict({
'Authorization': f'Bearer {bearer_token}',
'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value),
})
def _call_login_api(self, note, headers, query={}, data=None):
response = self._download_json(
f'{self._API_BASE}onboarding/task.json', None, note,
headers=headers, query=query, data=data, expected_status=400)
error = traverse_obj(response, ('errors', 0, 'message', {str}))
if error:
raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True)
elif traverse_obj(response, 'status') != 'success':
raise ExtractorError('Login was unsuccessful')
subtask = traverse_obj(
response, ('subtasks', ..., 'subtask_id', {str}), get_all=False)
if not subtask:
raise ExtractorError('Twitter API did not return next login subtask')
self._flow_token = response['flow_token']
return subtask
def _perform_login(self, username, password):
if self.is_logged_in:
headers.update({
'x-twitter-auth-type': 'OAuth2Session',
'x-twitter-client-language': 'en',
'x-twitter-active-user': 'yes',
})
return
last_error = None
for bearer_token in self._TOKENS:
for first_attempt in (True, False):
headers['Authorization'] = f'Bearer {bearer_token}'
webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page')
guest_token = self._search_regex(
r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None)
headers = {
**self._set_base_headers(),
'content-type': 'application/json',
'x-guest-token': guest_token,
'x-twitter-client-language': 'en',
'x-twitter-active-user': 'yes',
'Referer': 'https://twitter.com/',
'Origin': 'https://twitter.com',
}
if not self.is_logged_in:
if not self._TOKENS[bearer_token]:
headers.pop('x-guest-token', None)
guest_token_response = self._download_json(
self._API_BASE + 'guest/activate.json', video_id,
'Downloading guest token', data=b'', headers=headers)
def build_login_json(*subtask_inputs):
return json.dumps({
'flow_token': self._flow_token,
'subtask_inputs': subtask_inputs
}, separators=(',', ':')).encode()
self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
if not self._TOKENS[bearer_token]:
raise ExtractorError('Could not retrieve guest token')
def input_dict(subtask_id, text):
return {
'subtask_id': subtask_id,
'enter_text': {
'text': text,
'link': 'next_link'
}
}
headers['x-guest-token'] = self._TOKENS[bearer_token]
next_subtask = self._call_login_api(
'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA)
try:
allowed_status = {400, 403, 404} if graphql else {403}
result = self._download_json(
(self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
video_id, headers=headers, query=query, expected_status=allowed_status)
while not self.is_logged_in:
if next_subtask == 'LoginJsInstrumentationSubtask':
next_subtask = self._call_login_api(
'Submitting JS instrumentation response', headers, data=build_login_json({
'subtask_id': next_subtask,
'js_instrumentation': {
'response': '{}',
'link': 'next_link'
}
}))
except ExtractorError as e:
if last_error:
raise last_error
elif next_subtask == 'LoginEnterUserIdentifierSSO':
next_subtask = self._call_login_api(
'Submitting username', headers, data=build_login_json({
'subtask_id': next_subtask,
'settings_list': {
'setting_responses': [{
'key': 'user_identifier',
'response_data': {
'text_data': {
'result': username
}
}
}],
'link': 'next_link'
}
}))
if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
raise
elif next_subtask == 'LoginEnterAlternateIdentifierSubtask':
next_subtask = self._call_login_api(
'Submitting alternate identifier', headers,
data=build_login_json(input_dict(next_subtask, self._get_tfa_info(
'one of username, phone number or email that was not used as --username'))))
last_error = e
self.report_warning(
'Twitter API gave 404 response, retrying with deprecated auth token. '
'Only one media item can be extracted')
break # continue outer loop with next bearer_token
elif next_subtask == 'LoginEnterPassword':
next_subtask = self._call_login_api(
'Submitting password', headers, data=build_login_json({
'subtask_id': next_subtask,
'enter_password': {
'password': password,
'link': 'next_link'
}
}))
if result.get('errors'):
errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str)
if first_attempt and any('bad guest token' in error.lower() for error in errors):
self.to_screen('Guest token has expired. Refreshing guest token')
self._TOKENS[bearer_token] = None
continue
elif next_subtask == 'AccountDuplicationCheck':
next_subtask = self._call_login_api(
'Submitting account duplication check', headers, data=build_login_json({
'subtask_id': next_subtask,
'check_logged_in_account': {
'link': 'AccountDuplicationCheck_false'
}
}))
error_message = ', '.join(set(errors)) or 'Unknown error'
raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True)
elif next_subtask == 'LoginTwoFactorAuthChallenge':
next_subtask = self._call_login_api(
'Submitting 2FA token', headers, data=build_login_json(input_dict(
next_subtask, self._get_tfa_info('two-factor authentication token'))))
return result
elif next_subtask == 'LoginAcid':
next_subtask = self._call_login_api(
'Submitting confirmation code', headers, data=build_login_json(input_dict(
next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))
elif next_subtask == 'ArkoseLogin':
self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')
elif next_subtask == 'DenyLoginSubtask':
self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')
elif next_subtask == 'LoginSuccessSubtask':
raise ExtractorError('Twitter API did not grant auth token cookie')
else:
raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"')
self.report_login()
def _call_api(self, path, video_id, query={}, graphql=False):
headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api'))
headers.update({
'x-twitter-auth-type': 'OAuth2Session',
'x-twitter-client-language': 'en',
'x-twitter-active-user': 'yes',
} if self.is_logged_in else {
'x-guest-token': self._fetch_guest_token(video_id)
})
allowed_status = {400, 401, 403, 404} if graphql else {403}
result = self._download_json(
(self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
video_id, headers=headers, query=query, expected_status=allowed_status,
note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON')
if result.get('errors'):
errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str}))))
if errors and 'not authorized' in errors:
self.raise_login_required(remove_end(errors, '.'))
raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}')
return result
def _build_graphql_query(self, media_id):
raise NotImplementedError('Method must be implemented to support GraphQL')
@@ -313,6 +474,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 18,
},
@@ -391,6 +553,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['Damndaniel'],
'age_limit': 0,
},
@@ -431,6 +594,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -451,7 +615,7 @@ class TwitterIE(TwitterBaseIE):
# has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
'info_dict': {
'id': '852138619213144067',
'id': '852077943283097602',
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
@@ -460,8 +624,16 @@ class TwitterIE(TwitterBaseIE):
'duration': 277.4,
'timestamp': 1492000653,
'upload_date': '20170412',
'display_id': '852138619213144067',
'age_limit': 0,
'uploader_url': 'https://twitter.com/news_al3alm',
'thumbnail': r're:^https?://.*\.jpg',
'tags': [],
'repost_count': int,
'view_count': int,
'like_count': int,
'comment_count': int,
},
'skip': 'Account suspended',
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
@@ -480,6 +652,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['Maria'],
'age_limit': 0,
},
@@ -505,6 +678,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -517,18 +691,19 @@ class TwitterIE(TwitterBaseIE):
'id': '1087791272830607360',
'display_id': '1087791357756956680',
'ext': 'mp4',
'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'title': 'X - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:6dfd341a3310fb97d80d2bf7145df976',
'uploader': 'Twitter',
'uploader_id': 'Twitter',
'uploader': 'X',
'uploader_id': 'X',
'duration': 61.567,
'timestamp': 1548184644,
'upload_date': '20190122',
'uploader_url': 'https://twitter.com/Twitter',
'uploader_url': 'https://twitter.com/X',
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -589,6 +764,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -597,9 +773,9 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
'title': 'Ultima | #\u0432\u029f\u043c - Test',
'title': 'Ultima📛 | #вʟм - Test',
'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima | #\u0432\u029f\u043c',
'uploader': 'Ultima📛 | #вʟм',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@@ -630,12 +806,12 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['HurricaneIan'],
'age_limit': 0,
},
}, {
# Adult content, uses old token
# Fails if not logged in (GraphQL)
# Adult content, fails if not logged in (GraphQL)
'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
'info_dict': {
'id': '1575199163847000068',
@@ -655,9 +831,9 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18,
'tags': []
},
'expected_warnings': ['404'],
'skip': 'Requires authentication',
}, {
# Description is missing one https://t.co url (GraphQL)
# Playlist result only with auth
'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
'playlist_mincount': 2,
'info_dict': {
@@ -669,14 +845,13 @@ class TwitterIE(TwitterBaseIE):
'upload_date': '20210519',
'age_limit': 0,
'repost_count': int,
'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7',
'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw',
'uploader_id': 'Srirachachau',
'comment_count': int,
'uploader_url': 'https://twitter.com/Srirachachau',
'timestamp': 1621447860,
},
}, {
# Description is missing one https://t.co url (GraphQL)
'url': 'https://twitter.com/DavidToons_/status/1578353380363501568',
'playlist_mincount': 2,
'info_dict': {
@@ -688,7 +863,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': str,
'timestamp': 1665143744,
'uploader_url': 'https://twitter.com/DavidToons_',
'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w',
'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/WgJauwIW1w',
'tags': [],
'comment_count': int,
'upload_date': '20221007',
@@ -722,11 +897,15 @@ class TwitterIE(TwitterBaseIE):
'uploader': r're:Monique Camarra.+?',
'uploader_id': 'MoniqueCamarra',
'live_status': 'was_live',
'description': 'md5:acce559345fd49f129c20dbcda3f1201',
'timestamp': 1658407771464,
'release_timestamp': 1658417414,
'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad',
'timestamp': 1658407771,
'release_date': '20220721',
'upload_date': '20220721',
},
'add_ie': ['TwitterSpaces'],
'params': {'skip_download': 'm3u8'},
'skip': 'Requires authentication',
}, {
# URL specifies video number but --yes-playlist
'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1',
@@ -752,7 +931,7 @@ class TwitterIE(TwitterBaseIE):
'info_dict': {
'id': '1600649511827013632',
'ext': 'mp4',
'title': 'md5:dac4f4d4c591fcc4e88a253eba472dc3',
'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1',
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1670459604.0,
'uploader_id': 'CTVJLaidlaw',
@@ -764,6 +943,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_url': 'https://twitter.com/CTVJLaidlaw',
'display_id': '1600649710662213632',
'like_count': int,
'view_count': int,
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'upload_date': '20221208',
'age_limit': 0,
@@ -791,6 +971,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'view_count': int,
},
}, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
@@ -806,6 +987,7 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'duration': 9.531,
'comment_count': int,
'view_count': int,
'upload_date': '20221203',
'age_limit': 0,
'timestamp': 1670092210.0,
@@ -815,7 +997,6 @@ class TwitterIE(TwitterBaseIE):
},
'params': {'noplaylist': True},
}, {
# Media view count is GraphQL only, force in test
'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625',
'info_dict': {
'id': '1600009362759733248',
@@ -826,10 +1007,10 @@ class TwitterIE(TwitterBaseIE):
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0,
'uploader': 'Mün The Shinobi | BlaqBoi\'s Therapist',
'uploader': 'Mün The Friend Of YWAP',
'repost_count': int,
'upload_date': '20221206',
'title': 'Mün The Shinobi | BlaqBoi\'s Therapist - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int,
'like_count': int,
'tags': [],
@@ -837,7 +1018,53 @@ class TwitterIE(TwitterBaseIE):
'duration': 139.987,
'timestamp': 1670306984.0,
},
'params': {'extractor_args': {'twitter': {'force_graphql': ['']}}},
}, {
# url to retweet id w/ legacy api
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
'info_dict': {
'id': '1623274794488659969',
'display_id': '1623739803874349067',
'ext': 'mp4',
'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy',
'description': 'md5:b06864cd3dc2554821cc327f5348485a',
'uploader': 'Johnny Bullets',
'uploader_id': 'Johnnybull3ts',
'uploader_url': 'https://twitter.com/Johnnybull3ts',
'age_limit': 0,
'tags': [],
'duration': 8.033,
'timestamp': 1675853859.0,
'upload_date': '20230208',
'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',
'like_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
'skip': 'Protected tweet',
}, {
# orig tweet w/ graphql
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
'info_dict': {
'id': '1623274794488659969',
'display_id': '1623739803874349067',
'ext': 'mp4',
'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy',
'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a',
'uploader': '@selfisekai@hackerspace.pl 🐀',
'uploader_id': 'liberdalau',
'uploader_url': 'https://twitter.com/liberdalau',
'age_limit': 0,
'tags': [],
'duration': 8.033,
'timestamp': 1675964711.0,
'upload_date': '20230209',
'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',
'like_count': int,
'view_count': int,
'repost_count': int,
'comment_count': int,
},
'skip': 'Protected tweet',
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@@ -880,15 +1107,23 @@ class TwitterIE(TwitterBaseIE):
result = traverse_obj(data, (
'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries',
lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent',
'tweet_results', 'result', ('tweet', None),
), expected_type=dict, default={}, get_all=False)
'tweet_results', 'result', ('tweet', None), {dict},
), default={}, get_all=False) if self.is_logged_in else traverse_obj(
data, ('tweetResult', 'result', {dict}), default={})
if result.get('__typename') not in ('Tweet', None):
if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None):
self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True)
if 'tombstone' in result:
cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str)
cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more')
raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
elif result.get('__typename') == 'TweetUnavailable':
reason = result.get('reason')
if reason == 'NsfwLoggedOut':
self.raise_login_required('NSFW tweet requires authentication')
elif reason == 'Protected':
self.raise_login_required('You are not authorized to view this protected tweet')
raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)
status = result.get('legacy', {})
status.update(traverse_obj(result, {
@@ -900,7 +1135,7 @@ class TwitterIE(TwitterBaseIE):
# extra transformation is needed since result does not match legacy format
binding_values = {
binding_value.get('key'): binding_value.get('value')
for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict)
for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict}))
}
if binding_values:
status['card']['binding_values'] = binding_values
@@ -939,25 +1174,74 @@ class TwitterIE(TwitterBaseIE):
'verified_phone_label_enabled': False,
'vibe_api_enabled': True,
},
} if self.is_logged_in else {
'variables': {
'tweetId': media_id,
'withCommunity': False,
'includePromotedContent': False,
'withVoice': False,
},
'features': {
'creator_subscriptions_tweet_preview_api_enabled': True,
'tweetypie_unmention_optimization_enabled': True,
'responsive_web_edit_tweet_api_enabled': True,
'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True,
'view_counts_everywhere_api_enabled': True,
'longform_notetweets_consumption_enabled': True,
'responsive_web_twitter_article_tweet_consumption_enabled': False,
'tweet_awards_web_tipping_enabled': False,
'freedom_of_speech_not_reach_fetch_enabled': True,
'standardized_nudges_misinfo': True,
'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True,
'longform_notetweets_rich_text_read_enabled': True,
'longform_notetweets_inline_media_enabled': True,
'responsive_web_graphql_exclude_directive_enabled': True,
'verified_phone_label_enabled': False,
'responsive_web_media_download_video_enabled': False,
'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False,
'responsive_web_graphql_timeline_navigation_enabled': True,
'responsive_web_enhance_cards_enabled': False
},
'fieldToggles': {
'withArticleRichContentState': False
}
}
def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index')
if self.is_logged_in or self._configuration_arg('force_graphql'):
self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})')
result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
status = self._graphql_to_legacy(result, twid)
def _extract_status(self, twid):
if self.is_logged_in:
return self._graphql_to_legacy(
self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
else:
status = self._call_api(f'statuses/show/{twid}.json', twid, {
try:
if not self._configuration_arg('legacy_api'):
return self._graphql_to_legacy(
self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
})
}), 'retweeted_status', None)
title = description = status['full_text'].replace('\n', ' ')
except ExtractorError as e:
if e.expected:
raise
self.report_warning(
f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={'id': twid})
status['extended_entities'] = {'media': status.get('mediaDetails')}
return status
def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index')
status = self._extract_status(twid)
title = description = traverse_obj(
status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or ''
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
title = re.sub(r'\s+(https?://[^ ]+)', '', title)
user = status.get('user') or {}
@@ -982,13 +1266,15 @@ class TwitterIE(TwitterBaseIE):
}
def extract_from_video_info(media):
media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
media_id = traverse_obj(media, 'id_str', 'id', (
'video_info', 'variants', ..., 'url',
{functools.partial(re.search, r'_video/(\d+)/')}, 1
), get_all=False, expected_type=str_or_none) or twid
self.write_debug(f'Extracting from video info: {media_id}')
video_info = media.get('video_info') or {}
formats = []
subtitles = {}
for variant in video_info.get('variants', []):
for variant in traverse_obj(media, ('video_info', 'variants', ...)):
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
@@ -1013,7 +1299,7 @@ class TwitterIE(TwitterBaseIE):
'subtitles': subtitles,
'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})),
'duration': float_or_none(video_info.get('duration_millis'), 1000),
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'),
}
@@ -1098,7 +1384,8 @@ class TwitterIE(TwitterBaseIE):
if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):
selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card')))
else:
desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict}))
desired_obj = traverse_obj(status, (
(None, 'quoted_status'), 'extended_entities', 'media', int(selected_index) - 1, {dict}), get_all=False)
if not desired_obj:
raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)
elif desired_obj.get('type') != 'video':
@@ -1120,7 +1407,8 @@ class TwitterIE(TwitterBaseIE):
if not entries:
expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none)
if not expanded_url or expanded_url == url:
raise ExtractorError('No video could be found in this tweet', expected=True)
self.raise_no_formats('No video could be found in this tweet', expected=True)
return info
return self.url_result(expanded_url, display_id=twid, **info)
@@ -1244,7 +1532,42 @@ class TwitterSpacesIE(TwitterBaseIE):
'uploader': r're:Lucio Di Gaetano.*?',
'uploader_id': 'luciodigaetano',
'live_status': 'was_live',
'timestamp': 1659877956397,
'timestamp': 1659877956,
'upload_date': '20220807',
'release_timestamp': 1659904215,
'release_date': '20220807',
},
'params': {'skip_download': 'm3u8'},
}, {
# post_live/TimedOut but downloadable
'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl',
'info_dict': {
'id': '1vAxRAVQWONJl',
'ext': 'm4a',
'title': 'Framing Up FinOps: Billing Tools',
'description': 'Twitter Space participated by rupa, Alfonso Hernandez',
'uploader': 'Google Cloud',
'uploader_id': 'googlecloud',
'live_status': 'post_live',
'timestamp': 1681409554,
'upload_date': '20230413',
'release_timestamp': 1681839000,
'release_date': '20230418',
},
'params': {'skip_download': 'm3u8'},
}, {
# Needs ffmpeg as downloader, see: https://github.com/yt-dlp/yt-dlp/issues/7536
'url': 'https://twitter.com/i/spaces/1eaKbrQbjoRKX',
'info_dict': {
'id': '1eaKbrQbjoRKX',
'ext': 'm4a',
'title': '',
'description': 'Twitter Space participated by nobody yet',
'uploader': '息根とめる🔪Twitchで復活',
'uploader_id': 'tomeru_ikinone',
'live_status': 'was_live',
'timestamp': 1685617198,
'upload_date': '20230601',
},
'params': {'skip_download': 'm3u8'},
}]
@@ -1284,32 +1607,39 @@ class TwitterSpacesIE(TwitterBaseIE):
def _real_extract(self, url):
space_id = self._match_id(url)
if not self.is_logged_in:
self.raise_login_required('Twitter Spaces require authentication')
space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace']
if not space_data:
raise ExtractorError('Twitter Space not found', expected=True)
metadata = space_data['metadata']
live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()])
is_live = live_status == 'is_live'
formats = []
if live_status == 'is_upcoming':
self.raise_no_formats('Twitter Space not started yet', expected=True)
elif live_status == 'post_live':
self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True)
else:
source = self._call_api(
f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source']
# XXX: Native downloader does not work
formats = self._extract_m3u8_formats(
traverse_obj(source, 'noRedirectPlaybackUrl', 'location'),
metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live',
headers={'Referer': 'https://twitter.com/'})
elif not is_live and not metadata.get('is_space_available_for_replay'):
self.raise_no_formats('Twitter Space ended and replay is disabled', expected=True)
elif metadata.get('media_key'):
source = traverse_obj(
self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']),
('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False)
formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader
source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live,
headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else []
for fmt in formats:
fmt.update({'vcodec': 'none', 'acodec': 'aac'})
if not is_live:
fmt['container'] = 'm4a_dash'
participants = ', '.join(traverse_obj(
space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet'
if not formats and live_status == 'post_live':
self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True)
return {
'id': space_id,
'title': metadata.get('title'),
@@ -1319,7 +1649,9 @@ class TwitterSpacesIE(TwitterBaseIE):
'uploader_id': traverse_obj(
metadata, ('creator_results', 'result', 'legacy', 'screen_name')),
'live_status': live_status,
'timestamp': metadata.get('created_at'),
'release_timestamp': try_call(
lambda: int_or_none(metadata['scheduled_start'], scale=1000)),
'timestamp': int_or_none(metadata.get('created_at'), scale=1000),
'formats': formats,
}
@@ -1335,7 +1667,7 @@ class TwitterShortenerIE(TwitterBaseIE):
if eid:
id = eid
url = self._BASE_URL + id
new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl()
new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).url
__UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link="
if new_url.startswith(__UNSAFE_LINK):
new_url = new_url.replace(__UNSAFE_LINK, "")