diff --git a/plugins/youtube_download/yt_dlp/YoutubeDL.py b/plugins/youtube_download/yt_dlp/YoutubeDL.py index 4891b3f..666d89b 100644 --- a/plugins/youtube_download/yt_dlp/YoutubeDL.py +++ b/plugins/youtube_download/yt_dlp/YoutubeDL.py @@ -1,9 +1,10 @@ import collections import contextlib +import copy import datetime import errno import fileinput -import functools +import http.cookiejar import io import itertools import json @@ -13,6 +14,7 @@ import os import random import re import shutil +import string import subprocess import sys import tempfile @@ -20,18 +22,27 @@ import time import tokenize import traceback import unicodedata -import urllib.request -from string import ascii_letters from .cache import Cache -from .compat import compat_os_name, compat_shlex_quote -from .cookies import load_cookies +from .compat import functools, urllib # isort: split +from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req +from .cookies import LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text +from .networking import HEADRequest, Request, RequestDirector +from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES +from .networking.exceptions import ( + HTTPError, + NoSupportingHandlers, + RequestError, + SSLError, + _CompatHTTPError, + network_exceptions, +) from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_pps from .postprocessor import ( @@ -70,13 +81,11 @@ from .utils import ( ExtractorError, FormatSorter, GeoRestrictedError, - HEADRequest, ISO3166Utils, LazyList, MaxDownloadsReached, Namespace, PagedList, - PerRequestProxyHandler, PlaylistEntries, Popen, PostProcessingError, @@ -85,9 +94,6 @@ from .utils import ( SameFileError, UnavailableVideoError, UserNotLive, - YoutubeDLCookieProcessor, - YoutubeDLHandler, - YoutubeDLRedirectHandler, age_restricted, args_to_str, bug_reports_message, @@ -100,6 +106,7 @@ from .utils import ( error_to_compat_str, escapeHTML, expand_path, + extract_basic_auth, filter_dict, float_or_none, format_bytes, @@ -115,24 +122,18 @@ from .utils import ( locked_file, make_archive_id, make_dir, - make_HTTPS_handler, - merge_headers, - network_exceptions, number_of_digits, orderedSet, orderedSet_from_options, parse_filesize, preferredencoding, prepend_extension, - register_socks_protocols, remove_terminal_sequences, render_table, replace_extension, sanitize_filename, sanitize_path, sanitize_url, - sanitized_Request, - std_headers, str_or_none, strftime_or_none, subtitles_filename, @@ -150,7 +151,14 @@ from .utils import ( write_json_file, write_string, ) -from .version import RELEASE_GIT_HEAD, VARIANT, __version__ +from .utils._utils import _YDLLogger +from .utils.networking import ( + HTTPHeaderDict, + clean_headers, + clean_proxies, + std_headers, +) +from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': import ctypes @@ -190,6 +198,8 @@ class YoutubeDL: ap_username: Multiple-system operator account username. ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. + netrc_location: Location of the netrc file. Defaults to ~/.netrc. + netrc_cmd: Use a shell command to get credentials verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. @@ -246,8 +256,6 @@ class YoutubeDL: overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False - For compatibility with youtube-dl, - "nooverwrites" may also be used instead playlist_items: Specific indices of playlist to download. playlistrandom: Download playlist items in random order. lazy_playlist: Process playlist entries as they are received. @@ -258,7 +266,7 @@ class YoutubeDL: consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - clean_infojson: Remove private fields from the infojson + clean_infojson: Remove internal metadata from the infojson getcomments: Extract video comments. This will not be written to disk unless writeinfojson is also given writeannotations: Write the video annotations to a .annotations.xml file @@ -280,7 +288,7 @@ class YoutubeDL: subtitles. The language can be prefixed with a "-" to exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. + daterange: A utils.DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. False to disable filesystem cache. @@ -300,8 +308,6 @@ class YoutubeDL: Videos already present in the file are not downloaded again. break_on_existing: Stop the download process after attempting to download a file that is in the archive. - break_on_reject: Stop the download process when encountering a video that - has been filtered out. break_per_url: Whether break_on_reject and break_on_existing should act on each input URL as opposed to for the entire queue cookiefile: File name or text stream from where cookies should be read and dumped to @@ -331,13 +337,13 @@ class YoutubeDL: 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Whether to resolve and process url_results further - * False: Always process (default) + * False: Always process. Default for API * True: Never process * 'in_playlist': Do not process inside playlist/multi_video * 'discard': Always process, but don't return the result from inside playlist/multi_video * 'discard_in_playlist': Same as "discard", but only for - playlists (not multi_video) + playlists (not multi_video). Default for CLI wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -414,8 +420,15 @@ class YoutubeDL: - If it returns None, the video is downloaded. - If it returns utils.NO_DEFAULT, the user is interactively asked whether to download the video. + - Raise utils.DownloadCancelled(msg) to abort remaining + downloads when a video is rejected. match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. + color: A Dictionary with output stream names as keys + and their respective color policy as values. + Can also just be a single color policy, + in which case it applies to all outputs. + Valid stream names are 'stdout' and 'stderr'. + Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header geo_bypass_country: @@ -472,7 +485,7 @@ class YoutubeDL: can also be used The following options are used by the extractors: - extractor_retries: Number of times to retry for known errors + extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at discontinuities such as ad breaks (default: False) @@ -483,6 +496,9 @@ class YoutubeDL: The following options are deprecated and may be removed in the future: + break_on_reject: Stop the download process when encountering a video that + has been filtered out. + - `raise DownloadCancelled(msg)` in match_filter instead force_generic_extractor: Force downloader to use the generic extractor - Use allowed_extractors = ['generic', 'default'] playliststart: - Use playlist_items @@ -534,6 +550,8 @@ class YoutubeDL: data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) + no_color: Same as `color='no_color'` + no_overwrites: Same as `overwrites=False` """ _NUMERIC_FIELDS = { @@ -553,7 +571,7 @@ class YoutubeDL: 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', - 'preference', 'language', 'language_preference', 'quality', 'source_preference', + 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' } @@ -585,6 +603,7 @@ class YoutubeDL: self._playlist_level = 0 self._playlist_urls = set() self.cache = Cache(self) + self.__header_cookies = [] stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout self._out_files = Namespace( @@ -600,9 +619,25 @@ class YoutubeDL: except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + if self.params.get('no_color'): + if self.params.get('color') is not None: + self.params.setdefault('_warnings', []).append( + 'Overwriting params from "color" with "no_color"') + self.params['color'] = 'no_color' + + term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' + + def process_color_policy(stream): + stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) + if policy in ('auto', None): + return term_allow_color and supports_terminal_sequences(stream) + assert policy in ('always', 'never', 'no_color'), policy + return {'always': True, 'never': False}.get(policy, policy) + self._allow_colors = Namespace(**{ - type_: not self.params.get('no_color') and supports_terminal_sequences(stream) - for type_, stream in self._out_files.items_ if type_ != 'console' + name: process_color_policy(stream) + for name, stream in self._out_files.items_ if name != 'console' }) # The code is left like this to be reused for future deprecations @@ -614,7 +649,7 @@ class YoutubeDL: '\n You will no longer receive updates on this version') if current_version < MIN_SUPPORTED: msg = 'Python version %d.%d is no longer supported' - self.deprecation_warning( + self.deprecated_feature( f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) if self.params.get('allow_unplayable_formats'): @@ -645,6 +680,11 @@ class YoutubeDL: raise self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) + self._load_cookies(self.params['http_headers'].get('Cookie')) # compat + self.params['http_headers'].pop('Cookie', None) + self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES) + if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -715,9 +755,6 @@ class YoutubeDL: else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) - # Set http_headers defaults according to std_headers - self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) - hooks = { 'post_hooks': self.add_post_hook, 'progress_hooks': self.add_progress_hook, @@ -734,9 +771,6 @@ class YoutubeDL: get_postprocessor(pp_def.pop('key'))(self, **pp_def), when=when) - self._setup_opener() - register_socks_protocols() - def preload_download_archive(fn): """Preload the archive, if any is specified""" archive = set() @@ -912,11 +946,17 @@ class YoutubeDL: self.save_console_title() return self + def save_cookies(self): + if self.params.get('cookiefile') is not None: + self.cookiejar.save() + def __exit__(self, *args): self.restore_console_title() + self.close() - if self.params.get('cookiefile') is not None: - self.cookiejar.save(ignore_discard=True, ignore_expires=True) + def close(self): + self.save_cookies() + self._request_director.close() def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. @@ -959,6 +999,7 @@ class YoutubeDL: ID='green', DELIM='blue', ERROR='red', + BAD_FORMAT='light red', WARNING='yellow', SUPPRESS='light black', ) @@ -972,7 +1013,7 @@ class YoutubeDL: text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - return format_text(text, f) if allow_colors else text if fallback is None else fallback + return format_text(text, f) if allow_colors is True else text if fallback is None else fallback def _format_out(self, *args, **kwargs): return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) @@ -1075,7 +1116,7 @@ class YoutubeDL: # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join(random.choices(ascii_letters, k=32)) + sep = ''.join(random.choices(string.ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution @@ -1153,7 +1194,7 @@ class YoutubeDL: } MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) - INTERNAL_FORMAT_RE = re.compile(rf'''(?x) + INTERNAL_FORMAT_RE = re.compile(rf'''(?xs) (?P-)? (?P{FIELD_RE}) (?P(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) @@ -1234,32 +1275,45 @@ class YoutubeDL: return list(obj) return repr(obj) + class _ReplacementFormatter(string.Formatter): + def get_field(self, field_name, args, kwargs): + if field_name.isdigit(): + return args[0], -1 + raise ValueError('Unsupported field') + + replacement_formatter = _ReplacementFormatter() + def create_key(outer_mobj): if not outer_mobj.group('has_key'): return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - initial_field = mobj.group('fields') if mobj else '' - value, replacement, default = None, None, na + value, replacement, default, last_field = None, None, na, '' while mobj: mobj = mobj.groupdict() default = mobj['default'] if mobj['default'] is not None else default value = get_value(mobj) - replacement = mobj['replacement'] + last_field, replacement = mobj['fields'], mobj['replacement'] if value is None and mobj['alternate']: mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:]) else: break - fmt = outer_mobj.group('format') - if fmt == 's' and value is not None and key in field_size_compat_map.keys(): - fmt = f'0{field_size_compat_map[key]:d}d' + if None not in (value, replacement): + try: + value = replacement_formatter.format(replacement, value) + except ValueError: + value, default = None, na - value = default if value is None else value if replacement is None else replacement + fmt = outer_mobj.group('format') + if fmt == 's' and last_field in field_size_compat_map.keys() and isinstance(value, int): + fmt = f'0{field_size_compat_map[last_field]:d}d' flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' - if fmt[-1] == 'l': # list + if value is None: + value, fmt = default, 's' + elif fmt[-1] == 'l': # list delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json @@ -1284,24 +1338,26 @@ class YoutubeDL: value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s', factor=1024 if '#' in flags else 1000) elif fmt[-1] == 'S': # filename sanitization - value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt + value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': if value: value = str(value)[0] else: fmt = str_fmt - elif fmt[-1] not in 'rs': # numeric + elif fmt[-1] not in 'rsa': # numeric value = float_or_none(value) if value is None: value, fmt = default, 's' if sanitize: + # If value is an object, sanitize might convert it to a string + # So we convert it to repr first if fmt[-1] == 'r': - # If value is an object, sanitize might convert it to a string - # So we convert it to repr first value, fmt = repr(value), str_fmt - if fmt[-1] in 'csr': - value = sanitizer(initial_field, value) + elif fmt[-1] == 'a': + value, fmt = ascii(value), str_fmt + if fmt[-1] in 'csra': + value = sanitizer(last_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value @@ -1366,7 +1422,7 @@ class YoutubeDL: def _match_entry(self, info_dict, incomplete=False, silent=False): """Returns None if the file should be downloaded""" - _type = info_dict.get('_type', 'video') + _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video') assert incomplete or _type == 'video', 'Only video result can be considered complete' video_title = info_dict.get('title', info_dict.get('id', 'entry')) @@ -1407,31 +1463,47 @@ class YoutubeDL: return 'Skipping "%s" because it is age restricted' % video_title match_filter = self.params.get('match_filter') - if match_filter is not None: + if match_filter is None: + return None + + cancelled = None + try: try: ret = match_filter(info_dict, incomplete=incomplete) except TypeError: # For backward compatibility ret = None if incomplete else match_filter(info_dict) - if ret is NO_DEFAULT: - while True: - filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME) - reply = input(self._format_screen( - f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip() - if reply in {'y', ''}: - return None - elif reply == 'n': - return f'Skipping {video_title}' - elif ret is not None: - return ret - return None + except DownloadCancelled as err: + if err.msg is not NO_DEFAULT: + raise + ret, cancelled = err.msg, err + + if ret is NO_DEFAULT: + while True: + filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME) + reply = input(self._format_screen( + f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip() + if reply in {'y', ''}: + return None + elif reply == 'n': + if cancelled: + raise type(cancelled)(f'Skipping {video_title}') + return f'Skipping {video_title}' + return ret if self.in_download_archive(info_dict): - reason = '%s has already been recorded in the archive' % video_title + reason = ''.join(( + format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '), + format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '), + 'has already been recorded in the archive')) break_opt, break_err = 'break_on_existing', ExistingVideoReached else: - reason = check_filter() - break_opt, break_err = 'break_on_reject', RejectedVideoReached + try: + reason = check_filter() + except DownloadCancelled as e: + reason, break_opt, break_err = e.msg, 'match_filter', type(e) + else: + break_opt, break_err = 'break_on_reject', RejectedVideoReached if reason is not None: if not silent: self.to_screen('[download] ' + reason) @@ -1484,7 +1556,8 @@ class YoutubeDL: temp_id = ie.get_temp_id(url) if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): - self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive') + self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: ' + 'has already been recorded in the archive') if self.params.get('break_on_existing', False): raise ExistingVideoReached() break @@ -1572,8 +1645,67 @@ class YoutubeDL: self.to_screen('') raise + def _load_cookies(self, data, *, autoscope=True): + """Loads cookies from a `Cookie` header + + This tries to work around the security vulnerability of passing cookies to every domain. + See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + + @param data The Cookie header as string to load the cookies from + @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains + If `True`, save cookies for later to be stored in the jar with a limited scope + If a URL, save cookies in the jar with the domain of the URL + """ + for cookie in LenientSimpleCookie(data).values(): + if autoscope and any(cookie.values()): + raise ValueError('Invalid syntax in Cookie Header') + + domain = cookie.get('domain') or '' + expiry = cookie.get('expires') + if expiry == '': # 0 is valid + expiry = None + prepared_cookie = http.cookiejar.Cookie( + cookie.get('version') or 0, cookie.key, cookie.value, None, False, + domain, True, True, cookie.get('path') or '', bool(cookie.get('path')), + cookie.get('secure') or False, expiry, False, None, None, {}) + + if domain: + self.cookiejar.set_cookie(prepared_cookie) + elif autoscope is True: + self.deprecated_feature( + 'Passing cookies as a header is a potential security risk; ' + 'they will be scoped to the domain of the downloaded urls. ' + 'Please consider loading cookies from a file or browser instead.') + self.__header_cookies.append(prepared_cookie) + elif autoscope: + self.report_warning( + 'The extractor result contains an unscoped cookie as an HTTP header. ' + f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}', + only_once=True) + self._apply_header_cookies(autoscope, [prepared_cookie]) + else: + self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping', + tb=False, is_error=False) + + def _apply_header_cookies(self, url, cookies=None): + """Applies stray header cookies to the provided url + + This loads header cookies and scopes them to the domain provided in `url`. + While this is not ideal, it helps reduce the risk of them being sent + to an unintended destination while mostly maintaining compatibility. + """ + parsed = urllib.parse.urlparse(url) + if not parsed.hostname: + return + + for cookie in map(copy.copy, cookies or self.__header_cookies): + cookie.domain = f'.{parsed.hostname}' + self.cookiejar.set_cookie(cookie) + @_handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): + self._apply_header_cookies(url) + try: ie_result = ie.extract(url) except UserNotLive as e: @@ -1647,7 +1779,7 @@ class YoutubeDL: self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) self._fill_common_fields(info_copy, False) - self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self.__forced_printings(info_copy) self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) @@ -1851,7 +1983,7 @@ class YoutubeDL: continue entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') - if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + if not lazy and 'playlist-index' in self.params['compat_opts']: playlist_index = ie_result['requested_entries'][i] entry_copy = collections.ChainMap(entry, { @@ -1916,7 +2048,7 @@ class YoutubeDL: '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s* + (?P[\w.-]+)\s* (?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s* ''' % '|'.join(map(re.escape, OPERATORS.keys()))) @@ -2033,90 +2165,88 @@ class YoutubeDL: allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), 'video': self.params.get('allow_multiple_video_streams', False)} - check_formats = self.params.get('check_formats') == 'selected' - def _parse_filter(tokens): filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': + for type, string_, start, _, _ in tokens: + if type == tokenize.OP and string_ == ']': return ''.join(filter_parts) else: - filter_parts.append(string) + filter_parts.append(string_) def _remove_unused_ops(tokens): # Remove operators that we don't use and join them with the surrounding strings. # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': + for type, string_, start, end, line in tokens: + if type == tokenize.OP and string_ == '[': if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': + for type, string_, start, end, line in tokens: + yield type, string_, start, end, line + if type == tokenize.OP and string_ == ']': break - elif type == tokenize.OP and string in ALLOWED_OPS: + elif type == tokenize.OP and string_ in ALLOWED_OPS: if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: if not last_string: - last_string = string + last_string = string_ last_start = start last_end = end else: - last_string += string + last_string += string_ if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None - for type, string, start, _, _ in tokens: + for type, string_, start, _, _ in tokens: # ENCODING is only defined in python 3.x if type == getattr(tokenize, 'ENCODING', None): continue elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) + current_selector = FormatSelector(SINGLE, string_, []) elif type == tokenize.OP: - if string == ')': + if string_ == ')': if not inside_group: # ')' will be handled by the parentheses group tokens.restore_last_token() break - elif inside_merge and string in ['/', ',']: + elif inside_merge and string_ in ['/', ',']: tokens.restore_last_token() break - elif inside_choice and string == ',': + elif inside_choice and string_ == ',': tokens.restore_last_token() break - elif string == ',': + elif string_ == ',': if not current_selector: raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None - elif string == '/': + elif string_ == '/': if not current_selector: raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': + elif string_ == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) - elif string == '(': + elif string_ == '(': if current_selector: raise syntax_error('Unexpected "("', start) group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) - elif string == '+': + elif string_ == '+': if not current_selector: raise syntax_error('Unexpected "+"', start) selector_1 = current_selector @@ -2125,7 +2255,7 @@ class YoutubeDL: raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error(f'Operator not recognized: "{string}"', start) + raise syntax_error(f'Operator not recognized: "{string_}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2207,10 +2337,19 @@ class YoutubeDL: return new_dict def _check_formats(formats): - if not check_formats: + if self.params.get('check_formats') == 'selected': + yield from self._check_formats(formats) + return + elif (self.params.get('check_formats') is not None + or self.params.get('allow_unplayable_formats')): yield from formats return - yield from self._check_formats(formats) + + for f in formats: + if f.get('has_drm'): + yield from self._check_formats([f]) + else: + yield f def _build_selector_function(selector): if isinstance(selector, list): # , @@ -2349,12 +2488,34 @@ class YoutubeDL: parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) - def _calc_headers(self, info_dict): - res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) + def _calc_headers(self, info_dict, load_cookies=False): + res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers')) + clean_headers(res) - cookies = self._calc_cookies(info_dict['url']) + if load_cookies: # For --load-info-json + self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat + self._load_cookies(info_dict.get('cookies'), autoscope=False) + # The `Cookie` header is removed to prevent leaks and unscoped cookies. + # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + res.pop('Cookie', None) + cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: - res['Cookie'] = cookies + encoder = LenientSimpleCookie() + values = [] + for cookie in cookies: + _, value = encoder.value_encode(cookie.value) + values.append(f'{cookie.name}={value}') + if cookie.domain: + values.append(f'Domain={cookie.domain}') + if cookie.path: + values.append(f'Path={cookie.path}') + if cookie.secure: + values.append('Secure') + if cookie.expires: + values.append(f'Expires={cookie.expires}') + if cookie.version: + values.append(f'Version={cookie.version}') + info_dict['cookies'] = '; '.join(values) if 'X-Forwarded-For' not in res: x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') @@ -2364,9 +2525,8 @@ class YoutubeDL: return res def _calc_cookies(self, url): - pr = sanitized_Request(url) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') + self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version') + return self.cookiejar.get_cookie_header(url) def _sort_thumbnails(self, thumbnails): thumbnails.sort(key=lambda t: ( @@ -2561,10 +2721,10 @@ class YoutubeDL: if field_preference: info_dict['_format_sort_fields'] = field_preference - # or None ensures --clean-infojson removes it - info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None + info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it + f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None if not self.params.get('allow_unplayable_formats'): - formats = [f for f in formats if not f.get('has_drm')] + formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe'] if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): self.report_warning( @@ -2612,10 +2772,16 @@ class YoutubeDL: format['dynamic_range'] = 'SDR' if format.get('aspect_ratio') is None: format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) - if (info_dict.get('duration') and format.get('tbr') + if (not format.get('manifest_url') # For fragmented formats, "tbr" is often max bitrate and not average + and info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) - format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict)) + format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True) + + # Safeguard against old/insecure infojson when using --load-info-json + if info_dict.get('http_headers'): + info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers']) + info_dict['http_headers'].pop('Cookie', None) # This is copied to http_headers by the above _calc_headers and can now be removed if '__x_forwarded_for_ip' in info_dict: @@ -2689,33 +2855,31 @@ class YoutubeDL: self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work - self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) + self.__forced_printings(info_dict) return info_dict format_selector = self.format_selector - if format_selector is None: - req_format = self._default_format_spec(info_dict, download=download) - self.write_debug('Default format spec: %s' % req_format) - format_selector = self.build_format_selector(req_format) - while True: if interactive_format_selection: - req_format = input( - self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS)) + req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS) + + '(Press ENTER for default, or Ctrl+C to quit)' + + self._format_screen(': ', self.Styles.EMPHASIS)) try: - format_selector = self.build_format_selector(req_format) + format_selector = self.build_format_selector(req_format) if req_format else None except SyntaxError as err: self.report_error(err, tb=False, is_error=False) continue + if format_selector is None: + req_format = self._default_format_spec(info_dict, download=download) + self.write_debug(f'Default format spec: {req_format}') + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector({ 'formats': formats, 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), - 'incomplete_formats': ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)), + 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video + or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio })) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) @@ -2750,11 +2914,13 @@ class YoutubeDL: new_info.update(fmt) offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') end_time = offset + min(chapter.get('end_time', duration), duration) + # duration may not be accurate. So allow deviations <1sec + if end_time == float('inf') or end_time > offset + duration + 1: + end_time = None if chapter or offset: new_info.update({ 'section_start': offset + chapter.get('start_time', 0), - # duration may not be accurate. So allow deviations <1sec - 'section_end': end_time if end_time <= offset + duration + 1 else None, + 'section_end': end_time, 'section_title': chapter.get('title'), 'section_number': chapter.get('index'), }) @@ -2810,10 +2976,14 @@ class YoutubeDL: self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) except re.error as e: raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') - elif normal_sub_langs: - requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1] else: - requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1] + requested_langs = LazyList(itertools.chain( + ['en'] if 'en' in normal_sub_langs else [], + filter(lambda f: f.startswith('en'), normal_sub_langs), + ['en'] if 'en' in all_sub_langs else [], + filter(lambda f: f.startswith('en'), all_sub_langs), + normal_sub_langs, all_sub_langs, + ))[:1] if requested_langs: self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') @@ -2845,6 +3015,12 @@ class YoutubeDL: if info_dict is None: return info_copy = info_dict.copy() + info_copy.setdefault('filename', self.prepare_filename(info_dict)) + if info_dict.get('requested_formats') is not None: + # For RTMP URLs, also include the playpath + info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) + elif info_dict.get('url'): + info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '') info_copy['formats_table'] = self.render_formats_table(info_dict) info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict) info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles')) @@ -2857,7 +3033,7 @@ class YoutubeDL: fmt = '%({})s' if tmpl.startswith('{'): - tmpl = f'.{tmpl}' + tmpl, fmt = f'.{tmpl}', '%({})j' if tmpl.endswith('='): tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) @@ -2870,46 +3046,36 @@ class YoutubeDL: tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): - with open(filename, 'a', encoding='utf-8') as f: - f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') + with open(filename, 'a', encoding='utf-8', newline='') as f: + f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep) - def __forced_printings(self, info_dict, filename, incomplete): - def print_mandatory(field, actual_field=None): - if actual_field is None: - actual_field = field - if (self.params.get('force%s' % field, False) - and (not incomplete or info_dict.get(actual_field) is not None)): - self.to_stdout(info_dict[actual_field]) - - def print_optional(field): - if (self.params.get('force%s' % field, False) - and info_dict.get(field) is not None): - self.to_stdout(info_dict[field]) - - info_dict = info_dict.copy() - if filename is not None: - info_dict['filename'] = filename - if info_dict.get('requested_formats') is not None: - # For RTMP URLs, also include the playpath - info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) - elif info_dict.get('url'): - info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') + return info_copy + def __forced_printings(self, info_dict, filename=None, incomplete=True): if (self.params.get('forcejson') or self.params['forceprint'].get('video') or self.params['print_to_file'].get('video')): self.post_extract(info_dict) - self._forceprint('video', info_dict) + if filename: + info_dict['filename'] = filename + info_copy = self._forceprint('video', info_dict) - print_mandatory('title') - print_mandatory('id') - print_mandatory('url', 'urls') - print_optional('thumbnail') - print_optional('description') - print_optional('filename') - if self.params.get('forceduration') and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - print_mandatory('format') + def print_field(field, actual_field=None, optional=False): + if actual_field is None: + actual_field = field + if self.params.get(f'force{field}') and ( + info_copy.get(field) is not None or (not optional and not incomplete)): + self.to_stdout(info_copy[actual_field]) + + print_field('title') + print_field('id') + print_field('url', 'urls') + print_field('thumbnail', optional=True) + print_field('description', optional=True) + print_field('filename') + if self.params.get('forceduration') and info_copy.get('duration') is not None: + self.to_stdout(formatSeconds(info_copy['duration'])) + print_field('format') if self.params.get('forcejson'): self.to_stdout(json.dumps(self.sanitize_info(info_dict))) @@ -3123,7 +3289,7 @@ class YoutubeDL: fd, success = None, True if info_dict.get('protocol') or info_dict.get('url'): fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') - if fd is not FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( + if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( info_dict.get('section_start') or info_dict.get('section_end')): msg = ('This format cannot be partially downloaded' if FFmpegFD.available() else 'You have requested downloading the video partially, but ffmpeg is not installed') @@ -3131,7 +3297,6 @@ class YoutubeDL: return if info_dict.get('requested_formats') is not None: - requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] if self.params.get('merge_output_format') is None: if (info_dict['ext'] == 'webm' @@ -3158,19 +3323,22 @@ class YoutubeDL: full_filename = correct_ext(full_filename) temp_filename = correct_ext(temp_filename) dl_filename = existing_video_file(full_filename, temp_filename) + info_dict['__real_download'] = False + # NOTE: Copy so that original format dicts are not modified + info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats'])) merger = FFmpegMergerPP(self) downloaded = [] if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: - for f in requested_formats if fd != FFmpegFD else []: + for f in info_dict['requested_formats'] if fd != FFmpegFD else []: f['filepath'] = fname = prepend_extension( correct_ext(temp_filename, info_dict['ext']), 'f%s' % f['format_id'], info_dict['ext']) downloaded.append(fname) - info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) + info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats']) success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download else: @@ -3194,7 +3362,7 @@ class YoutubeDL: f'You have requested downloading multiple formats to stdout {reason}. ' 'The formats will be streamed one after the other') fname = temp_filename - for f in requested_formats: + for f in info_dict['requested_formats']: new_info = dict(info_dict) del new_info['requested_formats'] new_info.update(f) @@ -3285,14 +3453,15 @@ class YoutubeDL: ) for pp in self._pps['post_process']) if not postprocessed_by_ffmpeg: - ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash', + ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a' + and info_dict.get('container') == 'm4a_dash', 'writing DASH m4a. Only some players support this container', FFmpegFixupM4aPP) ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) - ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', + ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments', 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) @@ -3356,18 +3525,19 @@ class YoutubeDL: [info_filename], mode='r', openhook=fileinput.hook_encoded('utf-8'))) as f: # FileInput doesn't have a read method, we can't call json.load - info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) - try: - self.__download_wrapper(self.process_ie_result)(info, download=True) - except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: - if not isinstance(e, EntryNotInPlaylist): - self.to_stderr('\r') - webpage_url = info.get('webpage_url') - if webpage_url is not None: + infos = [self.sanitize_info(info, self.params.get('clean_infojson', True)) + for info in variadic(json.loads('\n'.join(f)))] + for info in infos: + try: + self.__download_wrapper(self.process_ie_result)(info, download=True) + except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: + if not isinstance(e, EntryNotInPlaylist): + self.to_stderr('\r') + webpage_url = info.get('webpage_url') + if webpage_url is None: + raise self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}') - return self.download([webpage_url]) - else: - raise + self.download([webpage_url]) return self._download_retcode @staticmethod @@ -3387,8 +3557,8 @@ class YoutubeDL: if remove_private_keys: reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', - '_format_sort_fields', + 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url', + 'playlist_autonumber', '_format_sort_fields', } else: reject = lambda k, v: False @@ -3627,7 +3797,7 @@ class YoutubeDL: def simplified_codec(f, field): assert field in ('acodec', 'vcodec') - codec = f.get(field, 'unknown') + codec = f.get(field) if not codec: return 'unknown' elif codec != 'none': @@ -3649,8 +3819,11 @@ class YoutubeDL: format_field(f, 'fps', '\t%d', func=round), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), format_field(f, 'audio_channels', '\t%s'), - delim, - format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), + delim, ( + format_field(f, 'filesize', ' \t%s', func=format_bytes) + or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes) + or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))), + None, self._format_out('~\t%s', self.Styles.SUPPRESS))), format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, @@ -3659,13 +3832,13 @@ class YoutubeDL: simplified_codec(f, 'acodec'), format_field(f, 'abr', '\t%dk', func=round), format_field(f, 'asr', '\t%s', func=format_decimal_suffix), - join_nonempty( - self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, - format_field(f, 'language', '[%s]'), - join_nonempty(format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - delim=', '), - delim=' '), + join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty( + self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None, + (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe' + else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None), + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', @@ -3714,12 +3887,6 @@ class YoutubeDL: def list_subtitles(self, video_id, subtitles, name='subtitles'): self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles) - def urlopen(self, req): - """ Start an HTTP download """ - if isinstance(req, str): - req = sanitized_Request(req) - return self._opener.open(req, timeout=self._socket_timeout) - def print_debug_header(self): if not self.params.get('verbose'): return @@ -3735,9 +3902,14 @@ class YoutubeDL: def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + additional_info = [] + if os.environ.get('TERM', '').lower() == 'dumb': + additional_info.append('dumb') if not supports_terminal_sequences(stream): from .utils import WINDOWS_VT_MODE # Must be imported locally - ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI') + if additional_info: + ret = f'{ret} ({",".join(additional_info)})' return ret encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( @@ -3760,19 +3932,18 @@ class YoutubeDL: source = detect_variant() if VARIANT not in (None, 'pip'): source += '*' + klass = type(self) write_debug(join_nonempty( f'{"yt-dlp" if REPOSITORY == "yt-dlp/yt-dlp" else REPOSITORY} version', - __version__, - f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', + f'{CHANNEL}@{__version__}', + f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', - '' if _IN_CLI else 'API', + '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', delim=' ')) if not _IN_CLI: write_debug(f'params: {self.params}') - write_debug('** This build is unofficial daily builds, provided for ease of use.') - write_debug('** Please do not ask for any support.') if not _LAZY_LOADER: if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): write_debug('Lazy loading extractors is forcibly disabled') @@ -3804,13 +3975,8 @@ class YoutubeDL: join_nonempty(*get_package_info(m)) for m in available_dependencies.values() })) or 'none')) - self._setup_opener() - proxy_map = {} - for handler in self._opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_debug(f'Proxy map: {proxy_map}') - + write_debug(f'Proxy map: {self.proxies}') + # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -3838,58 +4004,110 @@ class YoutubeDL: 'See https://yt-dl.org/update if you need help updating.' % latest_version) - def _setup_opener(self): - if hasattr(self, '_opener'): - return - timeout_val = self.params.get('socket_timeout') - self._socket_timeout = 20 if timeout_val is None else float(timeout_val) - - opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser') - opts_cookiefile = self.params.get('cookiefile') + @functools.cached_property + def proxies(self): + """Global proxy configuration""" opts_proxy = self.params.get('proxy') - - self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self) - - cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': - proxies = {} - else: - proxies = {'http': opts_proxy, 'https': opts_proxy} + opts_proxy = '__noproxy__' + proxies = {'all': opts_proxy} else: proxies = urllib.request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) + # compat. Set HTTPS_PROXY to __noproxy__ to revert if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] - proxy_handler = PerRequestProxyHandler(proxies) - debuglevel = 1 if self.params.get('debug_printtraffic') else 0 - https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) - ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - redirect_handler = YoutubeDLRedirectHandler() - data_handler = urllib.request.DataHandler() + return proxies - # When passing our own FileHandler instance, build_opener won't add the - # default FileHandler and allows us to disable the file protocol, which - # can be used for malicious purposes (see - # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = urllib.request.FileHandler() + @functools.cached_property + def cookiejar(self): + """Global cookiejar instance""" + return load_cookies( + self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) - if not self.params.get('enable_file_urls'): - def file_open(*args, **kwargs): - raise urllib.error.URLError( - 'file:// URLs are explicitly disabled in yt-dlp for security reasons. ' - 'Use --enable-file-urls to enable at your own risk.') - file_handler.file_open = file_open + @property + def _opener(self): + """ + Get a urllib OpenerDirector from the Urllib handler (deprecated). + """ + self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()') + handler = self._request_director.handlers['Urllib'] + return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies) - opener = urllib.request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) + def urlopen(self, req): + """ Start an HTTP download """ + if isinstance(req, str): + req = Request(req) + elif isinstance(req, urllib.request.Request): + self.deprecation_warning( + 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + req = urllib_req_to_req(req) + assert isinstance(req, Request) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) - opener.addheaders = [] - self._opener = opener + # compat: Assume user:pass url params are basic auth + url, basic_auth_header = extract_basic_auth(req.url) + if basic_auth_header: + req.headers['Authorization'] = basic_auth_header + req.url = sanitize_url(url) + + clean_proxies(proxies=req.proxies, headers=req.headers) + clean_headers(req.headers) + + try: + return self._request_director.send(req) + except NoSupportingHandlers as e: + for ue in e.unsupported_errors: + if not (ue.handler and ue.msg): + continue + if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower(): + raise RequestError( + 'file:// URLs are disabled by default in yt-dlp for security reasons. ' + 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue + raise + except SSLError as e: + if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): + raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e + elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e): + raise RequestError( + 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. ' + 'Try using --legacy-server-connect', cause=e) from e + raise + except HTTPError as e: # TODO: Remove in a future release + raise _CompatHTTPError(e) from e + + def build_request_director(self, handlers, preferences=None): + logger = _YDLLogger(self) + headers = self.params['http_headers'].copy() + proxies = self.proxies.copy() + clean_headers(headers) + clean_proxies(proxies, headers) + + director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic')) + for handler in handlers: + director.add_handler(handler( + logger=logger, + headers=headers, + cookiejar=self.cookiejar, + proxies=proxies, + prefer_system_certs='no-certifi' in self.params['compat_opts'], + verify=not self.params.get('nocheckcertificate'), + **traverse_obj(self.params, { + 'verbose': 'debug_printtraffic', + 'source_address': 'source_address', + 'timeout': 'socket_timeout', + 'legacy_ssl_support': 'legacyserverconnect', + 'enable_file_urls': 'enable_file_urls', + 'client_cert': { + 'client_certificate': 'client_certificate', + 'client_certificate_key': 'client_certificate_key', + 'client_certificate_password': 'client_certificate_password', + }, + }), + )) + director.preferences.update(preferences or []) + return director def encode(self, s): if isinstance(s, bytes): @@ -3963,7 +4181,7 @@ class YoutubeDL: # that way it will silently go on when used with unsupporting IE return ret elif not subtitles: - self.to_screen('[info] There\'s no subtitles for the requested languages') + self.to_screen('[info] There are no subtitles for the requested languages') return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: @@ -4017,7 +4235,7 @@ class YoutubeDL: if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] if not thumbnails: - self.to_screen(f'[info] There\'s no {label} thumbnails to download') + self.to_screen(f'[info] There are no {label} thumbnails to download') return ret multiple = write_all and len(thumbnails) > 1 @@ -4042,15 +4260,18 @@ class YoutubeDL: else: self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: - uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {}))) + uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: + if isinstance(err, HTTPError) and err.status == 404: + self.to_screen(f'[info] {thumb_display_id.title()} does not exist') + else: + self.report_warning(f'Unable to download {thumb_display_id}: {err}') thumbnails.pop(idx) - self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break return ret diff --git a/plugins/youtube_download/yt_dlp/__init__.py b/plugins/youtube_download/yt_dlp/__init__.py index 255b317..991dbcd 100644 --- a/plugins/youtube_download/yt_dlp/__init__.py +++ b/plugins/youtube_download/yt_dlp/__init__.py @@ -13,6 +13,7 @@ import optparse import os import re import sys +import traceback from .compat import compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS @@ -56,11 +57,11 @@ from .utils import ( read_stdin, render_table, setproctitle, - std_headers, traverse_obj, variadic, write_string, ) +from .utils.networking import std_headers from .YoutubeDL import YoutubeDL _IN_CLI = False @@ -187,8 +188,8 @@ def validate_options(opts): raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"') # Usernames and passwords - validate(not opts.usenetrc or (opts.username is None and opts.password is None), - '.netrc', msg='using {name} conflicts with giving username/password') + validate(sum(map(bool, (opts.usenetrc, opts.netrc_cmd, opts.username))) <= 1, '.netrc', + msg='{name}, netrc command and username/password are mutually exclusive options') validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing') validate(opts.ap_password is None or opts.ap_username is not None, 'TV Provider account username', msg='{name} missing') @@ -318,31 +319,50 @@ def validate_options(opts): if outtmpl_default == '': opts.skip_download = None del opts.outtmpl['default'] - if outtmpl_default and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio: - raise ValueError( - 'Cannot download a video and extract audio into the same file! ' - f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') - def parse_chapters(name, value): - chapters, ranges = [], [] + def parse_chapters(name, value, advanced=False): parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) - for regex in value or []: - if regex.startswith('*'): - for range_ in map(str.strip, regex[1:].split(',')): - mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_) - dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf')) - if None in (dur or [None]): - raise ValueError(f'invalid {name} time range "{regex}". Must be of the form "*start-end"') - ranges.append(dur) - continue - try: - chapters.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid {name} regex "{regex}" - {err}') - return chapters, ranges + TIMESTAMP_RE = r'''(?x)(?: + (?P-?)(?P[^-]+) + )?\s*-\s*(?: + (?P-?)(?P[^-]+) + )?''' - opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) - opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) + chapters, ranges, from_url = [], [], False + for regex in value or []: + if advanced and regex == '*from-url': + from_url = True + continue + elif not regex.startswith('*'): + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + continue + + for range_ in map(str.strip, regex[1:].split(',')): + mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_) + dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')] + signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign')) + + err = None + if None in (dur or [None]): + err = 'Must be of the form "*start-end"' + elif not advanced and any(signs): + err = 'Negative timestamps are not allowed' + else: + dur[0] *= -1 if signs[0] else 1 + dur[1] *= -1 if signs[1] else 1 + if dur[1] == float('-inf'): + err = '"-inf" is not a valid end' + if err: + raise ValueError(f'invalid {name} time range "{regex}". {err}') + ranges.append(dur) + + return chapters, ranges, from_url + + opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True)) # Cookies from browser if opts.cookiesfrombrowser: @@ -400,14 +420,19 @@ def validate_options(opts): except Exception as err: raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') - geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country - if geo_bypass_code is not None: + opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None + if opts.geo_bypass.lower() not in ('default', 'never'): try: - GeoUtils.random_ipv4(geo_bypass_code) + GeoUtils.random_ipv4(opts.geo_bypass) except Exception: - raise ValueError('unsupported geo-bypass country or ip-block') + raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"') + if len(opts.geo_bypass) == 2: + opts.geo_bypass_country = opts.geo_bypass + else: + opts.geo_bypass_ip_block = opts.geo_bypass + opts.geo_bypass = opts.geo_bypass.lower() != 'never' - opts.match_filter = match_filter_func(opts.match_filter) + opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter) if opts.download_archive is not None: opts.download_archive = expand_path(opts.download_archive) @@ -434,6 +459,10 @@ def validate_options(opts): elif ed and proto == 'default': default_downloader = ed.get_basename() + for policy in opts.color.values(): + if policy not in ('always', 'auto', 'no_color', 'never'): + raise ValueError(f'"{policy}" is not a valid color policy') + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -708,7 +737,8 @@ def parse_options(argv=None): 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' )) - opts.quiet = opts.quiet or any_getting or opts.print_json or bool(opts.forceprint) + if opts.quiet is None: + opts.quiet = any_getting or opts.print_json or bool(opts.forceprint) playlist_pps = [pp for pp in postprocessors if pp.get('when') == 'playlist'] write_playlist_infojson = (opts.writeinfojson and not opts.clean_infojson @@ -734,6 +764,7 @@ def parse_options(argv=None): return ParsedOptions(parser, opts, urls, { 'usenetrc': opts.usenetrc, 'netrc_location': opts.netrc_location, + 'netrc_cmd': opts.netrc_cmd, 'username': opts.username, 'password': opts.password, 'twofactor': opts.twofactor, @@ -891,7 +922,7 @@ def parse_options(argv=None): 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': opts.match_filter, - 'no_color': opts.no_color, + 'color': opts.color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'hls_use_mpegts': opts.hls_use_mpegts, @@ -935,14 +966,18 @@ def _real_main(argv=None): if opts.rm_cachedir: ydl.cache.remove() - updater = Updater(ydl) - if opts.update_self and updater.update() and actual_use: - if updater.cmd: - return updater.restart() - # This code is reachable only for zip variant in py < 3.10 - # It makes sense to exit here, but the old behavior is to continue - ydl.report_warning('Restart yt-dlp to use the updated version') - # return 100, 'ERROR: The program must exit for the update to complete' + try: + updater = Updater(ydl, opts.update_self) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + except Exception: + traceback.print_exc() + ydl._download_retcode = 100 if not actual_use: if pre_process: @@ -956,6 +991,8 @@ def _real_main(argv=None): parser.destroy() try: if opts.load_info_filename is not None: + if all_urls: + ydl.report_warning('URLs are ignored due to --load-info-json') return ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: return ydl.download(all_urls) diff --git a/plugins/youtube_download/yt_dlp/__pyinstaller/hook-yt_dlp.py b/plugins/youtube_download/yt_dlp/__pyinstaller/hook-yt_dlp.py index 057cfef..88c2b8b 100644 --- a/plugins/youtube_download/yt_dlp/__pyinstaller/hook-yt_dlp.py +++ b/plugins/youtube_download/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -1,30 +1,8 @@ -import ast -import os import sys -from pathlib import Path from PyInstaller.utils.hooks import collect_submodules -def find_attribute_accesses(node, name, path=()): - if isinstance(node, ast.Attribute): - path = [*path, node.attr] - if isinstance(node.value, ast.Name) and node.value.id == name: - yield path[::-1] - for child in ast.iter_child_nodes(node): - yield from find_attribute_accesses(child, name, path) - - -def collect_used_submodules(name, level): - for dirpath, _, filenames in os.walk(Path(__file__).parent.parent): - for filename in filenames: - if not filename.endswith('.py'): - continue - with open(Path(dirpath) / filename, encoding='utf8') as f: - for submodule in find_attribute_accesses(ast.parse(f.read()), name): - yield '.'.join(submodule[:level]) - - def pycryptodome_module(): try: import Cryptodome # noqa: F401 @@ -40,13 +18,10 @@ def pycryptodome_module(): def get_hidden_imports(): - yield 'yt_dlp.compat._legacy' + yield from ('yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated') + yield from ('yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated') + yield pycryptodome_module() yield from collect_submodules('websockets') - - crypto = pycryptodome_module() - for sm in set(collect_used_submodules('Cryptodome', 2)): - yield f'{crypto}.{sm}' - # These are auto-detected, but explicitly add them just in case yield from ('mutagen', 'brotli', 'certifi') diff --git a/plugins/youtube_download/yt_dlp/aes.py b/plugins/youtube_download/yt_dlp/aes.py index deff0a2..b3a383c 100644 --- a/plugins/youtube_download/yt_dlp/aes.py +++ b/plugins/youtube_download/yt_dlp/aes.py @@ -5,14 +5,14 @@ from .compat import compat_ord from .dependencies import Cryptodome from .utils import bytes_to_intlist, intlist_to_bytes -if Cryptodome: +if Cryptodome.AES: def aes_cbc_decrypt_bytes(data, key, iv): """ Decrypt bytes with AES-CBC using pycryptodome """ - return Cryptodome.Cipher.AES.new(key, Cryptodome.Cipher.AES.MODE_CBC, iv).decrypt(data) + return Cryptodome.AES.new(key, Cryptodome.AES.MODE_CBC, iv).decrypt(data) def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): """ Decrypt bytes with AES-GCM using pycryptodome """ - return Cryptodome.Cipher.AES.new(key, Cryptodome.Cipher.AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + return Cryptodome.AES.new(key, Cryptodome.AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) else: def aes_cbc_decrypt_bytes(data, key, iv): diff --git a/plugins/youtube_download/yt_dlp/cache.py b/plugins/youtube_download/yt_dlp/cache.py index 7be91ea..9dd4f2f 100644 --- a/plugins/youtube_download/yt_dlp/cache.py +++ b/plugins/youtube_download/yt_dlp/cache.py @@ -1,5 +1,4 @@ import contextlib -import errno import json import os import re @@ -39,11 +38,7 @@ class Cache: fn = self._get_cache_fn(section, key, dtype) try: - try: - os.makedirs(os.path.dirname(fn)) - except OSError as ose: - if ose.errno != errno.EEXIST: - raise + os.makedirs(os.path.dirname(fn), exist_ok=True) self._ydl.write_debug(f'Saving {section}.{key} to cache') write_json_file({'yt-dlp_version': __version__, 'data': data}, fn) except Exception: diff --git a/plugins/youtube_download/yt_dlp/casefold.py b/plugins/youtube_download/yt_dlp/casefold.py new file mode 100644 index 0000000..41a53e5 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/casefold.py @@ -0,0 +1,5 @@ +import warnings + +warnings.warn(DeprecationWarning(f'{__name__} is deprecated')) + +casefold = str.casefold diff --git a/plugins/youtube_download/yt_dlp/compat/__init__.py b/plugins/youtube_download/yt_dlp/compat/__init__.py index c6c0254..832a913 100644 --- a/plugins/youtube_download/yt_dlp/compat/__init__.py +++ b/plugins/youtube_download/yt_dlp/compat/__init__.py @@ -1,14 +1,11 @@ import os import sys -import warnings import xml.etree.ElementTree as etree -from ._deprecated import * # noqa: F401, F403 from .compat_utils import passthrough_module -# XXX: Implement this the same way as other DeprecationWarnings without circular import -passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( - DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +passthrough_module(__name__, '._deprecated') +del passthrough_module # HTMLParseError has been deprecated in Python 3.3 and removed in @@ -70,3 +67,13 @@ if compat_os_name in ('nt', 'ce'): return userhome + path[i:] else: compat_expanduser = os.path.expanduser + + +def urllib_req_to_req(urllib_request): + """Convert urllib Request to a networking Request""" + from ..networking import Request + from ..utils.networking import HTTPHeaderDict + return Request( + urllib_request.get_full_url(), data=urllib_request.data, method=urllib_request.get_method(), + headers=HTTPHeaderDict(urllib_request.headers, urllib_request.unredirected_hdrs), + extensions={'timeout': urllib_request.timeout} if hasattr(urllib_request, 'timeout') else None) diff --git a/plugins/youtube_download/yt_dlp/compat/_deprecated.py b/plugins/youtube_download/yt_dlp/compat/_deprecated.py index 342f1f8..607bae9 100644 --- a/plugins/youtube_download/yt_dlp/compat/_deprecated.py +++ b/plugins/youtube_download/yt_dlp/compat/_deprecated.py @@ -1,4 +1,12 @@ """Deprecated - New code should avoid these""" +import warnings + +from .compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) +del passthrough_module import base64 import urllib.error @@ -8,7 +16,6 @@ compat_str = str compat_b64decode = base64.b64decode -compat_HTTPError = urllib.error.HTTPError compat_urlparse = urllib.parse compat_parse_qs = urllib.parse.parse_qs compat_urllib_parse_unquote = urllib.parse.unquote diff --git a/plugins/youtube_download/yt_dlp/compat/_legacy.py b/plugins/youtube_download/yt_dlp/compat/_legacy.py index d19333d..90ccf0f 100644 --- a/plugins/youtube_download/yt_dlp/compat/_legacy.py +++ b/plugins/youtube_download/yt_dlp/compat/_legacy.py @@ -1,5 +1,6 @@ """ Do not use! """ +import base64 import collections import ctypes import getpass @@ -15,12 +16,12 @@ import shlex import shutil import socket import struct +import subprocess import tokenize import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree as etree -from subprocess import DEVNULL # isort: split import asyncio # noqa: F401 @@ -29,10 +30,11 @@ from asyncio import run as compat_asyncio_run # noqa: F401 from re import Pattern as compat_Pattern # noqa: F401 from re import match as compat_Match # noqa: F401 +from . import compat_expanduser, compat_HTMLParseError, compat_realpath from .compat_utils import passthrough_module -from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import websockets as compat_websockets # noqa: F401 +from ..dependencies.Cryptodome import AES as compat_pycrypto_AES # noqa: F401 passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) @@ -47,41 +49,48 @@ def compat_setenv(key, value, env=os.environ): env[key] = value +compat_base64_b64decode = base64.b64decode compat_basestring = str compat_casefold = str.casefold compat_chr = chr compat_collections_abc = collections.abc -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = http.cookiejar.Cookie -compat_cookies = http.cookies -compat_cookies_SimpleCookie = http.cookies.SimpleCookie -compat_etree_Element = etree.Element -compat_etree_register_namespace = etree.register_namespace +compat_cookiejar = compat_http_cookiejar = http.cookiejar +compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = compat_http_cookies = http.cookies +compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element +compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace compat_filter = filter compat_get_terminal_size = shutil.get_terminal_size compat_getenv = os.getenv -compat_getpass = getpass.getpass +compat_getpass = compat_getpass_getpass = getpass.getpass compat_html_entities = html.entities compat_html_entities_html5 = html.entities.html5 -compat_HTMLParser = html.parser.HTMLParser +compat_html_parser_HTMLParseError = compat_HTMLParseError +compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser compat_http_client = http.client compat_http_server = http.server +compat_HTTPError = urllib.error.HTTPError compat_input = input compat_integer_types = (int, ) compat_itertools_count = itertools.count compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) +compat_os_path_expanduser = compat_expanduser +compat_os_path_realpath = compat_realpath compat_print = print compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection compat_Struct = struct.Struct compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack -compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL compat_tokenize_tokenize = tokenize.tokenize compat_urllib_error = urllib.error +compat_urllib_HTTPError = urllib.error.HTTPError compat_urllib_parse = urllib.parse +compat_urllib_parse_parse_qs = urllib.parse.parse_qs compat_urllib_parse_quote = urllib.parse.quote compat_urllib_parse_quote_plus = urllib.parse.quote_plus compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus @@ -90,8 +99,10 @@ compat_urllib_parse_urlunparse = urllib.parse.urlunparse compat_urllib_request = urllib.request compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_response = urllib.response -compat_urlretrieve = urllib.request.urlretrieve -compat_xml_parse_error = etree.ParseError +compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None + +legacy = [] diff --git a/plugins/youtube_download/yt_dlp/compat/compat_utils.py b/plugins/youtube_download/yt_dlp/compat/compat_utils.py index 8956b3b..3ca46d2 100644 --- a/plugins/youtube_download/yt_dlp/compat/compat_utils.py +++ b/plugins/youtube_download/yt_dlp/compat/compat_utils.py @@ -48,7 +48,7 @@ def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=la """Passthrough parent module into a child module, creating the parent if necessary""" def __getattr__(attr): if _is_package(parent): - with contextlib.suppress(ImportError): + with contextlib.suppress(ModuleNotFoundError): return importlib.import_module(f'.{attr}', parent.__name__) ret = from_child(attr) diff --git a/plugins/youtube_download/yt_dlp/compat/types.py b/plugins/youtube_download/yt_dlp/compat/types.py new file mode 100644 index 0000000..4aa3b0e --- /dev/null +++ b/plugins/youtube_download/yt_dlp/compat/types.py @@ -0,0 +1,13 @@ +# flake8: noqa: F405 +from types import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'types') +del passthrough_module + +try: + # NB: pypy has builtin NoneType, so checking NameError won't work + from types import NoneType # >= 3.10 +except ImportError: + NoneType = type(None) diff --git a/plugins/youtube_download/yt_dlp/compat/urllib/__init__.py b/plugins/youtube_download/yt_dlp/compat/urllib/__init__.py new file mode 100644 index 0000000..b27cc61 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/compat/urllib/__init__.py @@ -0,0 +1,10 @@ +# flake8: noqa: F405 +from urllib import * # noqa: F403 + +del request +from . import request # noqa: F401 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib') +del passthrough_module diff --git a/plugins/youtube_download/yt_dlp/compat/urllib/request.py b/plugins/youtube_download/yt_dlp/compat/urllib/request.py new file mode 100644 index 0000000..ff63b2f --- /dev/null +++ b/plugins/youtube_download/yt_dlp/compat/urllib/request.py @@ -0,0 +1,40 @@ +# flake8: noqa: F405 +from urllib.request import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib.request') +del passthrough_module + + +from .. import compat_os_name + +if compat_os_name == 'nt': + # On older python versions, proxies are extracted from Windows registry erroneously. [1] + # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] + # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade + # it to http on these older python versions to avoid issues + # This also applies for ftp proxy type, as ftp:// proxy scheme is not supported. + # 1: https://github.com/python/cpython/issues/86793 + # 2: https://github.com/python/cpython/blob/51f1ae5ceb0673316c4e4b0175384e892e33cc6e/Lib/urllib/request.py#L2683-L2698 + import sys + from urllib.request import getproxies_environment, getproxies_registry + + def getproxies_registry_patched(): + proxies = getproxies_registry() + if ( + sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final + or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final + ): + return proxies + + for scheme in ('https', 'ftp'): + if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): + proxies[scheme] = 'http' + proxies[scheme][len(scheme):] + + return proxies + + def getproxies(): + return getproxies_environment() or getproxies_registry_patched() + +del compat_os_name diff --git a/plugins/youtube_download/yt_dlp/cookies.py b/plugins/youtube_download/yt_dlp/cookies.py index 8ca7cea..a71fbc2 100644 --- a/plugins/youtube_download/yt_dlp/cookies.py +++ b/plugins/youtube_download/yt_dlp/cookies.py @@ -1,7 +1,9 @@ import base64 +import collections import contextlib import http.cookiejar import http.cookies +import io import json import os import re @@ -11,6 +13,7 @@ import subprocess import sys import tempfile import time +import urllib.request from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -20,6 +23,7 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) +from .compat import functools from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, secretstorage, @@ -28,36 +32,24 @@ from .dependencies import ( from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( Popen, - YoutubeDLCookieJar, error_to_str, expand_path, is_path_like, + sanitize_url, + str_or_none, try_call, + write_string, ) +from .utils._utils import _YDLLogger +from .utils.networking import normalize_url CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} -class YDLLogger: - def __init__(self, ydl=None): - self._ydl = ydl - - def debug(self, message): - if self._ydl: - self._ydl.write_debug(message) - - def info(self, message): - if self._ydl: - self._ydl.to_screen(f'[Cookies] {message}') - - def warning(self, message, only_once=False): - if self._ydl: - self._ydl.report_warning(message, only_once) - - def error(self, message): - if self._ydl: - self._ydl.report_error(message) +class YDLLogger(_YDLLogger): + def warning(self, message, only_once=False): # compat + return super().warning(message, once=only_once) class ProgressBar(MultilinePrinter): _DELAY, _timer = 0.1, 0 @@ -105,7 +97,7 @@ def load_cookies(cookie_file, browser_specification, ydl): jar = YoutubeDLCookieJar(cookie_file) if not is_filename or os.access(cookie_file, os.R_OK): - jar.load(ignore_discard=True, ignore_expires=True) + jar.load() cookie_jars.append(jar) return _merge_cookie_jars(cookie_jars) @@ -146,7 +138,7 @@ def _extract_firefox_cookies(profile, container, logger): containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): raise FileNotFoundError(f'could not read containers.json in {search_root}') - with open(containers_path) as containers: + with open(containers_path, encoding='utf8') as containers: identities = json.load(containers).get('identities', []) container_id = next((context.get('userContextId') for context in identities if container in ( context.get('name'), @@ -346,7 +338,9 @@ class ChromeCookieDecryptor: Linux: - cookies are either v10 or v11 - v10: AES-CBC encrypted with a fixed key + - also attempts empty password if decryption fails - v11: AES-CBC encrypted with an OS protected key (keyring) + - also attempts empty password if decryption fails - v11 keys can be stored in various places depending on the activate desktop environment [2] Mac: @@ -361,7 +355,7 @@ class ChromeCookieDecryptor: Sources: - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/ - - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_linux.cc + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_linux.cc - KeyStorageLinux::CreateService """ @@ -383,32 +377,49 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') - password = _get_linux_keyring_password(browser_keyring_name, keyring, logger) - self._v11_key = None if password is None else self.derive_key(password) + self._empty_key = self.derive_key(b'') self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} + self._browser_keyring_name = browser_keyring_name + self._keyring = keyring + + @functools.cached_property + def _v11_key(self): + password = _get_linux_keyring_password(self._browser_keyring_name, self._keyring, self._logger) + return None if password is None else self.derive_key(password) @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) def decrypt(self, encrypted_value): + """ + + following the same approach as the fix in [1]: if cookies fail to decrypt then attempt to decrypt + with an empty password. The failure detection is not the same as what chromium uses so the + results won't be perfect + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/ + - a bugfix to try an empty password as a fallback + """ version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': self._cookie_counts['v10'] += 1 - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger) elif version == b'v11': self._cookie_counts['v11'] += 1 if self._v11_key is None: self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger) else: + self._logger.warning(f'unknown cookie version: "{version}"', only_once=True) self._cookie_counts['other'] += 1 return None @@ -423,7 +434,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) def decrypt(self, encrypted_value): @@ -436,12 +447,12 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger) else: self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return encrypted_value @@ -461,7 +472,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc # kNonceLength nonce_length = 96 // 8 # boringssl @@ -478,23 +489,27 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): else: self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() def _extract_safari_cookies(profile, logger): - if profile is not None: - logger.error('safari does not support profiles') if sys.platform != 'darwin': raise ValueError(f'unsupported platform: {sys.platform}') - cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') - - if not os.path.isfile(cookies_path): - logger.debug('Trying secondary cookie location') - cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if profile: + cookies_path = os.path.expanduser(profile) if not os.path.isfile(cookies_path): - raise FileNotFoundError('could not find safari cookies database') + raise FileNotFoundError('custom safari cookies database not found') + + else: + cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') + + if not os.path.isfile(cookies_path): + logger.debug('Trying secondary cookie location') + cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') with open(cookies_path, 'rb') as f: cookies_data = f.read() @@ -657,19 +672,27 @@ class _LinuxDesktopEnvironment(Enum): """ OTHER = auto() CINNAMON = auto() + DEEPIN = auto() GNOME = auto() - KDE = auto() + KDE3 = auto() + KDE4 = auto() + KDE5 = auto() + KDE6 = auto() PANTHEON = auto() + UKUI = auto() UNITY = auto() XFCE = auto() + LXQT = auto() class _LinuxKeyring(Enum): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h SelectedLinuxBackend """ - KWALLET = auto() + KWALLET = auto() # KDE4 + KWALLET5 = auto() + KWALLET6 = auto() GNOMEKEYRING = auto() BASICTEXT = auto() @@ -677,7 +700,7 @@ class _LinuxKeyring(Enum): SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() -def _get_linux_desktop_environment(env): +def _get_linux_desktop_environment(env, logger): """ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc GetDesktopEnvironment @@ -692,51 +715,97 @@ def _get_linux_desktop_environment(env): return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'Deepin': + return _LinuxDesktopEnvironment.DEEPIN elif xdg_current_desktop == 'GNOME': return _LinuxDesktopEnvironment.GNOME elif xdg_current_desktop == 'X-Cinnamon': return _LinuxDesktopEnvironment.CINNAMON elif xdg_current_desktop == 'KDE': - return _LinuxDesktopEnvironment.KDE + kde_version = env.get('KDE_SESSION_VERSION', None) + if kde_version == '5': + return _LinuxDesktopEnvironment.KDE5 + elif kde_version == '6': + return _LinuxDesktopEnvironment.KDE6 + elif kde_version == '4': + return _LinuxDesktopEnvironment.KDE4 + else: + logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') + return _LinuxDesktopEnvironment.KDE4 elif xdg_current_desktop == 'Pantheon': return _LinuxDesktopEnvironment.PANTHEON elif xdg_current_desktop == 'XFCE': return _LinuxDesktopEnvironment.XFCE + elif xdg_current_desktop == 'UKUI': + return _LinuxDesktopEnvironment.UKUI + elif xdg_current_desktop == 'LXQt': + return _LinuxDesktopEnvironment.LXQT + else: + logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + elif desktop_session is not None: - if desktop_session in ('mate', 'gnome'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): return _LinuxDesktopEnvironment.GNOME - elif 'kde' in desktop_session: - return _LinuxDesktopEnvironment.KDE - elif 'xfce' in desktop_session: + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI + else: + logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + else: if 'GNOME_DESKTOP_SESSION_ID' in env: return _LinuxDesktopEnvironment.GNOME elif 'KDE_FULL_SESSION' in env: - return _LinuxDesktopEnvironment.KDE + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 return _LinuxDesktopEnvironment.OTHER def _choose_linux_keyring(logger): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc - SelectBackend + SelectBackend in [1] + + There is currently support for forcing chromium to use BASIC_TEXT by creating a file called + `Disable Local Encryption` [1] in the user data dir. The function to write this file (`WriteBackendUse()` [1]) + does not appear to be called anywhere other than in tests, so the user would have to create this file manually + and so would be aware enough to tell yt-dlp to use the BASIC_TEXT keyring. + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.cc """ - desktop_environment = _get_linux_desktop_environment(os.environ) + desktop_environment = _get_linux_desktop_environment(os.environ, logger) logger.debug(f'detected desktop environment: {desktop_environment.name}') - if desktop_environment == _LinuxDesktopEnvironment.KDE: + if desktop_environment == _LinuxDesktopEnvironment.KDE4: linux_keyring = _LinuxKeyring.KWALLET - elif desktop_environment == _LinuxDesktopEnvironment.OTHER: + elif desktop_environment == _LinuxDesktopEnvironment.KDE5: + linux_keyring = _LinuxKeyring.KWALLET5 + elif desktop_environment == _LinuxDesktopEnvironment.KDE6: + linux_keyring = _LinuxKeyring.KWALLET6 + elif desktop_environment in ( + _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER + ): linux_keyring = _LinuxKeyring.BASICTEXT else: linux_keyring = _LinuxKeyring.GNOMEKEYRING return linux_keyring -def _get_kwallet_network_wallet(logger): +def _get_kwallet_network_wallet(keyring, logger): """ The name of the wallet used to store network passwords. - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/kwallet_dbus.cc KWalletDBus::NetworkWallet which does a dbus call to the following function: https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html @@ -744,10 +813,22 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: + if keyring == _LinuxKeyring.KWALLET: + service_name = 'org.kde.kwalletd' + wallet_path = '/modules/kwalletd' + elif keyring == _LinuxKeyring.KWALLET5: + service_name = 'org.kde.kwalletd5' + wallet_path = '/modules/kwalletd5' + elif keyring == _LinuxKeyring.KWALLET6: + service_name = 'org.kde.kwalletd6' + wallet_path = '/modules/kwalletd6' + else: + raise ValueError(keyring) + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', - '--dest=org.kde.kwalletd5', - '/modules/kwalletd5', + f'--dest={service_name}', + wallet_path, 'org.kde.KWallet.networkWallet' ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) @@ -762,8 +843,8 @@ def _get_kwallet_network_wallet(logger): return default_wallet -def _get_kwallet_password(browser_keyring_name, logger): - logger.debug('using kwallet-query to obtain password from kwallet') +def _get_kwallet_password(browser_keyring_name, keyring, logger): + logger.debug(f'using kwallet-query to obtain password from {keyring.name}') if shutil.which('kwallet-query') is None: logger.error('kwallet-query command not found. KWallet and kwallet-query ' @@ -771,7 +852,7 @@ def _get_kwallet_password(browser_keyring_name, logger): 'included in the kwallet package for your distribution') return b'' - network_wallet = _get_kwallet_network_wallet(logger) + network_wallet = _get_kwallet_network_wallet(keyring, logger) try: stdout, _, returncode = Popen.run([ @@ -793,8 +874,9 @@ def _get_kwallet_password(browser_keyring_name, logger): # checks hasEntry. To verify this: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. + # this was identified as a bug later and fixed in + # https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/#F0 + # https://chromium.googlesource.com/chromium/src/+/5463af3c39d7f5b6d11db7fbd51e38cc1974d764 return b'' else: logger.debug('password found') @@ -832,8 +914,8 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) logger.debug(f'Chosen keyring: {keyring.name}') - if keyring == _LinuxKeyring.KWALLET: - return _get_kwallet_password(browser_keyring_name, logger) + if keyring in (_LinuxKeyring.KWALLET, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): + return _get_kwallet_password(browser_keyring_name, keyring, logger) elif keyring == _LinuxKeyring.GNOMEKEYRING: return _get_gnome_keyring_password(browser_keyring_name, logger) elif keyring == _LinuxKeyring.BASICTEXT: @@ -861,6 +943,10 @@ def _get_mac_keyring_password(browser_keyring_name, logger): def _get_windows_v10_key(browser_root, logger): + """ + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + """ path = _find_most_recently_used_file(browser_root, 'Local State', logger) if path is None: logger.error('could not find local state file') @@ -869,11 +955,13 @@ def _get_windows_v10_key(browser_root, logger): with open(path, encoding='utf8') as f: data = json.load(f) try: + # kOsCryptEncryptedKeyPrefName in [1] base64_key = data['os_crypt']['encrypted_key'] except KeyError: logger.error('no encrypted key in Local State') return None encrypted_key = base64.b64decode(base64_key) + # kDPAPIKeyPrefix in [1] prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') @@ -885,13 +973,15 @@ def pbkdf2_sha1(password, salt, iterations, key_length): return pbkdf2_hmac('sha1', password, salt, iterations, key_length) -def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): - plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) - try: - return plaintext.decode() - except UnicodeDecodeError: - logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) - return None +def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): + for key in keys: + plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) + try: + return plaintext.decode() + except UnicodeDecodeError: + pass + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): @@ -1085,3 +1175,150 @@ class LenientSimpleCookie(http.cookies.SimpleCookie): else: morsel = None + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard, ignore_expires): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, ignore_discard=True, ignore_expires=True): + """ + Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ + + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with self.open(filename, write=True) as f: + f.write(self._HEADER) + self._really_save(f, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=True, ignore_expires=True): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with self.open(filename) as f: + for line in f: + try: + cf.write(prepare_line(line)) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + def get_cookie_header(self, url): + """Generate a Cookie HTTP header for a given url""" + cookie_req = urllib.request.Request(normalize_url(sanitize_url(url))) + self.add_cookie_header(cookie_req) + return cookie_req.get_header('Cookie') + + def get_cookies_for_url(self, url): + """Generate a list of Cookie objects for a given url""" + # Policy `_now` attribute must be set before calling `_cookies_for_request` + # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 + self._policy._now = self._now = int(time.time()) + return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url)))) + + def clear(self, *args, **kwargs): + with contextlib.suppress(KeyError): + return super().clear(*args, **kwargs) diff --git a/plugins/youtube_download/yt_dlp/dependencies/Cryptodome.py b/plugins/youtube_download/yt_dlp/dependencies/Cryptodome.py index 2adc513..2cfa4c9 100644 --- a/plugins/youtube_download/yt_dlp/dependencies/Cryptodome.py +++ b/plugins/youtube_download/yt_dlp/dependencies/Cryptodome.py @@ -1,6 +1,3 @@ -import types - -from ..compat import functools from ..compat.compat_utils import passthrough_module try: @@ -9,22 +6,33 @@ except ImportError: try: import Crypto as _parent except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python - _parent = types.ModuleType('no_Cryptodome') + _parent = passthrough_module(__name__, 'no_Cryptodome') __bool__ = lambda: False -passthrough_module(__name__, _parent, (..., '__version__')) del passthrough_module +__version__ = '' +AES = PKCS1_v1_5 = Blowfish = PKCS1_OAEP = SHA1 = CMAC = RSA = None +try: + if _parent.__name__ == 'Cryptodome': + from Cryptodome import __version__ + from Cryptodome.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 + from Cryptodome.Hash import CMAC, SHA1 + from Cryptodome.PublicKey import RSA + elif _parent.__name__ == 'Crypto': + from Crypto import __version__ + from Crypto.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 # noqa: F401 + from Crypto.Hash import CMAC, SHA1 # noqa: F401 + from Crypto.PublicKey import RSA # noqa: F401 +except ImportError: + __version__ = f'broken {__version__}'.strip() -@property -@functools.cache -def _yt_dlp__identifier(): - if _parent.__name__ == 'Crypto': - from Crypto.Cipher import AES - try: - # In pycrypto, mode defaults to ECB. See: - # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode - AES.new(b'abcdefghijklmnop') - except TypeError: - return 'pycrypto' - return _parent.__name__ + +_yt_dlp__identifier = _parent.__name__ +if AES and _yt_dlp__identifier == 'Crypto': + try: + # In pycrypto, mode defaults to ECB. See: + # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode + AES.new(b'abcdefghijklmnop') + except TypeError: + _yt_dlp__identifier = 'pycrypto' diff --git a/plugins/youtube_download/yt_dlp/dependencies/__init__.py b/plugins/youtube_download/yt_dlp/dependencies/__init__.py index c2214e6..6e7d29c 100644 --- a/plugins/youtube_download/yt_dlp/dependencies/__init__.py +++ b/plugins/youtube_download/yt_dlp/dependencies/__init__.py @@ -73,7 +73,7 @@ available_dependencies = {k: v for k, v in all_dependencies.items() if v} # Deprecated -Cryptodome_AES = Cryptodome.Cipher.AES if Cryptodome else None +Cryptodome_AES = Cryptodome.AES __all__ = [ diff --git a/plugins/youtube_download/yt_dlp/downloader/__init__.py b/plugins/youtube_download/yt_dlp/downloader/__init__.py index c34dbce..51a9f28 100644 --- a/plugins/youtube_download/yt_dlp/downloader/__init__.py +++ b/plugins/youtube_download/yt_dlp/downloader/__init__.py @@ -30,7 +30,7 @@ from .hls import HlsFD from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD +from .niconico import NiconicoDmcFD, NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD @@ -50,6 +50,7 @@ PROTOCOL_MAP = { 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, + 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, diff --git a/plugins/youtube_download/yt_dlp/downloader/common.py b/plugins/youtube_download/yt_dlp/downloader/common.py index 077b29b..b71d7ee 100644 --- a/plugins/youtube_download/yt_dlp/downloader/common.py +++ b/plugins/youtube_download/yt_dlp/downloader/common.py @@ -49,10 +49,10 @@ class FileDownloader: verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. - continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) - retries: Number of times to retry for HTTP error 5xx - file_access_retries: Number of times to retry on file access error + retries: Number of times to retry for expected network errors. + Default is 0 for API, but 10 for CLI + file_access_retries: Number of times to retry on file access error (default: 3) buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -138,17 +138,21 @@ class FileDownloader: def format_percent(percent): return ' N/A%' if percent is None else f'{percent:>5.1f}%' - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT): + if total is NO_DEFAULT: + rate, remaining = start_or_rate, now_or_remaining + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + + start, now = start_or_rate, now_or_remaining if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def calc_speed(start, now, bytes): @@ -165,6 +169,12 @@ class FileDownloader: def format_retries(retries): return 'inf' if retries == float('inf') else int(retries) + @staticmethod + def filesize_or_none(unencoded_filename): + if os.path.isfile(unencoded_filename): + return os.path.getsize(unencoded_filename) + return 0 + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -225,7 +235,7 @@ class FileDownloader: sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) def wrapper(self, func, *args, **kwargs): - for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self): try: return func(self, *args, **kwargs) except OSError as err: @@ -245,7 +255,8 @@ class FileDownloader: @wrap_file_access('remove') def try_remove(self, filename): - os.remove(filename) + if os.path.isfile(filename): + os.remove(filename) @wrap_file_access('rename') def try_rename(self, old_filename, new_filename): @@ -285,7 +296,8 @@ class FileDownloader: self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) - self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') + self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color' + self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out def _finish_multiline_status(self): self._multiline.end() @@ -407,7 +419,6 @@ class FileDownloader: """Download to a filename using the info from info_dict Return True on success and False otherwise """ - nooverwrites_and_exists = ( not self.params.get('overwrites', True) and os.path.exists(encodeFilename(filename)) diff --git a/plugins/youtube_download/yt_dlp/downloader/external.py b/plugins/youtube_download/yt_dlp/downloader/external.py index 5f54017..4ce8a3b 100644 --- a/plugins/youtube_download/yt_dlp/downloader/external.py +++ b/plugins/youtube_download/yt_dlp/downloader/external.py @@ -1,14 +1,16 @@ import enum import json -import os.path +import os import re import subprocess import sys +import tempfile import time import uuid from .fragment import FragmentFD from ..compat import functools +from ..networking import Request from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( Popen, @@ -23,9 +25,7 @@ from ..utils import ( encodeArgument, encodeFilename, find_available_port, - handle_youtubedl_headers, remove_end, - sanitized_Request, traverse_obj, ) @@ -43,6 +43,7 @@ class ExternalFD(FragmentFD): def real_download(self, filename, info_dict): self.report_destination(filename) tmpfilename = self.temp_name(filename) + self._cookies_tempfile = None try: started = time.time() @@ -55,6 +56,9 @@ class ExternalFD(FragmentFD): # should take place retval = 0 self.to_screen('[%s] Interrupted by user' % self.get_basename()) + finally: + if self._cookies_tempfile: + self.try_remove(self._cookies_tempfile) if retval == 0: status = { @@ -126,6 +130,16 @@ class ExternalFD(FragmentFD): self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME, keys, *args, **kwargs) + def _write_cookies(self): + if not self.ydl.cookiejar.filename: + tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False) + tmp_cookies.close() + self._cookies_tempfile = tmp_cookies.name + self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"') + # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename + self.ydl.cookiejar.save(self._cookies_tempfile) + return self.ydl.cookiejar.filename or self._cookies_tempfile + def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] @@ -176,7 +190,7 @@ class ExternalFD(FragmentFD): return 0 def _call_process(self, cmd, info_dict): - return Popen.run(cmd, text=True, stderr=subprocess.PIPE) + return Popen.run(cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) class CurlFD(ExternalFD): @@ -185,6 +199,9 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['--cookie', cookie_header] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -215,6 +232,9 @@ class AxelFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['-H', f'{key}: {val}'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['-H', f'Cookie: {cookie_header}', '--max-redirect=0'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -224,7 +244,9 @@ class WgetFD(ExternalFD): AVAILABLE_OPT = '--version' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] + cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--load-cookies', self._write_cookies()] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -272,7 +294,7 @@ class Aria2cFD(ExternalFD): return super()._call_downloader(tmpfilename, info_dict) def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c', + cmd = [self.exe, '-c', '--no-conf', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] if 'fragments' in info_dict: @@ -280,6 +302,8 @@ class Aria2cFD(ExternalFD): else: cmd += ['--min-split-size', '1M'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += [f'--load-cookies={self._write_cookies()}'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -334,13 +358,12 @@ class Aria2cFD(ExternalFD): 'method': method, 'params': [f'token:{rpc_secret}', *params], }).encode('utf-8') - request = sanitized_Request( + request = Request( f'http://localhost:{rpc_port}/jsonrpc', data=d, headers={ 'Content-Type': 'application/json', 'Content-Length': f'{len(d)}', - 'Ytdl-request-proxy': '__noproxy__', - }) + }, proxies={'all': None}) with self.ydl.urlopen(request) as r: resp = json.load(r) assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server' @@ -418,6 +441,14 @@ class HttpieFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += [f'{key}:{val}'] + + # httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1] + # If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2] + # 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq + # 2: https://httpie.io/docs/cli/sessions + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += [f'Cookie:{cookie_header}'] return cmd @@ -528,11 +559,16 @@ class FFmpegFD(ExternalFD): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): - headers_dict = handle_youtubedl_headers(fmt['http_headers']) + is_http = re.match(r'^https?://', fmt['url']) + cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] + if cookies: + args.extend(['-cookies', ''.join( + f'{cookie.name}={cookie.value}; path={cookie.path}; domain={cookie.domain};\r\n' + for cookie in cookies)]) + if fmt.get('http_headers') and is_http: # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in headers_dict.items())]) + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) if start_time: args += ['-ss', str(start_time)] diff --git a/plugins/youtube_download/yt_dlp/downloader/f4m.py b/plugins/youtube_download/yt_dlp/downloader/f4m.py index 306f921..28cbba0 100644 --- a/plugins/youtube_download/yt_dlp/downloader/f4m.py +++ b/plugins/youtube_download/yt_dlp/downloader/f4m.py @@ -3,11 +3,11 @@ import io import itertools import struct import time -import urllib.error import urllib.parse from .fragment import FragmentFD from ..compat import compat_etree_fromstring +from ..networking.exceptions import HTTPError from ..utils import fix_xml_ampersands, xpath_text @@ -312,7 +312,7 @@ class F4mFD(FragmentFD): self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() + man_url = urlh.url # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 # and https://github.com/ytdl-org/youtube-dl/issues/7823) @@ -407,8 +407,8 @@ class F4mFD(FragmentFD): if box_type == b'mdat': self._append_fragment(ctx, box_data) break - except urllib.error.HTTPError as err: - if live and (err.code == 404 or err.code == 410): + except HTTPError as err: + if live and (err.status == 404 or err.status == 410): # We didn't keep up with the live window. Continue # with the next available fragment. msg = 'Fragment %d unavailable' % frag_i diff --git a/plugins/youtube_download/yt_dlp/downloader/fragment.py b/plugins/youtube_download/yt_dlp/downloader/fragment.py index 039cb14..b4b680d 100644 --- a/plugins/youtube_download/yt_dlp/downloader/fragment.py +++ b/plugins/youtube_download/yt_dlp/downloader/fragment.py @@ -1,24 +1,19 @@ import concurrent.futures import contextlib -import http.client import json import math import os import struct import time -import urllib.error from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import compat_os_name -from ..utils import ( - DownloadError, - RetryManager, - encodeFilename, - sanitized_Request, - traverse_obj, -) +from ..networking import Request +from ..networking.exceptions import HTTPError, IncompleteRead +from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj +from ..utils.networking import HTTPHeaderDict class HttpQuietDownloader(HttpFD): @@ -34,8 +29,8 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) + fragment_retries: Number of times to retry a fragment for HTTP error + (DASH and hlsnative only). Default is 0 for API, but 10 for CLI skip_unavailable_fragments: Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is @@ -75,7 +70,7 @@ class FragmentFD(FileDownloader): def _prepare_url(self, info_dict, url): headers = info_dict.get('http_headers') - return sanitized_Request(url, None, headers) if headers else url + return Request(url, None, headers) if headers else url def _prepare_and_start_frag_download(self, ctx, info_dict): self._prepare_frag_download(ctx) @@ -121,6 +116,11 @@ class FragmentFD(FileDownloader): 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False @@ -155,9 +155,7 @@ class FragmentFD(FileDownloader): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -170,15 +168,17 @@ class FragmentFD(FileDownloader): **self.params, 'noprogress': True, 'test': False, + 'sleep_interval': 0, + 'max_sleep_interval': 0, + 'sleep_interval_subtitles': 0, }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -187,7 +187,9 @@ class FragmentFD(FileDownloader): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + continuedl = self.params.get('continuedl', True) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -201,7 +203,12 @@ class FragmentFD(FileDownloader): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -274,12 +281,10 @@ class FragmentFD(FileDownloader): else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) + if not ctx['live']: + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -290,14 +295,12 @@ class FragmentFD(FileDownloader): def _finish_frag_download(self, ctx, info_dict): ctx['dest_stream'].close() if self.__do_ytdl_file(ctx): - ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) - if os.path.isfile(ytdl_filename): - self.try_remove(ytdl_filename) + self.try_remove(self.ytdl_filename(ctx['filename'])) elapsed = time.time() - ctx['started'] to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) + downloaded_bytes = self.filesize_or_none(ctx['tmpfilename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] @@ -449,7 +452,7 @@ class FragmentFD(FileDownloader): frag_index = ctx['fragment_index'] = fragment['frag_index'] ctx['last_error'] = None - headers = info_dict.get('http_headers', {}).copy() + headers = HTTPHeaderDict(info_dict.get('http_headers')) byte_range = fragment.get('byte_range') if byte_range: headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) @@ -466,9 +469,10 @@ class FragmentFD(FileDownloader): for retry in RetryManager(self.params.get('fragment_retries'), error_callback): try: ctx['fragment_count'] = fragment.get('fragment_count') - if not self._download_fragment(ctx, fragment['url'], info_dict, headers): + if not self._download_fragment( + ctx, fragment['url'], info_dict, headers, info_dict.get('request_data')): return - except (urllib.error.HTTPError, http.client.IncompleteRead) as err: + except (HTTPError, IncompleteRead) as err: retry.error = err continue except DownloadError: # has own retry settings @@ -496,7 +500,7 @@ class FragmentFD(FileDownloader): download_fragment(fragment, ctx_copy) return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') - self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') + self.report_warning('The download speed shown is only of one thread. This is a known issue') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: try: for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): diff --git a/plugins/youtube_download/yt_dlp/downloader/hls.py b/plugins/youtube_download/yt_dlp/downloader/hls.py index 29d6f62..d4b3f03 100644 --- a/plugins/youtube_download/yt_dlp/downloader/hls.py +++ b/plugins/youtube_download/yt_dlp/downloader/hls.py @@ -28,7 +28,16 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' @staticmethod - def can_download(manifest, info_dict, allow_unplayable_formats=False): + def _has_drm(manifest): # TODO: https://github.com/yt-dlp/yt-dlp/pull/5039 + return bool(re.search('|'.join(( + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + )), manifest)) + + @classmethod + def can_download(cls, manifest, info_dict, allow_unplayable_formats=False): UNSUPPORTED_FEATURES = [ # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] @@ -50,13 +59,15 @@ class HlsFD(FragmentFD): ] if not allow_unplayable_formats: UNSUPPORTED_FEATURES += [ - r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM ] def check_results(): yield not info_dict.get('is_live') for feature in UNSUPPORTED_FEATURES: yield not re.search(feature, manifest) + if not allow_unplayable_formats: + yield not cls._has_drm(manifest) return all(check_results()) def real_download(self, filename, info_dict): @@ -64,13 +75,13 @@ class HlsFD(FragmentFD): self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() + man_url = urlh.url s = urlh.read().decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None if can_download: has_ffmpeg = FFmpegFD.available() - no_crypto = not Cryptodome and '#EXT-X-KEY:METHOD=AES-128' in s + no_crypto = not Cryptodome.AES and '#EXT-X-KEY:METHOD=AES-128' in s if no_crypto and has_ffmpeg: can_download, message = False, 'The stream has AES-128 encryption and pycryptodomex is not available' elif no_crypto: @@ -81,14 +92,13 @@ class HlsFD(FragmentFD): message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') if not can_download: - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), s) - if has_drm and not self.params.get('allow_unplayable_formats'): - self.report_error( - 'This video is DRM protected; Try selecting another format with --format or ' - 'add --check-formats to automatically fallback to the next best format') + if self._has_drm(s) and not self.params.get('allow_unplayable_formats'): + if info_dict.get('has_drm') and self.params.get('test'): + self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True) + else: + self.report_error( + 'This format is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format', tb=False) return False message = message or 'Unsupported features have been detected' fd = FFmpegFD(self.ydl, self.params) diff --git a/plugins/youtube_download/yt_dlp/downloader/http.py b/plugins/youtube_download/yt_dlp/downloader/http.py index 95c870e..f523744 100644 --- a/plugins/youtube_download/yt_dlp/downloader/http.py +++ b/plugins/youtube_download/yt_dlp/downloader/http.py @@ -1,12 +1,14 @@ -import http.client import os import random -import socket -import ssl import time -import urllib.error from .common import FileDownloader +from ..networking import Request +from ..networking.exceptions import ( + CertificateVerifyError, + HTTPError, + TransportError, +) from ..utils import ( ContentTooShortError, RetryManager, @@ -16,18 +18,10 @@ from ..utils import ( encodeFilename, int_or_none, parse_http_range, - sanitized_Request, try_call, write_xattr, ) - -RESPONSE_READ_EXCEPTIONS = ( - TimeoutError, - socket.timeout, # compat: py < 3.10 - ConnectionError, - ssl.SSLError, - http.client.HTTPException -) +from ..utils.networking import HTTPHeaderDict class HttpFD(FileDownloader): @@ -45,11 +39,8 @@ class HttpFD(FileDownloader): ctx.tmpfilename = self.temp_name(filename) ctx.stream = None - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} - add_headers = info_dict.get('http_headers') - if add_headers: - headers.update(add_headers) + # Disable compression + headers = HTTPHeaderDict({'Accept-Encoding': 'identity'}, info_dict.get('http_headers')) is_test = self.params.get('test', False) chunk_size = self._TEST_FILE_SIZE if is_test else ( @@ -120,10 +111,10 @@ class HttpFD(FileDownloader): if try_call(lambda: range_end >= ctx.content_len): range_end = ctx.content_len - 1 - request = sanitized_Request(url, request_data, headers) + request = Request(url, request_data, headers) has_range = range_start is not None if has_range: - request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}') + request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}' # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -150,20 +141,21 @@ class HttpFD(FileDownloader): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + elif range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' - ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) - except urllib.error.HTTPError as err: - if err.code == 416: + ctx.data_len = ctx.content_len = int_or_none(ctx.data.headers.get('Content-length', None)) + except HTTPError as err: + if err.status == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header ctx.data = self.ydl.urlopen( - sanitized_Request(url, request_data, headers)) - content_length = ctx.data.info()['Content-Length'] - except urllib.error.HTTPError as err: - if err.code < 500 or err.code >= 600: + Request(url, request_data, headers)) + content_length = ctx.data.headers['Content-Length'] + except HTTPError as err: + if err.status < 500 or err.status >= 600: raise else: # Examine the reported length @@ -191,17 +183,13 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' return - elif err.code < 500 or err.code >= 600: + elif err.status < 500 or err.status >= 600: # Unexpected HTTP error raise raise RetryDownload(err) - except urllib.error.URLError as err: - if isinstance(err.reason, ssl.CertificateError): - raise - raise RetryDownload(err) - # In urllib.request.AbstractHTTPHandler, the response is partially read on request. - # Any errors that occur during this will not be wrapped by URLError - except RESPONSE_READ_EXCEPTIONS as err: + except CertificateVerifyError: + raise + except TransportError as err: raise RetryDownload(err) def close_stream(): @@ -211,7 +199,12 @@ class HttpFD(FileDownloader): ctx.stream = None def download(): - data_len = ctx.data.info().get('Content-length', None) + data_len = ctx.data.headers.get('Content-length') + + if ctx.data.headers.get('Content-encoding'): + # Content-encoding is present, Content-length is not reliable anymore as we are + # doing auto decompression. (See: https://github.com/yt-dlp/yt-dlp/pull/6176) + data_len = None # Range HTTP header may be ignored/unsupported by a webserver # (e.g. extractor/scivee.py, extractor/bambuser.py). @@ -252,7 +245,7 @@ class HttpFD(FileDownloader): try: # Download and write data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - except RESPONSE_READ_EXCEPTIONS as err: + except TransportError as err: retry(err) byte_counter += len(data_block) @@ -333,15 +326,15 @@ class HttpFD(FileDownloader): elif speed: ctx.throttle_start = None - if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: - ctx.resume_len = byte_counter - # ctx.block_size = block_size - raise NextFragment() - if ctx.stream is None: self.to_stderr('\n') self.report_error('Did not get any data blocks') return False + + if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: + ctx.resume_len = byte_counter + raise NextFragment() + if ctx.tmpfilename != '-': ctx.stream.close() @@ -353,7 +346,7 @@ class HttpFD(FileDownloader): # Update file modification time if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.headers.get('last-modified', None)) self._hook_progress({ 'downloaded_bytes': byte_counter, diff --git a/plugins/youtube_download/yt_dlp/downloader/ism.py b/plugins/youtube_download/yt_dlp/downloader/ism.py index a157a8a..dd688f5 100644 --- a/plugins/youtube_download/yt_dlp/downloader/ism.py +++ b/plugins/youtube_download/yt_dlp/downloader/ism.py @@ -2,9 +2,9 @@ import binascii import io import struct import time -import urllib.error from .fragment import FragmentFD +from ..networking.exceptions import HTTPError from ..utils import RetryManager u8 = struct.Struct('>B') @@ -271,7 +271,7 @@ class IsmFD(FragmentFD): write_piff_header(ctx['dest_stream'], info_dict['_download_params']) extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) - except urllib.error.HTTPError as err: + except HTTPError as err: retry.error = err continue diff --git a/plugins/youtube_download/yt_dlp/downloader/niconico.py b/plugins/youtube_download/yt_dlp/downloader/niconico.py index 77ed39e..5720f6e 100644 --- a/plugins/youtube_download/yt_dlp/downloader/niconico.py +++ b/plugins/youtube_download/yt_dlp/downloader/niconico.py @@ -1,8 +1,12 @@ +import json import threading +import time from . import get_suitable_downloader from .common import FileDownloader -from ..utils import sanitized_Request +from .external import FFmpegFD +from ..networking import Request +from ..utils import DownloadError, WebSocketsWrapper, str_or_none, try_get class NiconicoDmcFD(FileDownloader): @@ -24,7 +28,7 @@ class NiconicoDmcFD(FileDownloader): heartbeat_data = heartbeat_info_dict['data'].encode() heartbeat_interval = heartbeat_info_dict.get('interval', 30) - request = sanitized_Request(heartbeat_url, heartbeat_data) + request = Request(heartbeat_url, heartbeat_data) def heartbeat(): try: @@ -50,3 +54,93 @@ class NiconicoDmcFD(FileDownloader): timer[0].cancel() download_complete = True return success + + +class NiconicoLiveFD(FileDownloader): + """ Downloads niconico live without being stopped """ + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + ws_url = info_dict['url'] + ws_extractor = info_dict['ws'] + ws_origin_host = info_dict['origin'] + cookies = info_dict.get('cookies') + live_quality = info_dict.get('live_quality', 'high') + live_latency = info_dict.get('live_latency', 'high') + dl = FFmpegFD(self.ydl, self.params or {}) + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'protocol': 'm3u8', + }) + + def communicate_ws(reconnect): + if reconnect: + ws = WebSocketsWrapper(ws_url, { + 'Cookies': str_or_none(cookies) or '', + 'Origin': f'https://{ws_origin_host}', + 'Accept': '*/*', + 'User-Agent': self.params['http_headers']['User-Agent'], + }) + if self.ydl.params.get('verbose', False): + self.to_screen('[debug] Sending startWatching request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': live_quality, + 'protocol': 'hls+fmp4', + 'latency': live_latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': True, + } + })) + else: + ws = ws_extractor + with ws: + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not data or not isinstance(data, dict): + continue + if data.get('type') == 'ping': + # pong back + ws.send(r'{"type":"pong"}') + ws.send(r'{"type":"keepSeat"}') + elif data.get('type') == 'disconnect': + self.write_debug(data) + return True + elif data.get('type') == 'error': + self.write_debug(data) + message = try_get(data, lambda x: x['body']['code'], str) or recv + return DownloadError(message) + elif self.ydl.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + def ws_main(): + reconnect = False + while True: + try: + ret = communicate_ws(reconnect) + if ret is True: + return + except BaseException as e: + self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e))) + time.sleep(10) + continue + finally: + reconnect = True + + thread = threading.Thread(target=ws_main, daemon=True) + thread.start() + + return dl.download(filename, new_info_dict) diff --git a/plugins/youtube_download/yt_dlp/downloader/youtube_live_chat.py b/plugins/youtube_download/yt_dlp/downloader/youtube_live_chat.py index 5928fec..c7a8637 100644 --- a/plugins/youtube_download/yt_dlp/downloader/youtube_live_chat.py +++ b/plugins/youtube_download/yt_dlp/downloader/youtube_live_chat.py @@ -1,8 +1,8 @@ import json import time -import urllib.error from .fragment import FragmentFD +from ..networking.exceptions import HTTPError from ..utils import ( RegexNotFoundError, RetryManager, @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, try_get, ) +from ..utils.networking import HTTPHeaderDict class YoutubeLiveChatFD(FragmentFD): @@ -37,10 +38,7 @@ class YoutubeLiveChatFD(FragmentFD): start_time = int(time.time() * 1000) def dl_fragment(url, data=None, headers=None): - http_headers = info_dict.get('http_headers', {}) - if headers: - http_headers = http_headers.copy() - http_headers.update(headers) + http_headers = HTTPHeaderDict(info_dict.get('http_headers'), headers) return self._download_fragment(ctx, url, info_dict, http_headers, data) def parse_actions_replay(live_chat_continuation): @@ -129,7 +127,7 @@ class YoutubeLiveChatFD(FragmentFD): or frag_index == 1 and try_refresh_replay_beginning or parse_actions_replay) return (True, *func(live_chat_continuation)) - except urllib.error.HTTPError as err: + except HTTPError as err: retry.error = err continue return False, None, None, None diff --git a/plugins/youtube_download/yt_dlp/extractor/_extractors.py b/plugins/youtube_download/yt_dlp/extractor/_extractors.py index 061a25a..63bb55e 100644 --- a/plugins/youtube_download/yt_dlp/extractor/_extractors.py +++ b/plugins/youtube_download/yt_dlp/extractor/_extractors.py @@ -15,7 +15,6 @@ from .youtube import ( # Youtube is moved to the top to improve performance YoutubeSearchURLIE, YoutubeMusicSearchURLIE, YoutubeSubscriptionsIE, - YoutubeStoriesIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeYtBeIE, @@ -102,6 +101,7 @@ from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, ) +from .anchorfm import AnchorFMEpisodeIE from .angel import AngelIE from .anvato import AnvatoIE from .aol import AolIE @@ -203,13 +203,18 @@ from .bfmtv import ( BFMTVLiveIE, BFMTVArticleIE, ) -from .bibeltv import BibelTVIE +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) from .bigflix import BigflixIE from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, BiliBiliBangumiIE, + BiliBiliBangumiSeasonIE, BiliBiliBangumiMediaIE, BiliBiliSearchIE, BilibiliCategoryIE, @@ -238,19 +243,28 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE -from .booyah import BooyahClipsIE +from .boxcast import BoxCastVideoIE from .bpb import BpbIE from .br import ( BRIE, BRMediathekIE, ) from .bravotv import BravoTVIE +from .brainpop import ( + BrainPOPIE, + BrainPOPJrIE, + BrainPOPELLIE, + BrainPOPEspIE, + BrainPOPFrIE, + BrainPOPIlIE, +) from .breakcom import BreakIE from .breitbart import BreitBartIE from .brightcove import ( @@ -270,6 +284,10 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .camfm import ( + CamFMEpisodeIE, + CamFMShowIE +) from .cammodels import CamModelsIE from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE @@ -277,12 +295,6 @@ from .camwithher import CamWithHerIE from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -295,15 +307,18 @@ from .cbc import ( CBCGemPlaylistIE, CBCGemLiveIE, ) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, +from .cbs import ( + CBSIE, + ParamountPressExpressIE, ) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, CBSNewsIE, + CBSLocalIE, + CBSLocalArticleIE, + CBSLocalLiveIE, + CBSNewsLiveIE, CBSNewsLiveVideoIE, ) from .cbssports import ( @@ -342,6 +357,7 @@ from .ciscolive import ( ) from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE +from .clipchamp import ClipchampIE from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE @@ -389,9 +405,12 @@ from .crowdbunker import ( CrowdBunkerIE, CrowdBunkerChannelIE, ) +from .crtvg import CrtvgIE from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, ) from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE @@ -408,6 +427,10 @@ from .cybrary import ( CybraryIE, CybraryCourseIE ) +from .dacast import ( + DacastVODIE, + DacastPlaylistIE, +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( @@ -438,6 +461,10 @@ from .deezer import ( ) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE +from .dlf import ( + DLFIE, + DLFCorpusIE, +) from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE @@ -470,6 +497,7 @@ from .dplay import ( DiscoveryPlusItalyIE, DiscoveryPlusItalyShowIE, DiscoveryPlusIndiaShowIE, + GlobalCyclingNetworkPlusIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE @@ -493,6 +521,7 @@ from .deuxm import ( DeuxMNewsIE ) from .digitalconcerthall import DigitalConcertHallIE +from .discogs import DiscogsReleasePlaylistIE from .discovery import DiscoveryIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE @@ -507,6 +536,7 @@ from .dw import ( ) from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE from .ebaumsworld import EbaumsWorldIE +from .ebay import EbayIE from .echomsk import EchoMskIE from .egghead import ( EggheadCourseIE, @@ -516,6 +546,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .elevensports import ElevenSportsIE from .ellentube import ( EllenTubeIE, EllenTubeVideoIE, @@ -549,6 +580,7 @@ from .espn import ( ESPNCricInfoIE, ) from .esri import EsriVideoIE +from .ettutv import EttuTvIE from .europa import EuropaIE, EuroParlWebstreamIE from .europeantour import EuropeanTourIE from .eurosport import EurosportIE @@ -635,6 +667,7 @@ from .funimation import ( FunimationShowIE, ) from .funk import FunkIE +from .funker530 import Funker530IE from .fusion import FusionIE from .fuyintv import FuyinTVIE from .gab import ( @@ -670,10 +703,18 @@ from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) from .globo import ( GloboIE, GloboArticleIE, ) +from .gmanetwork import GMANetworkVideoIE from .go import GoIE from .godtube import GodTubeIE from .gofile import GofileIE @@ -705,13 +746,16 @@ from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE +from .hollywoodreporter import ( + HollywoodReporterIE, + HollywoodReporterPlaylistIE, +) from .holodex import HolodexIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( @@ -723,6 +767,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrefli import HrefLiRedirectIE from .hrfensehen import HRFernsehenIE from .hrti import ( HRTiIE, @@ -745,12 +790,14 @@ from .hungama import ( HungamaAlbumPlaylistIE, ) from .hypem import HypemIE +from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE from .icareus import IcareusIE from .ichinanalive import ( IchinanaLiveIE, IchinanaLiveClipIE, ) +from .idolplus import IdolPlusIE from .ign import ( IGNIE, IGNVideoIE, @@ -835,6 +882,7 @@ from .japandiet import ( from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE +from .jstream import JStreamIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE @@ -844,7 +892,6 @@ from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, @@ -857,6 +904,7 @@ from .kicker import KickerIE from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .kompas import KompasVideoIE from .konserthusetplay import KonserthusetPlayIE from .koo import KooIE @@ -908,6 +956,10 @@ from .leeco import ( LePlaylistIE, LetvCloudIE, ) +from .lefigaro import ( + LeFigaroVideoEmbedIE, + LeFigaroVideoSectionIE, +) from .lego import LEGOIE from .lemonde import LemondeIE from .lenta import LentaIE @@ -926,10 +978,6 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) from .linkedin import ( LinkedInIE, LinkedInLearningIE, @@ -956,11 +1004,15 @@ from .lrt import ( LRTVODIE, LRTStreamIE ) +from .lumni import ( + LumniIE +) from .lynda import ( LyndaIE, LyndaCourseIE ) from .m6 import M6IE +from .magellantv import MagellanTVIE from .magentamusik360 import MagentaMusik360IE from .mailru import ( MailRuIE, @@ -1069,7 +1121,8 @@ from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, - MotherlessGroupIE + MotherlessGroupIE, + MotherlessGalleryIE, ) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE @@ -1089,6 +1142,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .murrtube import MurrtubeIE, MurrtubeUserIE +from .museai import MuseAIIE from .musescore import MuseScoreIE from .musicdex import ( MusicdexSongIE, @@ -1110,6 +1164,7 @@ from .myvi import ( ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE +from .mzaalo import MzaaloIE from .n1 import ( N1InfoAssetIE, N1InfoIIE, @@ -1158,6 +1213,7 @@ from .nebula import ( NebulaSubscriptionsIE, NebulaChannelIE, ) +from .nekohacker import NekoHackerIE from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -1206,6 +1262,9 @@ from .nhk import ( NhkForSchoolBangumiIE, NhkForSchoolSubjectIE, NhkForSchoolProgramListIE, + NhkRadioNewsPageIE, + NhkRadiruIE, + NhkRadiruLiveIE, ) from .nhl import NHLIE from .nick import ( @@ -1225,6 +1284,7 @@ from .niconico import ( NicovideoSearchIE, NicovideoSearchURLIE, NicovideoTagURLIE, + NiconicoLiveIE, ) from .ninecninemedia import ( NineCNineMediaIE, @@ -1282,6 +1342,7 @@ from .nrl import NRLTVIE from .ntvcojp import NTVCoJpCUIE from .ntvde import NTVDeIE from .ntvru import NTVRuIE +from .nubilesporn import NubilesPornIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, @@ -1292,6 +1353,7 @@ from .nzherald import NZHeraldIE from .nzonscreen import NZOnScreenIE from .nzz import NZZIE from .odatv import OdaTVIE +from .odkmedia import OnDemandChinaEpisodeIE from .odnoklassniki import OdnoklassnikiIE from .oftv import ( OfTVIE, @@ -1332,6 +1394,7 @@ from .orf import ( ORFIPTVIE, ) from .outsidetv import OutsideTVIE +from .owncloud import OwnCloudIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, @@ -1357,7 +1420,7 @@ from .patreon import ( PatreonIE, PatreonCampaignIE ) -from .pbs import PBSIE +from .pbs import PBSIE, PBSKidsIE from .pearvideo import PearVideoIE from .peekvids import PeekVidsIE, PlayVidsIE from .peertube import ( @@ -1375,6 +1438,7 @@ from .periscope import ( PeriscopeIE, PeriscopeUserIE, ) +from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -1432,7 +1496,6 @@ from .polskieradio import ( PolskieRadioPlayerIE, PolskieRadioPodcastIE, PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1455,6 +1518,7 @@ from .puhutv import ( PuhuTVIE, PuhuTVSerieIE, ) +from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE @@ -1469,6 +1533,7 @@ from .prx import ( ) from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qdance import QDanceIE from .qingting import QingTingIE from .qqmusic import ( QQMusicIE, @@ -1501,6 +1566,8 @@ from .radlive import ( RadLiveSeasonIE, ) from .rai import ( + RaiIE, + RaiCulturaIE, RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, @@ -1509,13 +1576,16 @@ from .rai import ( RaiPlaySoundPlaylistIE, RaiNewsIE, RaiSudtirolIE, - RaiIE, ) from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rcs import ( RCSIE, RCSEmbedsIE, @@ -1527,6 +1597,7 @@ from .rcti import ( RCTIPlusTVIE, ) from .rds import RDSIE +from .recurbate import RecurbateIE from .redbee import ParliamentLiveUKIE, RTBFIE from .redbulltv import ( RedBullTVIE, @@ -1549,6 +1620,7 @@ from .rentv import ( from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE +from .rheinmaintv import RheinMainTVIE from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE @@ -1563,6 +1635,7 @@ from .rottentomatoes import RottenTomatoesIE from .rozhlas import ( RozhlasIE, RozhlasVltavaIE, + MujRozhlasIE, ) from .rte import RteIE, RteRadioIE from .rtlnl import ( @@ -1586,6 +1659,11 @@ from .rtnews import ( from .rtp import RTPIE from .rtrfm import RTRFMIE from .rts import RTSIE +from .rtvcplay import ( + RTVCPlayIE, + RTVCPlayEmbedIE, + RTVCKalturaIE, +) from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, @@ -1631,6 +1709,7 @@ from .ruv import ( RuvIE, RuvSpilaIE ) +from .s4c import S4CIE from .safari import ( SafariIE, SafariApiIE, @@ -1655,6 +1734,7 @@ from .scte import ( ) from .scrolller import ScrolllerIE from .seeker import SeekerIE +from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE @@ -1752,6 +1832,7 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) +from .stageplus import StagePlusVODConcertIE from .startrek import StarTrekIE from .stitcher import ( StitcherIE, @@ -1777,6 +1858,10 @@ from .srgssr import ( SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE +from .stacommu import ( + StacommuLiveIE, + StacommuVODIE, +) from .stanfordoc import StanfordOpenClassroomIE from .startv import StarTVIE from .steam import ( @@ -1789,7 +1874,6 @@ from .storyfire import ( StoryFireSeriesIE, ) from .streamable import StreamableIE -from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streamff import StreamFFIE @@ -1827,7 +1911,10 @@ from .teachertube import ( TeacherTubeUserIE, ) from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE +from .teamcoco import ( + TeamcocoIE, + ConanClassicIE, +) from .teamtreehouse import TeamTreeHouseIE from .techtalks import TechTalksIE from .ted import ( @@ -1839,6 +1926,7 @@ from .ted import ( from .tele5 import Tele5IE from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE +from .telecaribe import TelecaribePlayIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telegram import TelegramEmbedIE @@ -1853,7 +1941,7 @@ from .telequebec import ( ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE -from .tempo import TempoIE +from .tempo import TempoIE, IVXPlayerIE from .tencent import ( IflixEpisodeIE, IflixSeriesIE, @@ -1930,6 +2018,7 @@ from .traileraddict import TrailerAddictIE from .triller import ( TrillerIE, TrillerUserIE, + TrillerShortIE, ) from .trilulilu import TriluliluIE from .trovo import ( @@ -1951,10 +2040,9 @@ from .tubitv import ( ) from .tumblr import TumblrIE from .tunein import ( - TuneInClipIE, TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, + TuneInPodcastIE, + TuneInPodcastEpisodeIE, TuneInShortenerIE, ) from .tunepk import TunePkIE @@ -2022,7 +2110,6 @@ from .tvp import ( ) from .tvplay import ( TVPlayIE, - ViafreeIE, TVPlayHomeIE, ) from .tvplayer import TVPlayerIE @@ -2181,12 +2268,16 @@ from .viu import ( ViuIE, ViuPlaylistIE, ViuOTTIE, + ViuOTTIndonesiaIE, ) from .vk import ( VKIE, VKUserVideosIE, VKWallPostIE, + VKPlayIE, + VKPlayLiveIE, ) +from .vocaroo import VocarooIE from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE @@ -2204,7 +2295,12 @@ from .voxmedia import ( VoxMediaVolumeIE, VoxMediaIE, ) -from .vrt import VRTIE +from .vrt import ( + VRTIE, + VrtNUIE, + KetnetIE, + DagelijkseKostIE, +) from .vrak import VrakIE from .vrv import ( VRVIE, @@ -2255,8 +2351,20 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) +from .wevidi import WeVidiIE +from .weyyak import WeyyakIE +from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE +from .wimbledon import WimbledonIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -2282,6 +2390,12 @@ from .wsj import ( WSJArticleIE, ) from .wwe import WWEIE +from .wykop import ( + WykopDigIE, + WykopDigCommentIE, + WykopPostIE, + WykopPostCommentIE, +) from .xanimu import XanimuIE from .xbef import XBefIE from .xboxclips import XboxClipsIE @@ -2301,13 +2415,14 @@ from .xnxx import XNXXIE from .xstream import XstreamIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE -from .xvideos import XVideosIE +from .xvideos import ( + XVideosIE, + XVideosQuickiesIE +) from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, YahooJapanNewsIE, ) from .yandexdisk import YandexDiskIE @@ -2325,6 +2440,10 @@ from .yandexvideo import ( ZenYandexChannelIE, ) from .yapfiles import YapFilesIE +from .yappy import ( + YappyIE, + YappyProfileIE, +) from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE @@ -2342,6 +2461,10 @@ from .younow import ( from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE +from .zaiko import ( + ZaikoIE, + ZaikoETicketIE, +) from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, @@ -2399,6 +2522,7 @@ from .zingmp3 import ( ZingMp3WeekChartIE, ZingMp3ChartMusicVideoIE, ZingMp3UserIE, + ZingMp3HubIE, ) from .zoom import ZoomIE from .zype import ZypeIE diff --git a/plugins/youtube_download/yt_dlp/extractor/abc.py b/plugins/youtube_download/yt_dlp/extractor/abc.py index 0ca76b8..f56133e 100644 --- a/plugins/youtube_download/yt_dlp/extractor/abc.py +++ b/plugins/youtube_download/yt_dlp/extractor/abc.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_iso8601, str_or_none, + traverse_obj, try_get, unescapeHTML, update_url_query, @@ -85,6 +86,15 @@ class ABCIE(InfoExtractor): 'uploader': 'Behind the News', 'uploader_id': 'behindthenews', } + }, { + 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540', + 'info_dict': { + 'id': '102520540', + 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', + 'ext': 'mp4', + 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', + 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + } }] def _real_extract(self, url): @@ -107,7 +117,7 @@ class ABCIE(InfoExtractor): video = True if mobj is None: - mobj = re.search(r'(?P)"sources": (?P\[[^\]]+\]),', webpage) + mobj = re.search(r'(?P)"(?:sources|files|renditions)":\s*(?P\[[^\]]+\])', webpage) if mobj is None: mobj = re.search( r'inline(?PVideo|Audio|YouTube)Data\.push\((?P[^)]+)\);', @@ -121,7 +131,8 @@ class ABCIE(InfoExtractor): urls_info = self._parse_json( mobj.group('json_data'), video_id, transform_source=js_to_json) youtube = mobj.group('type') == 'YouTube' - video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4' + video = mobj.group('type') == 'Video' or traverse_obj( + urls_info, (0, ('contentType', 'MIMEType')), get_all=False) == 'video/mp4' if not isinstance(urls_info, list): urls_info = [urls_info] diff --git a/plugins/youtube_download/yt_dlp/extractor/abematv.py b/plugins/youtube_download/yt_dlp/extractor/abematv.py index f611c1f..163b83c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/abematv.py +++ b/plugins/youtube_download/yt_dlp/extractor/abematv.py @@ -22,80 +22,23 @@ from ..utils import ( int_or_none, intlist_to_bytes, OnDemandPagedList, - request_to_url, time_seconds, traverse_obj, update_url_query, ) -# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) - -def add_opener(ydl, handler): - ''' Add a handler for opening URLs, like _download_webpage ''' +def add_opener(ydl, handler): # FIXME: Create proper API in .networking + """Add a handler for opening URLs, like _download_webpage""" # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - ydl._opener.add_handler(handler) - - -def remove_opener(ydl, handler): - ''' - Remove handler(s) for opening URLs - @param handler Either handler object itself or handler type. - Specifying handler type will remove all handler which isinstance returns True. - ''' - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - opener = ydl._opener - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - if isinstance(handler, (type, tuple)): - find_cp = lambda x: isinstance(x, handler) - else: - find_cp = lambda x: x is handler - - removed = [] - for meth in dir(handler): - if meth in ["redirect_request", "do_open", "proxy_open"]: - # oops, coincidental match - continue - - i = meth.find("_") - protocol = meth[:i] - condition = meth[i + 1:] - - if condition.startswith("error"): - j = condition.find("_") + i + 1 - kind = meth[j + 1:] - try: - kind = int(kind) - except ValueError: - pass - lookup = opener.handle_error.get(protocol, {}) - opener.handle_error[protocol] = lookup - elif condition == "open": - kind = protocol - lookup = opener.handle_open - elif condition == "response": - kind = protocol - lookup = opener.process_response - elif condition == "request": - kind = protocol - lookup = opener.process_request - else: - continue - - handlers = lookup.setdefault(kind, []) - if handlers: - handlers[:] = [x for x in handlers if not find_cp(x)] - - removed.append(x for x in handlers if find_cp(x)) - - if removed: - for x in opener.handlers: - if find_cp(x): - x.add_parent(None) - opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + rh = ydl._request_director.handlers['Urllib'] + if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: + return + opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) + assert isinstance(opener, urllib.request.OpenerDirector) + opener.add_handler(handler) + rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') class AbemaLicenseHandler(urllib.request.BaseHandler): @@ -137,11 +80,11 @@ class AbemaLicenseHandler(urllib.request.BaseHandler): return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) def abematv_license_open(self, url): - url = request_to_url(url) + url = url.get_full_url() if isinstance(url, urllib.request.Request) else url ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) return urllib.response.addinfourl(io.BytesIO(response_data), headers={ - 'Content-Length': len(response_data), + 'Content-Length': str(len(response_data)), }, url=url, code=200) @@ -213,10 +156,7 @@ class AbemaTVBaseIE(InfoExtractor): }) AbemaTVBaseIE._USERTOKEN = user_data['token'] - # don't allow adding it 2 times or more, though it's guarded - remove_opener(self._downloader, AbemaLicenseHandler) add_opener(self._downloader, AbemaLicenseHandler(self)) - return self._USERTOKEN def _get_media_token(self, invalidate=False, to_show=True): @@ -436,6 +376,16 @@ class AbemaTVIE(AbemaTVBaseIE): if 3 not in ondemand_types: # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + info.update(traverse_obj(api_response, { + 'series': ('series', 'title'), + 'season': ('season', 'title'), + 'season_number': ('season', 'sequence'), + 'episode_number': ('episode', 'number'), + })) + if not title: + title = traverse_obj(api_response, ('episode', 'title')) + if not description: + description = traverse_obj(api_response, ('episode', 'content')) m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' elif video_type == 'slots': diff --git a/plugins/youtube_download/yt_dlp/extractor/acast.py b/plugins/youtube_download/yt_dlp/extractor/acast.py index f2f828f..427d04c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/acast.py +++ b/plugins/youtube_download/yt_dlp/extractor/acast.py @@ -40,28 +40,33 @@ class ACastBaseIE(InfoExtractor): class ACastIE(ACastBaseIE): IE_NAME = 'acast' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?:// (?: (?:(?:embed|www)\.)?acast\.com/| play\.acast\.com/s/ ) - (?P[^/]+)/(?P[^/#?]+) - ''' + (?P[^/]+)/(?P[^/#?"]+) + )''' + _EMBED_REGEX = [rf'(?x)]+\bsrc=[\'"](?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'description': 'md5:013959207e05011ad14a222cf22278cc', 'timestamp': 1477346700, 'upload_date': '20161024', 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', + 'creator': 'Third Ear Studio', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', + 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg', + 'episode_number': 2, + 'display_id': '2.raggarmordet-rosterurdetforflutna', + 'season_number': 4, + 'season': 'Season 4', } }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', @@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE): 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government', + 'info_dict': { + 'id': '646c68fb21fbf20011e9c651', + 'ext': 'mp3', + 'creator': 'The Australian National University', + 'display_id': 'can-labor-be-a-long-form-government', + 'duration': 2618, + 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg', + 'title': 'Can Labor be a long-form government?', + 'episode': 'Can Labor be a long-form government?', + 'upload_date': '20230523', + 'series': 'Democracy Sausage with Mark Kenny', + 'timestamp': 1684826362, + 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16', + } + }] def _real_extract(self, url): channel, display_id = self._match_valid_url(url).groups() diff --git a/plugins/youtube_download/yt_dlp/extractor/adn.py b/plugins/youtube_download/yt_dlp/extractor/adn.py index f1f55e8..b59dbc8 100644 --- a/plugins/youtube_download/yt_dlp/extractor/adn.py +++ b/plugins/youtube_download/yt_dlp/extractor/adn.py @@ -6,10 +6,8 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_HTTPError, - compat_b64decode, -) +from ..compat import compat_b64decode +from ..networking.exceptions import HTTPError from ..utils import ( ass_subtitles_timecode, bytes_to_intlist, @@ -142,9 +140,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' self._HEADERS = {'authorization': 'Bearer ' + access_token} except ExtractorError as e: message = None - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + e.cause.response.read().decode(), None, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -195,14 +193,14 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }) break except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise e - if e.cause.code == 401: + if e.cause.status == 401: # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json(e.cause.response.read(), video_id) message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) diff --git a/plugins/youtube_download/yt_dlp/extractor/adobepass.py b/plugins/youtube_download/yt_dlp/extractor/adobepass.py index e5944f7..5eed0ca 100644 --- a/plugins/youtube_download/yt_dlp/extractor/adobepass.py +++ b/plugins/youtube_download/yt_dlp/extractor/adobepass.py @@ -2,11 +2,11 @@ import getpass import json import re import time -import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor from ..compat import compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -1394,7 +1394,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en form_page, urlh = form_page_res post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + post_url = compat_urlparse.urljoin(urlh.url, post_url) form_data = self._hidden_inputs(form_page) form_data.update(data) return self._download_webpage_handle( @@ -1473,7 +1473,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en elif 'automatically signed in with' in provider_redirect_page: # Seems like comcast is rolling up new way of automatically signing customers oauth_redirect_url = self._html_search_regex( - r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + r'continue:\s*"(https://oauth\.xfinity\.com/oauth/authorize\?.+)"', provider_redirect_page, 'oauth redirect (signed)') # Just need to process the request. No useful data comes back self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') @@ -1573,7 +1573,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }), headers={ 'Content-Type': 'application/x-www-form-urlencoded' }) - elif mso_id == 'Spectrum': + elif mso_id in ('Spectrum', 'Charter_Direct'): # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow # as a one-off implementation. provider_redirect_page, urlh = provider_redirect_page_res @@ -1619,7 +1619,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history'] = 1 provider_login_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending first bookend', + urlh.url, video_id, 'Sending first bookend', query=hidden_data) provider_association_redirect, urlh = post_form( @@ -1629,7 +1629,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1638,7 +1638,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending final bookend', + urlh.url, video_id, 'Sending final bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1652,7 +1652,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history_val'] = 1 provider_login_redirect_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending First Bookend', + urlh.url, video_id, 'Sending First Bookend', query=hidden_data) provider_login_redirect_page, urlh = provider_login_redirect_page_res @@ -1680,7 +1680,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1690,7 +1690,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history_val'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending Final Bookend', + urlh.url, video_id, 'Sending Final Bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1699,7 +1699,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en # based redirect that should be followed. provider_redirect_page, urlh = provider_redirect_page_res provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) + provider_redirect_page, url=urlh.url) if provider_refresh_redirect_url: provider_redirect_page_res = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1724,7 +1724,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en 'requestor_id': requestor_id, }), headers=mvpd_headers) except ExtractorError as e: - if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401: raise_mvpd_required() raise if '(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/plugins/youtube_download/yt_dlp/extractor/aeonco.py b/plugins/youtube_download/yt_dlp/extractor/aeonco.py index 4655862..390eae3 100644 --- a/plugins/youtube_download/yt_dlp/extractor/aeonco.py +++ b/plugins/youtube_download/yt_dlp/extractor/aeonco.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from .vimeo import VimeoIE +from ..utils import ExtractorError, traverse_obj, url_or_none class AeonCoIE(InfoExtractor): @@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor): } }, { 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', - 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'md5': '03582d795382e49f2fd0b427b55de409', 'info_dict': { - 'id': '728595228', + 'id': '759576926', 'ext': 'mp4', 'title': 'Wrought', - 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', - 'uploader': 'Biofilm Productions', - 'uploader_id': 'user140352216', - 'uploader_url': 'https://vimeo.com/user140352216', + 'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280', + 'uploader': 'Aeon Video', + 'uploader_id': 'aeonvideo', + 'uploader_url': 'https://vimeo.com/aeonvideo', 'duration': 1344 } + }, { + 'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out', + 'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b', + 'info_dict': { + 'id': 'emyi4z-O0ls', + 'ext': 'mp4', + 'title': 'How to outsmart the Prisoner’s Dilemma - Lucas Husted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp', + 'uploader': 'TED-Ed', + 'uploader_id': '@TEDEd', + 'uploader_url': 'https://www.youtube.com/@TEDEd', + 'duration': 344, + 'upload_date': '20200827', + 'channel_id': 'UCsooa4yRKGN_zEE8iknghZA', + 'playable_in_embed': True, + 'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3', + 'categories': ['Education'], + 'like_count': int, + 'channel': 'TED-Ed', + 'chapters': 'count:7', + 'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA', + 'tags': 'count:26', + 'availability': 'public', + 'channel_follower_count': int, + 'view_count': int, + 'age_limit': 0, + 'live_status': 'not_live', + 'comment_count': int, + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - vimeo_id = self._search_regex(r'hosterId":\s*"(?P[0-9]+)', webpage, 'vimeo id') - vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') - return self.url_result(vimeo_url, VimeoIE) + embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), ( + lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False) + if not embed_url: + raise ExtractorError('No embed URL found in webpage') + if 'player.vimeo.com' in embed_url: + embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/') + return self.url_result(embed_url) diff --git a/plugins/youtube_download/yt_dlp/extractor/afreecatv.py b/plugins/youtube_download/yt_dlp/extractor/afreecatv.py index 9276fe7..3d26d9c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/afreecatv.py +++ b/plugins/youtube_download/yt_dlp/extractor/afreecatv.py @@ -76,59 +76,6 @@ class AfreecaTVIE(InfoExtractor): }, }], 'skip': 'Video is gone', - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', - 'info_dict': { - 'id': '18650793', - 'ext': 'mp4', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '윈아디', - 'uploader_id': 'badkids', - 'duration': 107, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', - 'info_dict': { - 'id': '10481652', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'duration': 6492, - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '20160502_c4c62b9d_174361386_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 3601, - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '20160502_39e739bb_174361386_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 2891, - }, - }], - 'params': { - 'skip_download': True, - }, }, { # non standard key 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', @@ -146,8 +93,8 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # PARTIAL_ADULT - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + # adult content + 'url': 'https://vod.afreecatv.com/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', @@ -161,16 +108,25 @@ class AfreecaTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['adult content'], + 'skip': 'The VOD does not exist', }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', - 'only_matching': True, - }, { - 'url': 'http://vod.afreecatv.com/player/15055030', - 'only_matching': True, + 'url': 'https://vod.afreecatv.com/player/96753363', + 'info_dict': { + 'id': '20230108_9FF5BEE1_244432674_1', + 'ext': 'mp4', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'upload_date': '20230108', + 'title': '젠지 페이즈', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -223,26 +179,21 @@ class AfreecaTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'alert\(["\']This video has been deleted', webpage): - raise ExtractorError( - 'Video %s has been deleted' % video_id, expected=True) - - station_id = self._search_regex( - r'nStationNo\s*=\s*(\d+)', webpage, 'station') - bbs_id = self._search_regex( - r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') - video_id = self._search_regex( - r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - partial_view = False adult_view = False for _ in range(2): + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', + video_id, headers={'Referer': url}, data=urlencode_postdata({ + 'nTitleNo': video_id, + 'nApiLevel': 10, + }))['data'] + if traverse_obj(data, ('code', {int})) == -6221: + raise ExtractorError('The VOD does not exist', expected=True) query = { 'nTitleNo': video_id, - 'nStationNo': station_id, - 'nBbsNo': bbs_id, + 'nStationNo': data['station_no'], + 'nBbsNo': data['bbs_no'], } if partial_view: query['partialView'] = 'SKIP_ADULT' diff --git a/plugins/youtube_download/yt_dlp/extractor/amazonminitv.py b/plugins/youtube_download/yt_dlp/extractor/amazonminitv.py index 7309968..b57d985 100644 --- a/plugins/youtube_download/yt_dlp/extractor/amazonminitv.py +++ b/plugins/youtube_download/yt_dlp/extractor/amazonminitv.py @@ -191,7 +191,7 @@ query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' - IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -250,6 +250,7 @@ query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonI class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, diff --git a/plugins/youtube_download/yt_dlp/extractor/americastestkitchen.py b/plugins/youtube_download/yt_dlp/extractor/americastestkitchen.py index abda55d..e889458 100644 --- a/plugins/youtube_download/yt_dlp/extractor/americastestkitchen.py +++ b/plugins/youtube_download/yt_dlp/extractor/americastestkitchen.py @@ -11,7 +11,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,7 +106,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P/cookscountry)?/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_path, season_number = self._match_valid_url(url).group('show', 'id') - season_number = int(season_number) + season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2') + show_path = ('/' + show) if show else '' + show = show or show1 + season_number = int_or_none(season_number) - slug = 'cco' if show_path == '/cookscountry' else 'atk' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ + playlist_id, headers={ 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) @@ -162,4 +212,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/plugins/youtube_download/yt_dlp/extractor/amp.py b/plugins/youtube_download/yt_dlp/extractor/amp.py index b0cbd77..0d259c5 100644 --- a/plugins/youtube_download/yt_dlp/extractor/amp.py +++ b/plugins/youtube_download/yt_dlp/extractor/amp.py @@ -5,6 +5,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + strip_jsonp, unified_timestamp, url_or_none, ) @@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _extract_feed_info(self, url): feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed') + 'Unable to download Akamai AMP feed', transform_source=strip_jsonp) item = feed.get('channel', {}).get('item') if not item: raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) @@ -73,8 +74,10 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), diff --git a/plugins/youtube_download/yt_dlp/extractor/anchorfm.py b/plugins/youtube_download/yt_dlp/extractor/anchorfm.py new file mode 100644 index 0000000..52f2ad0 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/anchorfm.py @@ -0,0 +1,98 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + unified_timestamp +) + + +class AnchorFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://anchor\.fm/(?P\w+)/(?:embed/)?episodes/[\w-]+-(?P\w+)' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://anchor.fm/lovelyti/episodes/Chrisean-Rock-takes-to-twitter-to-announce-shes-pregnant--Blueface-denies-he-is-the-father-e1tpt3d', + 'info_dict': { + 'id': 'e1tpt3d', + 'ext': 'mp3', + 'title': ' Chrisean Rock takes to twitter to announce she\'s pregnant, Blueface denies he is the father!', + 'description': 'md5:207d167de3e28ceb4ddc1ebf5a30044c', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_nologo/1034827/1034827-1658438968460-5f3bfdf3601e8.jpg', + 'duration': 624.718, + 'uploader': 'Lovelyti ', + 'uploader_id': '991541', + 'channel': 'lovelyti', + 'modified_date': '20230121', + 'modified_timestamp': 1674285178, + 'release_date': '20230121', + 'release_timestamp': 1674285179, + 'episode_id': 'e1tpt3d', + } + }, { + # embed url + 'url': 'https://anchor.fm/apakatatempo/embed/episodes/S2E75-Perang-Bintang-di-Balik-Kasus-Ferdy-Sambo-dan-Ismail-Bolong-e1shjqd', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'duration': 1042.008, + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'release_date': '20221221', + 'release_timestamp': 1671595916, + 'modified_date': '20221221', + 'modified_timestamp': 1671590834, + 'channel': 'apakatatempo', + 'uploader': 'Podcast Tempo', + 'uploader_id': '2585461', + 'season': 'Season 2', + 'season_number': 2, + 'episode_id': 'e1shjqd', + } + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://podcast.tempo.co/podcast/192/perang-bintang-di-balik-kasus-ferdy-sambo-dan-ismail-bolong', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'release_date': '20221221', + 'duration': 1042.008, + 'season': 'Season 2', + 'modified_timestamp': 1671590834, + 'uploader_id': '2585461', + 'modified_date': '20221221', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'season_number': 2, + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'release_timestamp': 1671595916, + 'episode_id': 'e1shjqd', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'uploader': 'Podcast Tempo', + 'channel': 'apakatatempo', + } + }] + + def _real_extract(self, url): + channel_name, episode_id = self._match_valid_url(url).group('channel_name', 'episode_id') + api_data = self._download_json(f'https://anchor.fm/api/v3/episodes/{episode_id}', episode_id) + + return { + 'id': episode_id, + 'title': traverse_obj(api_data, ('episode', 'title')), + 'url': traverse_obj(api_data, ('episode', 'episodeEnclosureUrl'), ('episodeAudios', 0, 'url')), + 'ext': 'mp3', + 'vcodec': 'none', + 'thumbnail': traverse_obj(api_data, ('episode', 'episodeImage')), + 'description': clean_html(traverse_obj(api_data, ('episode', ('description', 'descriptionPreview')), get_all=False)), + 'duration': float_or_none(traverse_obj(api_data, ('episode', 'duration')), 1000), + 'modified_timestamp': unified_timestamp(traverse_obj(api_data, ('episode', 'modified'))), + 'release_timestamp': int_or_none(traverse_obj(api_data, ('episode', 'publishOnUnixTimestamp'))), + 'episode_id': episode_id, + 'uploader': traverse_obj(api_data, ('creator', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('creator', 'userId'))), + 'season_number': int_or_none(traverse_obj(api_data, ('episode', 'podcastSeasonNumber'))), + 'channel': channel_name or traverse_obj(api_data, ('creator', 'vanitySlug')), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/ant1newsgr.py b/plugins/youtube_download/yt_dlp/extractor/ant1newsgr.py index 7b384b2..217e3ac 100644 --- a/plugins/youtube_download/yt_dlp/extractor/ant1newsgr.py +++ b/plugins/youtube_download/yt_dlp/extractor/ant1newsgr.py @@ -1,8 +1,8 @@ import urllib.parse from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, @@ -121,7 +121,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): canonical_url = self._request_webpage( HEADRequest(url), video_id, note='Resolve canonical player URL', - errnote='Could not resolve canonical player URL').geturl() + errnote='Could not resolve canonical player URL').url _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) cid = urllib.parse.parse_qs(query)['cid'][0] diff --git a/plugins/youtube_download/yt_dlp/extractor/anvato.py b/plugins/youtube_download/yt_dlp/extractor/anvato.py index 79bfe41..0df5033 100644 --- a/plugins/youtube_download/yt_dlp/extractor/anvato.py +++ b/plugins/youtube_download/yt_dlp/extractor/anvato.py @@ -336,7 +336,7 @@ class AnvatoIE(InfoExtractor): elif media_format == 'm3u8-variant' or ext == 'm3u8': # For some videos the initial m3u8 URL returns JSON instead manifest_json = self._download_json( - video_url, video_id, note='Downloading manifest JSON', errnote=False) + video_url, video_id, note='Downloading manifest JSON', fatal=False) if manifest_json: video_url = manifest_json.get('master_m3u8') if not video_url: @@ -392,14 +392,6 @@ class AnvatoIE(InfoExtractor): url = smuggle_url(url, {'token': anvplayer_data['token']}) yield cls.url_result(url, AnvatoIE, video_id) - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json( - self._html_search_regex( - self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), - video_id) - return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) self._initialize_geo_bypass({ diff --git a/plugins/youtube_download/yt_dlp/extractor/archiveorg.py b/plugins/youtube_download/yt_dlp/extractor/archiveorg.py index 4ccd398..2541cd6 100644 --- a/plugins/youtube_download/yt_dlp/extractor/archiveorg.py +++ b/plugins/youtube_download/yt_dlp/extractor/archiveorg.py @@ -1,16 +1,16 @@ import json import re -import urllib.error import urllib.parse from .common import InfoExtractor from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE -from ..compat import compat_HTTPError, compat_urllib_parse_unquote +from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, - HEADRequest, bug_reports_message, clean_html, dict_get, @@ -899,7 +899,7 @@ class YoutubeWebArchiveIE(InfoExtractor): video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: self.raise_no_formats( 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) else: @@ -926,7 +926,7 @@ class YoutubeWebArchiveIE(InfoExtractor): info['thumbnails'] = self._extract_thumbnails(video_id) if urlh: - url = compat_urllib_parse_unquote(urlh.geturl()) + url = compat_urllib_parse_unquote(urlh.url) video_file_url_qs = parse_qs(url) # Attempt to recover any ext & format info from playback url & response headers format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} @@ -1052,7 +1052,7 @@ class VLiveWebArchiveIE(InfoExtractor): try: return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: raise ExtractorError('Page was not archived', expected=True) retry.error = e continue diff --git a/plugins/youtube_download/yt_dlp/extractor/ard.py b/plugins/youtube_download/yt_dlp/extractor/ard.py index 8660741..ca1faa7 100644 --- a/plugins/youtube_download/yt_dlp/extractor/ard.py +++ b/plugins/youtube_download/yt_dlp/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import ( try_get, unified_strdate, unified_timestamp, + update_url, update_url_query, url_or_none, xpath_text, @@ -408,6 +409,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' _TESTS = [{ + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', + 'md5': '3fd5fead7a370a819341129c8d713136', + 'info_dict': { + 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', + 'id': '12172961', + 'title': 'Wolfsland - Die traurigen Schwestern', + 'description': r're:^Als der Polizeiobermeister Raaben', + 'duration': 5241, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', + 'timestamp': 1670710500, + 'upload_date': '20221210', + 'ext': 'mp4', + 'age_limit': 12, + 'episode': 'Wolfsland - Die traurigen Schwestern', + 'series': 'Filme im MDR' + }, + }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { @@ -424,7 +442,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'skip': 'Error', }, { 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', - 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'md5': '1e73ded21cb79bac065117e80c81dc88', 'info_dict': { 'id': '10049223', 'ext': 'mp4', @@ -432,13 +450,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', - }, - }, { - 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', - 'playlist_count': 6, - 'info_dict': { - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', - 'title': 'beforeigners/beforeigners/staffel-1', + 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste', + 'duration': 915, + 'episode': 'tagesschau, 20:00 Uhr', + 'series': 'tagesschau', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -602,6 +618,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): show { title } + image { + src + } synopsis title tracking { @@ -640,6 +659,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), 'series': try_get(player_page, lambda x: x['show']['title']), + 'thumbnail': (media_collection.get('_previewImage') + or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None)) + or self.get_thumbnail_from_html(display_id, url)), }) info.update(self._ARD_extract_episode_info(info['title'])) return info + + def get_thumbnail_from_html(self, display_id, url): + webpage = self._download_webpage(url, display_id, fatal=False) or '' + return ( + self._og_search_thumbnail(webpage, default=None) + or self._html_search_meta('thumbnailUrl', webpage, default=None)) diff --git a/plugins/youtube_download/yt_dlp/extractor/atresplayer.py b/plugins/youtube_download/yt_dlp/extractor/atresplayer.py index a20e7f9..3a44e52 100644 --- a/plugins/youtube_download/yt_dlp/extractor/atresplayer.py +++ b/plugins/youtube_download/yt_dlp/extractor/atresplayer.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -34,8 +34,8 @@ class AtresPlayerIE(InfoExtractor): _API_BASE = 'https://api.atresplayer.com/' def _handle_error(self, e, code): - if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: - error = self._parse_json(e.cause.read(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == code: + error = self._parse_json(e.cause.response.read(), None) if error.get('error') == 'required_registered': self.raise_login_required() raise ExtractorError(error['error_description'], expected=True) diff --git a/plugins/youtube_download/yt_dlp/extractor/bbc.py b/plugins/youtube_download/yt_dlp/extractor/bbc.py index 9d28e70..a55cdef 100644 --- a/plugins/youtube_download/yt_dlp/extractor/bbc.py +++ b/plugins/youtube_download/yt_dlp/extractor/bbc.py @@ -2,11 +2,11 @@ import functools import itertools import json import re -import urllib.error import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..compat import compat_str, compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -277,7 +277,7 @@ class BBCCoUkIE(InfoExtractor): post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Referer': self._LOGIN_URL}) - if self._LOGIN_URL in urlh.geturl(): + if self._LOGIN_URL in urlh.url: error = clean_html(get_element_by_class('form-message', response)) if error: raise ExtractorError( @@ -388,8 +388,8 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], urllib.error.HTTPError) - and e.exc_info[1].code in (403, 404)): + if not (isinstance(e.exc_info[1], HTTPError) + and e.exc_info[1].status in (403, 404)): raise fmts = [] formats.extend(fmts) @@ -472,7 +472,7 @@ class BBCCoUkIE(InfoExtractor): return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404): raise # fallback to legacy playlist @@ -983,7 +983,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: continue raise if entry: diff --git a/plugins/youtube_download/yt_dlp/extractor/bibeltv.py b/plugins/youtube_download/yt_dlp/extractor/bibeltv.py index fd20aad..34464da 100644 --- a/plugins/youtube_download/yt_dlp/extractor/bibeltv.py +++ b/plugins/youtube_download/yt_dlp/extractor/bibeltv.py @@ -1,27 +1,197 @@ +from functools import partial + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P\d+)' - _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', - 'info_dict': { - 'id': 'ref:329703', - 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), } - }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', - 'only_matching': True, + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:video' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', + 'info_dict': { + 'id': '344436', + 'ext': 'mp4', + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bibeltv.de/livestreams/impuls/', + 'only_matching': True, + }] + + def _real_extract(self, url): + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/plugins/youtube_download/yt_dlp/extractor/bilibili.py b/plugins/youtube_download/yt_dlp/extractor/bilibili.py index f418063..cb7ab2a 100644 --- a/plugins/youtube_download/yt_dlp/extractor/bilibili.py +++ b/plugins/youtube_download/yt_dlp/extractor/bilibili.py @@ -1,12 +1,14 @@ import base64 import functools +import hashlib import itertools import math -import urllib.error +import time import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, @@ -16,6 +18,7 @@ from ..utils import ( float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, merge_dicts, mimetype2ext, @@ -26,6 +29,8 @@ from ..utils import ( srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, + unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, @@ -81,7 +86,7 @@ class BilibiliBaseIE(InfoExtractor): f'{line["content"]}\n\n') return srt_data - def _get_subtitles(self, video_id, initial_state, cid): + def _get_subtitles(self, video_id, aid, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', @@ -89,7 +94,8 @@ class BilibiliBaseIE(InfoExtractor): }] } - for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) + for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) @@ -130,9 +136,20 @@ class BilibiliBaseIE(InfoExtractor): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -280,19 +297,60 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - video_data = initial_state['videoData'] + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, @@ -315,99 +373,135 @@ class BiliBiliIE(BilibiliBaseIE): cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + festival_info = {} + if is_festival: + play_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note='Extracting festival video formats')['data'] + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + return { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + 'subtitles': self.extract_subtitles(video_id, aid, cid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P(?:ss|ep)\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?Pep\d+)' _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { - 'id': 'ss897', + 'id': '267851', 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '鬼灭之刃', + 'season_id': '26801', 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) + episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): + elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = {'Referer': url, **self.geo_verification_headers()} + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, ('episodes', ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) + aid = episode_info.get('aid') + return { 'id': video_id, 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), + 'episode': episode_info.get('long_title'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season_id': str_or_none(season_id), 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, + 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + '__post_extractor': self.extract_comments(aid), + 'http_headers': headers, } -class BiliBiliBangumiMediaIE(InfoExtractor): +class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', @@ -420,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) + ss_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801' + }, + 'playlist_mincount': 26 + }] + + def _real_extract(self, url): + ss_id = self._match_id(url) + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): @@ -452,21 +556,65 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, }] + def _extract_signature(self, playlist_id): + session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False) + + key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0] + img_key = traverse_obj( + session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100' + sub_key = traverse_obj( + session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6' + + session_key = img_key + sub_key + + signature_values = [] + for position in ( + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, + 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, + 57, 62, 11, 36, 20, 34, 44, 52 + ): + char_at_position = try_call(lambda: session_key[position]) + if char_at_position: + signature_values.append(char_at_position) + + return ''.join(signature_values)[:32] + def _real_extract(self, url): playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') if not is_video_url: self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' 'To download audios, add a "/audio" to the URL') + signature = self._extract_signature(playlist_id) + def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + 'wts': int(time.time()), + } + query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest() + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', + playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise @@ -494,9 +642,9 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] @@ -894,15 +1042,15 @@ class BiliIntlBaseIE(InfoExtractor): } def _perform_login(self, username, password): - if not Cryptodome: + if not Cryptodome.RSA: raise ExtractorError('pycryptodomex not found. Please install', expected=True) key_data = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, note='Downloading login key', errnote='Unable to download login key')['data'] - public_key = Cryptodome.PublicKey.RSA.importKey(key_data['key']) - password_hash = Cryptodome.Cipher.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + public_key = Cryptodome.RSA.importKey(key_data['key']) + password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) login_post = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ 'username': username, @@ -995,6 +1143,53 @@ class BiliIntlIE(BiliIntlBaseIE): 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', 'upload_date': '20221212', 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', + }, + }, { + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', + 'info_dict': { + 'id': '340317', + 'ext': 'mp4', + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '' + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro' + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro' + }], + }, + 'params': { + 'getcomments': True + } + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'comment_count': int, + 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + }, + 'params': { + 'getcomments': True } }, { # episode id without intro and outro @@ -1054,11 +1249,69 @@ class BiliIntlIE(BiliIntlBaseIE): # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { + self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { 'title': self._html_search_meta('og:title', webpage), 'description': self._html_search_meta('og:description', webpage) }) + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')) + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + def _real_extract(self, url): season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') video_id = ep_id or aid @@ -1086,7 +1339,8 @@ class BiliIntlIE(BiliIntlBaseIE): **self._extract_video_metadata(url, video_id, season_id), 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), - 'chapters': chapters + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id) } diff --git a/plugins/youtube_download/yt_dlp/extractor/bitchute.py b/plugins/youtube_download/yt_dlp/extractor/bitchute.py index 10e7b0b..0805b8b 100644 --- a/plugins/youtube_download/yt_dlp/extractor/bitchute.py +++ b/plugins/youtube_download/yt_dlp/extractor/bitchute.py @@ -2,9 +2,9 @@ import functools import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, clean_html, get_element_by_class, @@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor): def _check_format(self, video_url, video_id): urls = orderedSet( re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) - for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128', + 'seed132', 'seed150', 'seed151', 'seed152', 'seed153', + 'seed167', 'seed171', 'seed177', 'seed305', 'seed307', + 'seedp29xb', 'zb10-7gsop1v78')) for url in urls: try: response = self._request_webpage( diff --git a/plugins/youtube_download/yt_dlp/extractor/blerp.py b/plugins/youtube_download/yt_dlp/extractor/blerp.py new file mode 100644 index 0000000..4631ad2 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/blerp.py @@ -0,0 +1,167 @@ +import json + +from .common import InfoExtractor +from ..utils import strip_or_none, traverse_obj + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/plugins/youtube_download/yt_dlp/extractor/booyah.py b/plugins/youtube_download/yt_dlp/extractor/booyah.py deleted file mode 100644 index 5c55f2c..0000000 --- a/plugins/youtube_download/yt_dlp/extractor/booyah.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, str_or_none, traverse_obj - - -class BooyahBaseIE(InfoExtractor): - _BOOYAH_SESSION_KEY = None - - def _real_initialize(self): - BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( - 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') - - def _get_comments(self, video_id): - comment_json = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, - headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} - - return [{ - 'id': comment.get('comment_id'), - 'author': comment.get('from_nickname'), - 'author_id': comment.get('from_uid'), - 'author_thumbnail': comment.get('from_thumbnail'), - 'text': comment.get('content'), - 'timestamp': comment.get('create_time'), - 'like_count': comment.get('like_cnt'), - } for comment in comment_json.get('comment_list') or ()] - - -class BooyahClipsIE(BooyahBaseIE): - _VALID_URL = r'https?://booyah.live/clips/(?P\d+)' - _TESTS = [{ - 'url': 'https://booyah.live/clips/13887261322952306617', - 'info_dict': { - 'id': '13887261322952306617', - 'ext': 'mp4', - 'view_count': int, - 'duration': 30, - 'channel_id': 90565760, - 'like_count': int, - 'title': 'Cayendo con estilo 😎', - 'uploader': '♡LɪꜱGΛ​MER​', - 'comment_count': int, - 'uploader_id': '90565760', - 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', - 'upload_date': '20220617', - 'timestamp': 1655490556, - 'modified_timestamp': 1655490556, - 'modified_date': '20220617', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, - headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) - - formats = [] - for video_data in json_data['playback']['endpoint_list']: - formats.extend(({ - 'url': video_data.get('stream_url'), - 'ext': 'mp4', - 'height': video_data.get('resolution'), - }, { - 'url': video_data.get('download_url'), - 'ext': 'mp4', - 'format_note': 'Watermarked', - 'height': video_data.get('resolution'), - 'preference': -10, - })) - - return { - 'id': video_id, - 'title': traverse_obj(json_data, ('playback', 'name')), - 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), - 'formats': formats, - 'view_count': traverse_obj(json_data, ('playback', 'views')), - 'like_count': traverse_obj(json_data, ('playback', 'likes')), - 'duration': traverse_obj(json_data, ('playback', 'duration')), - 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), - 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), - 'uploader': traverse_obj(json_data, ('user', 'nickname')), - 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), - 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), - 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), - '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), - } diff --git a/plugins/youtube_download/yt_dlp/extractor/boxcast.py b/plugins/youtube_download/yt_dlp/extractor/boxcast.py new file mode 100644 index 0000000..51f9eb7 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/boxcast.py @@ -0,0 +1,102 @@ +from .common import InfoExtractor +from ..utils import ( + js_to_json, + traverse_obj, + unified_timestamp +) + + +class BoxCastVideoIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://boxcast\.tv/(?: + view-embed/| + channel/\w+\?(?:[^#]+&)?b=| + video-portal/(?:\w+/){2} + )(?P[\w-]+)''' + _EMBED_REGEX = [r']+src=["\'](?Phttps?://boxcast\.tv/view-embed/[\w-]+)'] + _TESTS = [{ + 'url': 'https://boxcast.tv/view-embed/in-the-midst-of-darkness-light-prevails-an-interdisciplinary-symposium-ozmq5eclj50ujl4bmpwx', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }, { + 'url': 'https://boxcast.tv/video-portal/vctwevwntun3o0ikq7af/rvyblnn0fxbfjx5nwxhl/otbpltj2kzkveo2qz3ad', + 'info_dict': { + 'id': 'otbpltj2kzkveo2qz3ad', + 'ext': 'mp4', + 'uploader_id': 'vctwevwntun3o0ikq7af', + 'uploader': 'Legacy Christian Church', + 'title': 'The Quest | 1: Beginner\'s Bay | Jamie Schools', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg' + } + }, { + 'url': 'https://boxcast.tv/channel/z03fqwaeaby5lnaawox2?b=ssihlw5gvfij2by8tkev', + 'info_dict': { + 'id': 'ssihlw5gvfij2by8tkev', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg$', + 'release_date': '20230101', + 'uploader_id': 'ds25vaazhlu4ygcvffid', + 'release_timestamp': 1672543201, + 'uploader': 'Lighthouse Ministries International - Beltsville, Maryland', + 'description': 'md5:ac23e3d01b0b0be592e8f7fe0ec3a340', + 'title': 'New Year\'s Eve CROSSOVER Service at LHMI | December 31, 2022', + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://childrenshealthdefense.eu/live-stream/', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + webpage_json_data = self._search_json( + r'var\s*BOXCAST_PRELOAD\s*=', webpage, 'broadcast data', display_id, + transform_source=js_to_json, default={}) + + # Ref: https://support.boxcast.com/en/articles/4235158-build-a-custom-viewer-experience-with-boxcast-api + broadcast_json_data = ( + traverse_obj(webpage_json_data, ('broadcast', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}', display_id)) + view_json_data = ( + traverse_obj(webpage_json_data, ('view', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}/view', + display_id, fatal=False) or {}) + + formats, subtitles = [], {} + if view_json_data.get('status') == 'recorded': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + view_json_data['playlist'], display_id) + + return { + 'id': str(broadcast_json_data['id']), + 'title': (broadcast_json_data.get('name') + or self._html_search_meta(['og:title', 'twitter:title'], webpage)), + 'description': (broadcast_json_data.get('description') + or self._html_search_meta(['og:description', 'twitter:description'], webpage) + or None), + 'thumbnail': (broadcast_json_data.get('preview') + or self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': unified_timestamp(broadcast_json_data.get('streamed_at')), + 'uploader': broadcast_json_data.get('account_name'), + 'uploader_id': broadcast_json_data.get('account_id'), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/brainpop.py b/plugins/youtube_download/yt_dlp/extractor/brainpop.py new file mode 100644 index 0000000..1200437 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/brainpop.py @@ -0,0 +1,318 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + classproperty, + int_or_none, + traverse_obj, + urljoin +) + + +class BrainPOPBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brainpop' + _ORIGIN = '' # So that _VALID_URL doesn't crash + _LOGIN_ERRORS = { + 1502: 'The username and password you entered did not match.', # LOGIN_FAILED + 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE + 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED + 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED + 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE + 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED + 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP + 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED + 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE + 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS + 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD + 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED + } + + @classproperty + def _VALID_URL(cls): + root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?') + return rf'{root}/(?P[^/]+/[^/]+/(?P[^/?#&]+))' + + def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}): + formats = [] + formats = self._extract_m3u8_formats( + f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}', + display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False) + formats.append({ + 'format_id': format_id, + 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}', + }) + for f in formats: + f.update(extra_fields) + return formats + + def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}): + formats = [] + additional_key_formats = { + '%s': {}, + 'ad_%s': { + 'format_note': 'Audio description', + 'source_preference': -2 + } + } + for additional_key_format, additional_key_fields in additional_key_formats.items(): + for key_quality, key_index in enumerate(('high', 'low')): + full_key_index = additional_key_format % (key_format % key_index) + if data.get(full_key_index): + formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, { + 'quality': -1 - key_quality, + **additional_key_fields, + **extra_fields + })) + return formats + + def _perform_login(self, username, password): + login_res = self._download_json( + 'https://api.brainpop.com/api/login', None, + data=json.dumps({'username': username, 'password': password}).encode(), + headers={ + 'Content-Type': 'application/json', + 'Referer': self._ORIGIN + }, note='Logging in', errnote='Unable to log in', expected_status=400) + status_code = int_or_none(login_res['status_code']) + if status_code != 1505: + self.report_warning( + f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}' + or f'Got status code {status_code}') + + +class BrainPOPIE(BrainPOPBaseIE): + _ORIGIN = 'https://www.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com' + _TESTS = [{ + 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null', + 'md5': '3ead374233ae74c7f1b0029a01c972f0', + 'info_dict': { + 'id': '1f3259fa457292b4', + 'ext': 'mp4', + 'title': 'Martin Luther King, Jr.', + 'display_id': 'martinlutherkingjr', + 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349', + }, + }, { + 'url': 'https://www.brainpop.com/science/space/bigbang/', + 'md5': '9a1ff0e77444dd9e437354eb669c87ec', + 'info_dict': { + 'id': 'acae52cd48c99acf', + 'ext': 'mp4', + 'title': 'Big Bang', + 'display_id': 'bigbang', + 'description': 'md5:3e53b766b0f116f631b13f4cae185d38', + }, + 'skip': 'Requires login', + }] + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + movie_data = self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id, + 'Downloading movie data JSON', 'Unable to download movie data')['data'] + topic_data = traverse_obj(self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id, + 'Downloading topic data JSON', 'Unable to download topic data', fatal=False), + ('data', 'topic'), expected_type=dict) or movie_data['topic'] + + if not traverse_obj(movie_data, ('access', 'allow')): + reason = traverse_obj(movie_data, ('access', 'reason')) + if 'logged' in reason: + self.raise_login_required(reason, metadata_available=True) + else: + self.raise_no_formats(reason, video_id=display_id) + movie_feature = movie_data['feature'] + movie_feature_data = movie_feature['data'] + + formats, subtitles = [], {} + formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', { + 'language': movie_feature.get('language') or 'en', + 'language_preference': 10 + })) + for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items(): + formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', { + 'language': lang, + 'language_preference': -10 + })) + + # TODO: Do localization fields also have subtitles? + for name, url in movie_feature_data.items(): + lang = self._search_regex( + r'^subtitles_(?P\w+)$', name, 'subtitle metadata', default=None) + if lang and url: + subtitles.setdefault(lang, []).append({ + 'url': urljoin(self._CDN_URL, url) + }) + + return { + 'id': topic_data['topic_id'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BrainPOPLegacyBaseIE(BrainPOPBaseIE): + def _parse_js_topic_data(self, topic_data, display_id, token): + movie_data = topic_data['movies'] + # TODO: Are there non-burned subtitles? + formats = self._extract_adaptive_formats(movie_data, token, display_id) + + return { + 'id': topic_data['EntryID'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'alt_title': topic_data.get('title'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + } + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + webpage = self._download_webpage(url, display_id) + topic_data = self._search_json( + r'var\s+content\s*=\s*', webpage, 'content data', + display_id, end_pattern=';')['category']['unit']['topic'] + token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token') + return self._parse_js_topic_data(topic_data, display_id, token) + + +class BrainPOPJrIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://jr.brainpop.com' + _VIDEO_URL = 'https://svideos-jr.brainpop.com' + _HLS_URL = 'https://hls-jr.brainpop.com' + _CDN_URL = 'https://cdn-jr.brainpop.com' + _TESTS = [{ + 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/', + 'md5': '04e0561bb21770f305a0ce6cf0d869ab', + 'info_dict': { + 'id': '347', + 'ext': 'mp4', + 'title': 'Emotions', + 'display_id': 'emotions', + }, + }, { + 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/', + 'md5': 'b0ed063bbd1910df00220ee29340f5d6', + 'info_dict': { + 'id': '29', + 'ext': 'mp4', + 'title': 'Arctic Habitats', + 'display_id': 'arctichabitats', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPELLIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://ell.brainpop.com' + _VIDEO_URL = 'https://svideos-esl.brainpop.com' + _HLS_URL = 'https://hls-esl.brainpop.com' + _CDN_URL = 'https://cdn-esl.brainpop.com' + _TESTS = [{ + 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/', + 'md5': 'a2012700cfb774acb7ad2e8834eed0d0', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Lesson 1', + 'display_id': 'lesson1', + 'alt_title': 'Personal Pronouns', + }, + }, { + 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/', + 'md5': 'be19c8292c87b24aacfb5fda2f3f8363', + 'info_dict': { + 'id': '101', + 'ext': 'mp4', + 'title': 'Lesson 5', + 'display_id': 'lesson5', + 'alt_title': 'Review: Unit 6', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPEspIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Español' + _ORIGIN = 'https://esp.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/mx' + _TESTS = [{ + 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/', + 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9', + 'info_dict': { + 'id': '3893', + 'ext': 'mp4', + 'title': 'Ecosistemas', + 'display_id': 'ecosistemas', + 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3', + }, + }, { + 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/', + 'md5': '98c1b9559e0e33777209c425cda7dac4', + 'info_dict': { + 'id': '7146', + 'ext': 'mp4', + 'title': 'Emily Dickinson', + 'display_id': 'emily_dickinson', + 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPFrIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Français' + _ORIGIN = 'https://fr.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/fr' + _TESTS = [{ + 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/', + 'md5': '97e7f48af8af93f8a2be11709f239371', + 'info_dict': { + 'id': '1651', + 'ext': 'mp4', + 'title': 'Sources d\'énergie', + 'display_id': 'sourcesdenergie', + 'description': 'md5:7eece350f019a21ef9f64d4088b2d857', + }, + }, { + 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/', + 'md5': '0cf2b4f89804d0dd4a360a51310d445a', + 'info_dict': { + 'id': '5803', + 'ext': 'mp4', + 'title': 'Plagiat', + 'display_id': 'plagiat', + 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPIlIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Hebrew' + _ORIGIN = 'https://il.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/he' + _TESTS = [{ + 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/', + 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641', + 'info_dict': { + 'id': '3782', + 'ext': 'mp4', + 'title': 'md5:e993632fcda0545d9205602ec314ad67', + 'display_id': 'subjects_3782', + 'description': 'md5:4cc084a8012beb01f037724423a4d4ed', + }, + }] diff --git a/plugins/youtube_download/yt_dlp/extractor/bravotv.py b/plugins/youtube_download/yt_dlp/extractor/bravotv.py index d489584..419fe8c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/bravotv.py +++ b/plugins/youtube_download/yt_dlp/extractor/bravotv.py @@ -1,117 +1,189 @@ -import re - from .adobepass import AdobePassIE +from ..networking import HEADRequest from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, + extract_attributes, float_or_none, - try_get, - dict_get, + get_element_html_by_class, + int_or_none, + merge_dicts, + parse_age_limit, + remove_end, + str_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + update_url_query, + url_or_none, ) class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'epL0pmK1kQlT', + 'id': '3923059', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, 'season_number': 16, 'episode_number': 15, 'series': 'Top Chef', 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.0, - } + 'duration': 190.357, + 'season': 'Season 16', + 'thumbnail': r're:^https://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, + 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', + 'info_dict': { + 'id': '9000234570', + 'ext': 'mp4', + 'title': 'London Calling', + 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', + 'upload_date': '20230310', + 'timestamp': 1678410000, + 'season_number': 20, + 'episode_number': 1, + 'series': 'Top Chef', + 'episode': 'London Calling', + 'duration': 3266.03, + 'season': 'Season 20', + 'chapters': 'count:7', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', + 'info_dict': { + 'id': '3692045', + 'ext': 'mp4', + 'title': 'Closing Night', + 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', + 'upload_date': '20180401', + 'timestamp': 1522623600, + 'season_number': 1, + 'episode_number': 1, + 'series': 'In Ice Cold Blood', + 'episode': 'Closing Night', + 'duration': 2629.051, + 'season': 'Season 1', + 'chapters': 'count:6', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'info_dict': { + 'id': '3974019', + 'ext': 'mp4', + 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', + 'upload_date': '20190617', + 'timestamp': 1560790800, + 'season_number': 2, + 'episode_number': 16, + 'series': 'In Ice Cold Blood', + 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'duration': 68.235, + 'season': 'Season 2', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }] def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() + site, display_id = self._match_valid_url(url).group('site', 'id') webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), - display_id) - info = {} + settings = self._search_json( + r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') query = { - 'mbr': 'true', + 'manifest': 'm3u', + 'formats': 'm3u,mpeg4', } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') + if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' + account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} + site = remove_end(site, 'tv') + release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) + tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, + tve['data-title'], release_pid, tve.get('data-rating')) + query.update({ + 'switch': 'HLSServiceSecure', + 'auth': self._extract_mvpd_auth( + url, release_pid, auth.get('adobePassRequestorId') or site, resource), + }) + else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' - - tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} + account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' + account_id = ls_playlist['mpxMediaAccountId'] + video_id = ls_playlist['defaultGuid'] + metadata = traverse_obj( + ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) + tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), - display_id, fatal=False) - if tp_metadata: - info.update({ - 'title': tp_metadata.get('title'), - 'description': tp_metadata.get('description'), - 'duration': float_or_none(tp_metadata.get('duration'), 1000), - 'season_number': int_or_none( - dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), - 'episode_number': int_or_none( - dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), - # For some reason the series is sometimes wrapped into a single element array. - 'series': try_get( - dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), - lambda x: x[0] if isinstance(x, list) else x, - expected_type=str), - 'episode': dict_get( - tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), - }) + update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info + seconds_or_none = lambda x: float_or_none(x, 1000) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {seconds_or_none}), + 'end_time': ('endTime', {seconds_or_none}), + })) + # prune pointless single chapters that span the entire duration from short videos + if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): + chapters = None + + m3u8_url = self._request_webpage(HEADRequest( + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url + if 'mpeg_cenc' in m3u8_url: + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + **merge_dicts(traverse_obj(tp_metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {seconds_or_none}), + 'timestamp': ('pubDate', {seconds_or_none}), + 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), + 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), + 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), + 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), + }, get_all=False), traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {unified_timestamp}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': 'episodeTitle', + 'series': 'show', + })) + } diff --git a/plugins/youtube_download/yt_dlp/extractor/brightcove.py b/plugins/youtube_download/yt_dlp/extractor/brightcove.py index 2b7ddca..61b1841 100644 --- a/plugins/youtube_download/yt_dlp/extractor/brightcove.py +++ b/plugins/youtube_download/yt_dlp/extractor/brightcove.py @@ -7,10 +7,10 @@ from .adobepass import AdobePassIE from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_HTTPError, compat_parse_qs, compat_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, dict_get, @@ -575,6 +575,7 @@ class BrightcoveNewBaseIE(AdobePassIE): self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + headers.pop('Authorization', None) # or else http formats will give error 400 for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -895,8 +896,9 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): store_pk(policy_key) return policy_key - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + token = smuggled_data.get('token') + api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}' + headers = {'Authorization': f'Bearer {token}'} if token else {} referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ @@ -913,8 +915,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): json_data = self._download_json(api_url, video_id, headers=headers) break except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): + json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0] message = json_data.get('message') or json_data['error_code'] if json_data.get('error_subcode') == 'CLIENT_GEO': self.raise_geo_restricted(msg=message) diff --git a/plugins/youtube_download/yt_dlp/extractor/callin.py b/plugins/youtube_download/yt_dlp/extractor/callin.py index e966876..c77179c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/callin.py +++ b/plugins/youtube_download/yt_dlp/extractor/callin.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - traverse_obj, - float_or_none, - int_or_none -) +from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): @@ -35,6 +31,54 @@ class CallinIE(InfoExtractor): 'episode_number': 1, 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' } + }, { + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'ext': 'ts', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + 'duration': 3182.472, + 'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'uploader_url': 'http://thepullrequest.com', + 'upload_date': '20220902', + 'episode': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'series': 'The Pull Request', + 'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'view_count': int, + 'uploader': 'Antonio García Martínez', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png', + 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'timestamp': 1662100688.005, + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'ext': 'ts', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + 'duration': 10043.16, + 'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'uploader_url': 'http://patreon.com/badfaithpodcast', + 'upload_date': '20220826', + 'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'display_id': 'episode-', + 'series': 'The DEBRIEF With Briahna Joy Gray', + 'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'view_count': int, + 'uploader': 'Briahna Gray', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png', + 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'timestamp': 1661476708.282, + } }] def try_get_user_name(self, d): @@ -86,6 +130,7 @@ class CallinIE(InfoExtractor): return { 'id': id, + '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])], 'display_id': display_id, 'title': title, 'formats': formats, diff --git a/plugins/youtube_download/yt_dlp/extractor/camfm.py b/plugins/youtube_download/yt_dlp/extractor/camfm.py new file mode 100644 index 0000000..a9850f4 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/camfm.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + get_elements_by_class, + join_nonempty, + traverse_obj, + unified_timestamp, + urljoin, +) + + +class CamFMShowIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/shows/(?P[^/]+)' + _TESTS = [{ + 'playlist_mincount': 5, + 'url': 'https://camfm.co.uk/shows/soul-mining/', + 'info_dict': { + 'id': 'soul-mining', + 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a', + 'title': 'Soul Mining', + 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + page = self._download_webpage(url, show_id) + + return { + '_type': 'playlist', + 'id': show_id, + 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE) + for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)], + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r']+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)), + 'title': self._html_search_regex('

([^<]+)

', page, 'title', fatal=False), + 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page)) + } + + +class CamFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/player/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://camfm.co.uk/player/43336', + 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually', + 'info_dict': { + 'id': '43336', + 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023', + 'ext': 'mp3', + 'upload_date': '20230516', + 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf', + 'timestamp': 1684263600, + 'series': 'AITAA: Am I the Agony Aunt?', + 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1', + 'categories': ['Entertainment'], + } + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + page = self._download_webpage(url, episode_id) + audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id) + + caption = get_element_by_class('caption', page) + series = clean_html(re.sub(r'', '', caption)) + + card_section = get_element_by_class('card-section', page) + date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False) + + return { + 'id': episode_id, + 'title': join_nonempty(series, date, delim=' - '), + 'formats': traverse_obj(audios, (..., 'formats', ...)), + 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings + 'series': series, + 'description': clean_html(re.sub(r'[^<]+]+/>', '', card_section)), + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r']+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)', + page, 'thumbnail', fatal=False)), + 'categories': get_elements_by_class('label', caption), + 'was_live': True, + } diff --git a/plugins/youtube_download/yt_dlp/extractor/cammodels.py b/plugins/youtube_download/yt_dlp/extractor/cammodels.py index 0509057..135b315 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cammodels.py +++ b/plugins/youtube_download/yt_dlp/extractor/cammodels.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) +from ..utils import int_or_none, url_or_none class CamModelsIE(InfoExtractor): @@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -82,12 +57,20 @@ class CamModelsIE(InfoExtractor): 'quality': -10, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) return { 'id': user_id, 'title': user_id, + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/plugins/youtube_download/yt_dlp/extractor/canalplus.py b/plugins/youtube_download/yt_dlp/extractor/canalplus.py index b7e2f9d..3ff5c3f 100644 --- a/plugins/youtube_download/yt_dlp/extractor/canalplus.py +++ b/plugins/youtube_download/yt_dlp/extractor/canalplus.py @@ -64,7 +64,7 @@ class CanalplusIE(InfoExtractor): # response = self._request_webpage( # HEADRequest(fmt_url), video_id, # 'Checking if the video is georestricted') - # if '/blocage' in response.geturl(): + # if '/blocage' in response.url: # raise ExtractorError( # 'The video is not available in your country', # expected=True) diff --git a/plugins/youtube_download/yt_dlp/extractor/canvas.py b/plugins/youtube_download/yt_dlp/extractor/canvas.py deleted file mode 100644 index ae6e03a..0000000 --- a/plugins/youtube_download/yt_dlp/extractor/canvas.py +++ /dev/null @@ -1,383 +0,0 @@ -import json - - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - extract_attributes, - float_or_none, - get_element_by_class, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - url_or_none, - urlencode_postdata -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8_native', - } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') - - data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'null', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{site_id} {video_id}' - description = data.get('description') - - formats = [] - subtitles = {} - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HSS': - fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?Pcanvas|een)\.be/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r']+class="video__body__header__title"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id - - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/plugins/youtube_download/yt_dlp/extractor/cbc.py b/plugins/youtube_download/yt_dlp/extractor/cbc.py index 210f5f8..9413281 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cbc.py +++ b/plugins/youtube_download/yt_dlp/extractor/cbc.py @@ -8,14 +8,16 @@ from ..compat import ( compat_str, ) from ..utils import ( + ExtractorError, int_or_none, join_nonempty, js_to_json, orderedSet, + parse_iso8601, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, ) @@ -159,7 +161,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, - 'skip': 'Geo-restricted to Canada', + 'skip': 'Geo-restricted to Canada and no longer available', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', @@ -172,6 +174,9 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1425704400, 'upload_date': '20150307', 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -184,6 +189,28 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1320410746, 'upload_date': '20111104', 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + }, + }, { + # Has subtitles + # These broadcasts expire after ~1 month, can find new test URL here: + # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast + 'url': 'http://www.cbc.ca/player/play/2249992771553', + 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'info_dict': { + 'id': '2249992771553', + 'ext': 'mp4', + 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', + 'description': 'md5:adba28011a56cfa47a080ff198dad27a', + 'timestamp': 1690596000, + 'duration': 2716.333, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:5', + 'upload_date': '20230729', }, }] @@ -197,12 +224,13 @@ class CBCPlayerIE(InfoExtractor): 'force_smil_url': True }), 'id': video_id, + '_format_sort_fields': ('res', 'proto') # Prioritize direct http formats over HLS } class CBCGemIE(InfoExtractor): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -245,6 +273,9 @@ class CBCGemIE(InfoExtractor): }, 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01', + 'only_matching': True, }] _GEO_COUNTRIES = ['CA'] @@ -346,7 +377,9 @@ class CBCGemIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + video_info = self._download_json( + f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', + video_id, expected_status=426) email, password = self._get_login_info() if email and password: @@ -401,7 +434,7 @@ class CBCGemIE(InfoExtractor): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -411,6 +444,9 @@ class CBCGemPlaylistIE(InfoExtractor): 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -418,7 +454,7 @@ class CBCGemPlaylistIE(InfoExtractor): match = self._match_valid_url(url) season_id = match.group('id') show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id) + show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) season = int(match.group('season')) season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) @@ -470,49 +506,90 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*' + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'timestamp': 1679706000, + 'upload_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + } + ] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break - else: + # Two types of metadata JSON + if not video_info.get('formattedIdMedia'): + video_info = traverse_obj( + video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), + get_all=False, default={}) + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'mpx', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), + 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), 'is_live': True, + **traverse_obj(video_info, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('images', 'card', 'url'), + 'timestamp': ('airDate', {parse_iso8601}), + }) } diff --git a/plugins/youtube_download/yt_dlp/extractor/cbs.py b/plugins/youtube_download/yt_dlp/extractor/cbs.py index 9aacd50..1c0dbde 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cbs.py +++ b/plugins/youtube_download/yt_dlp/extractor/cbs.py @@ -1,8 +1,14 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE +from .youtube import YoutubeIE from ..utils import ( ExtractorError, + extract_attributes, + get_element_html_by_id, int_or_none, find_xpath_attr, + smuggle_url, xpath_element, xpath_text, update_url_query, @@ -162,3 +168,110 @@ class CBSIE(CBSBaseIE): 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), }) + + +class ParamountPressExpressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?Pyt-)?video/?\?watch=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx', + 'md5': '56631dbcadaab980d1fc47cb7b76cba4', + 'info_dict': { + 'id': '6322981580112', + 'ext': 'mp4', + 'title': 'I’m Felicia', + 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290', + 'uploader_id': '6055873637001', + 'upload_date': '20230320', + 'timestamp': 1679334960, + 'duration': 49.557, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc', + 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b', + 'info_dict': { + 'id': '6323036027112', + 'ext': 'mp4', + 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More', + 'description': 'md5:b929867a357aac5544b783d834c78383', + 'uploader_id': '6055873637001', + 'upload_date': '20230321', + 'timestamp': 1679430180, + 'duration': 132.032, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck', + 'info_dict': { + 'id': 'OX9wJWOcqck', + 'ext': 'mp4', + 'title': 'Rugrats | Season 2 Official Trailer | Paramount+', + 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de', + 'uploader': 'Paramount Plus', + 'uploader_id': '@paramountplus', + 'uploader_url': 'http://www.youtube.com/@paramountplus', + 'channel': 'Paramount Plus', + 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg', + 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg', + 'upload_date': '20230316', + 'duration': 88, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': ['Rugrats'], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw', + 'info_dict': { + 'id': '_ljssSoDLkw', + 'ext': 'mp4', + 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME', + 'description': 'md5:39581bcc3fd810209b642609f448af70', + 'uploader': 'SHOWTIME', + 'uploader_id': '@Showtime', + 'uploader_url': 'http://www.youtube.com/@Showtime', + 'channel': 'SHOWTIME', + 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ', + 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ', + 'upload_date': '20230209', + 'duration': 49, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp', + 'categories': ['People & Blogs'], + 'tags': 'count:27', + }, + }] + + def _real_extract(self, url): + display_id, is_youtube = self._match_valid_url(url).group('id', 'yt') + if is_youtube: + return self.url_result(display_id, YoutubeIE) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID') + token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token') + + player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '') + account_id = player.get('data-account') or '6055873637001' + player_id = player.get('data-player') or 'OtLKgXlO9F' + embed = player.get('data-embed') or 'default' + + return self.url_result(smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}', + {'token': token}), BrightcoveNewIE) diff --git a/plugins/youtube_download/yt_dlp/extractor/cbslocal.py b/plugins/youtube_download/yt_dlp/extractor/cbslocal.py deleted file mode 100644 index 3d50b04..0000000 --- a/plugins/youtube_download/yt_dlp/extractor/cbslocal.py +++ /dev/null @@ -1,116 +0,0 @@ -from .anvato import AnvatoIE -from .sendtonews import SendtoNewsIE -from ..compat import compat_urlparse -from ..utils import ( - parse_iso8601, - unified_timestamp, -) - - -class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' - _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' - - _TESTS = [{ - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mcp_id = self._match_id(url) - return self.url_result( - 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) - - -class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P[0-9a-z-]+)' - - _TESTS = [{ - # Anvato backend - 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', - 'md5': 'f0ee3081e3843f575fccef901199b212', - 'info_dict': { - 'id': '3401037', - 'ext': 'mp4', - 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', - 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', - 'thumbnail': 're:^https?://.*', - 'timestamp': 1463440500, - 'upload_date': '20160516', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\KCBSTV', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\AOL', - 'Syndication\\Yahoo', - 'Syndication\\Tribune', - 'Syndication\\Curb.tv', - 'Content\\News' - ], - 'tags': ['CBS 2 News Evening'], - }, - }, { - # SendtoNews embed - 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', - 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - }, - 'playlist_count': 9, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - sendtonews_url = SendtoNewsIE._extract_url(webpage) - if sendtonews_url: - return self.url_result( - compat_urlparse.urljoin(url, sendtonews_url), - ie=SendtoNewsIE.ie_key()) - - info_dict = self._extract_anvato_videos(webpage, display_id) - - timestamp = unified_timestamp(self._html_search_regex( - r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, - 'released date', default=None)) or parse_iso8601( - self._html_search_meta('uploadDate', webpage)) - - info_dict.update({ - 'display_id': display_id, - 'timestamp': timestamp, - }) - - return info_dict diff --git a/plugins/youtube_download/yt_dlp/extractor/cbsnews.py b/plugins/youtube_download/yt_dlp/extractor/cbsnews.py index 16edf3a..5a8ebb8 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cbsnews.py +++ b/plugins/youtube_download/yt_dlp/extractor/cbsnews.py @@ -1,36 +1,153 @@ +import base64 import re +import urllib.error +import urllib.parse import zlib +from .anvato import AnvatoIE from .common import InfoExtractor -from .cbs import CBSIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) +from .paramountplus import ParamountPlusIE +from ..networking import HEADRequest from ..utils import ( + ExtractorError, + UserNotLive, + determine_ext, + float_or_none, + format_field, + int_or_none, + make_archive_id, + mimetype2ext, parse_duration, + smuggle_url, + traverse_obj, + url_or_none, ) -class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsBaseIE(InfoExtractor): + _LOCALES = { + 'atlanta': None, + 'baltimore': 'BAL', + 'boston': 'BOS', + 'chicago': 'CHI', + 'colorado': 'DEN', + 'detroit': 'DET', + 'losangeles': 'LA', + 'miami': 'MIA', + 'minnesota': 'MIN', + 'newyork': 'NY', + 'philadelphia': 'PHI', + 'pittsburgh': 'PIT', + 'sacramento': 'SAC', + 'sanfrancisco': 'SF', + 'texas': 'DAL', + } + _LOCALE_RE = '|'.join(map(re.escape, _LOCALES)) + _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl' + + def _get_item(self, webpage, display_id): + return traverse_obj(self._search_json( + r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id, + default={}), ('items', 0, {dict})) or {} + + def _get_video_url(self, item): + return traverse_obj(item, 'video', 'video2', expected_type=url_or_none) + + def _extract_playlist(self, webpage, playlist_id): + entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall( + r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)] + if entries: + return self.playlist_result( + entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage), + self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + def _extract_video(self, item, video_url, video_id): + if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4': + formats = [{'url': video_url, 'ext': 'mp4'}] + + else: + manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information') + + anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None) + # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source + if anvato_id: + return self.url_result( + smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}), + AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + formats, _ = self._parse_m3u8_formats_and_subtitles( + manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id) + + def get_subtitles(subs_url): + return { + 'en': [{ + 'url': subs_url, + 'ext': 'dfxp', # TTAF1 + }], + } if url_or_none(subs_url) else None + + episode_meta = traverse_obj(item, { + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + }) if item.get('isFullEpisode') else {} + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(item, { + 'title': (None, ('fulltitle', 'title')), + 'description': 'dek', + 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration', {float_or_none}), + 'subtitles': ('captions', {get_subtitles}), + 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), + 'is_live': ('type', {lambda x: x == 'live'}), + }, get_all=False), + **episode_meta, + } + + +class CBSNewsEmbedIE(CBSNewsBaseIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', - 'only_matching': True, + 'info_dict': { + 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA', + 'ext': 'mp4', + 'title': 'Cops investigate gorilla incident at Cincinnati Zoo', + 'description': 'md5:fee7441ab8aaeb3c693482394738102b', + 'duration': 350, + 'timestamp': 1464719713, + 'upload_date': '20160531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - item = self._parse_json(zlib.decompress(compat_b64decode( - compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode( + urllib.parse.unquote(self._match_id(url))), + -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {} + + video_id = item['mpxRefId'] + video_url = self._get_video_url(item) + if not video_url: + # Old embeds redirect user to ParamountPlus but most links are 404 + pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}' + try: + self._request_webpage(HEADRequest(pplus_url), video_id) + return self.url_result(pplus_url, ParamountPlusIE) + except ExtractorError: + self.raise_no_formats('This video is no longer available', True, video_id) + + return self._extract_video(item, video_url, video_id) -class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsIE(CBSNewsBaseIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\w-]+)' _TESTS = [ { @@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'timestamp': 1476046464, 'upload_date': '20161009', }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'skip': 'This video is no longer available', }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, - 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ - 'ext': 'ttml', + 'ext': 'dfxp', }], }, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { + 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved', 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, + { + 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/', + 'info_dict': { + 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE', + 'ext': 'mp4', + 'title': 'CBS Evening News, March 28, 2023', + 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13', + 'duration': 1189, + 'timestamp': 1680042600, + 'upload_date': '20230328', + 'season': 'Season 2023', + 'season_number': 2023, + 'episode': 'Episode 83', + 'episode_number': 83, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - entries = [] - for embed_url in re.findall(r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): - entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) - if entries: - return self.playlist_result( - entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), - playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist - item = self._parse_json(self._html_search_regex( - r'CBSNEWS\.defaultPayload\s*=\s*({.+})', - webpage, 'video JSON info'), display_id)['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + video_url = self._get_video_url(item) + if not video_url: + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalBaseIE(CBSNewsBaseIE): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + anvato_id = None + video_url = self._get_video_url(item) + + if not video_url: + anv_params = self._search_regex( + r']+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"', + webpage, 'Anvato URL', default=None) + + if not anv_params: + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id) + anvato_id = anv_data['v'] + return self.url_result( + smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', { + 'token': anv_data.get('token') or 'default', + }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P[\w-]+)' + _TESTS = [{ + # Anvato video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/', + 'info_dict': { + 'id': '6376747', + 'ext': 'mp4', + 'title': '1st cannabis dispensary opens in Queens', + 'description': 'The dispensary is women-owned and located in Jamaica.', + 'uploader': 'CBS', + 'duration': 20, + 'timestamp': 1680193657, + 'upload_date': '20230330', + 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'], + 'tags': 'count:11', + 'thumbnail': 're:^https?://.*', + '_old_archive_ids': ['cbslocal 6376747'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # cbsnews.com video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/', + 'info_dict': { + 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3', + 'ext': 'mp4', + 'title': 'the city is sounding the alarm on dangerous social media challenges', + 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6', + 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg', + 'duration': 41.0, + 'timestamp': 1680196615, + 'upload_date': '20230330', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class CBSLocalArticleIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P[\w-]+)' + _TESTS = [{ + # Anvato video via iframe embed + 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service', + 'title': 'MTA station agents begin leaving their booths to provide more direct customer service', + 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.', + }, + }, { + 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + }, + 'skip': 'Video has been removed', + }] + + +class CBSNewsLiveBaseIE(CBSNewsBaseIE): + def _get_id(self, url): + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + video_id = self._get_id(url) + if not video_id: + raise ExtractorError('Livestream is not available', expected=True) + + data = traverse_obj(self._download_json( + 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={ + 'partner': 'cbsnsite', + 'edition': video_id, + 'type': 'live', + }), ('navigation', 'data', 0, {dict})) + + video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False) + if not video_url: + raise UserNotLive(video_id=video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(data, { + 'title': 'headline', + 'description': 'rundown_slug', + 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}), + }), + } + + +class CBSLocalLiveIE(CBSNewsLiveBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/losangeles/live/', + 'info_dict': { + 'id': 'CBSN-LA', + 'ext': 'mp4', + 'title': str, + 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s') + + +class CBSNewsLiveIE(CBSNewsLiveBaseIE): + IE_NAME = 'cbsnews:live' + IE_DESC = 'CBS News Livestream' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/live/', + 'info_dict': { + 'id': 'CBSN-US', + 'ext': 'mp4', + 'title': str, + 'description': r're:\w+ \w+ CRISPIN RUNDOWN', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return 'CBSN-US' class CBSNewsLiveVideoIE(InfoExtractor): @@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'duration': 334, }, 'skip': 'Video gone', - } + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -131,13 +431,13 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'dvr_slug': display_id, }) - formats = self._extract_akamai_formats(video_info['url'], display_id) - return { 'id': display_id, 'display_id': display_id, - 'title': video_info['headline'], - 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), - 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': formats, + 'formats': self._extract_akamai_formats(video_info['url'], display_id), + **traverse_obj(video_info, { + 'title': 'headline', + 'thumbnail': ('thumbnail_url_hd', {url_or_none}), + 'duration': ('segmentDur', {parse_duration}), + }), } diff --git a/plugins/youtube_download/yt_dlp/extractor/ceskatelevize.py b/plugins/youtube_download/yt_dlp/extractor/ceskatelevize.py index be2b0bb..8390160 100644 --- a/plugins/youtube_download/yt_dlp/extractor/ceskatelevize.py +++ b/plugins/youtube_download/yt_dlp/extractor/ceskatelevize.py @@ -1,20 +1,20 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..networking import Request from ..utils import ( ExtractorError, float_or_none, - sanitized_Request, str_or_none, traverse_obj, urlencode_postdata, - USER_AGENTS, ) +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P[^/#?]+)' @@ -97,7 +97,7 @@ class CeskaTelevizeIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, playlist_id) - parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + parsed_url = compat_urllib_parse_urlparse(urlh.url) site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: @@ -163,16 +163,16 @@ class CeskaTelevizeIE(InfoExtractor): entries = [] for user_agent in (None, USER_AGENTS['Safari']): - req = sanitized_Request( + req = Request( 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') + req.headers['Content-type'] = 'application/x-www-form-urlencoded' + req.headers['x-addr'] = '127.0.0.1' + req.headers['X-Requested-With'] = 'XMLHttpRequest' if user_agent: - req.add_header('User-Agent', user_agent) - req.add_header('Referer', url) + req.headers['User-Agent'] = user_agent + req.headers['Referer'] = url playlistpage = self._download_json(req, playlist_id, fatal=False) @@ -183,8 +183,8 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) + req = Request(compat_urllib_parse_unquote(playlist_url)) + req.headers['Referer'] = url playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: diff --git a/plugins/youtube_download/yt_dlp/extractor/chilloutzone.py b/plugins/youtube_download/yt_dlp/extractor/chilloutzone.py index 1a2f77c..ac4252f 100644 --- a/plugins/youtube_download/yt_dlp/extractor/chilloutzone.py +++ b/plugins/youtube_download/yt_dlp/extractor/chilloutzone.py @@ -1,93 +1,123 @@ -import json +import base64 from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_b64decode from ..utils import ( clean_html, - ExtractorError + int_or_none, + traverse_obj, ) class ChilloutzoneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w|-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w-]+)\.html' _TESTS = [{ - 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'url': 'https://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'md5': 'a76f3457e813ea0037e5244f509e66d1', 'info_dict': { 'id': 'enemene-meck-alle-katzen-weg', 'ext': 'mp4', 'title': 'Enemene Meck - Alle Katzen weg', 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + 'duration': 24, }, }, { 'note': 'Video hosted at YouTube', - 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'url': 'https://www.chilloutzone.net/video/eine-sekunde-bevor.html', 'info_dict': { 'id': '1YVQaAgHyRU', 'ext': 'mp4', 'title': '16 Photos Taken 1 Second Before Disaster', 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', 'uploader': 'BuzzFeedVideo', - 'uploader_id': 'BuzzFeedVideo', + 'uploader_id': '@BuzzFeedVideo', 'upload_date': '20131105', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/1YVQaAgHyRU/maxresdefault.jpg', + 'tags': 'count:41', + 'like_count': int, + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCpko_-a4wgz2u_DgDgd9fqA', + 'chapters': 'count:6', + 'live_status': 'not_live', + 'view_count': int, + 'categories': ['Entertainment'], + 'age_limit': 0, + 'channel_id': 'UCpko_-a4wgz2u_DgDgd9fqA', + 'duration': 100, + 'uploader_url': 'http://www.youtube.com/@BuzzFeedVideo', + 'channel_follower_count': int, + 'channel': 'BuzzFeedVideo', }, }, { - 'note': 'Video hosted at Vimeo', - 'url': 'http://www.chilloutzone.net/video/icon-blending.html', - 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'url': 'https://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2f9d6850ec567b24f0f4fa143b9aa2f9', 'info_dict': { - 'id': '85523671', + 'id': 'LLNkHpSjBfc', 'ext': 'mp4', - 'title': 'The Sunday Times - Icons', - 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', - 'uploader': 'Us', - 'uploader_id': 'usfilms', - 'upload_date': '20140131' + 'title': 'The Sunday Times Making of Icons', + 'description': 'md5:b9259fcf63a1669e42001e5db677f02a', + 'uploader': 'MadFoxUA', + 'uploader_id': '@MadFoxUA', + 'upload_date': '20140204', + 'channel_id': 'UCSZa9Y6-Vl7c11kWMcbAfCw', + 'channel_url': 'https://www.youtube.com/channel/UCSZa9Y6-Vl7c11kWMcbAfCw', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/@MadFoxUA', + 'duration': 66, + 'live_status': 'not_live', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/LLNkHpSjBfc/maxresdefault.jpg', + 'categories': ['Comedy'], + 'availability': 'public', + 'tags': [], + 'channel': 'MadFoxUA', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.chilloutzone.net/video/ordentlich-abgeschuettelt.html', + 'info_dict': { + 'id': 'ordentlich-abgeschuettelt', + 'ext': 'mp4', + 'title': 'Ordentlich abgeschüttelt', + 'description': 'md5:d41541966b75d3d1e8ea77a94ea0d329', + 'duration': 18, }, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + b64_data = self._html_search_regex( + r'var cozVidData\s*=\s*"([^"]+)"', webpage, 'video data') + info = self._parse_json(base64.b64decode(b64_data).decode(), video_id) - base64_video_info = self._html_search_regex( - r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') - video_info_dict = json.loads(decoded_video_info) + video_url = info.get('mediaUrl') + native_platform = info.get('nativePlatform') - # get video information from dict - video_url = video_info_dict['mediaUrl'] - description = clean_html(video_info_dict.get('description')) - title = video_info_dict['title'] - native_platform = video_info_dict['nativePlatform'] - native_video_id = video_info_dict['nativeVideoId'] - source_priority = video_info_dict['sourcePriority'] - - # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) - if native_platform is None: - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or - # the own CDN - if source_priority == 'native': + if native_platform and info.get('sourcePriority') == 'native': + native_video_id = info['nativeVideoId'] if native_platform == 'youtube': - return self.url_result(native_video_id, ie='Youtube') - if native_platform == 'vimeo': - return self.url_result( - 'http://vimeo.com/' + native_video_id, ie='Vimeo') + return self.url_result(native_video_id, 'Youtube') + elif native_platform == 'vimeo': + return self.url_result(f'https://vimeo.com/{native_video_id}', 'Vimeo') - if not video_url: - raise ExtractorError('No video found') + elif not video_url: + # Possibly a standard youtube embed? + # TODO: Investigate if site still does this (there are no tests for it) + return self.url_result(url, 'Generic') return { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': description, + **traverse_obj(info, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'duration': ('videoLength', {int_or_none}), + 'width': ('videoWidth', {int_or_none}), + 'height': ('videoHeight', {int_or_none}), + }), } diff --git a/plugins/youtube_download/yt_dlp/extractor/cinetecamilano.py b/plugins/youtube_download/yt_dlp/extractor/cinetecamilano.py index 5e770eb..9cffa11 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cinetecamilano.py +++ b/plugins/youtube_download/yt_dlp/extractor/cinetecamilano.py @@ -1,6 +1,6 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -40,7 +40,7 @@ class CinetecaMilanoIE(InfoExtractor): 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '' }) except ExtractorError as e: - if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500) + if ((isinstance(e.cause, HTTPError) and e.cause.status == 500) or isinstance(e.cause, json.JSONDecodeError)): self.raise_login_required(method='cookies') raise diff --git a/plugins/youtube_download/yt_dlp/extractor/ciscowebex.py b/plugins/youtube_download/yt_dlp/extractor/ciscowebex.py index 0fcf022..85585df 100644 --- a/plugins/youtube_download/yt_dlp/extractor/ciscowebex.py +++ b/plugins/youtube_download/yt_dlp/extractor/ciscowebex.py @@ -33,7 +33,7 @@ class CiscoWebexIE(InfoExtractor): if rcid: webpage = self._download_webpage(url, None, note='Getting video ID') url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') - url = self._request_webpage(url, None, note='Resolving final URL').geturl() + url = self._request_webpage(url, None, note='Resolving final URL').url mobj = self._match_valid_url(url) subdomain = mobj.group('subdomain') siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') diff --git a/plugins/youtube_download/yt_dlp/extractor/clipchamp.py b/plugins/youtube_download/yt_dlp/extractor/clipchamp.py new file mode 100644 index 0000000..a8bdf7e --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/clipchamp.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError(f'Unsupported clip storage location "{storage_location}"') + + path = data['download_url'] + iframe = self._download_webpage( + f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe, + 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return { + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None, + **traverse_obj(data, { + 'title': ('project', 'project_name', {str}), + 'timestamp': ('created_at', {unified_timestamp}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/comedycentral.py b/plugins/youtube_download/yt_dlp/extractor/comedycentral.py index 05fc9f2..27d295b 100644 --- a/plugins/youtube_download/yt_dlp/extractor/comedycentral.py +++ b/plugins/youtube_download/yt_dlp/extractor/comedycentral.py @@ -2,7 +2,7 @@ from .mtv import MTVServicesInfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] diff --git a/plugins/youtube_download/yt_dlp/extractor/common.py b/plugins/youtube_download/yt_dlp/extractor/common.py index b7c687b..7deab99 100644 --- a/plugins/youtube_download/yt_dlp/extractor/common.py +++ b/plugins/youtube_download/yt_dlp/extractor/common.py @@ -13,6 +13,7 @@ import netrc import os import random import re +import subprocess import sys import time import types @@ -21,9 +22,21 @@ import urllib.request import xml.etree.ElementTree from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) from ..utils import ( IDENTITY, JSON_LD_RE, @@ -32,8 +45,8 @@ from ..utils import ( FormatSorter, GeoRestrictedError, GeoUtils, - HEADRequest, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -56,7 +69,7 @@ from ..utils import ( join_nonempty, js_to_json, mimetype2ext, - network_exceptions, + netrc_from_content, orderedSet, parse_bitrate, parse_codecs, @@ -66,7 +79,6 @@ from ..utils import ( parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, @@ -78,8 +90,6 @@ from ..utils import ( unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, urlhandle_detect_ext, @@ -132,6 +142,7 @@ class InfoExtractor: is parsed from a string (in case of fragmented media) for MSS - URL of the ISM manifest. + * request_data Data to send in POST request to the URL * manifest_url The URL of the manifest file in case of fragmented media: @@ -219,7 +230,8 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. * extra_param_to_segment_url A query string to append to each fragment's URL, or to update each existing query string with. Only applied by the native HLS/DASH downloaders. @@ -285,6 +297,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -313,6 +326,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -324,8 +342,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set @@ -349,6 +367,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -460,8 +482,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -524,7 +546,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -551,8 +573,8 @@ class InfoExtractor: # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): @@ -707,11 +729,11 @@ class InfoExtractor: except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), + e.video_id = e.video_id or self.get_temp_id(url) e.ie = e.ie or self.IE_NAME, e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -770,20 +792,25 @@ class InfoExtractor: @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) def _create_request(self, url_or_request, data=None, headers=None, query=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query) + return url_or_request def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ @@ -819,14 +846,9 @@ class InfoExtractor: try: return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False @@ -958,11 +980,11 @@ class InfoExtractor: if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) + self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.geturl(), video_id) + filename = self._request_dump_filename(urlh.url, video_id) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1020,7 +1042,7 @@ class InfoExtractor: fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.full_url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -1094,7 +1116,7 @@ class InfoExtractor: while True: try: return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) - except http.client.IncompleteRead as e: + except IncompleteRead as e: try_count += 1 if try_count >= tries: raise e @@ -1280,45 +1302,48 @@ class InfoExtractor: return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd') + if cmd: + cmd = cmd.replace('{}', netrc_machine) + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) - return username, password + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) + + else: + return None, None + if not info: + raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): @@ -1338,7 +1363,7 @@ class InfoExtractor: # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' @@ -1788,7 +1813,7 @@ class InfoExtractor: return [] manifest, urlh = res - manifest_url = urlh.geturl() + manifest_url = urlh.url return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, @@ -1947,7 +1972,7 @@ class InfoExtractor: return [], {} m3u8_doc, urlh = res - m3u8_url = urlh.geturl() + m3u8_url = urlh.url return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, @@ -1961,11 +1986,7 @@ class InfoExtractor: errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), m3u8_doc) + has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) @@ -2063,6 +2084,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, 'vcodec': 'none' if media_type == 'AUDIO' else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) @@ -2122,6 +2144,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, } resolution = last_stream_inf.get('RESOLUTION') if resolution: @@ -2225,18 +2248,10 @@ class InfoExtractor: if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.geturl() - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2250,7 +2265,7 @@ class InfoExtractor: return {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) @@ -2262,9 +2277,8 @@ class InfoExtractor: def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2303,7 +2317,14 @@ class InfoExtractor: return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2311,7 +2332,7 @@ class InfoExtractor: base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 @@ -2359,8 +2380,9 @@ class InfoExtractor: src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2381,11 +2403,15 @@ class InfoExtractor: f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2416,7 +2442,10 @@ class InfoExtractor: 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] @@ -2442,7 +2471,7 @@ class InfoExtractor: return [] xspf, urlh = res - xspf_url = urlh.geturl() + xspf_url = urlh.url return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, @@ -2513,7 +2542,7 @@ class InfoExtractor: return [], {} # We could have been redirected to a new url when we retrieved our mpd file. - mpd_url = urlh.geturl() + mpd_url = urlh.url mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( @@ -2884,7 +2913,7 @@ class InfoExtractor: if ism_doc is None: return [], {} - return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id) def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ @@ -2980,6 +3009,8 @@ class InfoExtractor: 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, @@ -3435,7 +3466,7 @@ class InfoExtractor: def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3510,8 +3541,8 @@ class InfoExtractor: @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): @@ -3524,7 +3555,7 @@ class InfoExtractor: desc = '' if cls._NETRC_MACHINE: if markdown: - desc += f' [{cls._NETRC_MACHINE}]' + desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")' else: desc += f' [{cls._NETRC_MACHINE}]' if cls.IE_DESC is False: @@ -3646,6 +3677,42 @@ class InfoExtractor: or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) or default) + def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': start_function(chapter), + 'title': title_function(chapter), + } for chapter in chapter_list or []] + if strict: + warn = self.report_warning + else: + warn = self.write_debug + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + warn(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') + return chapters[1:] + + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters_helper( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters_helper( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], + duration=duration, strict=False) + @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): all_known = all(map( diff --git a/plugins/youtube_download/yt_dlp/extractor/crackle.py b/plugins/youtube_download/yt_dlp/extractor/crackle.py index 4610015..1ef90b5 100644 --- a/plugins/youtube_download/yt_dlp/extractor/crackle.py +++ b/plugins/youtube_download/yt_dlp/extractor/crackle.py @@ -4,7 +4,7 @@ import re import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, float_or_none, @@ -113,7 +113,7 @@ class CrackleIE(InfoExtractor): errnote='Unable to download media JSON') except ExtractorError as e: # 401 means geo restriction, trying next country - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: continue raise diff --git a/plugins/youtube_download/yt_dlp/extractor/crtvg.py b/plugins/youtube_download/yt_dlp/extractor/crtvg.py new file mode 100644 index 0000000..1aa8d77 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/crtvg.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class CrtvgIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/[^/#?]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': '5839623', + 'title': 'Os caimáns do Tea', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url') + formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False) + formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'title': remove_end(self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'), + 'description': self._html_search_meta('description', webpage, 'description', default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/crunchyroll.py b/plugins/youtube_download/yt_dlp/extractor/crunchyroll.py index 1abffcd..ee34ace 100644 --- a/plugins/youtube_download/yt_dlp/extractor/crunchyroll.py +++ b/plugins/youtube_download/yt_dlp/extractor/crunchyroll.py @@ -1,28 +1,50 @@ import base64 -import urllib.parse from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, format_field, + int_or_none, join_nonempty, + parse_age_limit, + parse_count, parse_iso8601, qualities, + remove_start, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _AUTH_HEADERS = None + _API_ENDPOINT = None + _BASIC_AUTH = None + _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') + _LOCALE_LOOKUP = { + 'ar': 'ar-SA', + 'de': 'de-DE', + '': 'en-US', + 'es': 'es-419', + 'es-es': 'es-ES', + 'fr': 'fr-FR', + 'it': 'it-IT', + 'pt-br': 'pt-BR', + 'pt-pt': 'pt-PT', + 'ru': 'ru-RU', + 'hi': 'hi-IN', + } @property def is_logged_in(self): - return self._get_cookies(self._LOGIN_URL).get('etp_rt') + return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) def _perform_login(self, username, password): if self.is_logged_in: @@ -35,7 +57,7 @@ class CrunchyrollBaseIE(InfoExtractor): 'device_id': 'whatvalueshouldbeforweb', 'device_type': 'com.crunchyroll.static', 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL + 'referer': f'{self._BASE_URL}/welcome/login' }) if upsell_response['code'] != 'ok': raise ExtractorError('Could not get session id') @@ -43,149 +65,89 @@ class CrunchyrollBaseIE(InfoExtractor): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ + data=urlencode_postdata({ 'account': username, 'password': password, 'session_id': session_id - }).encode('ascii')) + })) if login_response['code'] != 'ok': raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): + return - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' - else: - grant_type, key = 'client_id', 'anonClientId' - - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') + if not CrunchyrollBaseIE._BASIC_AUTH: + cx_api_param = self._CLIENT_ID[self.is_logged_in] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + try: auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] - } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 403: + raise ExtractorError( + 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) + raise + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} + CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) -class CrunchyrollBetaIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ - (?P(?:\w{2}(?:-\w{2})?/)?) - watch/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', - 'info_dict': { - 'id': 'GY2P1Q98Y', - 'ext': 'mp4', - 'duration': 1380.241, - 'timestamp': 1459632600, - 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'title': 'World Trigger Episode 73 – To the Future', - 'upload_date': '20160402', - 'series': 'World Trigger', - 'series_id': 'GR757DMKY', - 'season': 'World Trigger', - 'season_id': 'GR9P39NJ6', - 'season_number': 1, - 'episode': 'To the Future', - 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:2', - }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, - }, { - 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', - 'info_dict': { - 'id': 'GYE5WKQGR', - 'ext': 'mp4', - 'duration': 366.459, - 'timestamp': 1476788400, - 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', - 'upload_date': '20161018', - 'series': 'SHELTER', - 'series_id': 'GYGG09WWY', - 'season': 'SHELTER', - 'season_id': 'GR09MGK4R', - 'season_number': 1, - 'episode': 'Porter Robinson presents Shelter the Animation', - 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:0', - }, - 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', - }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', - 'only_matching': True, - }, { - 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', - 'only_matching': True, - }] + def _locale_from_language(self, language): + config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) + return config_locale[0] if config_locale else self._LOCALE_LOOKUP.get(language) - def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_auth() - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): - if self.is_logged_in: - raise ExtractorError('This video is for premium members only', expected=True) - else: - self.raise_login_required('This video is for premium members only') + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + query = query.copy() + locale = self._locale_from_language(lang) + if locale: + query['locale'] = locale - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query=query) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_formats(self, stream_response, display_id=None): requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - available_formats = {} - for stream_type, streams in get_streams('streams'): + for stream_type, streams in traverse_obj( + stream_response, (('streams', ('data', 0)), {dict.items}, ...)): if stream_type not in requested_formats: continue - for stream in streams.values(): - if not stream.get('url'): - continue + for stream in traverse_obj(streams, lambda _, v: v['url']): hardsub_lang = stream.get('hardsub_locale') or '' format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] if '' in available_formats and 'all' not in requested_hardsubs: full_format_langs = set(requested_hardsubs) self.to_screen( @@ -196,6 +158,8 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): else: full_format_langs = set(map(str.lower, available_formats)) + audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + hardsub_preference = qualities(requested_hardsubs[::-1]) formats = [] for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): if stream_type.endswith('hls'): @@ -214,63 +178,292 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): continue for f in adaptive_formats: if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') + f['language'] = audio_locale f['quality'] = hardsub_preference(hardsub_lang.lower()) formats.extend(adaptive_formats) - chapters = None + return formats + + def _extract_subtitles(self, data): + subtitles = {} + + for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): + subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] + + return subtitles + + +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], + } + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) + + +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta\.|www\.)?crunchyroll\.com/ + (?:(?P\w{2}(?:-\w{2})?)/)? + watch/(?!concert|musicvideo)(?P\w+)''' + _TESTS = [{ + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': 'GY2P1Q98Y', + 'ext': 'mp4', + 'duration': 1380.241, + 'timestamp': 1459632600, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + 'series': 'World Trigger', + 'series_id': 'GR757DMKY', + 'season': 'World Trigger', + 'season_id': 'GR9P39NJ6', + 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + }, { + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', + 'info_dict': { + 'id': 'GYE5WKQGR', + 'ext': 'mp4', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/GY2P1Q98Y', + 'only_matching': True, + }, { + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', + 'only_matching': True, + }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) + + elif object_type == 'movie': + result = self._transform_movie_response(response) + + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) + + else: + raise ExtractorError(f'Unknown object type {object_type}') + + # There might be multiple audio languages for one object (`_metadata.versions`), + # so we need to get the id from `streams_link` instead or we dont know which language to choose + streams_link = response.get('streams_link') + if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + # We need go from unsigned to signed api to avoid getting soft banned + stream_response = self._call_cms_api_signed(remove_start( + streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + result['subtitles'] = self._extract_subtitles(stream_response) + # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - display_id, fatal=False, errnote=False) + intro_chapter = self._download_json( + f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) if isinstance(intro_chapter, dict): - chapters = [{ + result['chapters'] = [{ 'title': 'Intro', 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')) + 'end_time': float_or_none(intro_chapter.get('endTime')), }] + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, - 'chapters': chapters + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('%s%s' % ( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), + } + + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - series/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + series/(?P\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -279,41 +472,179 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) - - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) - - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number'), - 'language': episode.get('audio_locale'), - } + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) - return self.playlist_result(entries(), internal_id, series_response.get('title')) + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }) + }))) + + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + watch/(?Pconcert|musicvideo)/(?P\w+)''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV5B02C79', + 'display_id': 'egaono-hana', + 'title': 'Egaono Hana', + 'track': 'Egaono Hana', + 'artist': 'Goose house', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + streams_link = response.get('streams_link') + if not streams_link and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + result = self._transform_music_response(response) + stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artist': ('artist', 'name'), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } + + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + artist/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genre': ['J-Pop', 'Anime', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) + + def entries(): + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + }), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/cultureunplugged.py b/plugins/youtube_download/yt_dlp/extractor/cultureunplugged.py index 2fb2280..9c8509f 100644 --- a/plugins/youtube_download/yt_dlp/extractor/cultureunplugged.py +++ b/plugins/youtube_download/yt_dlp/extractor/cultureunplugged.py @@ -1,10 +1,8 @@ import time from .common import InfoExtractor -from ..utils import ( - int_or_none, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import int_or_none class CultureUnpluggedIE(InfoExtractor): diff --git a/plugins/youtube_download/yt_dlp/extractor/dacast.py b/plugins/youtube_download/yt_dlp/extractor/dacast.py new file mode 100644 index 0000000..4e81aa4 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/dacast.py @@ -0,0 +1,158 @@ +import hashlib +import re +import time + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + classproperty, + float_or_none, + traverse_obj, + url_or_none, +) + + +class DacastBaseIE(InfoExtractor): + _URL_TYPE = None + + @classproperty + def _VALID_URL(cls): + return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P[\w-]+)/(?P[\w-]+)' + + @classproperty + def _EMBED_REGEX(cls): + return [rf']+\bsrc=["\'](?P{cls._VALID_URL})'] + + _API_INFO_URL = 'https://playback.dacast.com/content/info' + + @classmethod + def _get_url_from_id(cls, content_id): + user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-') + return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for content_id in re.findall( + rf']+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage): + yield cls._get_url_from_id(content_id) + + +class DacastVODIE(DacastBaseIE): + _URL_TYPE = 'vod' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090', + 'info_dict': { + 'id': '1c6143e3-5a06-371d-8695-19b96ea49090', + 'ext': 'mp4', + 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534', + 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK', + 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', + 'info_dict': { + 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90', + 'ext': 'mp4', + 'title': '4-HowToEmbedVideo.mp4', + 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3', + 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html', + 'info_dict': { + 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa', + 'ext': 'mp4', + 'title': 'Evening Service 2-5-23', + 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e', + 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) + access = self._download_json( + 'https://playback.dacast.com/content/access', video_id, + note='Downloading access JSON', query=query, expected_status=403) + + error = access.get('error') + if error in ('Broadcaster has been blocked', 'Content is offline'): + raise ExtractorError(error, expected=True) + elif error: + raise ExtractorError(f'Dacast API says "{error}"') + + hls_url = access['hls'] + hls_aes = {} + + if 'DRM_EXT' in hls_url: + self.report_drm(video_id) + elif '/uspaes/' in hls_url: + # From https://player.dacast.com/js/player.js + ts = int(time.time()) + signature = hashlib.sha1( + f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' + + for retry in self.RetryManager(): + try: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + # CDN will randomly respond with 403 + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + retry.error = e + continue + raise + + return { + 'id': video_id, + 'uploader_id': user_id, + 'formats': formats, + 'hls_aes': hls_aes or None, + **traverse_obj(info, ('contentInfo', { + 'title': 'title', + 'duration': ('duration', {float_or_none}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + })), + } + + +class DacastPlaylistIE(DacastBaseIE): + _URL_TYPE = 'playlist' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + + def _real_extract(self, url): + user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id') + info = self._download_json( + self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={ + 'contentId': f'{user_id}-playlist-{playlist_id}', + 'provider': 'universe', + })['contentInfo'] + + def entries(info): + for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])): + yield self.url_result( + DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title')) + + return self.playlist_result(entries(info), playlist_id, info.get('title')) diff --git a/plugins/youtube_download/yt_dlp/extractor/daftsex.py b/plugins/youtube_download/yt_dlp/extractor/daftsex.py index 551d5e3..92510c7 100644 --- a/plugins/youtube_download/yt_dlp/extractor/daftsex.py +++ b/plugins/youtube_download/yt_dlp/extractor/daftsex.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( + ExtractorError, int_or_none, js_to_json, parse_count, @@ -12,21 +13,24 @@ from ..utils import ( class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' + _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P-?\d+_\d+)' _TESTS = [{ - 'url': 'https://daftsex.com/watch/-35370899_456246186', - 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'url': 'https://daft.sex/watch/-35370899_456246186', + 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd', 'info_dict': { 'id': '-35370899_456246186', 'ext': 'mp4', 'title': 'just relaxing', - 'description': 'just relaxing - Watch video Watch video in high quality', + 'description': 'just relaxing – Watch video Watch video in high quality', 'upload_date': '20201113', 'timestamp': 1605261911, - 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'duration': 15.0, + 'view_count': int }, }, { - 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'url': 'https://daft.sex/watch/-156601359_456242791', 'info_dict': { 'id': '-156601359_456242791', 'ext': 'mp4', @@ -36,6 +40,7 @@ class DaftsexIE(InfoExtractor): 'timestamp': 1600250735, 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', }, + 'skip': 'deleted / private' }] def _real_extract(self, url): @@ -60,7 +65,7 @@ class DaftsexIE(InfoExtractor): webpage, 'player color', fatal=False) or '' embed_page = self._download_webpage( - 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color), video_id, headers={'Referer': url}) video_params = self._parse_json( self._search_regex( @@ -94,15 +99,19 @@ class DaftsexIE(InfoExtractor): 'age_limit': 18, } - item = self._download_json( + items = self._download_json( f'{server_domain}/method/video.get/{video_id}', video_id, headers={'Referer': url}, query={ 'token': video_params['video']['access_token'], 'videos': video_id, 'ckey': video_params['c_key'], 'credentials': video_params['video']['credentials'], - })['response']['items'][0] + })['response']['items'] + if not items: + raise ExtractorError('Video is not available', video_id=video_id, expected=True) + + item = items[0] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': diff --git a/plugins/youtube_download/yt_dlp/extractor/dailymotion.py b/plugins/youtube_download/yt_dlp/extractor/dailymotion.py index 2a44718..21263d4 100644 --- a/plugins/youtube_download/yt_dlp/extractor/dailymotion.py +++ b/plugins/youtube_download/yt_dlp/extractor/dailymotion.py @@ -3,7 +3,7 @@ import json import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -68,9 +68,9 @@ class DailymotionBaseInfoExtractor(InfoExtractor): None, 'Downloading Access Token', data=urlencode_postdata(data))['access_token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError(self._parse_json( - e.cause.read().decode(), xid)['error_description'], expected=True) + e.cause.response.read().decode(), xid)['error_description'], expected=True) raise self._set_dailymotion_cookie('access_token' if username else 'client_token', token) self._HEADERS['Authorization'] = 'Bearer ' + token diff --git a/plugins/youtube_download/yt_dlp/extractor/digitalconcerthall.py b/plugins/youtube_download/yt_dlp/extractor/digitalconcerthall.py index 3461e36..c11cd79 100644 --- a/plugins/youtube_download/yt_dlp/extractor/digitalconcerthall.py +++ b/plugins/youtube_download/yt_dlp/extractor/digitalconcerthall.py @@ -11,7 +11,7 @@ from ..utils import ( class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' - _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/(?Pfilm|concert)/(?P[0-9]+)' _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' @@ -40,6 +40,19 @@ class DigitalConcertHallIE(InfoExtractor): }, 'params': {'skip_download': 'm3u8'}, 'playlist_count': 3, + }, { + 'url': 'https://www.digitalconcerthall.com/en/film/388', + 'info_dict': { + 'id': '388', + 'ext': 'mp4', + 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann', + 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20220714', + 'timestamp': 1657785600, + 'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff', + }, + 'params': {'skip_download': 'm3u8'}, }] def _perform_login(self, username, password): @@ -75,7 +88,7 @@ class DigitalConcertHallIE(InfoExtractor): if not self._ACCESS_TOKEN: self.raise_login_required(method='password') - def _entries(self, items, language, **kwargs): + def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] stream_info = self._download_json( @@ -103,11 +116,11 @@ class DigitalConcertHallIE(InfoExtractor): 'start_time': chapter.get('time'), 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), 'title': chapter.get('text'), - } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None, } def _real_extract(self, url): - language, video_id = self._match_valid_url(url).group('language', 'id') + language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id') if not language: language = 'en' @@ -120,18 +133,18 @@ class DigitalConcertHallIE(InfoExtractor): }] vid_info = self._download_json( - f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={ 'Accept': 'application/json', 'Accept-Language': language }) album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) return { '_type': 'playlist', 'id': video_id, 'title': vid_info.get('title'), - 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, - thumbnails=thumbnails, album_artist=album_artist), + 'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_), 'thumbnails': thumbnails, 'album_artist': album_artist, } diff --git a/plugins/youtube_download/yt_dlp/extractor/discogs.py b/plugins/youtube_download/yt_dlp/extractor/discogs.py new file mode 100644 index 0000000..048c622 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/discogs.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class DiscogsReleasePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?Prelease|master)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm', + 'info_dict': { + 'id': 'release1', + 'title': 'Stockholm', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time', + 'info_dict': { + 'id': 'master113', + 'title': 'Moments In Time', + }, + 'playlist_mincount': 53, + }] + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + display_id = f'{playlist_type}{playlist_id}' + response = self._download_json( + f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id) + + entries = [ + self.url_result(video['uri'], YoutubeIE, video_title=video.get('title')) + for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))] + + return self.playlist_result(entries, display_id, response.get('title')) diff --git a/plugins/youtube_download/yt_dlp/extractor/discovery.py b/plugins/youtube_download/yt_dlp/extractor/discovery.py index e6e109d..75b4643 100644 --- a/plugins/youtube_download/yt_dlp/extractor/discovery.py +++ b/plugins/youtube_download/yt_dlp/extractor/discovery.py @@ -3,8 +3,8 @@ import string from .discoverygo import DiscoveryGoBaseIE from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ExtractorError -from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): @@ -100,9 +100,9 @@ class DiscoveryIE(DiscoveryGoBaseIE): self._API_BASE_URL + 'streaming/video/' + video_id, display_id, 'Downloading streaming JSON metadata', headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): e_description = self._parse_json( - e.cause.read().decode(), display_id)['description'] + e.cause.response.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) if 'Authorized Networks' in e_description: diff --git a/plugins/youtube_download/yt_dlp/extractor/dlf.py b/plugins/youtube_download/yt_dlp/extractor/dlf.py new file mode 100644 index 0000000..88a4149 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/dlf.py @@ -0,0 +1,192 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + traverse_obj, + url_or_none, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + + return { + 'id': audio_id, + 'extractor_key': DLFIE.ie_key(), + 'extractor': DLFIE.IE_NAME, + **traverse_obj(attrs, { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}), + 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}), + 'thumbnail': ('data-audioimage', {url_or_none}), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}), + }, get_all=False), + 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) + } + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk' + }, + 'params': { + 'skip_download': 'm3u8' + }, + 'skip': 'This webpage no longer exists' + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk' + } + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }] + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage, default=None), + 'title': self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=None), + 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/dplay.py b/plugins/youtube_download/yt_dlp/extractor/dplay.py index 8eb4d8f..363b4be 100644 --- a/plugins/youtube_download/yt_dlp/extractor/dplay.py +++ b/plugins/youtube_download/yt_dlp/extractor/dplay.py @@ -2,7 +2,7 @@ import json import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -39,7 +39,7 @@ class DPlayBaseIE(InfoExtractor): return f'Bearer {token}' def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) + info = self._parse_json(e.cause.response.read().decode('utf-8'), None) error = info['errors'][0] error_code = error.get('code') if error_code == 'access.denied.geoblocked': @@ -65,6 +65,7 @@ class DPlayBaseIE(InfoExtractor): return streaming_list def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''): + country = self.get_param('geo_bypass_country') or country geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, @@ -86,7 +87,7 @@ class DPlayBaseIE(InfoExtractor): 'include': 'images,primaryChannel,show,tags' }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self._process_errors(e, geo_countries) raise video_id = video['data']['id'] @@ -98,7 +99,7 @@ class DPlayBaseIE(InfoExtractor): streaming = self._download_video_playback_info( disco_base, video_id, headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self._process_errors(e, geo_countries) raise for format_dict in streaming: @@ -745,7 +746,7 @@ class MotorTrendIE(DiscoveryPlusBaseIE): class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?motortrend(?:ondemand\.com|\.com/plus)/detail' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', 'info_dict': { @@ -766,6 +767,25 @@ class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): 'upload_date': '20140101', 'tags': [], }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/roadworthy-rescues-teaser-trailer/4922860/', + 'info_dict': { + 'id': '4922860', + 'ext': 'mp4', + 'title': 'Roadworthy Rescues | Teaser Trailer', + 'description': 'Derek Bieri helps Freiburger and Finnegan with their \'68 big-block Dart.', + 'display_id': 'roadworthy-rescues-teaser-trailer/4922860', + 'creator': 'Originals', + 'series': 'Roadworthy Rescues', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'upload_date': '20220907', + 'timestamp': 1662523200, + 'duration': 1066.356, + 'tags': [], + }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/ugly-duckling/2450033/12439', + 'only_matching': True, }] _PRODUCT = 'MTOD' @@ -1001,3 +1021,39 @@ class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE): _SHOW_STR = 'show' _INDEX = 4 _VIDEO_IE = DiscoveryPlusIndiaIE + + +class GlobalCyclingNetworkPlusIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://plus\.globalcyclingnetwork\.com/watch/(?P\d+)' + _TESTS = [{ + 'url': 'https://plus.globalcyclingnetwork.com/watch/1397691', + 'info_dict': { + 'id': '1397691', + 'ext': 'mp4', + 'title': 'The Athertons: Mountain Biking\'s Fastest Family', + 'description': 'md5:75a81937fcd8b989eec6083a709cd837', + 'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/04/eb9e3026-4849-3001-8281-9356466f0557.png', + 'series': 'gcn', + 'creator': 'Gcn', + 'upload_date': '20210309', + 'timestamp': 1615248000, + 'duration': 2531.0, + 'tags': [], + }, + 'skip': 'Subscription required', + 'params': {'skip_download': 'm3u8'}, + }] + + _PRODUCT = 'web' + _DISCO_API_PARAMS = { + 'disco_host': 'disco-api-prod.globalcyclingnetwork.com', + 'realm': 'gcn', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:27.3.2', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) diff --git a/plugins/youtube_download/yt_dlp/extractor/dropout.py b/plugins/youtube_download/yt_dlp/extractor/dropout.py index e280b1c..80ae6c1 100644 --- a/plugins/youtube_download/yt_dlp/extractor/dropout.py +++ b/plugins/youtube_download/yt_dlp/extractor/dropout.py @@ -1,13 +1,17 @@ +import functools + from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, - get_elements_by_class, + get_elements_html_by_class, int_or_none, - join_nonempty, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -162,12 +166,13 @@ class DropoutIE(InfoExtractor): class DropoutSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _PAGE_SIZE = 24 + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:(?P[0-9]+)/?$)' _TESTS = [ { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', 'note': 'Multi-season series with the season in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -176,7 +181,7 @@ class DropoutSeasonIE(InfoExtractor): { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', 'note': 'Multi-season series with the season not in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -190,29 +195,30 @@ class DropoutSeasonIE(InfoExtractor): 'id': 'dimension-20-shriek-week-season-1', 'title': 'Dimension 20 Shriek Week - Season 1' } + }, + { + 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3', + 'note': 'Multi-season series with season in the url that requires pagination', + 'playlist_count': 25, + 'info_dict': { + 'id': 'breaking-news-no-laugh-newsroom-season-3', + 'title': 'Breaking News No Laugh Newsroom - Season 3' + } } ] + def _fetch_page(self, url, season_id, page): + page += 1 + webpage = self._download_webpage( + f'{url}?page={page}', season_id, note=f'Downloading page {page}', expected_status={400}) + yield from [self.url_result(item_url, DropoutIE) for item_url in traverse_obj( + get_elements_html_by_class('browse-item-link', webpage), (..., {extract_attributes}, 'href'))] + def _real_extract(self, url): season_id = self._match_id(url) + season_num = self._match_valid_url(url).group('season') or 1 season_title = season_id.replace('-', ' ').title() - webpage = self._download_webpage(url, season_id) - entries = [ - self.url_result( - url=self._search_regex(r']+selected>([^<]+)', - seasons, 'current_season', default='').strip() - - return { - '_type': 'playlist', - 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), - 'title': join_nonempty(season_title, current_season, delim=' - '), - 'entries': entries - } + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, url, season_id), self._PAGE_SIZE), + f'{season_id}-season-{season_num}', f'{season_title} - Season {season_num}') diff --git a/plugins/youtube_download/yt_dlp/extractor/drtv.py b/plugins/youtube_download/yt_dlp/extractor/drtv.py index 470546b..6c381aa 100644 --- a/plugins/youtube_download/yt_dlp/extractor/drtv.py +++ b/plugins/youtube_download/yt_dlp/extractor/drtv.py @@ -12,7 +12,6 @@ from ..utils import ( mimetype2ext, str_or_none, traverse_obj, - try_get, unified_timestamp, update_url_query, url_or_none, @@ -25,7 +24,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?Pradio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P[\da-z_-]+) @@ -80,7 +79,7 @@ class DRTVIE(InfoExtractor): 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3504.618, + 'duration': 3504.619, 'formats': 'mincount:20', 'release_year': 2017, 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', @@ -101,14 +100,16 @@ class DRTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bonderøven 2019 (1:8)', 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', - 'timestamp': 1603188600, - 'upload_date': '20201020', + 'timestamp': 1654856100, + 'upload_date': '20220610', 'duration': 2576.6, 'season': 'Bonderøven 2019', 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', 'release_year': 2019, 'season_number': 2019, - 'series': 'Frank & Kastaniegaarden' + 'series': 'Frank & Kastaniegaarden', + 'episode_number': 1, + 'episode': 'Episode 1', }, 'params': { 'skip_download': True, @@ -140,10 +141,26 @@ class DRTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'this video has been removed', + }, { + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9', + 'info_dict': { + 'ext': 'mp4', + 'id': '14802310112', + 'timestamp': 1678786200, + 'duration': 120.043, + 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f', + 'series': 'P4 København regionale nyheder', + 'upload_date': '20230314', + 'release_year': 0, + 'description': 'Hør seneste regionale nyheder fra P4 København.', + 'season': 'Regionale nyheder', + 'title': 'Regionale nyheder', + }, }] def _real_extract(self, url): - raw_video_id = self._match_id(url) + raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio') webpage = self._download_webpage(url, raw_video_id) @@ -170,15 +187,17 @@ class DRTVIE(InfoExtractor): programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) else: programcard_url = _PROGRAMCARD_BASE - page = self._parse_json( - self._search_regex( - r'data\s*=\s*({.+?})\s*(?:;|https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P[0-9]+[/_][0-9a-zA-Z]+)' + _VALID_URL = r'''(?x) + (?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: + /(?:mediabase|embed|item)/| + (?:/toppers|/latest|/?)\?selectedId= + )(?P[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor): 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 9, + 'view_count': int, + 'like_count': int, } }, { 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', @@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor): }, { 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/item/100031688_b317a185', + 'info_dict': { + 'id': '100031688/b317a185', + 'ext': 'mp4', + 'title': 'Epic schijnbeweging', + 'description': '

Die zag je niet eh

', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'duration': 12, + 'view_count': int, + 'like_count': int, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,18 +66,23 @@ class DumpertIE(InfoExtractor): title = item['title'] media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') - quality = qualities(['flv', 'mobile', 'tablet', '720p']) + quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p']) formats = [] for variant in media.get('variants', []): uri = variant.get('uri') if not uri: continue version = variant.get('version') - formats.append({ - 'url': uri, - 'format_id': version, - 'quality': quality(version), - }) + preference = quality(version) + if determine_ext(uri) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', m3u8_id=version, quality=preference)) + else: + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': preference, + }) thumbnails = [] stills = item.get('stills') or {} diff --git a/plugins/youtube_download/yt_dlp/extractor/eagleplatform.py b/plugins/youtube_download/yt_dlp/extractor/eagleplatform.py index 9ebd24d..739d179 100644 --- a/plugins/youtube_download/yt_dlp/extractor/eagleplatform.py +++ b/plugins/youtube_download/yt_dlp/extractor/eagleplatform.py @@ -2,7 +2,7 @@ import functools import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -111,8 +111,8 @@ class EaglePlatformIE(InfoExtractor): response = super(EaglePlatformIE, self)._download_json( url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + if isinstance(ee.cause, HTTPError): + response = self._parse_json(ee.cause.response.read().decode('utf-8'), video_id) self._handle_error(response) raise return response diff --git a/plugins/youtube_download/yt_dlp/extractor/ebay.py b/plugins/youtube_download/yt_dlp/extractor/ebay.py new file mode 100644 index 0000000..d0eb9fc --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/ebay.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class EbayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ebay\.com/itm/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.ebay.com/itm/194509326719', + 'info_dict': { + 'id': '194509326719', + 'ext': 'mp4', + 'title': 'WiFi internal antenna adhesive for wifi 2.4GHz wifi 5 wifi 6 wifi 6E full bands', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_json = self._search_json(r'"video":', webpage, 'video json', video_id) + + formats = [] + for key, url in video_json['playlistMap'].items(): + if key == 'HLS': + formats.extend(self._extract_m3u8_formats(url, video_id, fatal=False)) + elif key == 'DASH': + formats.extend(self._extract_mpd_formats(url, video_id, fatal=False)) + else: + self.report_warning(f'Unsupported format {key}', video_id) + + return { + 'id': video_id, + 'title': remove_end(self._html_extract_title(webpage), ' | eBay'), + 'formats': formats + } diff --git a/plugins/youtube_download/yt_dlp/extractor/eitb.py b/plugins/youtube_download/yt_dlp/extractor/eitb.py index bd027da..66afbb6 100644 --- a/plugins/youtube_download/yt_dlp/extractor/eitb.py +++ b/plugins/youtube_download/yt_dlp/extractor/eitb.py @@ -1,10 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - sanitized_Request, -) +from ..networking import Request +from ..utils import float_or_none, int_or_none, parse_iso8601 class EitbIE(InfoExtractor): @@ -54,7 +50,7 @@ class EitbIE(InfoExtractor): hls_url = media.get('HLS_SURL') if hls_url: - request = sanitized_Request( + request = Request( 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', headers={'Referer': url}) token_data = self._download_json( diff --git a/plugins/youtube_download/yt_dlp/extractor/elevensports.py b/plugins/youtube_download/yt_dlp/extractor/elevensports.py new file mode 100644 index 0000000..99c52b3 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/elevensports.py @@ -0,0 +1,59 @@ +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class ElevenSportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P\w+)' + _TESTS = [{ + 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clf46yr3kenn80jgrqsjmwefk', + 'title': 'Cleveland SC vs Lionsbridge FC', + 'ext': 'mp4', + 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58', + 'upload_date': '20230323', + 'timestamp': 1679612400, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clhpyd53b06160jez74qhgkmf', + 'title': 'AJNLF vs ARRAF', + 'ext': 'mp4', + 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1', + 'upload_date': '20230521', + 'timestamp': 1684684800, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId'] + event_data = self._download_json( + f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id, + headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(event_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('start_time', {parse_iso8601}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/embedly.py b/plugins/youtube_download/yt_dlp/extractor/embedly.py index 1b58fca..458aaa0 100644 --- a/plugins/youtube_download/yt_dlp/extractor/embedly.py +++ b/plugins/youtube_download/yt_dlp/extractor/embedly.py @@ -61,6 +61,35 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'http://www.permacultureetc.com/2022/12/comment-greffer-facilement-les-arbres-fruitiers.html', + 'info_dict': { + 'id': 'pfUK_ADTvgY', + 'ext': 'mp4', + 'title': 'Comment greffer facilement les arbres fruitiers ? (mois par mois)', + 'description': 'md5:d3a876995e522f138aabb48e040bfb4c', + 'view_count': int, + 'upload_date': '20221210', + 'comment_count': int, + 'live_status': 'not_live', + 'channel_id': 'UCsM4_jihNFYe4CtSkXvDR-Q', + 'channel_follower_count': int, + 'tags': ['permaculture', 'jardinage', 'dekarz', 'autonomie', 'greffe', 'fruitiers', 'arbres', 'jardin forêt', 'forêt comestible', 'damien'], + 'playable_in_embed': True, + 'uploader': 'permaculture agroécologie etc...', + 'channel': 'permaculture agroécologie etc...', + 'thumbnail': 'https://i.ytimg.com/vi/pfUK_ADTvgY/sddefault.jpg', + 'duration': 1526, + 'channel_url': 'https://www.youtube.com/channel/UCsM4_jihNFYe4CtSkXvDR-Q', + 'age_limit': 0, + 'uploader_id': 'permacultureetc', + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/permacultureetc', + 'categories': ['Education'], + 'availability': 'public', + }, + }] + @classmethod def _extract_from_webpage(cls, url, webpage): # Bypass "ie=cls" and suitable check diff --git a/plugins/youtube_download/yt_dlp/extractor/eporner.py b/plugins/youtube_download/yt_dlp/extractor/eporner.py index a233797..aee2dee 100644 --- a/plugins/youtube_download/yt_dlp/extractor/eporner.py +++ b/plugins/youtube_download/yt_dlp/extractor/eporner.py @@ -52,7 +52,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(urlh.geturl()) + video_id = self._match_id(urlh.url) hash = self._search_regex( r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/plugins/youtube_download/yt_dlp/extractor/espn.py b/plugins/youtube_download/yt_dlp/extractor/espn.py index f4b0134..7ed824c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/espn.py +++ b/plugins/youtube_download/yt_dlp/extractor/espn.py @@ -240,7 +240,7 @@ class FiveThirtyEightIE(InfoExtractor): class ESPNCricInfoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/(?:cricket-)?videos?/[^#$&?/]+-(?P\d+)' _TESTS = [{ 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', 'info_dict': { @@ -252,6 +252,17 @@ class ESPNCricInfoIE(InfoExtractor): 'duration': 96, }, 'params': {'skip_download': True} + }, { + 'url': 'https://www.espncricinfo.com/cricket-videos/daryl-mitchell-mitchell-santner-is-one-of-the-best-white-ball-spinners-india-vs-new-zealand-1356225', + 'info_dict': { + 'id': '1356225', + 'ext': 'mp4', + 'description': '"Santner has done it for a long time for New Zealand - we\'re lucky to have him"', + 'upload_date': '20230128', + 'title': 'Mitchell: \'Santner is one of the best white-ball spinners at the moment\'', + 'duration': 87, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): diff --git a/plugins/youtube_download/yt_dlp/extractor/ettutv.py b/plugins/youtube_download/yt_dlp/extractor/ettutv.py new file mode 100644 index 0000000..133b525 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/ettutv.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none + + +class EttuTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ettu.tv/en-int/playerpage/1573849', + 'md5': '5874b7639a2aa866d1f6c3a4037c7c09', + 'info_dict': { + 'id': '1573849', + 'title': 'Ni Xia Lian - Shao Jieni', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677348600, + 'upload_date': '20230225', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ettu.tv/en-int/playerpage/1573753', + 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa', + 'info_dict': { + 'id': '1573753', + 'title': 'Qiu Dang - Jorgic Darko', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677423600, + 'upload_date': '20230226', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_settings = self._download_json( + f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={ + 'language': 'en', + 'showTitle': 'true', + 'device': 'desktop', + }) + + stream_response = self._download_json(player_settings['streamAccess'], video_id, data=b'') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream_response['data']['stream'], video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(player_settings, { + 'title': 'title', + 'description': ('metaInformation', 'competition'), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('date', {unified_timestamp}), + 'is_live': ('isLivestream', {bool_or_none}), + }) + } diff --git a/plugins/youtube_download/yt_dlp/extractor/europa.py b/plugins/youtube_download/yt_dlp/extractor/europa.py index 29daabe..f3da95f 100644 --- a/plugins/youtube_download/yt_dlp/extractor/europa.py +++ b/plugins/youtube_download/yt_dlp/extractor/europa.py @@ -6,6 +6,7 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, + traverse_obj, unified_strdate, xpath_text ) @@ -92,42 +93,17 @@ class EuropaIE(InfoExtractor): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:multimedia|webstreaming)\.europarl\.europa\.eu/[^/#?]+/ - (?:embed/embed\.html\?event=|(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) + https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ + (?:(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', - 'ext': 'mp4', - 'release_timestamp': 1663137900, - 'title': 'Plenary session', - 'release_date': '20220914', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/eu-cop27-un-climate-change-conference-in-sharm-el-sheikh-egypt-ep-delegation-meets-with-ngo-represen_20221114-1600-SPECIAL-OTHER', - 'info_dict': { - 'id': 'a8428de8-b9cd-6a2e-11e4-3805d9c9ff5c', - 'ext': 'mp4', - 'release_timestamp': 1668434400, - 'release_date': '20221114', - 'title': 'md5:d3550280c33cc70e0678652e3d52c028', - }, - 'params': { - 'skip_download': True, - } - }, { - # embed webpage - 'url': 'https://webstreaming.europarl.europa.eu/ep/embed/embed.html?event=20220914-0900-PLENARY&language=en&autoplay=true&logo=true', - 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'ext': 'mp4', 'title': 'Plenary session', + 'release_timestamp': 1663139069, 'release_date': '20220914', - 'release_timestamp': 1663137900, }, 'params': { 'skip_download': True, @@ -144,30 +120,54 @@ class EuroParlWebstreamIE(InfoExtractor): 'live_status': 'is_live', }, 'skip': 'not live anymore' + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'info_dict': { + 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + } + }, { + # live stream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'info_dict': { + 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', + 'release_timestamp': 1684911541, + 'live_status': 'is_live', + }, + 'skip': 'Not live anymore' }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] json_info = self._download_json( - 'https://vis-api.vuplay.co.uk/event/external', display_id, + 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, query={ - 'player_key': 'europarl|718f822c-a48c-4841-9947-c9cb9bb1743c', - 'external_id': display_id, + 'api-version': 1.0, + 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', + 'externalReference': display_id }) - formats, subtitles = self._extract_mpd_formats_and_subtitles(json_info['streaming_url'], display_id) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - json_info['streaming_url'].replace('.mpd', '.m3u8'), display_id) - - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + formats, subtitles = [], {} + for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): + fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) return { 'id': json_info['id'], - 'title': json_info.get('title'), + 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'formats': formats, 'subtitles': subtitles, - 'release_timestamp': parse_iso8601(json_info.get('published_start')), - 'is_live': 'LIVE' in json_info.get('state', '') + 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), + 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live' } diff --git a/plugins/youtube_download/yt_dlp/extractor/eurosport.py b/plugins/youtube_download/yt_dlp/extractor/eurosport.py index 654e112..6c426bb 100644 --- a/plugins/youtube_download/yt_dlp/extractor/eurosport.py +++ b/plugins/youtube_download/yt_dlp/extractor/eurosport.py @@ -3,7 +3,7 @@ from ..utils import traverse_obj class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?Pvid\d+)' + _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?Pvid\d+)' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -44,6 +44,32 @@ class EurosportIE(InfoExtractor): 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', 'upload_date': '20220727', } + }, { + 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml', + 'info_dict': { + 'id': '3096477', + 'ext': 'mp4', + 'title': 'md5:82edc17370124c7a19b3cf518517583b', + 'duration': 84.0, + 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg', + 'timestamp': 1681292028, + 'upload_date': '20230412', + 'display_id': 'vid1896254', + } + }, { + 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml', + 'info_dict': { + 'id': '3149108', + 'ext': 'mp4', + 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final', + 'description': 'md5:89ef142fe0170a66abab77fac2955d8e', + 'display_id': 'vid1914115', + 'timestamp': 1684403618, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg', + 'duration': 105.0, + 'upload_date': '20230518', + } }] _TOKEN = None diff --git a/plugins/youtube_download/yt_dlp/extractor/facebook.py b/plugins/youtube_download/yt_dlp/extractor/facebook.py index 1404be6..4fd17b5 100644 --- a/plugins/youtube_download/yt_dlp/extractor/facebook.py +++ b/plugins/youtube_download/yt_dlp/extractor/facebook.py @@ -8,6 +8,8 @@ from ..compat import ( compat_str, compat_urllib_parse_unquote, ) +from ..networking import Request +from ..networking.exceptions import network_exceptions from ..utils import ( ExtractorError, clean_html, @@ -19,11 +21,10 @@ from ..utils import ( int_or_none, js_to_json, merge_dicts, - network_exceptions, parse_count, parse_qs, qualities, - sanitized_Request, + str_or_none, traverse_obj, try_get, url_or_none, @@ -90,16 +91,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt', - 'description': 'Asif Nawab Butt', + 'title': 'Asif', + 'description': '', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', + 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'duration': 131.03, + 'concurrent_view_count': int, }, - 'expected_warnings': [ - 'title' - ] }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', @@ -151,7 +152,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '3f3798adb2b73423263e59376f1f5eb7', + 'md5': 'ca63897a90c9452efee5f8c40d080e25', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -162,6 +163,9 @@ class FacebookIE(InfoExtractor): 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', 'view_count': int, + 'uploader_id': '100059479812265', + 'concurrent_view_count': int, + 'duration': 44.478, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -170,12 +174,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео', + 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', 'description': 'Довгоочікуване відео', - 'timestamp': 1486648771, + 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': '100000948048708', + 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'concurrent_view_count': int, + 'thumbnail': r're:^https?://.*', + 'view_count': int, + 'duration': 11736.446, }, 'params': { 'skip_download': True, @@ -192,9 +200,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'La Guía Del Varón', 'thumbnail': r're:^https?://.*', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', @@ -208,9 +214,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'Elisabeth Ahtn', 'uploader_id': '100013949973717', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -252,7 +256,11 @@ class FacebookIE(InfoExtractor): 'timestamp': 1527084179, 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', - 'uploader_id': '234218833769558', + 'uploader_id': '100066514874195', + 'duration': 4524.212, + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -262,8 +270,17 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { 'id': '106560053808006', + 'ext': 'mp4', + 'title': 'Josef', + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, + 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'timestamp': 1549275572, + 'duration': 3.413, + 'uploader': 'Josef Novak', + 'description': '', + 'upload_date': '20190204', }, - 'playlist_count': 2, }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', @@ -276,6 +293,7 @@ class FacebookIE(InfoExtractor): 'id': '10157667649866271', }, 'playlist_count': 3, + 'skip': 'Requires logging in', }, { # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', @@ -319,7 +337,7 @@ class FacebookIE(InfoExtractor): } def _perform_login(self, username, password): - login_page_req = sanitized_Request(self._LOGIN_URL) + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', @@ -340,8 +358,8 @@ class FacebookIE(InfoExtractor): 'timezone': '-60', 'trynum': '1', } - request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') @@ -367,8 +385,8 @@ class FacebookIE(InfoExtractor): 'h': h, 'name_action_selected': 'dont_save', } - check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) - check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) + check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded' check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: @@ -390,7 +408,10 @@ class FacebookIE(InfoExtractor): k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) or {}) page_title = title or self._html_search_regex(( r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', @@ -415,16 +436,17 @@ class FacebookIE(InfoExtractor): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) info_dict = { 'description': description, 'uploader': uploader, 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': view_count, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -459,7 +481,8 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url'))) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around @@ -493,6 +516,13 @@ class FacebookIE(InfoExtractor): entries = [] def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + reel_info = traverse_obj( + video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) + if reel_info: + video = video['creation_story'] + video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) + video.update(reel_info) formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), @@ -509,15 +539,15 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, 'thumbnail': traverse_obj( video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), } process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) @@ -778,18 +808,18 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'md5': 'f13dd37f2633595982db5ed8765474d3', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', - 'description': 'md5:24ea7ef062215d295bdde64e778f5474', - 'uploader': 'Beast Camp Training', - 'uploader_id': '1738535909799870', - 'duration': 9.536, - 'thumbnail': r're:^https?://.*', + 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', + 'description': 'md5:22f03309b216ac84720183961441d8db', + 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'uploader_id': '100040874179269', + 'duration': 9.579, + 'timestamp': 1637502609, 'upload_date': '20211121', - 'timestamp': 1637502604, + 'thumbnail': r're:^https?://.*', } }] diff --git a/plugins/youtube_download/yt_dlp/extractor/fc2.py b/plugins/youtube_download/yt_dlp/extractor/fc2.py index dd5e088..ba19b6c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/fc2.py +++ b/plugins/youtube_download/yt_dlp/extractor/fc2.py @@ -3,11 +3,11 @@ import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..dependencies import websockets +from ..networking import Request from ..utils import ( ExtractorError, WebSocketsWrapper, js_to_json, - sanitized_Request, traverse_obj, update_url_query, urlencode_postdata, @@ -57,7 +57,7 @@ class FC2IE(InfoExtractor): } login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( + request = Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') @@ -66,7 +66,7 @@ class FC2IE(InfoExtractor): return False # this is also needed - login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') + login_redir = Request('http://id.fc2.com/?mode=redirect&login=done') self._download_webpage( login_redir, None, note='Login redirect', errnote='Login redirect failed') diff --git a/plugins/youtube_download/yt_dlp/extractor/filmon.py b/plugins/youtube_download/yt_dlp/extractor/filmon.py index 9a93cb9..0cd18f4 100644 --- a/plugins/youtube_download/yt_dlp/extractor/filmon.py +++ b/plugins/youtube_download/yt_dlp/extractor/filmon.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( qualities, strip_or_none, @@ -40,8 +38,8 @@ class FilmOnIE(InfoExtractor): 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['reason'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise @@ -124,8 +122,8 @@ class FilmOnChannelIE(InfoExtractor): channel_data = self._download_json( 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), channel_id)['message'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise diff --git a/plugins/youtube_download/yt_dlp/extractor/fox.py b/plugins/youtube_download/yt_dlp/extractor/fox.py index 15c0c48..e00e977 100644 --- a/plugins/youtube_download/yt_dlp/extractor/fox.py +++ b/plugins/youtube_download/yt_dlp/extractor/fox.py @@ -3,10 +3,10 @@ import uuid from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urllib_parse_unquote, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -20,7 +20,7 @@ from ..utils import ( class FOXIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -50,6 +50,10 @@ class FOXIE(InfoExtractor): # sports event, geo-restricted 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/', 'only_matching': True, + }, { + # fox sports replay, geo-restricted + 'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89', + 'only_matching': True, }] _GEO_BYPASS = False _HOME_PAGE_URL = 'https://www.fox.com/' @@ -68,9 +72,9 @@ class FOXIE(InfoExtractor): 'https://api3.fox.com/v2.0/' + path, video_id, data=data, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: entitlement_issues = self._parse_json( - e.cause.read().decode(), video_id)['entitlementIssues'] + e.cause.response.read().decode(), video_id)['entitlementIssues'] for e in entitlement_issues: if e.get('errorCode') == 1005: raise ExtractorError( @@ -123,8 +127,8 @@ class FOXIE(InfoExtractor): try: m3u8_url = self._download_json(release_url, video_id)['playURL'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), video_id) if error.get('exception') == 'GeoLocationBlocked': self.raise_geo_restricted(countries=['US']) raise ExtractorError(error['description'], expected=True) diff --git a/plugins/youtube_download/yt_dlp/extractor/foxnews.py b/plugins/youtube_download/yt_dlp/extractor/foxnews.py index 52172aa..6aa6361 100644 --- a/plugins/youtube_download/yt_dlp/extractor/foxnews.py +++ b/plugins/youtube_download/yt_dlp/extractor/foxnews.py @@ -7,8 +7,37 @@ from .common import InfoExtractor class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ + { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ class FoxNewsIE(AMPIE): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ class FoxNewsIE(AMPIE): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, diff --git a/plugins/youtube_download/yt_dlp/extractor/foxsports.py b/plugins/youtube_download/yt_dlp/extractor/foxsports.py index f906a17..8e89ccf 100644 --- a/plugins/youtube_download/yt_dlp/extractor/foxsports.py +++ b/plugins/youtube_download/yt_dlp/extractor/foxsports.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from .uplynk import UplynkPreplayIE -from ..utils import HEADRequest, float_or_none, make_archive_id, smuggle_url +from ..networking import HEADRequest +from ..utils import float_or_none, make_archive_id, smuggle_url class FoxSportsIE(InfoExtractor): @@ -35,7 +36,7 @@ class FoxSportsIE(InfoExtractor): 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93', }) preplay_url = self._request_webpage( - HEADRequest(data['url']), video_id, 'Fetching preplay URL').geturl() + HEADRequest(data['url']), video_id, 'Fetching preplay URL').url return { '_type': 'url_transparent', diff --git a/plugins/youtube_download/yt_dlp/extractor/fujitv.py b/plugins/youtube_download/yt_dlp/extractor/fujitv.py index 668bb27..77e826e 100644 --- a/plugins/youtube_download/yt_dlp/extractor/fujitv.py +++ b/plugins/youtube_download/yt_dlp/extractor/fujitv.py @@ -1,5 +1,5 @@ -from ..utils import HEADRequest from .common import InfoExtractor +from ..networking import HEADRequest class FujiTVFODPlus7IE(InfoExtractor): diff --git a/plugins/youtube_download/yt_dlp/extractor/funimation.py b/plugins/youtube_download/yt_dlp/extractor/funimation.py index 47c3166..41de85c 100644 --- a/plugins/youtube_download/yt_dlp/extractor/funimation.py +++ b/plugins/youtube_download/yt_dlp/extractor/funimation.py @@ -3,7 +3,7 @@ import re import string from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -46,8 +46,8 @@ class FunimationBaseIE(InfoExtractor): })) FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['error'] raise ExtractorError(error, expected=True) raise diff --git a/plugins/youtube_download/yt_dlp/extractor/funker530.py b/plugins/youtube_download/yt_dlp/extractor/funker530.py new file mode 100644 index 0000000..ba5ab7d --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/funker530.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)) + } diff --git a/plugins/youtube_download/yt_dlp/extractor/gdcvault.py b/plugins/youtube_download/yt_dlp/extractor/gdcvault.py index 2878bbd..4265feb 100644 --- a/plugins/youtube_download/yt_dlp/extractor/gdcvault.py +++ b/plugins/youtube_download/yt_dlp/extractor/gdcvault.py @@ -2,13 +2,8 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - HEADRequest, - remove_start, - sanitized_Request, - smuggle_url, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import remove_start, smuggle_url, urlencode_postdata class GDCVaultIE(InfoExtractor): @@ -138,8 +133,8 @@ class GDCVaultIE(InfoExtractor): 'password': password, } - request = sanitized_Request(login_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(login_url, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') @@ -163,7 +158,7 @@ class GDCVaultIE(InfoExtractor): video_url = 'http://www.gdcvault.com' + direct_url # resolve the url so that we can detect the correct extension video_url = self._request_webpage( - HEADRequest(video_url), video_id).geturl() + HEADRequest(video_url), video_id).url return { 'id': video_id, diff --git a/plugins/youtube_download/yt_dlp/extractor/generic.py b/plugins/youtube_download/yt_dlp/extractor/generic.py index 55e55d5..f5c59a0 100644 --- a/plugins/youtube_download/yt_dlp/extractor/generic.py +++ b/plugins/youtube_download/yt_dlp/extractor/generic.py @@ -14,7 +14,9 @@ from ..utils import ( ExtractorError, UnsupportedError, determine_ext, + determine_protocol, dict_get, + extract_basic_auth, format_field, int_or_none, is_html, @@ -31,6 +33,7 @@ from ..utils import ( unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, urljoin, variadic, @@ -865,7 +868,7 @@ class GenericIE(InfoExtractor): }, }, { - # Video.js embed, multiple formats + # Youtube embed, formerly: Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', 'info_dict': { 'id': 'yygqldloqIk', @@ -892,6 +895,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # rtl.nl embed { @@ -2167,6 +2171,33 @@ class GenericIE(InfoExtractor): 'age_limit': 18, }, }, + { + 'note': 'Live HLS direct link', + 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8', + 'info_dict': { + 'id': 'index', + 'title': r're:index', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + 'note': 'Video.js VOD HLS', + 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html', + 'info_dict': { + 'id': 'videojs_hls_test', + 'title': 'video', + 'ext': 'mp4', + 'age_limit': 0, + 'duration': 1800, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def report_following_redirect(self, new_url): @@ -2183,12 +2214,41 @@ class GenericIE(InfoExtractor): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') - def _fragment_query(self, url): - if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(url).query - if query_string: - return {'extra_param_to_segment_url': query_string} - return {} + def _extra_manifest_info(self, info, manifest_url): + fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] + if fragment_query is not None: + info['extra_param_to_segment_url'] = ( + urllib.parse.urlparse(fragment_query).query or fragment_query + or urllib.parse.urlparse(manifest_url).query or None) + + hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { + 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), + }) or None + + variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0] + if variant_query is not None: + query = urllib.parse.parse_qs( + urllib.parse.urlparse(variant_query).query or variant_query + or urllib.parse.urlparse(manifest_url).query) + for fmt in self._downloader._get_formats(info): + fmt['url'] = update_url_query(fmt['url'], query) + + # Attempt to detect live HLS or set VOD duration + m3u8_format = next((f for f in self._downloader._get_formats(info) + if determine_protocol(f) == 'm3u8_native'), None) + if m3u8_format: + is_live = self._configuration_arg('is_live', [None])[0] + if is_live is not None: + info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' + return + headers = m3u8_format.get('http_headers') or info.get('http_headers') + duration = self._extract_m3u8_vod_duration( + m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', + errnote='Failed to download m3u8 media playlist', headers=headers) + if not duration: + info['live_status'] = 'is_live' + info['duration'] = info.get('duration') or duration def _extract_rss(self, url, video_id, doc): NS_MAP = { @@ -2371,10 +2431,9 @@ class GenericIE(InfoExtractor): 'Accept-Encoding': 'identity', **smuggled_data.get('http_headers', {}) }) - new_url = full_response.geturl() - if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): - url = new_url - elif url != new_url: + new_url = full_response.url + url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl() + if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) if force_videoid: new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) @@ -2393,14 +2452,13 @@ class GenericIE(InfoExtractor): self.report_detected('direct video link') headers = smuggled_data.get('http_headers', {}) format_id = str(m.group('format_id')) + ext = determine_ext(url) subtitles = {} - if format_id.endswith('mpegurl'): + if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd': formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id == 'f4m': + elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ @@ -2414,6 +2472,7 @@ class GenericIE(InfoExtractor): 'subtitles': subtitles, 'http_headers': headers or None, }) + self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2426,7 +2485,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? @@ -2470,14 +2529,14 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), + xspf_base_url=full_response.url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=full_response.url.rpartition('/')[0], mpd_url=url) - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2503,7 +2562,7 @@ class GenericIE(InfoExtractor): self._downloader.write_debug('Looking for embeds') embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) if len(embeds) == 1: - return {**info_dict, **embeds[0]} + return merge_dicts(embeds[0], info_dict) elif embeds: return self.playlist_result(embeds, **info_dict) raise UnsupportedError(url) @@ -2513,7 +2572,7 @@ class GenericIE(InfoExtractor): info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) url, smuggled_data = unsmuggle_url(url, {}) - actual_url = urlh.geturl() if urlh else url + actual_url = urlh.url if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) @@ -2566,8 +2625,7 @@ class GenericIE(InfoExtractor): varname = mobj.group(1) sources = variadic(self._parse_json( mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) - formats = [] - subtitles = {} + formats, subtitles, src = [], {}, None for source in sources: src = source.get('src') if not src or not isinstance(src, str): @@ -2590,8 +2648,6 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - for fmt in formats: - fmt.update(self._fragment_query(src)) if not formats: formats.append({ @@ -2607,11 +2663,11 @@ class GenericIE(InfoExtractor): for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): sub = self._parse_json( sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} - src = str_or_none(sub.get('src')) - if not src: + sub_src = str_or_none(sub.get('src')) + if not sub_src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': urllib.parse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, sub_src), 'name': sub.get('label'), 'http_headers': { 'Referer': actual_url, @@ -2619,7 +2675,10 @@ class GenericIE(InfoExtractor): }) if formats or subtitles: self.report_detected('video.js embed') - return [{'formats': formats, 'subtitles': subtitles}] + info_dict = {'formats': formats, 'subtitles': subtitles} + if formats: + self._extra_manifest_info(info_dict, src) + return [info_dict] # Look for generic KVS player (before json-ld bc of some urls that break otherwise) found = self._search_regex(( @@ -2794,10 +2853,10 @@ class GenericIE(InfoExtractor): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: diff --git a/plugins/youtube_download/yt_dlp/extractor/genius.py b/plugins/youtube_download/yt_dlp/extractor/genius.py index 62f5a28..57c25e7 100644 --- a/plugins/youtube_download/yt_dlp/extractor/genius.py +++ b/plugins/youtube_download/yt_dlp/extractor/genius.py @@ -10,7 +10,7 @@ from ..utils import ( class GeniusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P[^?/#]+)' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?:videos|(?P
a))/(?P[^?/#]+)' _TESTS = [{ 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', @@ -41,19 +41,37 @@ class GeniusIE(InfoExtractor): 'timestamp': 1631209167, 'thumbnail': r're:^https?://.*\.jpg$', }, + }, { + 'url': 'https://genius.com/a/cordae-anderson-paak-break-down-the-meaning-of-two-tens', + 'md5': 'f98a4e03b16b0a2821bd6e52fb3cc9d7', + 'info_dict': { + 'id': '6321509903112', + 'ext': 'mp4', + 'title': 'Cordae & Anderson .Paak Breaks Down The Meaning Of “Two Tens”', + 'description': 'md5:1255f0e1161d07342ce56a8464ac339d', + 'tags': ['song id: 5457554'], + 'uploader_id': '4863540648001', + 'duration': 361.813, + 'upload_date': '20230301', + 'timestamp': 1677703908, + 'thumbnail': r're:^https?://.*\.jpg$', + }, }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, is_article = self._match_valid_url(url).group('id', 'article') webpage = self._download_webpage(url, display_id) metadata = self._search_json( - r'[^?/#]+)-lyrics[?/#]?' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P[^?/#]+)-lyrics(?:[?/#]|$)' _TESTS = [{ 'url': 'https://genius.com/Lil-baby-heyy-lyrics', 'playlist_mincount': 2, diff --git a/plugins/youtube_download/yt_dlp/extractor/globalplayer.py b/plugins/youtube_download/yt_dlp/extractor/globalplayer.py new file mode 100644 index 0000000..e0c0d58 --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/globalplayer.py @@ -0,0 +1,254 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + join_nonempty, + parse_duration, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + def _get_page_props(self, url, video_id): + webpage = self._download_webpage(url, video_id) + return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + + def _request_ext(self, url, video_id): + return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests + url, video_id, note='Determining source extension')) + + def _extract_audio(self, episode, series): + return { + 'vcodec': 'none', + **traverse_obj(series, { + 'series': 'title', + 'series_id': 'id', + 'thumbnail': 'imageUrl', + 'uploader': 'itunesAuthor', # podcasts only + }), + **traverse_obj(episode, { + 'id': 'id', + 'description': ('description', {clean_html}), + 'duration': ('duration', {parse_duration}), + 'thumbnail': 'imageUrl', + 'url': 'streamUrl', + 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}), + 'title': 'title', + }, get_all=False) + } + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P\w+)/\w+' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/live/smoothchill/uk/', + 'info_dict': { + 'id': '2mx1E', + 'ext': 'aac', + 'display_id': 'smoothchill-uk', + 'title': 're:^Smooth Chill.+$', + 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', + 'description': 'Music To Chill To', + 'live_status': 'is_live', + }, + }, { + # national station + 'url': 'https://www.globalplayer.com/live/heart/uk/', + 'info_dict': { + 'id': '2mwx4', + 'ext': 'aac', + 'description': 'turn up the feel good!', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'live_status': 'is_live', + 'title': 're:^Heart UK.+$', + 'display_id': 'heart-uk', + }, + }, { + # regional variation + 'url': 'https://www.globalplayer.com/live/heart/london/', + 'info_dict': { + 'id': 'AMqg', + 'ext': 'aac', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'title': 're:^Heart London.+$', + 'live_status': 'is_live', + 'display_id': 'heart-london', + 'description': 'turn up the feel good!', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['station'] + stream_url = station['streamUrl'] + + return { + 'id': station['id'], + 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'), + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': (('name', 'brandName'), {str_or_none}), + 'description': 'tagline', + 'thumbnail': 'brandLogo', + }, get_all=False), + } + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P\w+)' + _TESTS = [{ + # "live playlist" + 'url': 'https://www.globalplayer.com/playlists/8bLk/', + 'info_dict': { + 'id': '8bLk', + 'ext': 'aac', + 'live_status': 'is_live', + 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', + 'title': 're:^Classic FM Hall of Fame.+$' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['playlistData'] + stream_url = station['streamUrl'] + + return { + 'id': video_id, + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': 'title', + 'description': 'description', + 'thumbnail': 'image', + }), + } + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)/|catchup/\w+/\w+/)(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/42KuaM/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '42KuaM', + 'title': 'Filthy Ritual', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'categories': ['Society & Culture', 'True Crime'], + 'uploader': 'Global', + 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + series = props['podcastInfo'] if podcast else props['catchupInfo'] + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': [self._extract_audio(ep, series) for ep in traverse_obj( + series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], + 'categories': traverse_obj(series, ('categories', ..., 'name')) or None, + **traverse_obj(series, { + 'description': 'description', + 'thumbnail': 'imageUrl', + 'title': 'title', + 'uploader': 'itunesAuthor', # podcasts only + }), + } + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)|catchup/\w+/\w+)/episodes/(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', + 'info_dict': { + 'id': '7DrfNnE', + 'ext': 'mp3', + 'title': 'Filthy Ritual - Trailer', + 'description': 'md5:1f1562fd0f01b4773b590984f94223e0', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'duration': 225.0, + 'timestamp': 1681254900, + 'series': 'Filthy Ritual', + 'series_id': '42KuaM', + 'upload_date': '20230411', + 'uploader': 'Global', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', + 'info_dict': { + 'id': '2zGq26Vcv1fCWhddC4JAwETXWe', + 'ext': 'm4a', + 'timestamp': 1682056800, + 'series': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + 'upload_date': '20230421', + 'series_id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'duration': 10800.0, + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + + return self._extract_audio( + episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P\w+)' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', + 'info_dict': { + 'id': '2JsSZ7Gm2uP', + 'ext': 'mp4', + 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', + 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', + 'upload_date': '20230420', + 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._get_page_props(url, video_id)['videoData'] + + return { + 'id': video_id, + **traverse_obj(meta, { + 'url': 'url', + 'thumbnail': ('image', 'url'), + 'title': 'title', + 'upload_date': ('publish_date', {unified_strdate}), + 'description': 'description', + }), + } diff --git a/plugins/youtube_download/yt_dlp/extractor/globo.py b/plugins/youtube_download/yt_dlp/extractor/globo.py index a7be2cb..df98f09 100644 --- a/plugins/youtube_download/yt_dlp/extractor/globo.py +++ b/plugins/youtube_download/yt_dlp/extractor/globo.py @@ -8,8 +8,8 @@ from .common import InfoExtractor from ..compat import ( compat_str, ) +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, float_or_none, orderedSet, diff --git a/plugins/youtube_download/yt_dlp/extractor/gmanetwork.py b/plugins/youtube_download/yt_dlp/extractor/gmanetwork.py new file mode 100644 index 0000000..62fff4e --- /dev/null +++ b/plugins/youtube_download/yt_dlp/extractor/gmanetwork.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .youtube import YoutubeIE + + +class GMANetworkVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P\d+)/(?P[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home', + 'info_dict': { + 'id': '28BqW0AXPe0', + 'ext': 'mp4', + 'upload_date': '20220919', + 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'like_count': int, + 'view_count': int, + 'uploader': 'YoüLOL', + 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'duration': 5313, + 'comment_count': int, + 'tags': 'count:22', + 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)', + 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg', + 'release_timestamp': 1663594212, + 'age_limit': 0, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'description': 'md5:811bdcea74f9c48051824e494756e926', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'YoüLOL', + 'availability': 'public', + 'release_date': '20220919', + } + }, { + 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home', + 'info_dict': { + 'id': 'yiDOExw2aSA', + 'ext': 'mp4', + 'live_status': 'not_live', + 'channel': 'GMANetwork', + 'like_count': int, + 'channel_follower_count': int, + 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05', + 'duration': 1419, + 'age_limit': 0, + 'comment_count': int, + 'upload_date': '20181003', + 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng', + 'title': 'More Than Words: Full Episode 80 (Finale)', + 'uploader_id': 'GMANETWORK', + 'categories': ['Entertainment'], + 'uploader': 'GMANetwork', + 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng', + 'tags': 'count:29', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/user/GMANETWORK', + } + }] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + # webpage route + youtube_id = self._search_regex( + r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P[\w-]+)', webpage, 'youtube_id', fatal=False) + if youtube_id: + return self.url_result(youtube_id, YoutubeIE, youtube_id) + + # api call route + # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11 + network_url = self._search_regex( + r'NETWORK_URL\s*=\s*[\'"](?P[^\'"]+)', webpage, 'network_url') + json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id) + if json_data.get('video_file'): + return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file']) + else: + return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file']) diff --git a/plugins/youtube_download/yt_dlp/extractor/googledrive.py b/plugins/youtube_download/yt_dlp/extractor/googledrive.py index e027ea7..2fdec20 100644 --- a/plugins/youtube_download/yt_dlp/extractor/googledrive.py +++ b/plugins/youtube_download/yt_dlp/extractor/googledrive.py @@ -3,9 +3,11 @@ import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( - determine_ext, ExtractorError, + determine_ext, + extract_attributes, get_element_by_class, + get_element_html_by_id, int_or_none, lowercase_escape, try_get, @@ -34,6 +36,7 @@ class GoogleDriveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, + 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', } }, { # video can't be watched anonymously due to view count limit reached, @@ -163,15 +166,13 @@ class GoogleDriveIE(InfoExtractor): video_id = self._match_id(url) video_info = compat_parse_qs(self._download_webpage( 'https://drive.google.com/get_video_info', - video_id, query={'docid': video_id})) + video_id, 'Downloading video webpage', query={'docid': video_id})) def get_value(key): return try_get(video_info, lambda x: x[key][0]) reason = get_value('reason') title = get_value('title') - if not title and reason: - raise ExtractorError(reason, expected=True) formats = [] fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') @@ -209,20 +210,25 @@ class GoogleDriveIE(InfoExtractor): 'export': 'download', }) - def request_source_file(source_url, kind): + def request_source_file(source_url, kind, data=None): return self._request_webpage( source_url, video_id, note='Requesting %s file' % kind, - errnote='Unable to request %s file' % kind, fatal=False) + errnote='Unable to request %s file' % kind, fatal=False, data=data) urlh = request_source_file(source_url, 'source') if urlh: def add_source_format(urlh): + nonlocal title + if not title: + title = self._search_regex( + r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'), + 'title', default=None) formats.append({ # Use redirect URLs as download URLs in order to calculate # correct cookies in _calc_cookies. # Using original URLs may result in redirect loop due to # google.com's cookies mistakenly used for googleusercontent.com # redirect URLs (see #23919). - 'url': urlh.geturl(), + 'url': urlh.url, 'ext': determine_ext(title, 'mp4').lower(), 'format_id': 'source', 'quality': 1, @@ -234,14 +240,10 @@ class GoogleDriveIE(InfoExtractor): urlh, url, video_id, note='Downloading confirmation page', errnote='Unable to confirm download', fatal=False) if confirmation_webpage: - confirm = self._search_regex( - r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', default=None) - if confirm: - confirmed_source_url = update_url_query(source_url, { - 'confirm': confirm, - }) - urlh = request_source_file(confirmed_source_url, 'confirmed source') + confirmed_source_url = extract_attributes( + get_element_html_by_id('download-form', confirmation_webpage) or '').get('action') + if confirmed_source_url: + urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) else: @@ -251,7 +253,10 @@ class GoogleDriveIE(InfoExtractor): or 'unable to extract confirmation code') if not formats and reason: - self.raise_no_formats(reason, expected=True) + if title: + self.raise_no_formats(reason, expected=True) + else: + raise ExtractorError(reason, expected=True) hl = get_value('hl') subtitles_id = None diff --git a/plugins/youtube_download/yt_dlp/extractor/gronkh.py b/plugins/youtube_download/yt_dlp/extractor/gronkh.py index b9370e3..1ae0a68 100644 --- a/plugins/youtube_download/yt_dlp/extractor/gronkh.py +++ b/plugins/youtube_download/yt_dlp/extractor/gronkh.py @@ -3,6 +3,7 @@ import functools from .common import InfoExtractor from ..utils import ( OnDemandPagedList, + float_or_none, traverse_obj, unified_strdate, ) @@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor): 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', - 'upload_date': '20221111' + 'upload_date': '20221111', + 'chapters': 'count:3', + 'duration': 31463, }, 'params': {'skip_download': True} }, { @@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor): 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', - 'upload_date': '20211001' + 'upload_date': '20211001', + 'duration': 32058, }, 'params': {'skip_download': True} }, { @@ -56,6 +60,12 @@ class GronkhIE(InfoExtractor): 'upload_date': unified_strdate(data_json.get('created_at')), 'formats': formats, 'subtitles': subtitles, + 'duration': float_or_none(data_json.get('source_length')), + 'chapters': traverse_obj(data_json, ( + 'chapters', lambda _, v: float_or_none(v['offset']) is not None, { + 'title': 'title', + 'start_time': ('offset', {float_or_none}), + })) or None, } diff --git a/plugins/youtube_download/yt_dlp/extractor/hentaistigma.py b/plugins/youtube_download/yt_dlp/extractor/hentaistigma.py deleted file mode 100644 index ca5ffc2..0000000 --- a/plugins/youtube_download/yt_dlp/extractor/hentaistigma.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class HentaiStigmaIE(InfoExtractor): - _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P[^/]+)' - _TEST = { - 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/', - 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23', - 'info_dict': { - 'id': 'inyouchuu-etsu-bonus', - 'ext': 'mp4', - 'title': 'Inyouchuu Etsu Bonus', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r']+class="posttitle"[^>]*>]*>([^<]+)', - webpage, 'title') - wrap_url = self._html_search_regex( - r']+src="([^"]+mp4)"', webpage, 'wrapper url') - wrap_webpage = self._download_webpage(wrap_url, video_id) - - video_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'age_limit': 18, - } diff --git a/plugins/youtube_download/yt_dlp/extractor/hidive.py b/plugins/youtube_download/yt_dlp/extractor/hidive.py index 8a87498..df6868d 100644 --- a/plugins/youtube_download/yt_dlp/extractor/hidive.py +++ b/plugins/youtube_download/yt_dlp/extractor/hidive.py @@ -47,15 +47,16 @@ class HiDiveIE(InfoExtractor): login_webpage = self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) # If the user has multiple profiles on their account, select one. For now pick the first profile. - profile_id = self._search_regex(r'