Plugin cleanup and tweaks

This commit is contained in:
2023-02-20 19:18:45 -06:00
parent 372e4ff3dc
commit 3ad9e1c7bb
1138 changed files with 48878 additions and 40445 deletions

View File

@@ -1,23 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
import urllib.parse
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
compat_str,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
)
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
error_to_compat_str,
ExtractorError,
float_or_none,
get_element_by_id,
get_first,
int_or_none,
js_to_json,
merge_dicts,
@@ -59,6 +57,13 @@ class FacebookIE(InfoExtractor):
)
(?P<id>[0-9]+)
'''
_EMBED_REGEX = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
# Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player
r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
]
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
@@ -313,26 +318,7 @@ class FacebookIE(InfoExtractor):
'graphURI': '/api/graphql/'
}
@staticmethod
def _extract_urls(webpage):
urls = []
for mobj in re.finditer(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
webpage):
urls.append(mobj.group('url'))
# Facebook API embed
# see https://developers.facebook.com/docs/plugins/embedded-video-player
for mobj in re.finditer(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
urls.append(mobj.group('url'))
return urls
def _login(self):
useremail, password = self._get_login_info()
if useremail is None:
return
def _perform_login(self, username, password):
login_page_req = sanitized_Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US')
login_page = self._download_webpage(login_page_req, None,
@@ -344,7 +330,7 @@ class FacebookIE(InfoExtractor):
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': useremail,
'email': username,
'pass': password,
'lsd': lsd,
'lgnrnd': lgnrnd,
@@ -391,9 +377,6 @@ class FacebookIE(InfoExtractor):
self.report_warning('unable to log in: %s' % error_to_compat_str(err))
return
def _real_initialize(self):
self._login()
def _extract_from_url(self, url, video_id):
webpage = self._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
@@ -403,13 +386,11 @@ class FacebookIE(InfoExtractor):
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or []
if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
description = traverse_obj(media, (
..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
uploader_data = (traverse_obj(media, (..., 'owner'), get_all=False)
or traverse_obj(post, (..., 'node', 'actors', ...), get_all=False) or {})
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text'))
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}
page_title = title or self._html_search_regex((
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
@@ -478,15 +459,14 @@ class FacebookIE(InfoExtractor):
dash_manifest = video.get('dash_manifest')
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest))))
def process_formats(formats):
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
for f in formats:
for f in info['formats']:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
self._sort_formats(formats, ('res', 'quality'))
info['_format_sort_fields'] = ('res', 'quality')
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
@@ -529,16 +509,17 @@ class FacebookIE(InfoExtractor):
'url': playable_url,
})
extract_dash_manifest(video, formats)
process_formats(formats)
v_id = video.get('videoId') or video.get('id') or video_id
info = {
'id': v_id,
'formats': formats,
'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
'thumbnail': traverse_obj(
video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')),
'uploader_id': try_get(video, lambda x: x['owner']['id']),
'timestamp': int_or_none(video.get('publish_time')),
'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
}
process_formats(info)
description = try_get(video, lambda x: x['savable_description']['text'])
title = video.get('name')
if title:
@@ -705,13 +686,12 @@ class FacebookIE(InfoExtractor):
if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src})
process_formats(formats)
info_dict = {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
process_formats(info_dict)
info_dict.update(extract_metadata(webpage))
return info_dict
@@ -790,3 +770,30 @@ class FacebookRedirectURLIE(InfoExtractor):
if not redirect_url:
raise ExtractorError('Invalid facebook redirect URL', expected=True)
return self.url_result(redirect_url)
class FacebookReelIE(InfoExtractor):
_VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)'
IE_NAME = 'facebook:reel'
_TESTS = [{
'url': 'https://www.facebook.com/reel/1195289147628387',
'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831',
'info_dict': {
'id': '1195289147628387',
'ext': 'mp4',
'title': 'md5:9f5b142921b2dc57004fa13f76005f87',
'description': 'md5:24ea7ef062215d295bdde64e778f5474',
'uploader': 'Beast Camp Training',
'uploader_id': '1738535909799870',
'duration': 9.536,
'thumbnail': r're:^https?://.*',
'upload_date': '20211121',
'timestamp': 1637502604,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)