Plugin cleanup and tweaks

2023-02-20 19:18:45 -06:00
parent 372e4ff3dc
commit 3ad9e1c7bb
1138 changed files with 48878 additions and 40445 deletions
--- a/plugins/youtube_download/yt_dlp/extractor/facebook.py
+++ b/plugins/youtube_download/yt_dlp/extractor/facebook.py
@@ -1,23 +1,21 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import json
 import re
+import urllib.parse

 from .common import InfoExtractor
 from ..compat import (
    compat_etree_fromstring,
    compat_str,
    compat_urllib_parse_unquote,
-    compat_urllib_parse_unquote_plus,
 )
 from ..utils import (
+    ExtractorError,
    clean_html,
    determine_ext,
    error_to_compat_str,
-    ExtractorError,
    float_or_none,
    get_element_by_id,
+    get_first,
    int_or_none,
    js_to_json,
    merge_dicts,
@@ -59,6 +57,13 @@ class FacebookIE(InfoExtractor):
                )
                (?P<id>[0-9]+)
                '''
+    _EMBED_REGEX = [
+        r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
+        # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player
+        r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
+    ]
    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
    _NETRC_MACHINE = 'facebook'
@@ -313,26 +318,7 @@ class FacebookIE(InfoExtractor):
        'graphURI': '/api/graphql/'
    }

-    @staticmethod
-    def _extract_urls(webpage):
-        urls = []
-        for mobj in re.finditer(
-                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
-                webpage):
-            urls.append(mobj.group('url'))
-        # Facebook API embed
-        # see https://developers.facebook.com/docs/plugins/embedded-video-player
-        for mobj in re.finditer(r'''(?x)<div[^>]+
-                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
-                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
-            urls.append(mobj.group('url'))
-        return urls
-
-    def _login(self):
-        useremail, password = self._get_login_info()
-        if useremail is None:
-            return
-
+    def _perform_login(self, username, password):
        login_page_req = sanitized_Request(self._LOGIN_URL)
        self._set_cookie('facebook.com', 'locale', 'en_US')
        login_page = self._download_webpage(login_page_req, None,
@@ -344,7 +330,7 @@ class FacebookIE(InfoExtractor):
        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')

        login_form = {
-            'email': useremail,
+            'email': username,
            'pass': password,
            'lsd': lsd,
            'lgnrnd': lgnrnd,
@@ -391,9 +377,6 @@ class FacebookIE(InfoExtractor):
            self.report_warning('unable to log in: %s' % error_to_compat_str(err))
            return

-    def _real_initialize(self):
-        self._login()
-
    def _extract_from_url(self, url, video_id):
        webpage = self._download_webpage(
            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
@@ -403,13 +386,11 @@ class FacebookIE(InfoExtractor):
                r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
            post = traverse_obj(post_data, (
                ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
-            media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or []
-                     if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
-            title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
-            description = traverse_obj(media, (
-                ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
-            uploader_data = (traverse_obj(media, (..., 'owner'), get_all=False)
-                             or traverse_obj(post, (..., 'node', 'actors', ...), get_all=False) or {})
+            media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
+                k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
+            title = get_first(media, ('title', 'text'))
+            description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
+            uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}

            page_title = title or self._html_search_regex((
                r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
@@ -478,15 +459,14 @@ class FacebookIE(InfoExtractor):
            dash_manifest = video.get('dash_manifest')
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
-                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest))))

-        def process_formats(formats):
+        def process_formats(info):
            # Downloads with browser's User-Agent are rate limited. Working around
            # with non-browser User-Agent.
-            for f in formats:
+            for f in info['formats']:
                f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
-
-            self._sort_formats(formats, ('res', 'quality'))
+            info['_format_sort_fields'] = ('res', 'quality')

        def extract_relay_data(_filter):
            return self._parse_json(self._search_regex(
@@ -529,16 +509,17 @@ class FacebookIE(InfoExtractor):
                                'url': playable_url,
                            })
                    extract_dash_manifest(video, formats)
-                    process_formats(formats)
                    v_id = video.get('videoId') or video.get('id') or video_id
                    info = {
                        'id': v_id,
                        'formats': formats,
-                        'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
+                        'thumbnail': traverse_obj(
+                            video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')),
                        'uploader_id': try_get(video, lambda x: x['owner']['id']),
                        'timestamp': int_or_none(video.get('publish_time')),
                        'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
                    }
+                    process_formats(info)
                    description = try_get(video, lambda x: x['savable_description']['text'])
                    title = video.get('name')
                    if title:
@@ -705,13 +686,12 @@ class FacebookIE(InfoExtractor):
            if subtitles_src:
                subtitles.setdefault('en', []).append({'url': subtitles_src})

-        process_formats(formats)
-
        info_dict = {
            'id': video_id,
            'formats': formats,
            'subtitles': subtitles,
        }
+        process_formats(info_dict)
        info_dict.update(extract_metadata(webpage))

        return info_dict
@@ -790,3 +770,30 @@ class FacebookRedirectURLIE(InfoExtractor):
        if not redirect_url:
            raise ExtractorError('Invalid facebook redirect URL', expected=True)
        return self.url_result(redirect_url)
+
+
+class FacebookReelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)'
+    IE_NAME = 'facebook:reel'
+
+    _TESTS = [{
+        'url': 'https://www.facebook.com/reel/1195289147628387',
+        'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831',
+        'info_dict': {
+            'id': '1195289147628387',
+            'ext': 'mp4',
+            'title': 'md5:9f5b142921b2dc57004fa13f76005f87',
+            'description': 'md5:24ea7ef062215d295bdde64e778f5474',
+            'uploader': 'Beast Camp Training',
+            'uploader_id': '1738535909799870',
+            'duration': 9.536,
+            'thumbnail': r're:^https?://.*',
+            'upload_date': '20211121',
+            'timestamp': 1637502604,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)