Upgrade yt_dlp and download script

2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions
--- a/plugins/youtube_download/yt_dlp/extractor/bandcamp.py
+++ b/plugins/youtube_download/yt_dlp/extractor/bandcamp.py
@@ -1,12 +1,13 @@
+import json
 import random
 import re
 import time

 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
    KNOWN_EXTENSIONS,
    ExtractorError,
+    extract_attributes,
    float_or_none,
    int_or_none,
    parse_filesize,
@@ -18,6 +19,7 @@ from ..utils import (
    url_or_none,
    urljoin,
 )
+from ..utils.traversal import find_element, traverse_obj


 class BandcampIE(InfoExtractor):
@@ -41,8 +43,10 @@ class BandcampIE(InfoExtractor):
            'uploader_url': 'https://youtube-dl.bandcamp.com',
            'uploader_id': 'youtube-dl',
            'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
+            'artists': ['youtube-dl "\'/\\ä↭'],
+            'album_artists': ['youtube-dl "\'/\\ä↭'],
        },
-        '_skip': 'There is a limit of 200 free downloads / month for the test song'
+        'skip': 'There is a limit of 200 free downloads / month for the test song',
    }, {
        # free download
        'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
@@ -119,7 +123,7 @@ class BandcampIE(InfoExtractor):

    def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
        return self._parse_json(self._html_search_regex(
-            r'data-%s=(["\'])({.+?})\1' % attr, webpage,
+            rf'data-{attr}=(["\'])({{.+?}})\1', webpage,
            attr + ' data', group=2), video_id, fatal=fatal)

    def _real_extract(self, url):
@@ -167,7 +171,7 @@ class BandcampIE(InfoExtractor):

        download_link = tralbum.get('freeDownloadPage')
        if download_link:
-            track_id = compat_str(tralbum['id'])
+            track_id = str(tralbum['id'])

            download_webpage = self._download_webpage(
                download_link, track_id, 'Downloading free downloads page')
@@ -192,7 +196,7 @@ class BandcampIE(InfoExtractor):
                    if isinstance(download_formats_list, list):
                        for f in blob['download_formats']:
                            name, ext = f.get('name'), f.get('file_extension')
-                            if all(isinstance(x, compat_str) for x in (name, ext)):
+                            if all(isinstance(x, str) for x in (name, ext)):
                                download_formats[name] = ext.strip('.')

                    for format_id, f in downloads.items():
@@ -207,7 +211,7 @@ class BandcampIE(InfoExtractor):
                            })
                        format_id = f.get('encoding_name') or format_id
                        stat = self._download_json(
-                            stat_url, track_id, 'Downloading %s JSON' % format_id,
+                            stat_url, track_id, f'Downloading {format_id} JSON',
                            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
                            fatal=False)
                        if not stat:
@@ -225,7 +229,7 @@ class BandcampIE(InfoExtractor):
                            'acodec': format_id.split('-')[0],
                        })

-        title = '%s - %s' % (artist, track) if artist else track
+        title = f'{artist} - {track}' if artist else track

        if not duration:
            duration = float_or_none(self._html_search_meta(
@@ -267,7 +271,19 @@ class BandcampAlbumIE(BandcampIE):  # XXX: Do not subclass from concrete IE
                    'timestamp': 1311756226,
                    'upload_date': '20110727',
                    'uploader': 'Blazo',
-                }
+                    'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
+                    'album_artists': ['Blazo'],
+                    'uploader_url': 'https://blazo.bandcamp.com',
+                    'release_date': '20110727',
+                    'release_timestamp': 1311724800.0,
+                    'track': 'Intro',
+                    'uploader_id': 'blazo',
+                    'track_number': 1,
+                    'album': 'Jazz Format Mixtape vol.1',
+                    'artists': ['Blazo'],
+                    'duration': 19.335,
+                    'track_id': '1353101989',
+                },
            },
            {
                'md5': '1a2c32e2691474643e912cc6cd4bffaa',
@@ -278,18 +294,31 @@ class BandcampAlbumIE(BandcampIE):  # XXX: Do not subclass from concrete IE
                    'timestamp': 1311757238,
                    'upload_date': '20110727',
                    'uploader': 'Blazo',
-                }
+                    'track': 'Kero One - Keep It Alive (Blazo remix)',
+                    'release_date': '20110727',
+                    'track_id': '38097443',
+                    'track_number': 2,
+                    'duration': 181.467,
+                    'uploader_url': 'https://blazo.bandcamp.com',
+                    'album': 'Jazz Format Mixtape vol.1',
+                    'uploader_id': 'blazo',
+                    'album_artists': ['Blazo'],
+                    'artists': ['Blazo'],
+                    'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
+                    'release_timestamp': 1311724800.0,
+                },
            },
        ],
        'info_dict': {
            'title': 'Jazz Format Mixtape vol.1',
            'id': 'jazz-format-mixtape-vol-1',
            'uploader_id': 'blazo',
+            'description': 'md5:38052a93217f3ffdc033cd5dbbce2989',
        },
        'params': {
-            'playlistend': 2
+            'playlistend': 2,
        },
-        'skip': 'Bandcamp imposes download limits.'
+        'skip': 'Bandcamp imposes download limits.',
    }, {
        'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
        'info_dict': {
@@ -324,7 +353,7 @@ class BandcampAlbumIE(BandcampIE):  # XXX: Do not subclass from concrete IE
    def suitable(cls, url):
        return (False
                if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
-                else super(BandcampAlbumIE, cls).suitable(url))
+                else super().suitable(url))

    def _real_extract(self, url):
        uploader_id, album_id = self._match_valid_url(url).groups()
@@ -359,10 +388,10 @@ class BandcampWeeklyIE(BandcampIE):  # XXX: Do not subclass from concrete IE
    _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://bandcamp.com/?show=224',
-        'md5': 'b00df799c733cf7e0c567ed187dea0fd',
+        'md5': '61acc9a002bed93986b91168aa3ab433',
        'info_dict': {
            'id': '224',
-            'ext': 'opus',
+            'ext': 'mp3',
            'title': 'BC Weekly April 4th 2017 - Magic Moments',
            'description': 'md5:5d48150916e8e02d030623a48512c874',
            'duration': 5829.77,
@@ -372,11 +401,11 @@ class BandcampWeeklyIE(BandcampIE):  # XXX: Do not subclass from concrete IE
            'episode_id': '224',
        },
        'params': {
-            'format': 'opus-lo',
+            'format': 'mp3-128',
        },
    }, {
        'url': 'https://bandcamp.com/?blah/blah@&show=228',
-        'only_matching': True
+        'only_matching': True,
    }]

    def _real_extract(self, url):
@@ -407,7 +436,7 @@ class BandcampWeeklyIE(BandcampIE):  # XXX: Do not subclass from concrete IE
        title = show.get('audio_title') or 'Bandcamp Weekly'
        subtitle = show.get('subtitle')
        if subtitle:
-            title += ' - %s' % subtitle
+            title += f' - {subtitle}'

        return {
            'id': show_id,
@@ -419,7 +448,7 @@ class BandcampWeeklyIE(BandcampIE):  # XXX: Do not subclass from concrete IE
            'series': 'Bandcamp Weekly',
            'episode': show.get('subtitle'),
            'episode_id': show_id,
-            'formats': formats
+            'formats': formats,
        }


@@ -440,7 +469,7 @@ class BandcampUserIE(InfoExtractor):
        'url': 'http://dotscale.bandcamp.com',
        'info_dict': {
            'id': 'dotscale',
-            'title': 'Discography of dotscale'
+            'title': 'Discography of dotscale',
        },
        'playlist_count': 1,
    }, {
@@ -460,7 +489,7 @@ class BandcampUserIE(InfoExtractor):
        },
    }, {
        'url': 'https://coldworldofficial.bandcamp.com/music',
-        'playlist_mincount': 10,
+        'playlist_mincount': 7,
        'info_dict': {
            'id': 'coldworldofficial',
            'title': 'Discography of coldworldofficial',
@@ -474,12 +503,19 @@ class BandcampUserIE(InfoExtractor):
        },
    }]

+    def _yield_items(self, webpage):
+        yield from (
+            re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
+            or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
+
+        yield from traverse_obj(webpage, (
+            {find_element(id='music-grid', html=True)}, {extract_attributes},
+            'data-client-items', {json.loads}, ..., 'page_url', {str}))
+
    def _real_extract(self, url):
        uploader = self._match_id(url)
        webpage = self._download_webpage(url, uploader)

-        discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
-                            or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
-
        return self.playlist_from_matches(
-            discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))
+            self._yield_items(webpage), uploader, f'Discography of {uploader}',
+            getter=urljoin(url))