Upgrade yt_dlp and download script

This commit is contained in:
2025-05-02 16:11:08 -05:00
parent 3a2e8eeb08
commit d68d9ce4f9
1194 changed files with 60099 additions and 44436 deletions

View File

@@ -5,7 +5,6 @@ import re
import urllib.parse
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
InAdvancePagedList,
@@ -23,7 +22,7 @@ from ..utils import (
)
class PolskieRadioBaseExtractor(InfoExtractor):
class PolskieRadioBaseIE(InfoExtractor):
def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
media_urls = set()
@@ -37,7 +36,7 @@ class PolskieRadioBaseExtractor(InfoExtractor):
media_urls.add(media_url)
entry = base_data.copy()
entry.update({
'id': compat_str(media['id']),
'id': str(media['id']),
'url': media_url,
'duration': int_or_none(media.get('length')),
'vcodec': 'none' if media.get('provider') == 'audio' else None,
@@ -48,7 +47,7 @@ class PolskieRadioBaseExtractor(InfoExtractor):
yield entry
class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
class PolskieRadioLegacyIE(PolskieRadioBaseIE):
# legacy sites
IE_NAME = 'polskieradio:legacy'
_VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)'
@@ -68,7 +67,7 @@ class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
'timestamp': 1592654400,
'upload_date': '20200620',
'duration': 1430,
'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$',
},
}],
}, {
@@ -128,7 +127,7 @@ class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
return self.playlist_result(entries, playlist_id, title, description)
class PolskieRadioIE(PolskieRadioBaseExtractor):
class PolskieRadioIE(PolskieRadioBaseIE):
# new next.js sites
_VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)'
_TESTS = [{
@@ -262,14 +261,14 @@ class PolskieRadioAuditionIE(InfoExtractor):
query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'})
def _entries(self, playlist_id, has_episodes, has_articles):
for i in itertools.count(1) if has_episodes else []:
for i in itertools.count(0) if has_episodes else []:
page = self._call_lp3(
'AudioArticle/GetListByCategoryId', {
'categoryId': playlist_id,
'PageSize': 10,
'skip': i,
'format': 400,
}, playlist_id, f'Downloading episode list page {i}')
}, playlist_id, f'Downloading episode list page {i + 1}')
if not traverse_obj(page, 'data'):
break
for episode in page['data']:
@@ -281,14 +280,14 @@ class PolskieRadioAuditionIE(InfoExtractor):
'timestamp': parse_iso8601(episode.get('datePublic')),
}
for i in itertools.count(1) if has_articles else []:
for i in itertools.count(0) if has_articles else []:
page = self._call_lp3(
'Article/GetListByCategoryId', {
'categoryId': playlist_id,
'PageSize': 9,
'skip': i,
'format': 400,
}, playlist_id, f'Downloading article list page {i}')
}, playlist_id, f'Downloading article list page {i + 1}')
if not traverse_obj(page, 'data'):
break
for article in page['data']:
@@ -328,14 +327,14 @@ class PolskieRadioCategoryIE(InfoExtractor):
'id': '4143',
'title': 'Kierunek Kraków',
},
'playlist_mincount': 61
'playlist_mincount': 61,
}, {
'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
'info_dict': {
'id': '214',
'title': 'Muzyka',
},
'playlist_mincount': 61
'playlist_mincount': 61,
}, {
# billennium tabs
'url': 'https://www.polskieradio.pl/8/2385',
@@ -400,7 +399,7 @@ class PolskieRadioCategoryIE(InfoExtractor):
params = self._search_json(
r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(',
pagination, 'next page params', category_id, default=None, close_objects=1,
contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x)))
contains_pattern='.+', transform_source=lambda x: f'[{js_to_json(unescapeHTML(x))}')
if not params:
break
tab_content = self._download_json(
@@ -409,7 +408,7 @@ class PolskieRadioCategoryIE(InfoExtractor):
data=json.dumps(dict(zip((
'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode',
'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate',
'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber'
'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber',
), params))).encode())['d']
content, pagination = tab_content['Content'], tab_content.get('PagerContent')
elif is_post_back:
@@ -511,7 +510,7 @@ class PolskieRadioPlayerIE(InfoExtractor):
})
return {
'id': compat_str(channel['id']),
'id': str(channel['id']),
'formats': formats,
'title': channel.get('name') or channel.get('streamName'),
'display_id': channel_url,
@@ -520,7 +519,7 @@ class PolskieRadioPlayerIE(InfoExtractor):
}
class PolskieRadioPodcastBaseExtractor(InfoExtractor):
class PolskieRadioPodcastBaseIE(InfoExtractor):
_API_BASE = 'https://apipodcasts.polskieradio.pl/api'
def _parse_episode(self, data):
@@ -540,7 +539,7 @@ class PolskieRadioPodcastBaseExtractor(InfoExtractor):
}
class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseIE):
IE_NAME = 'polskieradio:podcast:list'
_VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
_TESTS = [{
@@ -579,7 +578,7 @@ class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
}
class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
class PolskieRadioPodcastIE(PolskieRadioPodcastBaseIE):
IE_NAME = 'polskieradio:podcast'
_VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
_TESTS = [{
@@ -603,7 +602,7 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
podcast_id, 'Downloading podcast metadata',
data=json.dumps({
'guids': [podcast_id],
}).encode('utf-8'),
}).encode(),
headers={
'Content-Type': 'application/json',
})