From 0af3d0c7d9d6a570393853c0cb07284436cfefc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20G=C3=A4nsejunge?= Date: Mon, 8 Dec 2025 18:39:17 +0100 Subject: [PATCH] Perun: Changed fetching YouTube data from using yt-dlp to an RSS feed --- src/perun/get_episode.py | 76 +++++++++++++++---------------- src/perun/requirements.txt | 4 +- src/perun/rss_feed_handler.py | 85 +++++++++++++++++++++++++++++++++++ src/perun/youtube_handler.py | 67 +++------------------------ 4 files changed, 132 insertions(+), 100 deletions(-) create mode 100644 src/perun/rss_feed_handler.py diff --git a/src/perun/get_episode.py b/src/perun/get_episode.py index b1d7098..ce9f304 100644 --- a/src/perun/get_episode.py +++ b/src/perun/get_episode.py @@ -4,9 +4,9 @@ import os import time from dotenv import load_dotenv from ssh_helper import upload_via_sftp, send_notification_via_ssh -from youtube_handler import get_url_for_latest_video, get_youtube_data, return_download_options, check_for_sponsorblock_segments +from youtube_handler import return_download_options, check_for_sponsorblock_segments from simple_logger_handler import setup_logger - +from rss_feed_handler import grab_latest_chapter_information, EpisodeData logger = setup_logger(__name__) load_dotenv() @@ -14,14 +14,14 @@ PODCAST_AUTHORIZATION_TOKEN = os.getenv("PODCAST_AUTHORIZATION_TOKEN") PODCAST_API_URL = os.getenv("PODCAST_API_URL") -def get_audiobookshelf_data() -> tuple[int | None, str | None]: +def get_audiobookshelf_data() -> tuple[str | None, str | None]: """ Fetches the latest episode data from the Audiobookshelf API. Returns: tuple[int | None, str | None]: - - The track number as an integer (or None if data could not be fetched due to retryable errors). - - The episode title as a string (or None if data could not be fetched due to retryable errors). + - The track number as a string (or None if data could not be fetched due to retryable errors). + - The YouTube episode id as a string (or None if data could not be fetched due to retryable errors). Raises: requests.exceptions.HTTPError: @@ -40,10 +40,10 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]: result = response.json() audiobookshelf_track = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTrack"] - audiobookshelf_title = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTitle"] + audiobookshelf_ytid = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagDescription"] - logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, title={audiobookshelf_title}") - return (audiobookshelf_track, audiobookshelf_title) + logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, ytid={audiobookshelf_ytid}") + return (audiobookshelf_track, audiobookshelf_ytid) except requests.exceptions.ConnectionError as e: logger.warning(f"[Audiobookshelf] Connection error, will retry: {e}") @@ -60,57 +60,56 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]: logger.error(f"[Audiobookshelf] HTTP error {status}, not retrying: {e}") raise -def check_until_new_episode_gets_released() -> tuple[int | None, dict | None, str | None]: +def check_until_new_episode_gets_released() -> tuple[EpisodeData | None, str | None]: """ Polls YouTube every hour for a new episode and compares it to the available episode on Audiobookshelf. Stops after 72 hours. Returns: - tuple[int | None, dict | None, str | None]: + tuple[EpisodeData | None, str | None]: + - EpisodeData with information about the date,description,link,title and YouTube ID - Track number from Audiobookshelf - - Episode info dictionary from YouTube - - Episode URL - Returns (None, None, None) if no new episode found within timeout + Returns (None, None) if no new episode found within timeout """ CHECK_INTERVAL_HOURS = 1 MAX_HOURS = 72 for attempt in range(1, MAX_HOURS + 1): logger.debug(f"[EpisodeCheck] Waiting for a new episode to be released, attempt: {attempt}/{MAX_HOURS}") - audiobookshelf_track, audiobookshelf_title = get_audiobookshelf_data() + audiobookshelf_track, audiobookshelf_ytid = get_audiobookshelf_data() - if audiobookshelf_track is None or audiobookshelf_title is None: + if audiobookshelf_track is None or audiobookshelf_ytid is None: logger.warning("[EpisodeCheck] Unable to fetch Audiobookshelf data, retrying in 1 hour.") time.sleep(CHECK_INTERVAL_HOURS * 3600) continue - episode_url = get_url_for_latest_video() - if episode_url is None: - logger.warning("[EpisodeCheck] Unable to fetch latest video URL, retrying in 1 hour.") + try: + episode_data = grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q") + except Exception as e: + logger.warning(f"[EpisodeCheck] Failed to fetch latest video data: {e}, retrying in 1 hour.") time.sleep(CHECK_INTERVAL_HOURS * 3600) continue - episode_info = get_youtube_data(episode_url) - if not episode_info: - logger.warning("[EpisodeCheck] Unable to fetch video metadata, retrying in 1 hour.") + if episode_data is None: + logger.warning("[EpisodeCheck] Unable to fetch latest video data, retrying in 1 hour.") time.sleep(CHECK_INTERVAL_HOURS * 3600) continue - if audiobookshelf_title != episode_info["title"]: - logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_info['title']}") - return (audiobookshelf_track,episode_info,episode_url) + if audiobookshelf_ytid != episode_data.episode_ytid: + logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_data.episode_title}") + return episode_data, audiobookshelf_track - logger.debug("[EpisodeCheck] No new episode found, retrying in 1 hour.") + logger.info("[EpisodeCheck] No new episode found, retrying in 1 hour.") time.sleep(CHECK_INTERVAL_HOURS * 3600) logger.warning("[EpisodeCheck] No new episode found after maximum attempts.") - return (None, None, None) + return None, None -def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool: +def wait_for_sponsorblock_segments_to_be_added(episode_link) -> bool: """ Polls SponsorBlock for segments on the current video until found or until max attempts. Args: - episode_url: YouTube video URL to check for SponsorBlock segments + episode_link: YouTube video URL to check for SponsorBlock segments Returns: True if segments found, False otherwise @@ -119,7 +118,7 @@ def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool: MAX_HOURS = 24 for attempt in range(1, MAX_HOURS + 1): logger.debug(f"[SponsorBlock] Waiting for SponsorBlock to be added, attempt: {attempt}/{MAX_HOURS} ") - segments = check_for_sponsorblock_segments(episode_url) + segments = check_for_sponsorblock_segments(episode_link) if segments: logger.debug("[SponsorBlock] Segments found, exiting loop.") @@ -138,9 +137,9 @@ def download_episode() -> None: logger.info("[App] Starting Perun") try: - audiobookshelf_track,episode_info,episode_url = check_until_new_episode_gets_released() + episode_data,audiobookshelf_track = check_until_new_episode_gets_released() - if audiobookshelf_track is None or episode_info is None or episode_url is None: + if episode_data is None or audiobookshelf_track is None: logger.error("[App] Failed to find new episode within timeout period") return @@ -150,23 +149,22 @@ def download_episode() -> None: return try: - episode_description = episode_info.get("description", "") - if "sponsored" in episode_description.lower(): + if "sponsored" in episode_data.episode_description.lower(): logger.debug("[App] Sponsored segments found in description, waiting for SponsorBlock") - wait_for_sponsorblock_segments_to_be_added(episode_url) + wait_for_sponsorblock_segments_to_be_added(episode_data.episode_link) else: logger.debug("[App] No sponsored segments found in description") except Exception as e: logger.warning(f"[App] Failed during SponsorBlock wait: {e}", exc_info=True) try: - track = str(int(audiobookshelf_track) + 1).zfill(4) + episode_data.episode_number = str(int(audiobookshelf_track) + 1).zfill(4) except (ValueError,TypeError) as e: logger.warning(f"[App] Failed incrementing audiobookshelf track: {e}", exc_info=True) return try: - options = return_download_options(episode_info,track) + options = return_download_options(episode_data) except Exception as e: logger.error(f"[App] Failed to generate download options: {e}", exc_info=True) return @@ -174,17 +172,17 @@ def download_episode() -> None: logger.info("[App] Downloading new episode") try: with yt_dlp.YoutubeDL(options) as episode: - episode.download(episode_url) + episode.download(episode_data.episode_link) logger.debug("[App] Download completed successfully") except Exception as e: logger.error(f"[App] Failed to download episode: {e}", exc_info=True) return logger.info("[App] Uploading episode via SFTP") - upload_via_sftp(f"perun-{episode_info['date']}.mp3") + upload_via_sftp(f"perun-{episode_data.episode_date}.mp3") logger.info("[App] Sending release notification") - send_notification_via_ssh(f"Perun episode {track} has been released",episode_info["title"]) + send_notification_via_ssh(f"Perun episode {episode_data.episode_number} has been released",episode_data.episode_title) logger.info("[App] Workflow complete") diff --git a/src/perun/requirements.txt b/src/perun/requirements.txt index 0af6a6b..76e0367 100644 --- a/src/perun/requirements.txt +++ b/src/perun/requirements.txt @@ -6,6 +6,7 @@ cffi==2.0.0 charset-normalizer==3.4.3 cryptography==46.0.2 dotenv==0.9.9 +feedparser==6.0.12 idna==3.10 invoke==2.2.0 mutagen==1.47.0 @@ -15,7 +16,8 @@ pycryptodomex==3.23.0 PyNaCl==1.6.0 python-dotenv==1.1.1 requests==2.32.5 +sgmllib3k==1.0.0 +simple-logger-handler==0.1.0 sponsorblock.py==0.2.3 urllib3==2.5.0 websockets==15.0.1 -simple-logger-handler==0.1.0 \ No newline at end of file diff --git a/src/perun/rss_feed_handler.py b/src/perun/rss_feed_handler.py new file mode 100644 index 0000000..cca78fb --- /dev/null +++ b/src/perun/rss_feed_handler.py @@ -0,0 +1,85 @@ +import feedparser +from simple_logger_handler import setup_logger +import time +from urllib.error import URLError +from typing import Optional +from dataclasses import dataclass +from datetime import datetime + +@dataclass +class EpisodeData: + episode_date: str + episode_description: str + episode_link: str + episode_number: str + episode_title: str + episode_ytid: str + +logger = setup_logger(__name__) + +def grab_latest_chapter_information(id: str, max_retries: int = 3) -> Optional[EpisodeData]: + """ + Fetches the latest episodes information from a Youtube RSS feed, with retries on network-related errors. + + Parameters: + id: Youtube channel ID as a string. + max_retries: Number of retry attempts if fetching the feed fails due to network issues. + + Returns: + EpisodeData: A dataclass containing episode metadata: + episode_date: Date when it was published in iso format (2025-11-30). + episode_description: Episode description. + episode_link: YouTube link. + episode_number: Episode number. + episode_title: Episode title. + episode_ytid: Episode YouTube ID . + Returns None if the feed has no entries or all retries are exhausted. + + Raises: + ValueError: If the feed has no entries. + Other network-related exceptions: If fetching fails after retries. + """ + + rss_feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={id}" + attempt = 1 + + while attempt <= max_retries: + logger.debug(f"[Feed] Parsing feed URL: {rss_feed_url} (attempt {attempt}/{max_retries})") + try: + feed = feedparser.parse(rss_feed_url) + + if not feed.entries: + logger.warning(f"[Feed] No entries found for feed {id}") + return None + + latest_chapter_data = feed["entries"][0] + episode_link = latest_chapter_data["link"] + episode_title = latest_chapter_data["title"] + episode_description = latest_chapter_data["summary"] + episode_date = latest_chapter_data["published"] + episode_date = datetime.fromisoformat(episode_date).date().isoformat() + episode_ytid = latest_chapter_data["yt_videoid"] + + logger.info(f"[Feed] Latest episode '{episode_title}': {episode_link}") + logger.debug(f"[Feed] Latest episode '{episode_title}' (YouTubeId {episode_ytid}): {episode_link} -> {episode_description}") + return EpisodeData( + episode_date=episode_date, + episode_description=episode_description, + episode_link=episode_link, + episode_number="", + episode_title=episode_title, + episode_ytid=episode_ytid + ) + + except (URLError, OSError) as e: + logger.warning(f"[Feed] Network error on attempt {attempt} for feed {id}: {e}") + if attempt == max_retries: + logger.error(f"[Feed] All {max_retries} attempts failed for feed {id}") + return None + backoff = 2 ** (attempt - 1) + logger.debug(f"[Feed] Retrying in {backoff} seconds...") + time.sleep(backoff) + attempt += 1 + +if __name__ == "__main__": + print(grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q")) \ No newline at end of file diff --git a/src/perun/youtube_handler.py b/src/perun/youtube_handler.py index 29aeb81..54864aa 100644 --- a/src/perun/youtube_handler.py +++ b/src/perun/youtube_handler.py @@ -13,59 +13,6 @@ logger = setup_logger(__name__) load_dotenv() YOUTUBE_CHANNEL_URL = os.getenv("YOUTUBE_CHANNEL_URL") - -def get_url_for_latest_video(): - """ - Fetch the URL of the latest video from a YouTube channel. - """ - logger.info("[YouTube] Fetching latest video URL from YouTube channel") - options = { - "extract_flat": True, - "playlist_items": "1", - "quiet": True, - "forcejson": True, - "simulate": True, - } - - try: - with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull): - with yt_dlp.YoutubeDL(options) as video: - info_dict = video.extract_info(YOUTUBE_CHANNEL_URL, download=False) - except Exception as e: - logger.error(f"[YouTube] Failed to fetch latest video info: {e}", exc_info=True) - return None - - if "entries" in info_dict and len(info_dict["entries"]) > 0: - latest_url = info_dict["entries"][0]["url"] - logger.debug(f"[YouTube] Latest video URL found: {latest_url}") - return latest_url - else: - logger.warning("[YouTube] No entries found in channel feed") - return None - -def get_youtube_data(url: str) -> dict: - """ - Fetch metadata for a given YouTube video URL. - """ - logger.info(f"Fetching YouTube metadata for video: {url}") - try: - with yt_dlp.YoutubeDL({"quiet": True, "noprogress": True}) as video: - info_dict = video.extract_info(url, download=False) - except Exception as e: - logger.error(f"[YouTube] Failed to fetch YouTube video info for {url}: {e}", exc_info=True) - return {} - - video_data = { - "date": datetime.datetime.fromtimestamp( - info_dict["timestamp"], datetime.timezone.utc - ).strftime("%Y-%m-%d"), - "title": info_dict["title"], - "description": info_dict.get("description", "") - } - - logger.debug(f"[YouTube] Fetched video data: {json.dumps(video_data, indent=4)}") - return video_data - def check_for_sponsorblock_segments(youtube_video:str) -> bool: client = sb.Client() try: @@ -78,14 +25,14 @@ def check_for_sponsorblock_segments(youtube_video:str) -> bool: logger.info(f"[SponsorBlock] SponsorBlock segments found for video: {youtube_video}") return True -def return_download_options(information:dict,track:str)->dict: +def return_download_options(episode_data)->dict: download_options = { "quiet": True, "noprogress": True, "format": "bestaudio/best", "extract_audio": True, "audio_format": "mp3", - "outtmpl": f"perun-{information['date']}.%(ext)s", + "outtmpl": f"perun-{episode_data.episode_date}.%(ext)s", "addmetadata": True, "postprocessors":[ {"api": "https://sponsor.ajay.app", @@ -109,12 +56,12 @@ def return_download_options(information:dict,track:str)->dict: "key": "FFmpegMetadata", }], "postprocessor_args": [ - "-metadata", f"title={information['title']}", + "-metadata", f"title={episode_data.episode_title}", "-metadata", "artist=Perun", - "-metadata", f"track={track}", - "-metadata", f"date={information['date']}", - "-metadata", f"comment={return_string_as_html(information['description'])}", - "-metadata", f"description={return_string_as_html(information['description'])}", + "-metadata", f"track={episode_data.episode_number}", + "-metadata", f"date={episode_data.episode_date}", + "-metadata", f"comment={return_string_as_html(episode_data.episode_description)}", + "-metadata", f"description={episode_data.episode_ytid}", ], "merge_output_format": "mp3" } -- 2.43.0