Perun: Changed fetching YouTube data from using yt-dlp to an RSS feed #2

Merged
florian merged 1 commits from feature/rss-feed into main 2025-12-08 18:46:50 +01:00
4 changed files with 132 additions and 100 deletions

View File

@ -4,9 +4,9 @@ import os
import time import time
from dotenv import load_dotenv from dotenv import load_dotenv
from ssh_helper import upload_via_sftp, send_notification_via_ssh from ssh_helper import upload_via_sftp, send_notification_via_ssh
from youtube_handler import get_url_for_latest_video, get_youtube_data, return_download_options, check_for_sponsorblock_segments from youtube_handler import return_download_options, check_for_sponsorblock_segments
from simple_logger_handler import setup_logger from simple_logger_handler import setup_logger
from rss_feed_handler import grab_latest_chapter_information, EpisodeData
logger = setup_logger(__name__) logger = setup_logger(__name__)
load_dotenv() load_dotenv()
@ -14,14 +14,14 @@ PODCAST_AUTHORIZATION_TOKEN = os.getenv("PODCAST_AUTHORIZATION_TOKEN")
PODCAST_API_URL = os.getenv("PODCAST_API_URL") PODCAST_API_URL = os.getenv("PODCAST_API_URL")
def get_audiobookshelf_data() -> tuple[int | None, str | None]: def get_audiobookshelf_data() -> tuple[str | None, str | None]:
""" """
Fetches the latest episode data from the Audiobookshelf API. Fetches the latest episode data from the Audiobookshelf API.
Returns: Returns:
tuple[int | None, str | None]: tuple[int | None, str | None]:
- The track number as an integer (or None if data could not be fetched due to retryable errors). - The track number as a string (or None if data could not be fetched due to retryable errors).
- The episode title as a string (or None if data could not be fetched due to retryable errors). - The YouTube episode id as a string (or None if data could not be fetched due to retryable errors).
Raises: Raises:
requests.exceptions.HTTPError: requests.exceptions.HTTPError:
@ -40,10 +40,10 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
result = response.json() result = response.json()
audiobookshelf_track = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTrack"] audiobookshelf_track = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTrack"]
audiobookshelf_title = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTitle"] audiobookshelf_ytid = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagDescription"]
logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, title={audiobookshelf_title}") logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, ytid={audiobookshelf_ytid}")
return (audiobookshelf_track, audiobookshelf_title) return (audiobookshelf_track, audiobookshelf_ytid)
except requests.exceptions.ConnectionError as e: except requests.exceptions.ConnectionError as e:
logger.warning(f"[Audiobookshelf] Connection error, will retry: {e}") logger.warning(f"[Audiobookshelf] Connection error, will retry: {e}")
@ -60,57 +60,56 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
logger.error(f"[Audiobookshelf] HTTP error {status}, not retrying: {e}") logger.error(f"[Audiobookshelf] HTTP error {status}, not retrying: {e}")
raise raise
def check_until_new_episode_gets_released() -> tuple[int | None, dict | None, str | None]: def check_until_new_episode_gets_released() -> tuple[EpisodeData | None, str | None]:
""" """
Polls YouTube every hour for a new episode and compares it to the available episode on Audiobookshelf. Polls YouTube every hour for a new episode and compares it to the available episode on Audiobookshelf.
Stops after 72 hours. Stops after 72 hours.
Returns: Returns:
tuple[int | None, dict | None, str | None]: tuple[EpisodeData | None, str | None]:
- EpisodeData with information about the date,description,link,title and YouTube ID
- Track number from Audiobookshelf - Track number from Audiobookshelf
- Episode info dictionary from YouTube Returns (None, None) if no new episode found within timeout
- Episode URL
Returns (None, None, None) if no new episode found within timeout
""" """
CHECK_INTERVAL_HOURS = 1 CHECK_INTERVAL_HOURS = 1
MAX_HOURS = 72 MAX_HOURS = 72
for attempt in range(1, MAX_HOURS + 1): for attempt in range(1, MAX_HOURS + 1):
logger.debug(f"[EpisodeCheck] Waiting for a new episode to be released, attempt: {attempt}/{MAX_HOURS}") logger.debug(f"[EpisodeCheck] Waiting for a new episode to be released, attempt: {attempt}/{MAX_HOURS}")
audiobookshelf_track, audiobookshelf_title = get_audiobookshelf_data() audiobookshelf_track, audiobookshelf_ytid = get_audiobookshelf_data()
if audiobookshelf_track is None or audiobookshelf_title is None: if audiobookshelf_track is None or audiobookshelf_ytid is None:
logger.warning("[EpisodeCheck] Unable to fetch Audiobookshelf data, retrying in 1 hour.") logger.warning("[EpisodeCheck] Unable to fetch Audiobookshelf data, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600) time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue continue
episode_url = get_url_for_latest_video() try:
if episode_url is None: episode_data = grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q")
logger.warning("[EpisodeCheck] Unable to fetch latest video URL, retrying in 1 hour.") except Exception as e:
logger.warning(f"[EpisodeCheck] Failed to fetch latest video data: {e}, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600) time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue continue
episode_info = get_youtube_data(episode_url) if episode_data is None:
if not episode_info: logger.warning("[EpisodeCheck] Unable to fetch latest video data, retrying in 1 hour.")
logger.warning("[EpisodeCheck] Unable to fetch video metadata, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600) time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue continue
if audiobookshelf_title != episode_info["title"]: if audiobookshelf_ytid != episode_data.episode_ytid:
logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_info['title']}") logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_data.episode_title}")
return (audiobookshelf_track,episode_info,episode_url) return episode_data, audiobookshelf_track
logger.debug("[EpisodeCheck] No new episode found, retrying in 1 hour.") logger.info("[EpisodeCheck] No new episode found, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600) time.sleep(CHECK_INTERVAL_HOURS * 3600)
logger.warning("[EpisodeCheck] No new episode found after maximum attempts.") logger.warning("[EpisodeCheck] No new episode found after maximum attempts.")
return (None, None, None) return None, None
def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool: def wait_for_sponsorblock_segments_to_be_added(episode_link) -> bool:
""" """
Polls SponsorBlock for segments on the current video until found or until max attempts. Polls SponsorBlock for segments on the current video until found or until max attempts.
Args: Args:
episode_url: YouTube video URL to check for SponsorBlock segments episode_link: YouTube video URL to check for SponsorBlock segments
Returns: Returns:
True if segments found, False otherwise True if segments found, False otherwise
@ -119,7 +118,7 @@ def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool:
MAX_HOURS = 24 MAX_HOURS = 24
for attempt in range(1, MAX_HOURS + 1): for attempt in range(1, MAX_HOURS + 1):
logger.debug(f"[SponsorBlock] Waiting for SponsorBlock to be added, attempt: {attempt}/{MAX_HOURS} ") logger.debug(f"[SponsorBlock] Waiting for SponsorBlock to be added, attempt: {attempt}/{MAX_HOURS} ")
segments = check_for_sponsorblock_segments(episode_url) segments = check_for_sponsorblock_segments(episode_link)
if segments: if segments:
logger.debug("[SponsorBlock] Segments found, exiting loop.") logger.debug("[SponsorBlock] Segments found, exiting loop.")
@ -138,9 +137,9 @@ def download_episode() -> None:
logger.info("[App] Starting Perun") logger.info("[App] Starting Perun")
try: try:
audiobookshelf_track,episode_info,episode_url = check_until_new_episode_gets_released() episode_data,audiobookshelf_track = check_until_new_episode_gets_released()
if audiobookshelf_track is None or episode_info is None or episode_url is None: if episode_data is None or audiobookshelf_track is None:
logger.error("[App] Failed to find new episode within timeout period") logger.error("[App] Failed to find new episode within timeout period")
return return
@ -150,23 +149,22 @@ def download_episode() -> None:
return return
try: try:
episode_description = episode_info.get("description", "") if "sponsored" in episode_data.episode_description.lower():
if "sponsored" in episode_description.lower():
logger.debug("[App] Sponsored segments found in description, waiting for SponsorBlock") logger.debug("[App] Sponsored segments found in description, waiting for SponsorBlock")
wait_for_sponsorblock_segments_to_be_added(episode_url) wait_for_sponsorblock_segments_to_be_added(episode_data.episode_link)
else: else:
logger.debug("[App] No sponsored segments found in description") logger.debug("[App] No sponsored segments found in description")
except Exception as e: except Exception as e:
logger.warning(f"[App] Failed during SponsorBlock wait: {e}", exc_info=True) logger.warning(f"[App] Failed during SponsorBlock wait: {e}", exc_info=True)
try: try:
track = str(int(audiobookshelf_track) + 1).zfill(4) episode_data.episode_number = str(int(audiobookshelf_track) + 1).zfill(4)
except (ValueError,TypeError) as e: except (ValueError,TypeError) as e:
logger.warning(f"[App] Failed incrementing audiobookshelf track: {e}", exc_info=True) logger.warning(f"[App] Failed incrementing audiobookshelf track: {e}", exc_info=True)
return return
try: try:
options = return_download_options(episode_info,track) options = return_download_options(episode_data)
except Exception as e: except Exception as e:
logger.error(f"[App] Failed to generate download options: {e}", exc_info=True) logger.error(f"[App] Failed to generate download options: {e}", exc_info=True)
return return
@ -174,17 +172,17 @@ def download_episode() -> None:
logger.info("[App] Downloading new episode") logger.info("[App] Downloading new episode")
try: try:
with yt_dlp.YoutubeDL(options) as episode: with yt_dlp.YoutubeDL(options) as episode:
episode.download(episode_url) episode.download(episode_data.episode_link)
logger.debug("[App] Download completed successfully") logger.debug("[App] Download completed successfully")
except Exception as e: except Exception as e:
logger.error(f"[App] Failed to download episode: {e}", exc_info=True) logger.error(f"[App] Failed to download episode: {e}", exc_info=True)
return return
logger.info("[App] Uploading episode via SFTP") logger.info("[App] Uploading episode via SFTP")
upload_via_sftp(f"perun-{episode_info['date']}.mp3") upload_via_sftp(f"perun-{episode_data.episode_date}.mp3")
logger.info("[App] Sending release notification") logger.info("[App] Sending release notification")
send_notification_via_ssh(f"Perun episode {track} has been released",episode_info["title"]) send_notification_via_ssh(f"Perun episode {episode_data.episode_number} has been released",episode_data.episode_title)
logger.info("[App] Workflow complete") logger.info("[App] Workflow complete")

View File

@ -6,6 +6,7 @@ cffi==2.0.0
charset-normalizer==3.4.3 charset-normalizer==3.4.3
cryptography==46.0.2 cryptography==46.0.2
dotenv==0.9.9 dotenv==0.9.9
feedparser==6.0.12
idna==3.10 idna==3.10
invoke==2.2.0 invoke==2.2.0
mutagen==1.47.0 mutagen==1.47.0
@ -15,7 +16,8 @@ pycryptodomex==3.23.0
PyNaCl==1.6.0 PyNaCl==1.6.0
python-dotenv==1.1.1 python-dotenv==1.1.1
requests==2.32.5 requests==2.32.5
sgmllib3k==1.0.0
simple-logger-handler==0.1.0
sponsorblock.py==0.2.3 sponsorblock.py==0.2.3
urllib3==2.5.0 urllib3==2.5.0
websockets==15.0.1 websockets==15.0.1
simple-logger-handler==0.1.0

View File

@ -0,0 +1,85 @@
import feedparser
from simple_logger_handler import setup_logger
import time
from urllib.error import URLError
from typing import Optional
from dataclasses import dataclass
from datetime import datetime
@dataclass
class EpisodeData:
episode_date: str
episode_description: str
episode_link: str
episode_number: str
episode_title: str
episode_ytid: str
logger = setup_logger(__name__)
def grab_latest_chapter_information(id: str, max_retries: int = 3) -> Optional[EpisodeData]:
"""
Fetches the latest episodes information from a Youtube RSS feed, with retries on network-related errors.
Parameters:
id: Youtube channel ID as a string.
max_retries: Number of retry attempts if fetching the feed fails due to network issues.
Returns:
EpisodeData: A dataclass containing episode metadata:
episode_date: Date when it was published in iso format (2025-11-30).
episode_description: Episode description.
episode_link: YouTube link.
episode_number: Episode number.
episode_title: Episode title.
episode_ytid: Episode YouTube ID .
Returns None if the feed has no entries or all retries are exhausted.
Raises:
ValueError: If the feed has no entries.
Other network-related exceptions: If fetching fails after retries.
"""
rss_feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={id}"
attempt = 1
while attempt <= max_retries:
logger.debug(f"[Feed] Parsing feed URL: {rss_feed_url} (attempt {attempt}/{max_retries})")
try:
feed = feedparser.parse(rss_feed_url)
if not feed.entries:
logger.warning(f"[Feed] No entries found for feed {id}")
return None
latest_chapter_data = feed["entries"][0]
episode_link = latest_chapter_data["link"]
episode_title = latest_chapter_data["title"]
episode_description = latest_chapter_data["summary"]
episode_date = latest_chapter_data["published"]
episode_date = datetime.fromisoformat(episode_date).date().isoformat()
episode_ytid = latest_chapter_data["yt_videoid"]
logger.info(f"[Feed] Latest episode '{episode_title}': {episode_link}")
logger.debug(f"[Feed] Latest episode '{episode_title}' (YouTubeId {episode_ytid}): {episode_link} -> {episode_description}")
return EpisodeData(
episode_date=episode_date,
episode_description=episode_description,
episode_link=episode_link,
episode_number="",
episode_title=episode_title,
episode_ytid=episode_ytid
)
except (URLError, OSError) as e:
logger.warning(f"[Feed] Network error on attempt {attempt} for feed {id}: {e}")
if attempt == max_retries:
logger.error(f"[Feed] All {max_retries} attempts failed for feed {id}")
return None
backoff = 2 ** (attempt - 1)
logger.debug(f"[Feed] Retrying in {backoff} seconds...")
time.sleep(backoff)
attempt += 1
if __name__ == "__main__":
print(grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q"))

View File

@ -13,59 +13,6 @@ logger = setup_logger(__name__)
load_dotenv() load_dotenv()
YOUTUBE_CHANNEL_URL = os.getenv("YOUTUBE_CHANNEL_URL") YOUTUBE_CHANNEL_URL = os.getenv("YOUTUBE_CHANNEL_URL")
def get_url_for_latest_video():
"""
Fetch the URL of the latest video from a YouTube channel.
"""
logger.info("[YouTube] Fetching latest video URL from YouTube channel")
options = {
"extract_flat": True,
"playlist_items": "1",
"quiet": True,
"forcejson": True,
"simulate": True,
}
try:
with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull):
with yt_dlp.YoutubeDL(options) as video:
info_dict = video.extract_info(YOUTUBE_CHANNEL_URL, download=False)
except Exception as e:
logger.error(f"[YouTube] Failed to fetch latest video info: {e}", exc_info=True)
return None
if "entries" in info_dict and len(info_dict["entries"]) > 0:
latest_url = info_dict["entries"][0]["url"]
logger.debug(f"[YouTube] Latest video URL found: {latest_url}")
return latest_url
else:
logger.warning("[YouTube] No entries found in channel feed")
return None
def get_youtube_data(url: str) -> dict:
"""
Fetch metadata for a given YouTube video URL.
"""
logger.info(f"Fetching YouTube metadata for video: {url}")
try:
with yt_dlp.YoutubeDL({"quiet": True, "noprogress": True}) as video:
info_dict = video.extract_info(url, download=False)
except Exception as e:
logger.error(f"[YouTube] Failed to fetch YouTube video info for {url}: {e}", exc_info=True)
return {}
video_data = {
"date": datetime.datetime.fromtimestamp(
info_dict["timestamp"], datetime.timezone.utc
).strftime("%Y-%m-%d"),
"title": info_dict["title"],
"description": info_dict.get("description", "")
}
logger.debug(f"[YouTube] Fetched video data: {json.dumps(video_data, indent=4)}")
return video_data
def check_for_sponsorblock_segments(youtube_video:str) -> bool: def check_for_sponsorblock_segments(youtube_video:str) -> bool:
client = sb.Client() client = sb.Client()
try: try:
@ -78,14 +25,14 @@ def check_for_sponsorblock_segments(youtube_video:str) -> bool:
logger.info(f"[SponsorBlock] SponsorBlock segments found for video: {youtube_video}") logger.info(f"[SponsorBlock] SponsorBlock segments found for video: {youtube_video}")
return True return True
def return_download_options(information:dict,track:str)->dict: def return_download_options(episode_data)->dict:
download_options = { download_options = {
"quiet": True, "quiet": True,
"noprogress": True, "noprogress": True,
"format": "bestaudio/best", "format": "bestaudio/best",
"extract_audio": True, "extract_audio": True,
"audio_format": "mp3", "audio_format": "mp3",
"outtmpl": f"perun-{information['date']}.%(ext)s", "outtmpl": f"perun-{episode_data.episode_date}.%(ext)s",
"addmetadata": True, "addmetadata": True,
"postprocessors":[ "postprocessors":[
{"api": "https://sponsor.ajay.app", {"api": "https://sponsor.ajay.app",
@ -109,12 +56,12 @@ def return_download_options(information:dict,track:str)->dict:
"key": "FFmpegMetadata", "key": "FFmpegMetadata",
}], }],
"postprocessor_args": [ "postprocessor_args": [
"-metadata", f"title={information['title']}", "-metadata", f"title={episode_data.episode_title}",
"-metadata", "artist=Perun", "-metadata", "artist=Perun",
"-metadata", f"track={track}", "-metadata", f"track={episode_data.episode_number}",
"-metadata", f"date={information['date']}", "-metadata", f"date={episode_data.episode_date}",
"-metadata", f"comment={return_string_as_html(information['description'])}", "-metadata", f"comment={return_string_as_html(episode_data.episode_description)}",
"-metadata", f"description={return_string_as_html(information['description'])}", "-metadata", f"description={episode_data.episode_ytid}",
], ],
"merge_output_format": "mp3" "merge_output_format": "mp3"
} }