Perun: Changed fetching YouTube data from using yt-dlp to an RSS feed #2

Merged
florian merged 1 commits from feature/rss-feed into main 2025-12-08 18:46:50 +01:00
4 changed files with 132 additions and 100 deletions

View File

@ -4,9 +4,9 @@ import os
import time
from dotenv import load_dotenv
from ssh_helper import upload_via_sftp, send_notification_via_ssh
from youtube_handler import get_url_for_latest_video, get_youtube_data, return_download_options, check_for_sponsorblock_segments
from youtube_handler import return_download_options, check_for_sponsorblock_segments
from simple_logger_handler import setup_logger
from rss_feed_handler import grab_latest_chapter_information, EpisodeData
logger = setup_logger(__name__)
load_dotenv()
@ -14,14 +14,14 @@ PODCAST_AUTHORIZATION_TOKEN = os.getenv("PODCAST_AUTHORIZATION_TOKEN")
PODCAST_API_URL = os.getenv("PODCAST_API_URL")
def get_audiobookshelf_data() -> tuple[int | None, str | None]:
def get_audiobookshelf_data() -> tuple[str | None, str | None]:
"""
Fetches the latest episode data from the Audiobookshelf API.
Returns:
tuple[int | None, str | None]:
- The track number as an integer (or None if data could not be fetched due to retryable errors).
- The episode title as a string (or None if data could not be fetched due to retryable errors).
- The track number as a string (or None if data could not be fetched due to retryable errors).
- The YouTube episode id as a string (or None if data could not be fetched due to retryable errors).
Raises:
requests.exceptions.HTTPError:
@ -40,10 +40,10 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
result = response.json()
audiobookshelf_track = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTrack"]
audiobookshelf_title = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTitle"]
audiobookshelf_ytid = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagDescription"]
logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, title={audiobookshelf_title}")
return (audiobookshelf_track, audiobookshelf_title)
logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, ytid={audiobookshelf_ytid}")
return (audiobookshelf_track, audiobookshelf_ytid)
except requests.exceptions.ConnectionError as e:
logger.warning(f"[Audiobookshelf] Connection error, will retry: {e}")
@ -60,57 +60,56 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
logger.error(f"[Audiobookshelf] HTTP error {status}, not retrying: {e}")
raise
def check_until_new_episode_gets_released() -> tuple[int | None, dict | None, str | None]:
def check_until_new_episode_gets_released() -> tuple[EpisodeData | None, str | None]:
"""
Polls YouTube every hour for a new episode and compares it to the available episode on Audiobookshelf.
Stops after 72 hours.
Returns:
tuple[int | None, dict | None, str | None]:
tuple[EpisodeData | None, str | None]:
- EpisodeData with information about the date,description,link,title and YouTube ID
- Track number from Audiobookshelf
- Episode info dictionary from YouTube
- Episode URL
Returns (None, None, None) if no new episode found within timeout
Returns (None, None) if no new episode found within timeout
"""
CHECK_INTERVAL_HOURS = 1
MAX_HOURS = 72
for attempt in range(1, MAX_HOURS + 1):
logger.debug(f"[EpisodeCheck] Waiting for a new episode to be released, attempt: {attempt}/{MAX_HOURS}")
audiobookshelf_track, audiobookshelf_title = get_audiobookshelf_data()
audiobookshelf_track, audiobookshelf_ytid = get_audiobookshelf_data()
if audiobookshelf_track is None or audiobookshelf_title is None:
if audiobookshelf_track is None or audiobookshelf_ytid is None:
logger.warning("[EpisodeCheck] Unable to fetch Audiobookshelf data, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue
episode_url = get_url_for_latest_video()
if episode_url is None:
logger.warning("[EpisodeCheck] Unable to fetch latest video URL, retrying in 1 hour.")
try:
episode_data = grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q")
except Exception as e:
logger.warning(f"[EpisodeCheck] Failed to fetch latest video data: {e}, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue
episode_info = get_youtube_data(episode_url)
if not episode_info:
logger.warning("[EpisodeCheck] Unable to fetch video metadata, retrying in 1 hour.")
if episode_data is None:
logger.warning("[EpisodeCheck] Unable to fetch latest video data, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600)
continue
if audiobookshelf_title != episode_info["title"]:
logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_info['title']}")
return (audiobookshelf_track,episode_info,episode_url)
if audiobookshelf_ytid != episode_data.episode_ytid:
logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_data.episode_title}")
return episode_data, audiobookshelf_track
logger.debug("[EpisodeCheck] No new episode found, retrying in 1 hour.")
logger.info("[EpisodeCheck] No new episode found, retrying in 1 hour.")
time.sleep(CHECK_INTERVAL_HOURS * 3600)
logger.warning("[EpisodeCheck] No new episode found after maximum attempts.")
return (None, None, None)
return None, None
def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool:
def wait_for_sponsorblock_segments_to_be_added(episode_link) -> bool:
"""
Polls SponsorBlock for segments on the current video until found or until max attempts.
Args:
episode_url: YouTube video URL to check for SponsorBlock segments
episode_link: YouTube video URL to check for SponsorBlock segments
Returns:
True if segments found, False otherwise
@ -119,7 +118,7 @@ def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool:
MAX_HOURS = 24
for attempt in range(1, MAX_HOURS + 1):
logger.debug(f"[SponsorBlock] Waiting for SponsorBlock to be added, attempt: {attempt}/{MAX_HOURS} ")
segments = check_for_sponsorblock_segments(episode_url)
segments = check_for_sponsorblock_segments(episode_link)
if segments:
logger.debug("[SponsorBlock] Segments found, exiting loop.")
@ -138,9 +137,9 @@ def download_episode() -> None:
logger.info("[App] Starting Perun")
try:
audiobookshelf_track,episode_info,episode_url = check_until_new_episode_gets_released()
episode_data,audiobookshelf_track = check_until_new_episode_gets_released()
if audiobookshelf_track is None or episode_info is None or episode_url is None:
if episode_data is None or audiobookshelf_track is None:
logger.error("[App] Failed to find new episode within timeout period")
return
@ -150,23 +149,22 @@ def download_episode() -> None:
return
try:
episode_description = episode_info.get("description", "")
if "sponsored" in episode_description.lower():
if "sponsored" in episode_data.episode_description.lower():
logger.debug("[App] Sponsored segments found in description, waiting for SponsorBlock")
wait_for_sponsorblock_segments_to_be_added(episode_url)
wait_for_sponsorblock_segments_to_be_added(episode_data.episode_link)
else:
logger.debug("[App] No sponsored segments found in description")
except Exception as e:
logger.warning(f"[App] Failed during SponsorBlock wait: {e}", exc_info=True)
try:
track = str(int(audiobookshelf_track) + 1).zfill(4)
episode_data.episode_number = str(int(audiobookshelf_track) + 1).zfill(4)
except (ValueError,TypeError) as e:
logger.warning(f"[App] Failed incrementing audiobookshelf track: {e}", exc_info=True)
return
try:
options = return_download_options(episode_info,track)
options = return_download_options(episode_data)
except Exception as e:
logger.error(f"[App] Failed to generate download options: {e}", exc_info=True)
return
@ -174,17 +172,17 @@ def download_episode() -> None:
logger.info("[App] Downloading new episode")
try:
with yt_dlp.YoutubeDL(options) as episode:
episode.download(episode_url)
episode.download(episode_data.episode_link)
logger.debug("[App] Download completed successfully")
except Exception as e:
logger.error(f"[App] Failed to download episode: {e}", exc_info=True)
return
logger.info("[App] Uploading episode via SFTP")
upload_via_sftp(f"perun-{episode_info['date']}.mp3")
upload_via_sftp(f"perun-{episode_data.episode_date}.mp3")
logger.info("[App] Sending release notification")
send_notification_via_ssh(f"Perun episode {track} has been released",episode_info["title"])
send_notification_via_ssh(f"Perun episode {episode_data.episode_number} has been released",episode_data.episode_title)
logger.info("[App] Workflow complete")

View File

@ -6,6 +6,7 @@ cffi==2.0.0
charset-normalizer==3.4.3
cryptography==46.0.2
dotenv==0.9.9
feedparser==6.0.12
idna==3.10
invoke==2.2.0
mutagen==1.47.0
@ -15,7 +16,8 @@ pycryptodomex==3.23.0
PyNaCl==1.6.0
python-dotenv==1.1.1
requests==2.32.5
sgmllib3k==1.0.0
simple-logger-handler==0.1.0
sponsorblock.py==0.2.3
urllib3==2.5.0
websockets==15.0.1
simple-logger-handler==0.1.0

View File

@ -0,0 +1,85 @@
import feedparser
from simple_logger_handler import setup_logger
import time
from urllib.error import URLError
from typing import Optional
from dataclasses import dataclass
from datetime import datetime
@dataclass
class EpisodeData:
episode_date: str
episode_description: str
episode_link: str
episode_number: str
episode_title: str
episode_ytid: str
logger = setup_logger(__name__)
def grab_latest_chapter_information(id: str, max_retries: int = 3) -> Optional[EpisodeData]:
"""
Fetches the latest episodes information from a Youtube RSS feed, with retries on network-related errors.
Parameters:
id: Youtube channel ID as a string.
max_retries: Number of retry attempts if fetching the feed fails due to network issues.
Returns:
EpisodeData: A dataclass containing episode metadata:
episode_date: Date when it was published in iso format (2025-11-30).
episode_description: Episode description.
episode_link: YouTube link.
episode_number: Episode number.
episode_title: Episode title.
episode_ytid: Episode YouTube ID .
Returns None if the feed has no entries or all retries are exhausted.
Raises:
ValueError: If the feed has no entries.
Other network-related exceptions: If fetching fails after retries.
"""
rss_feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={id}"
attempt = 1
while attempt <= max_retries:
logger.debug(f"[Feed] Parsing feed URL: {rss_feed_url} (attempt {attempt}/{max_retries})")
try:
feed = feedparser.parse(rss_feed_url)
if not feed.entries:
logger.warning(f"[Feed] No entries found for feed {id}")
return None
latest_chapter_data = feed["entries"][0]
episode_link = latest_chapter_data["link"]
episode_title = latest_chapter_data["title"]
episode_description = latest_chapter_data["summary"]
episode_date = latest_chapter_data["published"]
episode_date = datetime.fromisoformat(episode_date).date().isoformat()
episode_ytid = latest_chapter_data["yt_videoid"]
logger.info(f"[Feed] Latest episode '{episode_title}': {episode_link}")
logger.debug(f"[Feed] Latest episode '{episode_title}' (YouTubeId {episode_ytid}): {episode_link} -> {episode_description}")
return EpisodeData(
episode_date=episode_date,
episode_description=episode_description,
episode_link=episode_link,
episode_number="",
episode_title=episode_title,
episode_ytid=episode_ytid
)
except (URLError, OSError) as e:
logger.warning(f"[Feed] Network error on attempt {attempt} for feed {id}: {e}")
if attempt == max_retries:
logger.error(f"[Feed] All {max_retries} attempts failed for feed {id}")
return None
backoff = 2 ** (attempt - 1)
logger.debug(f"[Feed] Retrying in {backoff} seconds...")
time.sleep(backoff)
attempt += 1
if __name__ == "__main__":
print(grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q"))

View File

@ -13,59 +13,6 @@ logger = setup_logger(__name__)
load_dotenv()
YOUTUBE_CHANNEL_URL = os.getenv("YOUTUBE_CHANNEL_URL")
def get_url_for_latest_video():
"""
Fetch the URL of the latest video from a YouTube channel.
"""
logger.info("[YouTube] Fetching latest video URL from YouTube channel")
options = {
"extract_flat": True,
"playlist_items": "1",
"quiet": True,
"forcejson": True,
"simulate": True,
}
try:
with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull):
with yt_dlp.YoutubeDL(options) as video:
info_dict = video.extract_info(YOUTUBE_CHANNEL_URL, download=False)
except Exception as e:
logger.error(f"[YouTube] Failed to fetch latest video info: {e}", exc_info=True)
return None
if "entries" in info_dict and len(info_dict["entries"]) > 0:
latest_url = info_dict["entries"][0]["url"]
logger.debug(f"[YouTube] Latest video URL found: {latest_url}")
return latest_url
else:
logger.warning("[YouTube] No entries found in channel feed")
return None
def get_youtube_data(url: str) -> dict:
"""
Fetch metadata for a given YouTube video URL.
"""
logger.info(f"Fetching YouTube metadata for video: {url}")
try:
with yt_dlp.YoutubeDL({"quiet": True, "noprogress": True}) as video:
info_dict = video.extract_info(url, download=False)
except Exception as e:
logger.error(f"[YouTube] Failed to fetch YouTube video info for {url}: {e}", exc_info=True)
return {}
video_data = {
"date": datetime.datetime.fromtimestamp(
info_dict["timestamp"], datetime.timezone.utc
).strftime("%Y-%m-%d"),
"title": info_dict["title"],
"description": info_dict.get("description", "")
}
logger.debug(f"[YouTube] Fetched video data: {json.dumps(video_data, indent=4)}")
return video_data
def check_for_sponsorblock_segments(youtube_video:str) -> bool:
client = sb.Client()
try:
@ -78,14 +25,14 @@ def check_for_sponsorblock_segments(youtube_video:str) -> bool:
logger.info(f"[SponsorBlock] SponsorBlock segments found for video: {youtube_video}")
return True
def return_download_options(information:dict,track:str)->dict:
def return_download_options(episode_data)->dict:
download_options = {
"quiet": True,
"noprogress": True,
"format": "bestaudio/best",
"extract_audio": True,
"audio_format": "mp3",
"outtmpl": f"perun-{information['date']}.%(ext)s",
"outtmpl": f"perun-{episode_data.episode_date}.%(ext)s",
"addmetadata": True,
"postprocessors":[
{"api": "https://sponsor.ajay.app",
@ -109,12 +56,12 @@ def return_download_options(information:dict,track:str)->dict:
"key": "FFmpegMetadata",
}],
"postprocessor_args": [
"-metadata", f"title={information['title']}",
"-metadata", f"title={episode_data.episode_title}",
"-metadata", "artist=Perun",
"-metadata", f"track={track}",
"-metadata", f"date={information['date']}",
"-metadata", f"comment={return_string_as_html(information['description'])}",
"-metadata", f"description={return_string_as_html(information['description'])}",
"-metadata", f"track={episode_data.episode_number}",
"-metadata", f"date={episode_data.episode_date}",
"-metadata", f"comment={return_string_as_html(episode_data.episode_description)}",
"-metadata", f"description={episode_data.episode_ytid}",
],
"merge_output_format": "mp3"
}