Merge pull request 'Perun: Changed fetching YouTube data from using yt-dlp to an RSS feed' (#2) from feature/rss-feed into main

Reviewed-on: #2
2025-12-08 18:46:49 +01:00 · 2025-12-08 18:46:49 +01:00 · e81fe01a01
commit e81fe01a01
parent f5a95a08f2 0af3d0c7d9
4 changed files with 132 additions and 100 deletions
--- a/src/perun/get_episode.py
+++ b/src/perun/get_episode.py
@ -4,9 +4,9 @@ import os
 import time
 from dotenv import load_dotenv
 from ssh_helper import upload_via_sftp, send_notification_via_ssh
-from youtube_handler import get_url_for_latest_video, get_youtube_data, return_download_options, check_for_sponsorblock_segments
+from youtube_handler import return_download_options, check_for_sponsorblock_segments
 from simple_logger_handler import setup_logger
-
+from rss_feed_handler import grab_latest_chapter_information, EpisodeData
 logger = setup_logger(__name__)

 load_dotenv()
@ -14,14 +14,14 @@ PODCAST_AUTHORIZATION_TOKEN = os.getenv("PODCAST_AUTHORIZATION_TOKEN")
 PODCAST_API_URL =  os.getenv("PODCAST_API_URL")


-def get_audiobookshelf_data() -> tuple[int | None, str | None]:
+def get_audiobookshelf_data() -> tuple[str | None, str | None]:
 	"""
 	Fetches the latest episode data from the Audiobookshelf API.

 	Returns:
 		tuple[int | None, str | None]:
-			- The track number as an integer (or None if data could not be fetched due to retryable errors).
-			- The episode title as a string (or None if data could not be fetched due to retryable errors).
+			- The track number as a string (or None if data could not be fetched due to retryable errors).
+			- The YouTube episode id as a string (or None if data could not be fetched due to retryable errors).

 	Raises:
 		requests.exceptions.HTTPError:
@ -40,10 +40,10 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
 		result = response.json()

 		audiobookshelf_track = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTrack"]
-		audiobookshelf_title = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagTitle"]
+		audiobookshelf_ytid = result["media"]["episodes"][-1]["audioFile"]["metaTags"]["tagDescription"]

-		logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, title={audiobookshelf_title}")
-		return (audiobookshelf_track, audiobookshelf_title)
+		logger.debug(f"[Audiobookshelf] Fetched Audiobookshelf data: track={audiobookshelf_track}, ytid={audiobookshelf_ytid}")
+		return (audiobookshelf_track, audiobookshelf_ytid)

 	except requests.exceptions.ConnectionError as e:
 		logger.warning(f"[Audiobookshelf] Connection error, will retry: {e}")
@ -60,57 +60,56 @@ def get_audiobookshelf_data() -> tuple[int | None, str | None]:
 			logger.error(f"[Audiobookshelf] HTTP error {status}, not retrying: {e}")
 			raise
 	
-def check_until_new_episode_gets_released() -> tuple[int | None, dict | None, str | None]:
+def check_until_new_episode_gets_released() -> tuple[EpisodeData | None, str | None]:
 	"""
 	Polls YouTube every hour for a new episode and compares it to the available episode on Audiobookshelf.
 	Stops after 72 hours.

 	Returns:
-		tuple[int | None, dict | None, str | None]:
+		tuple[EpisodeData | None, str | None]:
+			- EpisodeData with information about the date,description,link,title and YouTube ID
 			- Track number from Audiobookshelf
-			- Episode info dictionary from YouTube
-			- Episode URL
-			Returns (None, None, None) if no new episode found within timeout
+			Returns (None, None) if no new episode found within timeout
 	"""
 	CHECK_INTERVAL_HOURS = 1
 	MAX_HOURS = 72
 	for attempt in range(1, MAX_HOURS + 1):
 		logger.debug(f"[EpisodeCheck] Waiting for a new episode to be released, attempt: {attempt}/{MAX_HOURS}")
-		audiobookshelf_track, audiobookshelf_title = get_audiobookshelf_data()
+		audiobookshelf_track, audiobookshelf_ytid = get_audiobookshelf_data()

-		if audiobookshelf_track is None or audiobookshelf_title is None:
+		if audiobookshelf_track is None or audiobookshelf_ytid is None:
 			logger.warning("[EpisodeCheck] Unable to fetch Audiobookshelf data, retrying in 1 hour.")
 			time.sleep(CHECK_INTERVAL_HOURS * 3600)
 			continue

-		episode_url = get_url_for_latest_video()
-		if episode_url is None:
-			logger.warning("[EpisodeCheck] Unable to fetch latest video URL, retrying in 1 hour.")
+		try:
+			episode_data = grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q")
+		except Exception as e:
+			logger.warning(f"[EpisodeCheck] Failed to fetch latest video data: {e}, retrying in 1 hour.")
 			time.sleep(CHECK_INTERVAL_HOURS * 3600)
 			continue

-		episode_info = get_youtube_data(episode_url)
-		if not episode_info:
-			logger.warning("[EpisodeCheck] Unable to fetch video metadata, retrying in 1 hour.")
+		if episode_data is None:
+			logger.warning("[EpisodeCheck] Unable to fetch latest video data, retrying in 1 hour.")
 			time.sleep(CHECK_INTERVAL_HOURS * 3600)
 			continue

-		if audiobookshelf_title != episode_info["title"]:
-			logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_info['title']}")
-			return (audiobookshelf_track,episode_info,episode_url)
+		if audiobookshelf_ytid != episode_data.episode_ytid:
+			logger.info(f"[EpisodeCheck] Latest YouTube episode: {episode_data.episode_title}")
+			return episode_data, audiobookshelf_track

-		logger.debug("[EpisodeCheck] No new episode found, retrying in 1 hour.")
+		logger.info("[EpisodeCheck] No new episode found, retrying in 1 hour.")
 		time.sleep(CHECK_INTERVAL_HOURS * 3600)

 	logger.warning("[EpisodeCheck] No new episode found after maximum attempts.")
-	return (None, None, None)
+	return None, None

-def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool:
+def wait_for_sponsorblock_segments_to_be_added(episode_link) -> bool:
 	"""
 	Polls SponsorBlock for segments on the current video until found or until max attempts.

 	Args:
-	episode_url: YouTube video URL to check for SponsorBlock segments
+	episode_link: YouTube video URL to check for SponsorBlock segments

 	Returns:
 		True if segments found, False otherwise
@ -119,7 +118,7 @@ def wait_for_sponsorblock_segments_to_be_added(episode_url) -> bool:
 	MAX_HOURS = 24
 	for attempt in range(1, MAX_HOURS + 1):
 		logger.debug(f"[SponsorBlock] Waiting for SponsorBlock to be added, attempt: {attempt}/{MAX_HOURS} ")
-		segments = check_for_sponsorblock_segments(episode_url)
+		segments = check_for_sponsorblock_segments(episode_link)

 		if segments:
 			logger.debug("[SponsorBlock] Segments found, exiting loop.")
@ -138,9 +137,9 @@ def download_episode() -> None:
 	logger.info("[App] Starting Perun")

 	try:
-		audiobookshelf_track,episode_info,episode_url = check_until_new_episode_gets_released()
+		episode_data,audiobookshelf_track = check_until_new_episode_gets_released()

-		if audiobookshelf_track is None or episode_info is None or episode_url is None:
+		if episode_data is None or audiobookshelf_track is None:
 			logger.error("[App] Failed to find new episode within timeout period")
 			return

@ -150,23 +149,22 @@ def download_episode() -> None:
 		return

 	try:
-		episode_description = episode_info.get("description", "")
-		if "sponsored" in episode_description.lower():
+		if "sponsored" in episode_data.episode_description.lower():
 			logger.debug("[App] Sponsored segments found in description, waiting for SponsorBlock")
-			wait_for_sponsorblock_segments_to_be_added(episode_url)
+			wait_for_sponsorblock_segments_to_be_added(episode_data.episode_link)
 		else:
 			logger.debug("[App] No sponsored segments found in description")
 	except Exception as e:
 		logger.warning(f"[App] Failed during SponsorBlock wait: {e}", exc_info=True)

 	try:
-		track = str(int(audiobookshelf_track) + 1).zfill(4)
+		episode_data.episode_number = str(int(audiobookshelf_track) + 1).zfill(4)
 	except (ValueError,TypeError) as e:
 		logger.warning(f"[App] Failed incrementing audiobookshelf track: {e}", exc_info=True)
 		return

 	try:
-		options = return_download_options(episode_info,track)
+		options = return_download_options(episode_data)
 	except Exception as e:
 		logger.error(f"[App] Failed to generate download options: {e}", exc_info=True)
 		return
@ -174,17 +172,17 @@ def download_episode() -> None:
 	logger.info("[App] Downloading new episode")
 	try:
 		with yt_dlp.YoutubeDL(options) as episode:
-			episode.download(episode_url)
+			episode.download(episode_data.episode_link)
 		logger.debug("[App] Download completed successfully")
 	except Exception as e:
 		logger.error(f"[App] Failed to download episode: {e}", exc_info=True)
 		return
 	
 	logger.info("[App] Uploading episode via SFTP")
-	upload_via_sftp(f"perun-{episode_info['date']}.mp3")
+	upload_via_sftp(f"perun-{episode_data.episode_date}.mp3")

 	logger.info("[App] Sending release notification")
-	send_notification_via_ssh(f"Perun episode {track} has been released",episode_info["title"])
+	send_notification_via_ssh(f"Perun episode {episode_data.episode_number} has been released",episode_data.episode_title)
 	logger.info("[App] Workflow complete")


--- a/src/perun/requirements.txt
+++ b/src/perun/requirements.txt
@ -6,6 +6,7 @@ cffi==2.0.0
 charset-normalizer==3.4.3
 cryptography==46.0.2
 dotenv==0.9.9
+feedparser==6.0.12
 idna==3.10
 invoke==2.2.0
 mutagen==1.47.0
@ -15,7 +16,8 @@ pycryptodomex==3.23.0
 PyNaCl==1.6.0
 python-dotenv==1.1.1
 requests==2.32.5
+sgmllib3k==1.0.0
+simple-logger-handler==0.1.0
 sponsorblock.py==0.2.3
 urllib3==2.5.0
 websockets==15.0.1
-simple-logger-handler==0.1.0
--- a/src/perun/rss_feed_handler.py
+++ b/src/perun/rss_feed_handler.py
@ -0,0 +1,85 @@
+import feedparser
+from simple_logger_handler import setup_logger
+import time
+from urllib.error import URLError
+from typing import Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+@dataclass
+class EpisodeData:
+	episode_date: str
+	episode_description: str
+	episode_link: str
+	episode_number: str
+	episode_title: str
+	episode_ytid: str
+
+logger = setup_logger(__name__)
+
+def grab_latest_chapter_information(id: str, max_retries: int = 3) -> Optional[EpisodeData]:
+	"""
+	Fetches the latest episodes information from a Youtube RSS feed, with retries on network-related errors.
+
+	Parameters:
+		id: Youtube channel ID as a string.
+		max_retries: Number of retry attempts if fetching the feed fails due to network issues.
+
+	Returns:
+		EpisodeData: A dataclass containing episode metadata:
+			episode_date: Date when it was published in iso format (2025-11-30).
+			episode_description: Episode description.
+			episode_link: YouTube link.
+			episode_number: Episode number.
+			episode_title: Episode title.
+			episode_ytid: Episode YouTube ID .
+		Returns None if the feed has no entries or all retries are exhausted.
+
+	Raises:
+		ValueError: If the feed has no entries.
+		Other network-related exceptions: If fetching fails after retries.
+	"""
+
+	rss_feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={id}"
+	attempt = 1
+
+	while attempt <= max_retries:
+		logger.debug(f"[Feed] Parsing feed URL: {rss_feed_url} (attempt {attempt}/{max_retries})")
+		try:
+			feed = feedparser.parse(rss_feed_url)
+
+			if not feed.entries:
+				logger.warning(f"[Feed] No entries found for feed {id}")
+				return None
+
+			latest_chapter_data = feed["entries"][0]
+			episode_link = latest_chapter_data["link"]
+			episode_title = latest_chapter_data["title"]
+			episode_description = latest_chapter_data["summary"]
+			episode_date = latest_chapter_data["published"]
+			episode_date = datetime.fromisoformat(episode_date).date().isoformat()
+			episode_ytid = latest_chapter_data["yt_videoid"]
+
+			logger.info(f"[Feed] Latest episode '{episode_title}': {episode_link}")
+			logger.debug(f"[Feed] Latest episode '{episode_title}' (YouTubeId {episode_ytid}): {episode_link} -> {episode_description}")
+			return EpisodeData(
+					episode_date=episode_date,
+					episode_description=episode_description,
+					episode_link=episode_link,
+					episode_number="",
+					episode_title=episode_title,
+					episode_ytid=episode_ytid
+			)
+
+		except (URLError, OSError) as e:
+			logger.warning(f"[Feed] Network error on attempt {attempt} for feed {id}: {e}")
+			if attempt == max_retries:
+				logger.error(f"[Feed] All {max_retries} attempts failed for feed {id}")
+				return None
+			backoff = 2 ** (attempt - 1)
+			logger.debug(f"[Feed] Retrying in {backoff} seconds...")
+			time.sleep(backoff)
+			attempt += 1
+
+if __name__ == "__main__":
+	print(grab_latest_chapter_information("UCC3ehuUksTyQ7bbjGntmx3Q"))
--- a/src/perun/youtube_handler.py
+++ b/src/perun/youtube_handler.py
@ -13,59 +13,6 @@ logger = setup_logger(__name__)
 load_dotenv()
 YOUTUBE_CHANNEL_URL = os.getenv("YOUTUBE_CHANNEL_URL")

-
-def get_url_for_latest_video():
-	"""
-	Fetch the URL of the latest video from a YouTube channel.
-	"""
-	logger.info("[YouTube] Fetching latest video URL from YouTube channel")
-	options = {
-	"extract_flat": True,
-	"playlist_items": "1",
-	"quiet": True,
-	"forcejson": True,
-	"simulate": True,
-	}
-
-	try:
-		with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull):
-			with yt_dlp.YoutubeDL(options) as video:
-				info_dict = video.extract_info(YOUTUBE_CHANNEL_URL, download=False)
-	except Exception as e:
-		logger.error(f"[YouTube] Failed to fetch latest video info: {e}", exc_info=True)
-		return None
-
-	if "entries" in info_dict and len(info_dict["entries"]) > 0:
-		latest_url = info_dict["entries"][0]["url"]
-		logger.debug(f"[YouTube] Latest video URL found: {latest_url}")
-		return latest_url
-	else:
-		logger.warning("[YouTube] No entries found in channel feed")
-		return None
-
-def get_youtube_data(url: str) -> dict:
-	"""
-	Fetch metadata for a given YouTube video URL.
-	"""
-	logger.info(f"Fetching YouTube metadata for video: {url}")
-	try:
-		with yt_dlp.YoutubeDL({"quiet": True, "noprogress": True}) as video:
-			info_dict = video.extract_info(url, download=False)
-	except Exception as e:
-		logger.error(f"[YouTube] Failed to fetch YouTube video info for {url}: {e}", exc_info=True)
-		return {}
-
-	video_data = {
-		"date": datetime.datetime.fromtimestamp(
-			info_dict["timestamp"], datetime.timezone.utc
-		).strftime("%Y-%m-%d"),
-		"title": info_dict["title"],
-		"description": info_dict.get("description", "")
-	}
-
-	logger.debug(f"[YouTube] Fetched video data: {json.dumps(video_data, indent=4)}")
-	return video_data
-
 def check_for_sponsorblock_segments(youtube_video:str) -> bool:
 	client = sb.Client()
 	try:
@ -78,14 +25,14 @@ def check_for_sponsorblock_segments(youtube_video:str) -> bool:
 		logger.info(f"[SponsorBlock] SponsorBlock segments found for video: {youtube_video}")
 		return True
 	
-def return_download_options(information:dict,track:str)->dict:
+def return_download_options(episode_data)->dict:
 	download_options = {
 		"quiet": True,
 		"noprogress": True,
 		"format": "bestaudio/best", 
 		"extract_audio": True,
 		"audio_format": "mp3",
-		"outtmpl": f"perun-{information['date']}.%(ext)s",
+		"outtmpl": f"perun-{episode_data.episode_date}.%(ext)s",
 		"addmetadata": True,
 		"postprocessors":[
 			{"api": "https://sponsor.ajay.app",
@ -109,12 +56,12 @@ def return_download_options(information:dict,track:str)->dict:
 		"key": "FFmpegMetadata", 
 		}],
 		"postprocessor_args": [
-			"-metadata", f"title={information['title']}",
+			"-metadata", f"title={episode_data.episode_title}",
 			"-metadata", "artist=Perun",
-			"-metadata", f"track={track}",
-			"-metadata", f"date={information['date']}",
-			"-metadata", f"comment={return_string_as_html(information['description'])}",
-			"-metadata", f"description={return_string_as_html(information['description'])}",
+			"-metadata", f"track={episode_data.episode_number}",
+			"-metadata", f"date={episode_data.episode_date}",
+			"-metadata", f"comment={return_string_as_html(episode_data.episode_description)}",
+			"-metadata", f"description={episode_data.episode_ytid}",
 		],
 		"merge_output_format": "mp3"  
 	}