Spaces:
Runtime error
Runtime error
import json | |
from urllib.parse import urlparse, parse_qs, urlencode | |
from urllib.request import urlopen | |
from typing import Optional, List | |
from phi.tools import Toolkit | |
try: | |
from youtube_transcript_api import YouTubeTranscriptApi | |
except ImportError: | |
raise ImportError( | |
"`youtube_transcript_api` not installed. Please install using `pip install youtube_transcript_api`" | |
) | |
class YouTubeTools(Toolkit): | |
def __init__( | |
self, | |
get_video_captions: bool = True, | |
get_video_data: bool = True, | |
languages: Optional[List[str]] = None, | |
): | |
super().__init__(name="youtube_toolkit") | |
self.languages: Optional[List[str]] = languages | |
if get_video_captions: | |
self.register(self.get_youtube_video_captions) | |
if get_video_data: | |
self.register(self.get_youtube_video_data) | |
def get_youtube_video_id(self, url: str) -> Optional[str]: | |
"""Function to get the video ID from a YouTube URL. | |
Args: | |
url: The URL of the YouTube video. | |
Returns: | |
str: The video ID of the YouTube video. | |
""" | |
parsed_url = urlparse(url) | |
hostname = parsed_url.hostname | |
if hostname == "youtu.be": | |
return parsed_url.path[1:] | |
if hostname in ("www.youtube.com", "youtube.com"): | |
if parsed_url.path == "/watch": | |
query_params = parse_qs(parsed_url.query) | |
return query_params.get("v", [None])[0] | |
if parsed_url.path.startswith("/embed/"): | |
return parsed_url.path.split("/")[2] | |
if parsed_url.path.startswith("/v/"): | |
return parsed_url.path.split("/")[2] | |
return None | |
def get_youtube_video_data(self, url: str) -> str: | |
"""Function to get video data from a YouTube URL. | |
Data returned includes {title, author_name, author_url, type, height, width, version, provider_name, provider_url, thumbnail_url} | |
Args: | |
url: The URL of the YouTube video. | |
Returns: | |
str: JSON data of the YouTube video. | |
""" | |
if not url: | |
return "No URL provided" | |
try: | |
video_id = self.get_youtube_video_id(url) | |
except Exception: | |
return "Error getting video ID from URL, please provide a valid YouTube url" | |
try: | |
params = {"format": "json", "url": f"https://www.youtube.com/watch?v={video_id}"} | |
url = "https://www.youtube.com/oembed" | |
query_string = urlencode(params) | |
url = url + "?" + query_string | |
with urlopen(url) as response: | |
response_text = response.read() | |
video_data = json.loads(response_text.decode()) | |
clean_data = { | |
"title": video_data.get("title"), | |
"author_name": video_data.get("author_name"), | |
"author_url": video_data.get("author_url"), | |
"type": video_data.get("type"), | |
"height": video_data.get("height"), | |
"width": video_data.get("width"), | |
"version": video_data.get("version"), | |
"provider_name": video_data.get("provider_name"), | |
"provider_url": video_data.get("provider_url"), | |
"thumbnail_url": video_data.get("thumbnail_url"), | |
} | |
return json.dumps(clean_data, indent=4) | |
except Exception as e: | |
return f"Error getting video data: {e}" | |
def get_youtube_video_captions(self, url: str) -> str: | |
"""Use this function to get captions from a YouTube video. | |
Args: | |
url: The URL of the YouTube video. | |
Returns: | |
str: The captions of the YouTube video. | |
""" | |
if not url: | |
return "No URL provided" | |
try: | |
video_id = self.get_youtube_video_id(url) | |
except Exception: | |
return "Error getting video ID from URL, please provide a valid YouTube url" | |
try: | |
captions = None | |
if self.languages: | |
captions = YouTubeTranscriptApi.get_transcript(video_id, languages=self.languages) | |
else: | |
captions = YouTubeTranscriptApi.get_transcript(video_id) | |
# logger.debug(f"Captions for video {video_id}: {captions}") | |
if captions: | |
return " ".join(line["text"] for line in captions) | |
return "No captions found for video" | |
except Exception as e: | |
return f"Error getting captions for video: {e}" | |