horseui

Runtime error

horseui / backend /open_webui /apps /retrieval /loaders /youtube.py

github-actions[bot]

GitHub deploy: c5ef53a09faec59df3d9039e96eb797a276a1162

1554d3e 4 months ago

2.91 kB

	from typing import Any, Dict, Generator, List, Optional, Sequence, Union
	from urllib.parse import parse_qs, urlparse
	from langchain_core.documents import Document


	ALLOWED_SCHEMES = {"http", "https"}
	ALLOWED_NETLOCS = {
	"youtu.be",
	"m.youtube.com",
	"youtube.com",
	"www.youtube.com",
	"www.youtube-nocookie.com",
	"vid.plus",
	}


	def _parse_video_id(url: str) -> Optional[str]:
	"""Parse a YouTube URL and return the video ID if valid, otherwise None."""
	parsed_url = urlparse(url)

	if parsed_url.scheme not in ALLOWED_SCHEMES:
	return None

	if parsed_url.netloc not in ALLOWED_NETLOCS:
	return None

	path = parsed_url.path

	if path.endswith("/watch"):
	query = parsed_url.query
	parsed_query = parse_qs(query)
	if "v" in parsed_query:
	ids = parsed_query["v"]
	video_id = ids if isinstance(ids, str) else ids[0]
	else:
	return None
	else:
	path = parsed_url.path.lstrip("/")
	video_id = path.split("/")[-1]

	if len(video_id) != 11: # Video IDs are 11 characters long
	return None

	return video_id


	class YoutubeLoader:
	"""Load `YouTube` video transcripts."""

	def __init__(
	self,
	video_id: str,
	language: Union[str, Sequence[str]] = "en",
	):
	"""Initialize with YouTube video ID."""
	_video_id = _parse_video_id(video_id)
	self.video_id = _video_id if _video_id is not None else video_id
	self._metadata = {"source": video_id}
	self.language = language
	if isinstance(language, str):
	self.language = [language]
	else:
	self.language = language

	def load(self) -> List[Document]:
	"""Load YouTube transcripts into `Document` objects."""
	try:
	from youtube_transcript_api import (
	NoTranscriptFound,
	TranscriptsDisabled,
	YouTubeTranscriptApi,
	)
	except ImportError:
	raise ImportError(
	'Could not import "youtube_transcript_api" Python package. '
	"Please install it with `pip install youtube-transcript-api`."
	)

	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
	except Exception as e:
	print(e)
	return []

	try:
	transcript = transcript_list.find_transcript(self.language)
	except NoTranscriptFound:
	transcript = transcript_list.find_transcript(["en"])

	transcript_pieces: List[Dict[str, Any]] = transcript.fetch()

	transcript = " ".join(
	map(
	lambda transcript_piece: transcript_piece["text"].strip(" "),
	transcript_pieces,
	)
	)
	return [Document(page_content=transcript, metadata=self._metadata)]