"""Loader that loads YouTube transcript.""" from __future__ import annotations from pathlib import Path from typing import Any, Dict, List, Optional from pydantic import root_validator from pydantic.dataclasses import dataclass from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] @dataclass class GoogleApiClient: """A Generic Google Api Client. To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google`` python package installed. As the google api expects credentials you need to set up a google account and register your Service. "https://developers.google.com/docs/api/quickstart/python" Example: .. code-block:: python from langchain.document_loaders import GoogleApiClient google_api_client = GoogleApiClient( service_account_path=Path("path_to_your_sec_file.json") ) """ credentials_path: Path = Path.home() / ".credentials" / "credentials.json" service_account_path: Path = Path.home() / ".credentials" / "credentials.json" token_path: Path = Path.home() / ".credentials" / "token.json" def __post_init__(self) -> None: self.creds = self._load_credentials() @root_validator def validate_channel_or_videoIds_is_set( cls, values: Dict[str, Any] ) -> Dict[str, Any]: """Validate that either folder_id or document_ids is set, but not both.""" if not values.get("credentials_path") and not values.get( "service_account_path" ): raise ValueError("Must specify either channel_name or video_ids") return values def _load_credentials(self) -> Any: """Load credentials.""" # Adapted from https://developers.google.com/drive/api/v3/quickstart/python try: from google.auth.transport.requests import Request from google.oauth2 import service_account from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401 except ImportError: raise ImportError( "You must run" "`pip install --upgrade " "google-api-python-client google-auth-httplib2 " "google-auth-oauthlib" "youtube-transcript-api`" "to use the Google Drive loader" ) creds = None if self.service_account_path.exists(): return service_account.Credentials.from_service_account_file( str(self.service_account_path) ) if self.token_path.exists(): creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( str(self.credentials_path), SCOPES ) creds = flow.run_local_server(port=0) with open(self.token_path, "w") as token: token.write(creds.to_json()) return creds class YoutubeLoader(BaseLoader): """Loader that loads Youtube transcripts.""" def __init__( self, video_id: str, add_video_info: bool = False, language: str = "en" ): """Initialize with YouTube video ID.""" self.video_id = video_id self.add_video_info = add_video_info self.language = language @classmethod def from_youtube_channel(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: """Given a channel name, load all videos.""" video_id = youtube_url.split("youtube.com/watch?v=")[-1] return cls(video_id, **kwargs) def load(self) -> List[Document]: """Load documents.""" try: from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi except ImportError: raise ImportError( "Could not import youtube_transcript_api python package. " "Please it install it with `pip install youtube-transcript-api`." ) metadata = {"source": self.video_id} if self.add_video_info: # Get more video meta info # Such as title, description, thumbnail url, publish_date video_info = self._get_video_info() metadata.update(video_info) transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) try: transcript = transcript_list.find_transcript([self.language]) except NoTranscriptFound: en_transcript = transcript_list.find_transcript(["en"]) transcript = en_transcript.translate(self.language) transcript_pieces = transcript.fetch() transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) return [Document(page_content=transcript, metadata=metadata)] def _get_video_info(self) -> dict: """Get important video information. Components are: - title - description - thumbnail url, - publish_date - channel_author - and more. """ try: from pytube import YouTube except ImportError: raise ImportError( "Could not import pytube python package. " "Please it install it with `pip install pytube`." ) yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}") video_info = { "title": yt.title, "description": yt.description, "view_count": yt.views, "thumbnail_url": yt.thumbnail_url, "publish_date": yt.publish_date, "length": yt.length, "author": yt.author, } return video_info @dataclass class GoogleApiYoutubeLoader(BaseLoader): """Loader that loads all Videos from a Channel To use, you should have the ``googleapiclient,youtube_transcript_api`` python package installed. As the service needs a google_api_client, you first have to initialize the GoogleApiClient. Additionally you have to either provide a channel name or a list of videoids "https://developers.google.com/docs/api/quickstart/python" Example: .. code-block:: python from langchain.document_loaders import GoogleApiClient from langchain.document_loaders import GoogleApiYoutubeLoader google_api_client = GoogleApiClient( service_account_path=Path("path_to_your_sec_file.json") ) loader = GoogleApiYoutubeLoader( google_api_client=google_api_client, channel_name = "CodeAesthetic" ) load.load() """ google_api_client: GoogleApiClient channel_name: Optional[str] = None video_ids: Optional[List[str]] = None add_video_info: bool = True captions_language: str = "en" def __post_init__(self) -> None: self.youtube_client = self._build_youtube_client(self.google_api_client.creds) def _build_youtube_client(self, creds: Any) -> Any: try: from googleapiclient.discovery import build from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401 except ImportError: raise ImportError( "You must run" "`pip install --upgrade " "google-api-python-client google-auth-httplib2 " "google-auth-oauthlib" "youtube-transcript-api`" "to use the Google Drive loader" ) return build("youtube", "v3", credentials=creds) @root_validator def validate_channel_or_videoIds_is_set( cls, values: Dict[str, Any] ) -> Dict[str, Any]: """Validate that either folder_id or document_ids is set, but not both.""" if not values.get("channel_name") and not values.get("video_ids"): raise ValueError("Must specify either channel_name or video_ids") return values def _get_transcripe_for_video_id(self, video_id: str) -> str: from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids) try: transcript = transcript_list.find_transcript([self.captions_language]) except NoTranscriptFound: en_transcript = transcript_list.find_transcript(["en"]) transcript = en_transcript.translate(self.captions_language) transcript_pieces = transcript.fetch() return " ".join([t["text"].strip(" ") for t in transcript_pieces]) def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: captions = self._get_transcripe_for_video_id(video_id) video_response = ( self.youtube_client.videos() .list( part="id,snippet", id=video_id, ) .execute() ) return Document( page_content=captions, metadata=video_response.get("items")[0], ) def _get_channel_id(self, channel_name: str) -> str: request = self.youtube_client.search().list( part="id", q=channel_name, type="channel", maxResults=1, # we only need one result since channel names are unique ) response = request.execute() channel_id = response["items"][0]["id"]["channelId"] return channel_id def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]: channel_id = self._get_channel_id(channel) request = self.youtube_client.search().list( part="id,snippet", channelId=channel_id, maxResults=50, # adjust this value to retrieve more or fewer videos ) video_ids = [] while request is not None: response = request.execute() # Add each video ID to the list for item in response["items"]: if not item["id"].get("videoId"): continue meta_data = {"videoId": item["id"]["videoId"]} if self.add_video_info: item["snippet"].pop("thumbnails") meta_data.update(item["snippet"]) video_ids.append( Document( page_content=self._get_transcripe_for_video_id( item["id"]["videoId"] ), metadata=meta_data, ) ) request = self.youtube_client.search().list_next(request, response) return video_ids def load(self) -> List[Document]: """Load documents.""" document_list = [] if self.channel_name: document_list.extend(self._get_document_for_channel(self.channel_name)) elif self.video_ids: document_list.extend( [ self._get_document_for_video_id(video_id) for video_id in self.video_ids ] ) else: raise ValueError("Must specify either channel_name or video_ids") return document_list