Spaces:
Runtime error
Runtime error
"""Loader that loads YouTube transcript.""" | |
from __future__ import annotations | |
from pathlib import Path | |
from typing import Any, Dict, List, Optional | |
from pydantic import root_validator | |
from pydantic.dataclasses import dataclass | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] | |
class GoogleApiClient: | |
"""A Generic Google Api Client. | |
To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google`` | |
python package installed. | |
As the google api expects credentials you need to set up a google account and | |
register your Service. "https://developers.google.com/docs/api/quickstart/python" | |
Example: | |
.. code-block:: python | |
from langchain.document_loaders import GoogleApiClient | |
google_api_client = GoogleApiClient( | |
service_account_path=Path("path_to_your_sec_file.json") | |
) | |
""" | |
credentials_path: Path = Path.home() / ".credentials" / "credentials.json" | |
service_account_path: Path = Path.home() / ".credentials" / "credentials.json" | |
token_path: Path = Path.home() / ".credentials" / "token.json" | |
def __post_init__(self) -> None: | |
self.creds = self._load_credentials() | |
def validate_channel_or_videoIds_is_set( | |
cls, values: Dict[str, Any] | |
) -> Dict[str, Any]: | |
"""Validate that either folder_id or document_ids is set, but not both.""" | |
if not values.get("credentials_path") and not values.get( | |
"service_account_path" | |
): | |
raise ValueError("Must specify either channel_name or video_ids") | |
return values | |
def _load_credentials(self) -> Any: | |
"""Load credentials.""" | |
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python | |
try: | |
from google.auth.transport.requests import Request | |
from google.oauth2 import service_account | |
from google.oauth2.credentials import Credentials | |
from google_auth_oauthlib.flow import InstalledAppFlow | |
from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401 | |
except ImportError: | |
raise ImportError( | |
"You must run" | |
"`pip install --upgrade " | |
"google-api-python-client google-auth-httplib2 " | |
"google-auth-oauthlib" | |
"youtube-transcript-api`" | |
"to use the Google Drive loader" | |
) | |
creds = None | |
if self.service_account_path.exists(): | |
return service_account.Credentials.from_service_account_file( | |
str(self.service_account_path) | |
) | |
if self.token_path.exists(): | |
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES) | |
if not creds or not creds.valid: | |
if creds and creds.expired and creds.refresh_token: | |
creds.refresh(Request()) | |
else: | |
flow = InstalledAppFlow.from_client_secrets_file( | |
str(self.credentials_path), SCOPES | |
) | |
creds = flow.run_local_server(port=0) | |
with open(self.token_path, "w") as token: | |
token.write(creds.to_json()) | |
return creds | |
class YoutubeLoader(BaseLoader): | |
"""Loader that loads Youtube transcripts.""" | |
def __init__( | |
self, video_id: str, add_video_info: bool = False, language: str = "en" | |
): | |
"""Initialize with YouTube video ID.""" | |
self.video_id = video_id | |
self.add_video_info = add_video_info | |
self.language = language | |
def from_youtube_channel(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: | |
"""Given a channel name, load all videos.""" | |
video_id = youtube_url.split("youtube.com/watch?v=")[-1] | |
return cls(video_id, **kwargs) | |
def load(self) -> List[Document]: | |
"""Load documents.""" | |
try: | |
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi | |
except ImportError: | |
raise ImportError( | |
"Could not import youtube_transcript_api python package. " | |
"Please it install it with `pip install youtube-transcript-api`." | |
) | |
metadata = {"source": self.video_id} | |
if self.add_video_info: | |
# Get more video meta info | |
# Such as title, description, thumbnail url, publish_date | |
video_info = self._get_video_info() | |
metadata.update(video_info) | |
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) | |
try: | |
transcript = transcript_list.find_transcript([self.language]) | |
except NoTranscriptFound: | |
en_transcript = transcript_list.find_transcript(["en"]) | |
transcript = en_transcript.translate(self.language) | |
transcript_pieces = transcript.fetch() | |
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) | |
return [Document(page_content=transcript, metadata=metadata)] | |
def _get_video_info(self) -> dict: | |
"""Get important video information. | |
Components are: | |
- title | |
- description | |
- thumbnail url, | |
- publish_date | |
- channel_author | |
- and more. | |
""" | |
try: | |
from pytube import YouTube | |
except ImportError: | |
raise ImportError( | |
"Could not import pytube python package. " | |
"Please it install it with `pip install pytube`." | |
) | |
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}") | |
video_info = { | |
"title": yt.title, | |
"description": yt.description, | |
"view_count": yt.views, | |
"thumbnail_url": yt.thumbnail_url, | |
"publish_date": yt.publish_date, | |
"length": yt.length, | |
"author": yt.author, | |
} | |
return video_info | |
class GoogleApiYoutubeLoader(BaseLoader): | |
"""Loader that loads all Videos from a Channel | |
To use, you should have the ``googleapiclient,youtube_transcript_api`` | |
python package installed. | |
As the service needs a google_api_client, you first have to initialize | |
the GoogleApiClient. | |
Additionally you have to either provide a channel name or a list of videoids | |
"https://developers.google.com/docs/api/quickstart/python" | |
Example: | |
.. code-block:: python | |
from langchain.document_loaders import GoogleApiClient | |
from langchain.document_loaders import GoogleApiYoutubeLoader | |
google_api_client = GoogleApiClient( | |
service_account_path=Path("path_to_your_sec_file.json") | |
) | |
loader = GoogleApiYoutubeLoader( | |
google_api_client=google_api_client, | |
channel_name = "CodeAesthetic" | |
) | |
load.load() | |
""" | |
google_api_client: GoogleApiClient | |
channel_name: Optional[str] = None | |
video_ids: Optional[List[str]] = None | |
add_video_info: bool = True | |
captions_language: str = "en" | |
def __post_init__(self) -> None: | |
self.youtube_client = self._build_youtube_client(self.google_api_client.creds) | |
def _build_youtube_client(self, creds: Any) -> Any: | |
try: | |
from googleapiclient.discovery import build | |
from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401 | |
except ImportError: | |
raise ImportError( | |
"You must run" | |
"`pip install --upgrade " | |
"google-api-python-client google-auth-httplib2 " | |
"google-auth-oauthlib" | |
"youtube-transcript-api`" | |
"to use the Google Drive loader" | |
) | |
return build("youtube", "v3", credentials=creds) | |
def validate_channel_or_videoIds_is_set( | |
cls, values: Dict[str, Any] | |
) -> Dict[str, Any]: | |
"""Validate that either folder_id or document_ids is set, but not both.""" | |
if not values.get("channel_name") and not values.get("video_ids"): | |
raise ValueError("Must specify either channel_name or video_ids") | |
return values | |
def _get_transcripe_for_video_id(self, video_id: str) -> str: | |
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi | |
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids) | |
try: | |
transcript = transcript_list.find_transcript([self.captions_language]) | |
except NoTranscriptFound: | |
en_transcript = transcript_list.find_transcript(["en"]) | |
transcript = en_transcript.translate(self.captions_language) | |
transcript_pieces = transcript.fetch() | |
return " ".join([t["text"].strip(" ") for t in transcript_pieces]) | |
def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document: | |
captions = self._get_transcripe_for_video_id(video_id) | |
video_response = ( | |
self.youtube_client.videos() | |
.list( | |
part="id,snippet", | |
id=video_id, | |
) | |
.execute() | |
) | |
return Document( | |
page_content=captions, | |
metadata=video_response.get("items")[0], | |
) | |
def _get_channel_id(self, channel_name: str) -> str: | |
request = self.youtube_client.search().list( | |
part="id", | |
q=channel_name, | |
type="channel", | |
maxResults=1, # we only need one result since channel names are unique | |
) | |
response = request.execute() | |
channel_id = response["items"][0]["id"]["channelId"] | |
return channel_id | |
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]: | |
channel_id = self._get_channel_id(channel) | |
request = self.youtube_client.search().list( | |
part="id,snippet", | |
channelId=channel_id, | |
maxResults=50, # adjust this value to retrieve more or fewer videos | |
) | |
video_ids = [] | |
while request is not None: | |
response = request.execute() | |
# Add each video ID to the list | |
for item in response["items"]: | |
if not item["id"].get("videoId"): | |
continue | |
meta_data = {"videoId": item["id"]["videoId"]} | |
if self.add_video_info: | |
item["snippet"].pop("thumbnails") | |
meta_data.update(item["snippet"]) | |
video_ids.append( | |
Document( | |
page_content=self._get_transcripe_for_video_id( | |
item["id"]["videoId"] | |
), | |
metadata=meta_data, | |
) | |
) | |
request = self.youtube_client.search().list_next(request, response) | |
return video_ids | |
def load(self) -> List[Document]: | |
"""Load documents.""" | |
document_list = [] | |
if self.channel_name: | |
document_list.extend(self._get_document_for_channel(self.channel_name)) | |
elif self.video_ids: | |
document_list.extend( | |
[ | |
self._get_document_for_video_id(video_id) | |
for video_id in self.video_ids | |
] | |
) | |
else: | |
raise ValueError("Must specify either channel_name or video_ids") | |
return document_list | |