Spaces:

jfeng1115
/

marketing-analytics-bot

Runtime error

App Files Files Community

marketing-analytics-bot / langchain /document_loaders /youtube.py

jfeng1115

init commit

58d33f0 almost 2 years ago

raw

history blame contribute delete

11.7 kB

	"""Loader that loads YouTube transcript."""
	from __future__ import annotations

	from pathlib import Path
	from typing import Any, Dict, List, Optional

	from pydantic import root_validator
	from pydantic.dataclasses import dataclass

	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader

	SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]


	@dataclass
	class GoogleApiClient:
	"""A Generic Google Api Client.

	To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
	python package installed.
	As the google api expects credentials you need to set up a google account and
	register your Service. "https://developers.google.com/docs/api/quickstart/python"



	Example:
	.. code-block:: python

	from langchain.document_loaders import GoogleApiClient
	google_api_client = GoogleApiClient(
	service_account_path=Path("path_to_your_sec_file.json")
	)

	"""

	credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
	service_account_path: Path = Path.home() / ".credentials" / "credentials.json"
	token_path: Path = Path.home() / ".credentials" / "token.json"

	def __post_init__(self) -> None:
	self.creds = self._load_credentials()

	@root_validator
	def validate_channel_or_videoIds_is_set(
	cls, values: Dict[str, Any]
	) -> Dict[str, Any]:
	"""Validate that either folder_id or document_ids is set, but not both."""

	if not values.get("credentials_path") and not values.get(
	"service_account_path"
	):
	raise ValueError("Must specify either channel_name or video_ids")
	return values

	def _load_credentials(self) -> Any:
	"""Load credentials."""
	# Adapted from https://developers.google.com/drive/api/v3/quickstart/python
	try:
	from google.auth.transport.requests import Request
	from google.oauth2 import service_account
	from google.oauth2.credentials import Credentials
	from google_auth_oauthlib.flow import InstalledAppFlow
	from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401
	except ImportError:
	raise ImportError(
	"You must run"
	"`pip install --upgrade "
	"google-api-python-client google-auth-httplib2 "
	"google-auth-oauthlib"
	"youtube-transcript-api`"
	"to use the Google Drive loader"
	)

	creds = None
	if self.service_account_path.exists():
	return service_account.Credentials.from_service_account_file(
	str(self.service_account_path)
	)
	if self.token_path.exists():
	creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)

	if not creds or not creds.valid:
	if creds and creds.expired and creds.refresh_token:
	creds.refresh(Request())
	else:
	flow = InstalledAppFlow.from_client_secrets_file(
	str(self.credentials_path), SCOPES
	)
	creds = flow.run_local_server(port=0)
	with open(self.token_path, "w") as token:
	token.write(creds.to_json())

	return creds


	class YoutubeLoader(BaseLoader):
	"""Loader that loads Youtube transcripts."""

	def __init__(
	self, video_id: str, add_video_info: bool = False, language: str = "en"
	):
	"""Initialize with YouTube video ID."""
	self.video_id = video_id
	self.add_video_info = add_video_info
	self.language = language

	@classmethod
	def from_youtube_channel(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
	"""Given a channel name, load all videos."""
	video_id = youtube_url.split("youtube.com/watch?v=")[-1]
	return cls(video_id, **kwargs)

	def load(self) -> List[Document]:
	"""Load documents."""
	try:
	from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
	except ImportError:
	raise ImportError(
	"Could not import youtube_transcript_api python package. "
	"Please it install it with `pip install youtube-transcript-api`."
	)

	metadata = {"source": self.video_id}

	if self.add_video_info:
	# Get more video meta info
	# Such as title, description, thumbnail url, publish_date
	video_info = self._get_video_info()
	metadata.update(video_info)

	transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
	try:
	transcript = transcript_list.find_transcript([self.language])
	except NoTranscriptFound:
	en_transcript = transcript_list.find_transcript(["en"])
	transcript = en_transcript.translate(self.language)

	transcript_pieces = transcript.fetch()

	transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])

	return [Document(page_content=transcript, metadata=metadata)]

	def _get_video_info(self) -> dict:
	"""Get important video information.

	Components are:
	- title
	- description
	- thumbnail url,
	- publish_date
	- channel_author
	- and more.
	"""
	try:
	from pytube import YouTube

	except ImportError:
	raise ImportError(
	"Could not import pytube python package. "
	"Please it install it with `pip install pytube`."
	)
	yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
	video_info = {
	"title": yt.title,
	"description": yt.description,
	"view_count": yt.views,
	"thumbnail_url": yt.thumbnail_url,
	"publish_date": yt.publish_date,
	"length": yt.length,
	"author": yt.author,
	}
	return video_info


	@dataclass
	class GoogleApiYoutubeLoader(BaseLoader):
	"""Loader that loads all Videos from a Channel

	To use, you should have the ``googleapiclient,youtube_transcript_api``
	python package installed.
	As the service needs a google_api_client, you first have to initialize
	the GoogleApiClient.

	Additionally you have to either provide a channel name or a list of videoids
	"https://developers.google.com/docs/api/quickstart/python"



	Example:
	.. code-block:: python

	from langchain.document_loaders import GoogleApiClient
	from langchain.document_loaders import GoogleApiYoutubeLoader
	google_api_client = GoogleApiClient(
	service_account_path=Path("path_to_your_sec_file.json")
	)
	loader = GoogleApiYoutubeLoader(
	google_api_client=google_api_client,
	channel_name = "CodeAesthetic"
	)
	load.load()

	"""

	google_api_client: GoogleApiClient
	channel_name: Optional[str] = None
	video_ids: Optional[List[str]] = None
	add_video_info: bool = True
	captions_language: str = "en"

	def __post_init__(self) -> None:
	self.youtube_client = self._build_youtube_client(self.google_api_client.creds)

	def _build_youtube_client(self, creds: Any) -> Any:
	try:
	from googleapiclient.discovery import build
	from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401
	except ImportError:
	raise ImportError(
	"You must run"
	"`pip install --upgrade "
	"google-api-python-client google-auth-httplib2 "
	"google-auth-oauthlib"
	"youtube-transcript-api`"
	"to use the Google Drive loader"
	)

	return build("youtube", "v3", credentials=creds)

	@root_validator
	def validate_channel_or_videoIds_is_set(
	cls, values: Dict[str, Any]
	) -> Dict[str, Any]:
	"""Validate that either folder_id or document_ids is set, but not both."""
	if not values.get("channel_name") and not values.get("video_ids"):
	raise ValueError("Must specify either channel_name or video_ids")
	return values

	def _get_transcripe_for_video_id(self, video_id: str) -> str:
	from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi

	transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
	try:
	transcript = transcript_list.find_transcript([self.captions_language])
	except NoTranscriptFound:
	en_transcript = transcript_list.find_transcript(["en"])
	transcript = en_transcript.translate(self.captions_language)

	transcript_pieces = transcript.fetch()
	return " ".join([t["text"].strip(" ") for t in transcript_pieces])

	def _get_document_for_video_id(self, video_id: str, **kwargs: Any) -> Document:
	captions = self._get_transcripe_for_video_id(video_id)
	video_response = (
	self.youtube_client.videos()
	.list(
	part="id,snippet",
	id=video_id,
	)
	.execute()
	)
	return Document(
	page_content=captions,
	metadata=video_response.get("items")[0],
	)

	def _get_channel_id(self, channel_name: str) -> str:
	request = self.youtube_client.search().list(
	part="id",
	q=channel_name,
	type="channel",
	maxResults=1, # we only need one result since channel names are unique
	)
	response = request.execute()
	channel_id = response["items"][0]["id"]["channelId"]
	return channel_id

	def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
	channel_id = self._get_channel_id(channel)
	request = self.youtube_client.search().list(
	part="id,snippet",
	channelId=channel_id,
	maxResults=50, # adjust this value to retrieve more or fewer videos
	)
	video_ids = []
	while request is not None:
	response = request.execute()

	# Add each video ID to the list
	for item in response["items"]:
	if not item["id"].get("videoId"):
	continue
	meta_data = {"videoId": item["id"]["videoId"]}
	if self.add_video_info:
	item["snippet"].pop("thumbnails")
	meta_data.update(item["snippet"])
	video_ids.append(
	Document(
	page_content=self._get_transcripe_for_video_id(
	item["id"]["videoId"]
	),
	metadata=meta_data,
	)
	)
	request = self.youtube_client.search().list_next(request, response)

	return video_ids

	def load(self) -> List[Document]:
	"""Load documents."""
	document_list = []
	if self.channel_name:
	document_list.extend(self._get_document_for_channel(self.channel_name))
	elif self.video_ids:
	document_list.extend(
	[
	self._get_document_for_video_id(video_id)
	for video_id in self.video_ids
	]
	)
	else:
	raise ValueError("Must specify either channel_name or video_ids")
	return document_list