Spaces:
Runtime error
Runtime error
"""Loader that loads data from Google Drive.""" | |
# Prerequisites: | |
# 1. Create a Google Cloud project | |
# 2. Enable the Google Drive API: | |
# https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com | |
# 3. Authorize credentials for desktop app: | |
# https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501 | |
# 4. For service accounts visit | |
# https://cloud.google.com/iam/docs/service-accounts-create | |
from pathlib import Path | |
from typing import Any, Dict, List, Optional | |
from pydantic import BaseModel, root_validator, validator | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] | |
class GoogleDriveLoader(BaseLoader, BaseModel): | |
"""Loader that loads Google Docs from Google Drive.""" | |
service_account_key: Path = Path.home() / ".credentials" / "keys.json" | |
credentials_path: Path = Path.home() / ".credentials" / "credentials.json" | |
token_path: Path = Path.home() / ".credentials" / "token.json" | |
folder_id: Optional[str] = None | |
document_ids: Optional[List[str]] = None | |
file_ids: Optional[List[str]] = None | |
def validate_folder_id_or_document_ids( | |
cls, values: Dict[str, Any] | |
) -> Dict[str, Any]: | |
"""Validate that either folder_id or document_ids is set, but not both.""" | |
if values.get("folder_id") and ( | |
values.get("document_ids") or values.get("file_ids") | |
): | |
raise ValueError( | |
"Cannot specify both folder_id and document_ids nor " | |
"folder_id and file_ids" | |
) | |
if ( | |
not values.get("folder_id") | |
and not values.get("document_ids") | |
and not values.get("file_ids") | |
): | |
raise ValueError("Must specify either folder_id, document_ids, or file_ids") | |
return values | |
def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any: | |
"""Validate that credentials_path exists.""" | |
if not v.exists(): | |
raise ValueError(f"credentials_path {v} does not exist") | |
return v | |
def _load_credentials(self) -> Any: | |
"""Load credentials.""" | |
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python | |
try: | |
from google.auth.transport.requests import Request | |
from google.oauth2 import service_account | |
from google.oauth2.credentials import Credentials | |
from google_auth_oauthlib.flow import InstalledAppFlow | |
except ImportError: | |
raise ImportError( | |
"You must run" | |
"`pip install --upgrade " | |
"google-api-python-client google-auth-httplib2 " | |
"google-auth-oauthlib`" | |
"to use the Google Drive loader." | |
) | |
creds = None | |
if self.service_account_key.exists(): | |
return service_account.Credentials.from_service_account_file( | |
str(self.service_account_key), scopes=SCOPES | |
) | |
if self.token_path.exists(): | |
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES) | |
if not creds or not creds.valid: | |
if creds and creds.expired and creds.refresh_token: | |
creds.refresh(Request()) | |
else: | |
flow = InstalledAppFlow.from_client_secrets_file( | |
str(self.credentials_path), SCOPES | |
) | |
creds = flow.run_local_server(port=0) | |
with open(self.token_path, "w") as token: | |
token.write(creds.to_json()) | |
return creds | |
def _load_document_from_id(self, id: str) -> Document: | |
"""Load a document from an ID.""" | |
from io import BytesIO | |
from googleapiclient.discovery import build | |
from googleapiclient.http import MediaIoBaseDownload | |
creds = self._load_credentials() | |
service = build("drive", "v3", credentials=creds) | |
request = service.files().export_media(fileId=id, mimeType="text/plain") | |
fh = BytesIO() | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while done is False: | |
status, done = downloader.next_chunk() | |
text = fh.getvalue().decode("utf-8") | |
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"} | |
return Document(page_content=text, metadata=metadata) | |
def _load_documents_from_folder(self) -> List[Document]: | |
"""Load documents from a folder.""" | |
from googleapiclient.discovery import build | |
creds = self._load_credentials() | |
service = build("drive", "v3", credentials=creds) | |
results = ( | |
service.files() | |
.list( | |
q=f"'{self.folder_id}' in parents", | |
pageSize=1000, | |
fields="nextPageToken, files(id, name, mimeType)", | |
) | |
.execute() | |
) | |
items = results.get("files", []) | |
returns = [] | |
for item in items: | |
if item["mimeType"] == "application/vnd.google-apps.document": | |
returns.append(self._load_document_from_id(item["id"])) | |
elif item["mimeType"] == "application/pdf": | |
returns.extend(self._load_file_from_id(item["id"])) | |
else: | |
pass | |
return returns | |
def _load_documents_from_ids(self) -> List[Document]: | |
"""Load documents from a list of IDs.""" | |
if not self.document_ids: | |
raise ValueError("document_ids must be set") | |
return [self._load_document_from_id(doc_id) for doc_id in self.document_ids] | |
def _load_file_from_id(self, id: str) -> List[Document]: | |
"""Load a file from an ID.""" | |
from io import BytesIO | |
from googleapiclient.discovery import build | |
from googleapiclient.http import MediaIoBaseDownload | |
creds = self._load_credentials() | |
service = build("drive", "v3", credentials=creds) | |
request = service.files().get_media(fileId=id) | |
fh = BytesIO() | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while done is False: | |
status, done = downloader.next_chunk() | |
content = fh.getvalue() | |
from PyPDF2 import PdfReader | |
pdf_reader = PdfReader(BytesIO(content)) | |
return [ | |
Document( | |
page_content=page.extract_text(), | |
metadata={ | |
"source": f"https://drive.google.com/file/d/{id}/view", | |
"page": i, | |
}, | |
) | |
for i, page in enumerate(pdf_reader.pages) | |
] | |
def _load_file_from_ids(self) -> List[Document]: | |
"""Load files from a list of IDs.""" | |
if not self.file_ids: | |
raise ValueError("file_ids must be set") | |
docs = [] | |
for file_id in self.file_ids: | |
docs.extend(self._load_file_from_id(file_id)) | |
return docs | |
def load(self) -> List[Document]: | |
"""Load documents.""" | |
if self.folder_id: | |
return self._load_documents_from_folder() | |
elif self.document_ids: | |
return self._load_documents_from_ids() | |
else: | |
return self._load_file_from_ids() | |