File size: 1,509 Bytes
58d33f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Loading logic for loading documents from a GCS file."""
import tempfile
from typing import List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader


class GCSFileLoader(BaseLoader):
    """Loading logic for loading documents from GCS."""

    def __init__(self, project_name: str, bucket: str, blob: str):
        """Initialize with bucket and key name."""
        self.bucket = bucket
        self.blob = blob
        self.project_name = project_name

    def load(self) -> List[Document]:
        """Load documents."""
        try:
            from google.cloud import storage
        except ImportError:
            raise ValueError(
                "Could not import google-cloud-storage python package. "
                "Please it install it with `pip install google-cloud-storage`."
            )

        # Initialise a client
        storage_client = storage.Client(self.project_name)
        # Create a bucket object for our bucket
        bucket = storage_client.get_bucket(self.bucket)
        # Create a blob object from the filepath
        blob = bucket.blob(self.blob)
        with tempfile.TemporaryDirectory() as temp_dir:
            file_path = f"{temp_dir}/{self.blob}"
            # Download the file to a destination
            blob.download_to_filename(file_path)
            loader = UnstructuredFileLoader(file_path)
            return loader.load()