Spaces:

GovindRaj
/

upload-pdf

Sleeping

App Files Files Community

GovindRaj commited on Oct 21, 2024

Commit

29d7f82

verified ·

1 Parent(s): c5fbeef

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -18

app.py CHANGED Viewed

@@ -5,9 +5,12 @@ from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 import tempfile
-DB_FAISS_PATH = './vectorstore/db_faiss'
 def create_vector_db(uploaded_files):
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -26,7 +29,7 @@ def create_vector_db(uploaded_files):
                 loader = PyPDFLoader(pdf_path)
                 documents.extend(loader.load())
-        # Split documents
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=500,
             chunk_overlap=50
@@ -39,24 +42,30 @@ def create_vector_db(uploaded_files):
             model_kwargs={'device': 'cpu'}
         )
-        # Create the vectorstore directory if it doesn't exist
-        if not os.path.exists(DB_FAISS_PATH):
-            try:
-                os.makedirs(DB_FAISS_PATH)
-                print(f"Created directory {DB_FAISS_PATH}")
-            except Exception as e:
-                print(f"Error creating directory: {e}")
-        # Create and save FAISS database
         db = FAISS.from_documents(texts, embeddings)
-        try:
-            db.save_local(DB_FAISS_PATH)
-            print(f"FAISS database saved to {DB_FAISS_PATH}")
-        except Exception as e:
-            print(f"Error saving FAISS database: {e}")
         return True
 def main():
     st.title("PDF to Vector Database Converter")
@@ -71,9 +80,9 @@ def main():
             try:
                 success = create_vector_db(uploaded_files)
                 if success:
-                    st.success("Vector database created successfully!")
             except Exception as e:
                 st.error(f"An error occurred: {str(e)}")
 if __name__ == "__main__":
-    main()

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 import tempfile
+from huggingface_hub import HfApi, HfFolder
+DB_FAISS_PATH = 'vectorstore/db_faiss'
+DATASET_REPO = "GovindRaj/faiss-vectorstore"  # Your Hugging Face Dataset ID
+# Function to create FAISS vector DB and upload to Hugging Face
 def create_vector_db(uploaded_files):
     # Create a temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
                 loader = PyPDFLoader(pdf_path)
                 documents.extend(loader.load())
+        # Split documents into chunks
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=500,
             chunk_overlap=50
             model_kwargs={'device': 'cpu'}
         )
+        # Create and save FAISS database locally
         db = FAISS.from_documents(texts, embeddings)
+        db.save_local(DB_FAISS_PATH)
+        # Retrieve the token from environment variables (Hugging Face Secrets)
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.")
+        # Push the vector database to Hugging Face Dataset
+        HfFolder.save_token(hf_token)
+        api = HfApi()
+        api.upload_folder(
+            folder_path=DB_FAISS_PATH,  # Local path to the FAISS folder
+            path_in_repo="faiss_data",  # Save at the root of the dataset
+            repo_id=DATASET_REPO,  # Hugging Face Dataset ID
+            repo_type="dataset",  # Specify this is a dataset
+            token=hf_token  # Use the token from secrets
+        )
         return True
+# Streamlit app
 def main():
     st.title("PDF to Vector Database Converter")
             try:
                 success = create_vector_db(uploaded_files)
                 if success:
+                    st.success("Vector database created and uploaded successfully!")
             except Exception as e:
                 st.error(f"An error occurred: {str(e)}")
 if __name__ == "__main__":
+    main()