SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 4, 2024

Commit

459b8b4

verified ·

1 Parent(s): 25c59df

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -13

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import tempfile
 import os
 import json
 import gradio as gr
 import pandas as pd
-from tempfile import NamedTemporaryFile
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.vectorstores import FAISS
@@ -13,22 +13,28 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFaceHub
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
-# At the beginning of your script
-os.environ['TMPDIR'] = '/tmp'
-def load_and_split_document(file):
-    """Loads and splits the document into pages."""
     loader = PyPDFLoader(file.name)
-    data = loader.load_and_split()
-    return data
 def get_embeddings():
     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
-def create_database(data, embeddings):
     db = FAISS.from_documents(data, embeddings)
     db.save_local("faiss_database")
@@ -74,7 +80,7 @@ def update_vectors(file):
     data = load_and_split_document(file)
     embed = get_embeddings()
     create_database(data, embed)
-    return "Vector store updated successfully."
 def ask_question(question):
     if not question:
@@ -92,14 +98,13 @@ def extract_db_to_excel():
     data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
     df = pd.DataFrame(data)
-    # Create a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
         excel_path = tmp.name
         df.to_excel(excel_path, index=False)
     return excel_path
-# Modify the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Chat with your PDF documents")

 import os
 import json
 import gradio as gr
 import pandas as pd
+import tempfile
+from typing import List
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.vectorstores import FAISS
 from langchain_community.llms import HuggingFaceHub
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from langchain_core.documents import Document
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
+def load_and_split_document(file: tempfile._TemporaryFileWrapper) -> List[Document]:
+    """Loads and splits the document into chunks."""
     loader = PyPDFLoader(file.name)
+    pages = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+    )
+    chunks = text_splitter.split_documents(pages)
+    return chunks
 def get_embeddings():
     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+def create_database(data: List[Document], embeddings):
     db = FAISS.from_documents(data, embeddings)
     db.save_local("faiss_database")
     data = load_and_split_document(file)
     embed = get_embeddings()
     create_database(data, embed)
+    return f"Vector store updated successfully. Processed {len(data)} chunks."
 def ask_question(question):
     if not question:
     data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
     df = pd.DataFrame(data)
     with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
         excel_path = tmp.name
         df.to_excel(excel_path, index=False)
     return excel_path
+# Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Chat with your PDF documents")