Shreyas094 commited on
Commit
459b8b4
·
verified ·
1 Parent(s): 25c59df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -1,9 +1,9 @@
1
- import tempfile
2
  import os
3
  import json
4
  import gradio as gr
5
  import pandas as pd
6
- from tempfile import NamedTemporaryFile
 
7
 
8
  from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_community.vectorstores import FAISS
@@ -13,22 +13,28 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain_community.llms import HuggingFaceHub
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
15
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 
16
 
17
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
18
 
19
- # At the beginning of your script
20
- os.environ['TMPDIR'] = '/tmp'
21
-
22
- def load_and_split_document(file):
23
- """Loads and splits the document into pages."""
24
  loader = PyPDFLoader(file.name)
25
- data = loader.load_and_split()
26
- return data
 
 
 
 
 
 
 
 
27
 
28
  def get_embeddings():
29
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
30
 
31
- def create_database(data, embeddings):
32
  db = FAISS.from_documents(data, embeddings)
33
  db.save_local("faiss_database")
34
 
@@ -74,7 +80,7 @@ def update_vectors(file):
74
  data = load_and_split_document(file)
75
  embed = get_embeddings()
76
  create_database(data, embed)
77
- return "Vector store updated successfully."
78
 
79
  def ask_question(question):
80
  if not question:
@@ -92,14 +98,13 @@ def extract_db_to_excel():
92
  data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
93
  df = pd.DataFrame(data)
94
 
95
- # Create a temporary file
96
  with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
97
  excel_path = tmp.name
98
  df.to_excel(excel_path, index=False)
99
 
100
  return excel_path
101
 
102
- # Modify the Gradio interface
103
  with gr.Blocks() as demo:
104
  gr.Markdown("# Chat with your PDF documents")
105
 
 
 
1
  import os
2
  import json
3
  import gradio as gr
4
  import pandas as pd
5
+ import tempfile
6
+ from typing import List
7
 
8
  from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_community.vectorstores import FAISS
 
13
  from langchain_community.llms import HuggingFaceHub
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
15
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
16
+ from langchain_core.documents import Document
17
 
18
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
19
 
20
+ def load_and_split_document(file: tempfile._TemporaryFileWrapper) -> List[Document]:
21
+ """Loads and splits the document into chunks."""
 
 
 
22
  loader = PyPDFLoader(file.name)
23
+ pages = loader.load()
24
+
25
+ text_splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size=1000,
27
+ chunk_overlap=200,
28
+ length_function=len,
29
+ )
30
+
31
+ chunks = text_splitter.split_documents(pages)
32
+ return chunks
33
 
34
  def get_embeddings():
35
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
36
 
37
+ def create_database(data: List[Document], embeddings):
38
  db = FAISS.from_documents(data, embeddings)
39
  db.save_local("faiss_database")
40
 
 
80
  data = load_and_split_document(file)
81
  embed = get_embeddings()
82
  create_database(data, embed)
83
+ return f"Vector store updated successfully. Processed {len(data)} chunks."
84
 
85
  def ask_question(question):
86
  if not question:
 
98
  data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
99
  df = pd.DataFrame(data)
100
 
 
101
  with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
102
  excel_path = tmp.name
103
  df.to_excel(excel_path, index=False)
104
 
105
  return excel_path
106
 
107
+ # Gradio interface
108
  with gr.Blocks() as demo:
109
  gr.Markdown("# Chat with your PDF documents")
110