Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,9 +15,12 @@ from typing import List, Dict, Tuple
|
|
15 |
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
|
16 |
from langchain.text_splitter import CharacterTextSplitter
|
17 |
|
|
|
|
|
|
|
18 |
class PDFExtract:
|
19 |
def __init__(self):
|
20 |
-
|
21 |
|
22 |
def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
|
23 |
"""Extract text content from PDF files.
|
@@ -27,8 +30,8 @@ class PDFExtract:
|
|
27 |
List[str]: Extracted text from the PDFs.
|
28 |
"""
|
29 |
docs = []
|
30 |
-
|
31 |
-
|
32 |
docs.extend(loader.load())
|
33 |
return docs
|
34 |
|
@@ -40,9 +43,7 @@ class PDFExtract:
|
|
40 |
List[str]: List of smaller text chunks.
|
41 |
"""
|
42 |
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
|
43 |
-
|
44 |
chunks = text_splitter.split_documents(text)
|
45 |
-
|
46 |
return chunks
|
47 |
|
48 |
def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
|
@@ -53,16 +54,32 @@ class PDFExtract:
|
|
53 |
FAISS: Vector store created from the text chunks.
|
54 |
"""
|
55 |
embeddings = AzureOpenAIEmbeddings(
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
|
60 |
|
61 |
-
def main(self,file_paths: List[str]):
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Set page configuration
|
67 |
st.set_page_config(page_title="GASB Decision Flow", layout="wide")
|
68 |
|
@@ -163,15 +180,6 @@ if 'db' not in st.session_state:
|
|
163 |
if 'file_processed' not in st.session_state:
|
164 |
st.session_state.file_processed = False
|
165 |
|
166 |
-
# Function to process the uploaded file
|
167 |
-
def process_file(uploaded_file):
|
168 |
-
with st.spinner("Processing document..."):
|
169 |
-
pdfextract = PDFExtract()
|
170 |
-
print(uploaded_file.name)
|
171 |
-
db = pdfextract.main([uploaded_file.name])
|
172 |
-
|
173 |
-
return db
|
174 |
-
|
175 |
# Center the file uploader
|
176 |
st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
|
177 |
uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])
|
|
|
15 |
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
|
16 |
from langchain.text_splitter import CharacterTextSplitter
|
17 |
|
18 |
+
import tempfile
|
19 |
+
import os
|
20 |
+
|
21 |
class PDFExtract:
|
22 |
def __init__(self):
|
23 |
+
pass
|
24 |
|
25 |
def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
|
26 |
"""Extract text content from PDF files.
|
|
|
30 |
List[str]: Extracted text from the PDFs.
|
31 |
"""
|
32 |
docs = []
|
33 |
+
for file_path in file_paths:
|
34 |
+
loader = UnstructuredLoader(file_path, strategy="fast")
|
35 |
docs.extend(loader.load())
|
36 |
return docs
|
37 |
|
|
|
43 |
List[str]: List of smaller text chunks.
|
44 |
"""
|
45 |
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
|
|
|
46 |
chunks = text_splitter.split_documents(text)
|
|
|
47 |
return chunks
|
48 |
|
49 |
def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
|
|
|
54 |
FAISS: Vector store created from the text chunks.
|
55 |
"""
|
56 |
embeddings = AzureOpenAIEmbeddings(
|
57 |
+
azure_deployment="text-embedding-3-large",
|
58 |
+
)
|
|
|
59 |
return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
|
60 |
|
61 |
+
def main(self, file_paths: List[str]):
|
62 |
+
text = self._extract_text_from_pdfs(file_paths)
|
63 |
+
text_chunks = self._split_text_into_chunks(text)
|
64 |
+
vector_store = self._create_vector_store_from_text_chunks(text_chunks)
|
65 |
+
return vector_store
|
66 |
+
|
67 |
+
# Function to process the uploaded file
|
68 |
+
def process_file(uploaded_file):
|
69 |
+
with st.spinner("Processing document..."):
|
70 |
+
# Save the uploaded file to a temporary location
|
71 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
72 |
+
tmp_file.write(uploaded_file.getvalue())
|
73 |
+
tmp_file_path = tmp_file.name
|
74 |
+
|
75 |
+
# Process the file
|
76 |
+
pdfextract = PDFExtract()
|
77 |
+
db = pdfextract.main([tmp_file_path])
|
78 |
+
|
79 |
+
# Clean up the temporary file
|
80 |
+
os.unlink(tmp_file_path)
|
81 |
+
|
82 |
+
return db
|
83 |
# Set page configuration
|
84 |
st.set_page_config(page_title="GASB Decision Flow", layout="wide")
|
85 |
|
|
|
180 |
if 'file_processed' not in st.session_state:
|
181 |
st.session_state.file_processed = False
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
# Center the file uploader
|
184 |
st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
|
185 |
uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])
|