KarthickAdopleAI commited on
Commit
2152fa4
·
verified ·
1 Parent(s): 9cba730

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -22
app.py CHANGED
@@ -15,9 +15,12 @@ from typing import List, Dict, Tuple
15
  from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
16
  from langchain.text_splitter import CharacterTextSplitter
17
 
 
 
 
18
  class PDFExtract:
19
  def __init__(self):
20
- pass
21
 
22
  def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
23
  """Extract text content from PDF files.
@@ -27,8 +30,8 @@ class PDFExtract:
27
  List[str]: Extracted text from the PDFs.
28
  """
29
  docs = []
30
- loaders = [UnstructuredLoader(file_obj, strategy="fast") for file_obj in file_paths]
31
- for loader in loaders:
32
  docs.extend(loader.load())
33
  return docs
34
 
@@ -40,9 +43,7 @@ class PDFExtract:
40
  List[str]: List of smaller text chunks.
41
  """
42
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
43
-
44
  chunks = text_splitter.split_documents(text)
45
-
46
  return chunks
47
 
48
  def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
@@ -53,16 +54,32 @@ class PDFExtract:
53
  FAISS: Vector store created from the text chunks.
54
  """
55
  embeddings = AzureOpenAIEmbeddings(
56
- azure_deployment="text-embedding-3-large",
57
- )
58
-
59
  return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
60
 
61
- def main(self,file_paths: List[str]):
62
- text = self._extract_text_from_pdfs(file_paths)
63
- text_chunks = self._split_text_into_chunks(text)
64
- vector_store = self._create_vector_store_from_text_chunks(text_chunks)
65
- return vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Set page configuration
67
  st.set_page_config(page_title="GASB Decision Flow", layout="wide")
68
 
@@ -163,15 +180,6 @@ if 'db' not in st.session_state:
163
  if 'file_processed' not in st.session_state:
164
  st.session_state.file_processed = False
165
 
166
- # Function to process the uploaded file
167
- def process_file(uploaded_file):
168
- with st.spinner("Processing document..."):
169
- pdfextract = PDFExtract()
170
- print(uploaded_file.name)
171
- db = pdfextract.main([uploaded_file.name])
172
-
173
- return db
174
-
175
  # Center the file uploader
176
  st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
177
  uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])
 
15
  from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
16
  from langchain.text_splitter import CharacterTextSplitter
17
 
18
+ import tempfile
19
+ import os
20
+
21
  class PDFExtract:
22
  def __init__(self):
23
+ pass
24
 
25
  def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
26
  """Extract text content from PDF files.
 
30
  List[str]: Extracted text from the PDFs.
31
  """
32
  docs = []
33
+ for file_path in file_paths:
34
+ loader = UnstructuredLoader(file_path, strategy="fast")
35
  docs.extend(loader.load())
36
  return docs
37
 
 
43
  List[str]: List of smaller text chunks.
44
  """
45
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
 
46
  chunks = text_splitter.split_documents(text)
 
47
  return chunks
48
 
49
  def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
 
54
  FAISS: Vector store created from the text chunks.
55
  """
56
  embeddings = AzureOpenAIEmbeddings(
57
+ azure_deployment="text-embedding-3-large",
58
+ )
 
59
  return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
60
 
61
+ def main(self, file_paths: List[str]):
62
+ text = self._extract_text_from_pdfs(file_paths)
63
+ text_chunks = self._split_text_into_chunks(text)
64
+ vector_store = self._create_vector_store_from_text_chunks(text_chunks)
65
+ return vector_store
66
+
67
+ # Function to process the uploaded file
68
+ def process_file(uploaded_file):
69
+ with st.spinner("Processing document..."):
70
+ # Save the uploaded file to a temporary location
71
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
72
+ tmp_file.write(uploaded_file.getvalue())
73
+ tmp_file_path = tmp_file.name
74
+
75
+ # Process the file
76
+ pdfextract = PDFExtract()
77
+ db = pdfextract.main([tmp_file_path])
78
+
79
+ # Clean up the temporary file
80
+ os.unlink(tmp_file_path)
81
+
82
+ return db
83
  # Set page configuration
84
  st.set_page_config(page_title="GASB Decision Flow", layout="wide")
85
 
 
180
  if 'file_processed' not in st.session_state:
181
  st.session_state.file_processed = False
182
 
 
 
 
 
 
 
 
 
 
183
  # Center the file uploader
184
  st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
185
  uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])