Spaces:

ADOPLE
/

22nd_Century

Running

App Files Files Community

KarthickAdopleAI commited on Feb 25

Commit

2152fa4

verified ·

1 Parent(s): 9cba730

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -15,9 +15,12 @@ from typing import List, Dict, Tuple
 from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
 class PDFExtract:
     def __init__(self):
-      pass
     def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
         """Extract text content from PDF files.
@@ -27,8 +30,8 @@ class PDFExtract:
             List[str]: Extracted text from the PDFs.
         """
         docs = []
-        loaders = [UnstructuredLoader(file_obj, strategy="fast") for file_obj in file_paths]
-        for loader in loaders:
             docs.extend(loader.load())
         return docs
@@ -40,9 +43,7 @@ class PDFExtract:
             List[str]: List of smaller text chunks.
         """
         text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
         chunks = text_splitter.split_documents(text)
         return chunks
     def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
@@ -53,16 +54,32 @@ class PDFExtract:
             FAISS: Vector store created from the text chunks.
         """
         embeddings = AzureOpenAIEmbeddings(
-                        azure_deployment="text-embedding-3-large",
-                    )
         return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
-    def main(self,file_paths: List[str]):
-      text = self._extract_text_from_pdfs(file_paths)
-      text_chunks = self._split_text_into_chunks(text)
-      vector_store = self._create_vector_store_from_text_chunks(text_chunks)
-      return vector_store
 # Set page configuration
 st.set_page_config(page_title="GASB Decision Flow", layout="wide")
@@ -163,15 +180,6 @@ if 'db' not in st.session_state:
 if 'file_processed' not in st.session_state:
     st.session_state.file_processed = False
-# Function to process the uploaded file
-def process_file(uploaded_file):
-    with st.spinner("Processing document..."):
-        pdfextract = PDFExtract()
-        print(uploaded_file.name)
-        db = pdfextract.main([uploaded_file.name])
-        return db
 # Center the file uploader
 st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
 uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])

 from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
+import tempfile
+import os
 class PDFExtract:
     def __init__(self):
+        pass
     def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
         """Extract text content from PDF files.
             List[str]: Extracted text from the PDFs.
         """
         docs = []
+        for file_path in file_paths:
+            loader = UnstructuredLoader(file_path, strategy="fast")
             docs.extend(loader.load())
         return docs
             List[str]: List of smaller text chunks.
         """
         text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=0, length_function=len)
         chunks = text_splitter.split_documents(text)
         return chunks
     def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
             FAISS: Vector store created from the text chunks.
         """
         embeddings = AzureOpenAIEmbeddings(
+            azure_deployment="text-embedding-3-large",
+        )
         return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
+    def main(self, file_paths: List[str]):
+        text = self._extract_text_from_pdfs(file_paths)
+        text_chunks = self._split_text_into_chunks(text)
+        vector_store = self._create_vector_store_from_text_chunks(text_chunks)
+        return vector_store
+# Function to process the uploaded file
+def process_file(uploaded_file):
+    with st.spinner("Processing document..."):
+        # Save the uploaded file to a temporary location
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            tmp_file_path = tmp_file.name
+        # Process the file
+        pdfextract = PDFExtract()
+        db = pdfextract.main([tmp_file_path])
+        # Clean up the temporary file
+        os.unlink(tmp_file_path)
+        return db
 # Set page configuration
 st.set_page_config(page_title="GASB Decision Flow", layout="wide")
 if 'file_processed' not in st.session_state:
     st.session_state.file_processed = False
 # Center the file uploader
 st.markdown('<div class="uploadfile-container">', unsafe_allow_html=True)
 uploaded_file = st.file_uploader("Upload your contract document (PDF, Word, or Text)", type=["pdf", "docx", "txt"])