scai

Sleeping

App Files Files Community

IAMTFRMZA commited on Mar 4

Commit

42cece2

verified ·

1 Parent(s): 4bd484d

app.py

Browse files

Files changed (1) hide show

app.py +24 -23

app.py CHANGED Viewed

@@ -37,28 +37,28 @@ def install_dependencies():
 install_dependencies()
-def process_pdf(file):
-    input_pdf = file.name
-    os.system(f'pdftoppm -png "{input_pdf}" img')
     for image in os.listdir():
         if image.startswith('img') and image.endswith('.png'):
             output_txt = f"ocr_{image}.txt"
             os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
-    output_txt_file = f"{input_pdf[:-4]}.txt"
-    with open(output_txt_file, 'w') as output_file:
-        for text_file in os.listdir():
-            if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
-                with open(text_file, 'r') as f:
-                    output_file.write(f.read())
-                    output_file.write("\n")
     for file in os.listdir():
         if file.startswith('img') or file.startswith('ocr_img'):
             os.remove(file)
-    return output_txt_file
 with tab4:
     st.subheader("Document Preparation")
@@ -66,13 +66,18 @@ with tab4:
     if uploaded_file:
         st.write("Processing the uploaded document...")
-        result_file = process_pdf(uploaded_file)
-        st.download_button(
-            label="Download Extracted Text",
-            data=open(result_file, "rb").read(),
-            file_name=result_file,
-            mime="text/plain"
-        )
 # Contract Chat Section
 def contract_chat_section(tab, assistant_id, session_key, input_key):
@@ -130,9 +135,5 @@ def contract_chat_section(tab, assistant_id, session_key, input_key):
 ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
 ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
-# Contract Chat Section
 contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
-# Technical Chat Section
 contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")

 install_dependencies()
+def process_pdf(uploaded_file):
+    temp_pdf_path = "uploaded_document.pdf"
+    with open(temp_pdf_path, "wb") as f:
+        f.write(uploaded_file.read())
+    os.system(f'pdftoppm -png "{temp_pdf_path}" img')
+    extracted_text = ""
     for image in os.listdir():
         if image.startswith('img') and image.endswith('.png'):
             output_txt = f"ocr_{image}.txt"
             os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
+            with open(output_txt, "r") as txt_file:
+                extracted_text += txt_file.read() + "\n"
     for file in os.listdir():
         if file.startswith('img') or file.startswith('ocr_img'):
             os.remove(file)
+    os.remove(temp_pdf_path)
+    return extracted_text
 with tab4:
     st.subheader("Document Preparation")
     if uploaded_file:
         st.write("Processing the uploaded document...")
+        extracted_text = process_pdf(uploaded_file)
+        if extracted_text.strip():
+            st.text_area("Extracted Text", extracted_text, height=300)
+            st.download_button(
+                label="Download Extracted Text",
+                data=extracted_text,
+                file_name="extracted_text.txt",
+                mime="text/plain"
+            )
+        else:
+            st.error("No text could be extracted. Try another document.")
 # Contract Chat Section
 def contract_chat_section(tab, assistant_id, session_key, input_key):
 ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
 ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
 contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
 contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")