app.py
Browse files
app.py
CHANGED
@@ -37,28 +37,28 @@ def install_dependencies():
|
|
37 |
|
38 |
install_dependencies()
|
39 |
|
40 |
-
def process_pdf(
|
41 |
-
|
42 |
-
|
|
|
43 |
|
|
|
|
|
|
|
44 |
for image in os.listdir():
|
45 |
if image.startswith('img') and image.endswith('.png'):
|
46 |
output_txt = f"ocr_{image}.txt"
|
47 |
os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
for text_file in os.listdir():
|
52 |
-
if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
|
53 |
-
with open(text_file, 'r') as f:
|
54 |
-
output_file.write(f.read())
|
55 |
-
output_file.write("\n")
|
56 |
|
57 |
for file in os.listdir():
|
58 |
if file.startswith('img') or file.startswith('ocr_img'):
|
59 |
os.remove(file)
|
|
|
60 |
|
61 |
-
return
|
62 |
|
63 |
with tab4:
|
64 |
st.subheader("Document Preparation")
|
@@ -66,13 +66,18 @@ with tab4:
|
|
66 |
|
67 |
if uploaded_file:
|
68 |
st.write("Processing the uploaded document...")
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Contract Chat Section
|
78 |
def contract_chat_section(tab, assistant_id, session_key, input_key):
|
@@ -130,9 +135,5 @@ def contract_chat_section(tab, assistant_id, session_key, input_key):
|
|
130 |
ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
|
131 |
ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
|
132 |
|
133 |
-
# Contract Chat Section
|
134 |
contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
|
135 |
-
|
136 |
-
# Technical Chat Section
|
137 |
contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")
|
138 |
-
|
|
|
37 |
|
38 |
install_dependencies()
|
39 |
|
40 |
+
def process_pdf(uploaded_file):
|
41 |
+
temp_pdf_path = "uploaded_document.pdf"
|
42 |
+
with open(temp_pdf_path, "wb") as f:
|
43 |
+
f.write(uploaded_file.read())
|
44 |
|
45 |
+
os.system(f'pdftoppm -png "{temp_pdf_path}" img')
|
46 |
+
|
47 |
+
extracted_text = ""
|
48 |
for image in os.listdir():
|
49 |
if image.startswith('img') and image.endswith('.png'):
|
50 |
output_txt = f"ocr_{image}.txt"
|
51 |
os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
|
52 |
+
|
53 |
+
with open(output_txt, "r") as txt_file:
|
54 |
+
extracted_text += txt_file.read() + "\n"
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
for file in os.listdir():
|
57 |
if file.startswith('img') or file.startswith('ocr_img'):
|
58 |
os.remove(file)
|
59 |
+
os.remove(temp_pdf_path)
|
60 |
|
61 |
+
return extracted_text
|
62 |
|
63 |
with tab4:
|
64 |
st.subheader("Document Preparation")
|
|
|
66 |
|
67 |
if uploaded_file:
|
68 |
st.write("Processing the uploaded document...")
|
69 |
+
extracted_text = process_pdf(uploaded_file)
|
70 |
+
|
71 |
+
if extracted_text.strip():
|
72 |
+
st.text_area("Extracted Text", extracted_text, height=300)
|
73 |
+
st.download_button(
|
74 |
+
label="Download Extracted Text",
|
75 |
+
data=extracted_text,
|
76 |
+
file_name="extracted_text.txt",
|
77 |
+
mime="text/plain"
|
78 |
+
)
|
79 |
+
else:
|
80 |
+
st.error("No text could be extracted. Try another document.")
|
81 |
|
82 |
# Contract Chat Section
|
83 |
def contract_chat_section(tab, assistant_id, session_key, input_key):
|
|
|
135 |
ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
|
136 |
ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
|
137 |
|
|
|
138 |
contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
|
|
|
|
|
139 |
contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")
|
|