IAMTFRMZA commited on
Commit
42cece2
·
verified ·
1 Parent(s): 4bd484d
Files changed (1) hide show
  1. app.py +24 -23
app.py CHANGED
@@ -37,28 +37,28 @@ def install_dependencies():
37
 
38
  install_dependencies()
39
 
40
- def process_pdf(file):
41
- input_pdf = file.name
42
- os.system(f'pdftoppm -png "{input_pdf}" img')
 
43
 
 
 
 
44
  for image in os.listdir():
45
  if image.startswith('img') and image.endswith('.png'):
46
  output_txt = f"ocr_{image}.txt"
47
  os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
48
-
49
- output_txt_file = f"{input_pdf[:-4]}.txt"
50
- with open(output_txt_file, 'w') as output_file:
51
- for text_file in os.listdir():
52
- if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
53
- with open(text_file, 'r') as f:
54
- output_file.write(f.read())
55
- output_file.write("\n")
56
 
57
  for file in os.listdir():
58
  if file.startswith('img') or file.startswith('ocr_img'):
59
  os.remove(file)
 
60
 
61
- return output_txt_file
62
 
63
  with tab4:
64
  st.subheader("Document Preparation")
@@ -66,13 +66,18 @@ with tab4:
66
 
67
  if uploaded_file:
68
  st.write("Processing the uploaded document...")
69
- result_file = process_pdf(uploaded_file)
70
- st.download_button(
71
- label="Download Extracted Text",
72
- data=open(result_file, "rb").read(),
73
- file_name=result_file,
74
- mime="text/plain"
75
- )
 
 
 
 
 
76
 
77
  # Contract Chat Section
78
  def contract_chat_section(tab, assistant_id, session_key, input_key):
@@ -130,9 +135,5 @@ def contract_chat_section(tab, assistant_id, session_key, input_key):
130
  ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
131
  ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
132
 
133
- # Contract Chat Section
134
  contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
135
-
136
- # Technical Chat Section
137
  contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")
138
-
 
37
 
38
  install_dependencies()
39
 
40
+ def process_pdf(uploaded_file):
41
+ temp_pdf_path = "uploaded_document.pdf"
42
+ with open(temp_pdf_path, "wb") as f:
43
+ f.write(uploaded_file.read())
44
 
45
+ os.system(f'pdftoppm -png "{temp_pdf_path}" img')
46
+
47
+ extracted_text = ""
48
  for image in os.listdir():
49
  if image.startswith('img') and image.endswith('.png'):
50
  output_txt = f"ocr_{image}.txt"
51
  os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
52
+
53
+ with open(output_txt, "r") as txt_file:
54
+ extracted_text += txt_file.read() + "\n"
 
 
 
 
 
55
 
56
  for file in os.listdir():
57
  if file.startswith('img') or file.startswith('ocr_img'):
58
  os.remove(file)
59
+ os.remove(temp_pdf_path)
60
 
61
+ return extracted_text
62
 
63
  with tab4:
64
  st.subheader("Document Preparation")
 
66
 
67
  if uploaded_file:
68
  st.write("Processing the uploaded document...")
69
+ extracted_text = process_pdf(uploaded_file)
70
+
71
+ if extracted_text.strip():
72
+ st.text_area("Extracted Text", extracted_text, height=300)
73
+ st.download_button(
74
+ label="Download Extracted Text",
75
+ data=extracted_text,
76
+ file_name="extracted_text.txt",
77
+ mime="text/plain"
78
+ )
79
+ else:
80
+ st.error("No text could be extracted. Try another document.")
81
 
82
  # Contract Chat Section
83
  def contract_chat_section(tab, assistant_id, session_key, input_key):
 
135
  ASSISTANT_CONTRACT_ID = "asst_rd9h8PfYuOmHbkvOF3RTmVfn"
136
  ASSISTANT_TECHNICAL_ID = "asst_xizNZBCJuy4TqdjqjwkxbAki"
137
 
 
138
  contract_chat_section(tab1, ASSISTANT_CONTRACT_ID, "contract_messages", "contract_input")
 
 
139
  contract_chat_section(tab2, ASSISTANT_TECHNICAL_ID, "technical_messages", "technical_input")