DeepDiveDev commited on
Commit
8976d30
·
verified ·
1 Parent(s): 17130f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -8
app.py CHANGED
@@ -34,7 +34,6 @@ translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
34
 
35
  # Define a function for keyword highlighting
36
  def highlight_keywords(text, keyword):
37
- # Escape keyword for regex to avoid issues with special characters
38
  pattern = re.compile(re.escape(keyword), re.IGNORECASE)
39
  highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
40
  return highlighted_text
@@ -59,11 +58,11 @@ if image_file is not None:
59
  if st.button("Run OCR"):
60
  # Use GOT-OCR2 model for plain text OCR (structured documents)
61
  with torch.no_grad():
62
- res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr') # Removed device parameter
63
 
64
  # Perform formatted text OCR
65
  with torch.no_grad():
66
- res_format = model.chat(tokenizer, temp_file_path, ocr_type='format') # Removed device parameter
67
 
68
  # Use EasyOCR for both English and Hindi text recognition
69
  result_easyocr = reader.readtext(temp_file_path, detail=0)
@@ -94,13 +93,13 @@ if image_file is not None:
94
 
95
  # Additional OCR types using GOT-OCR2
96
  with torch.no_grad():
97
- res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='') # Removed device parameter
98
  st.subheader("Fine-Grained OCR Results:")
99
  st.write(res_fine_grained)
100
 
101
  # Render formatted OCR to HTML
102
  with torch.no_grad():
103
- res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html') # Removed device parameter
104
  st.subheader("Rendered OCR Results (HTML):")
105
  st.write(res_render)
106
 
@@ -109,11 +108,8 @@ if image_file is not None:
109
 
110
  if keyword:
111
  st.subheader("Search Results:")
112
- # Highlight the matching sections in the extracted text
113
  highlighted_text = highlight_keywords(extracted_text, keyword)
114
  st.markdown(highlighted_text)
115
 
116
  # Clean up the temporary file after use
117
  os.remove(temp_file_path)
118
-
119
- # Note: No need for if __name__ == "__main__": st.run()
 
34
 
35
  # Define a function for keyword highlighting
36
  def highlight_keywords(text, keyword):
 
37
  pattern = re.compile(re.escape(keyword), re.IGNORECASE)
38
  highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
39
  return highlighted_text
 
58
  if st.button("Run OCR"):
59
  # Use GOT-OCR2 model for plain text OCR (structured documents)
60
  with torch.no_grad():
61
+ res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
62
 
63
  # Perform formatted text OCR
64
  with torch.no_grad():
65
+ res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
66
 
67
  # Use EasyOCR for both English and Hindi text recognition
68
  result_easyocr = reader.readtext(temp_file_path, detail=0)
 
93
 
94
  # Additional OCR types using GOT-OCR2
95
  with torch.no_grad():
96
+ res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
97
  st.subheader("Fine-Grained OCR Results:")
98
  st.write(res_fine_grained)
99
 
100
  # Render formatted OCR to HTML
101
  with torch.no_grad():
102
+ res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
103
  st.subheader("Rendered OCR Results (HTML):")
104
  st.write(res_render)
105
 
 
108
 
109
  if keyword:
110
  st.subheader("Search Results:")
 
111
  highlighted_text = highlight_keywords(extracted_text, keyword)
112
  st.markdown(highlighted_text)
113
 
114
  # Clean up the temporary file after use
115
  os.remove(temp_file_path)