Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,6 @@ translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
|
|
34 |
|
35 |
# Define a function for keyword highlighting
|
36 |
def highlight_keywords(text, keyword):
|
37 |
-
# Escape keyword for regex to avoid issues with special characters
|
38 |
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
39 |
highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
|
40 |
return highlighted_text
|
@@ -59,11 +58,11 @@ if image_file is not None:
|
|
59 |
if st.button("Run OCR"):
|
60 |
# Use GOT-OCR2 model for plain text OCR (structured documents)
|
61 |
with torch.no_grad():
|
62 |
-
res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
|
63 |
|
64 |
# Perform formatted text OCR
|
65 |
with torch.no_grad():
|
66 |
-
res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
|
67 |
|
68 |
# Use EasyOCR for both English and Hindi text recognition
|
69 |
result_easyocr = reader.readtext(temp_file_path, detail=0)
|
@@ -94,13 +93,13 @@ if image_file is not None:
|
|
94 |
|
95 |
# Additional OCR types using GOT-OCR2
|
96 |
with torch.no_grad():
|
97 |
-
res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
|
98 |
st.subheader("Fine-Grained OCR Results:")
|
99 |
st.write(res_fine_grained)
|
100 |
|
101 |
# Render formatted OCR to HTML
|
102 |
with torch.no_grad():
|
103 |
-
res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
|
104 |
st.subheader("Rendered OCR Results (HTML):")
|
105 |
st.write(res_render)
|
106 |
|
@@ -109,11 +108,8 @@ if image_file is not None:
|
|
109 |
|
110 |
if keyword:
|
111 |
st.subheader("Search Results:")
|
112 |
-
# Highlight the matching sections in the extracted text
|
113 |
highlighted_text = highlight_keywords(extracted_text, keyword)
|
114 |
st.markdown(highlighted_text)
|
115 |
|
116 |
# Clean up the temporary file after use
|
117 |
os.remove(temp_file_path)
|
118 |
-
|
119 |
-
# Note: No need for if __name__ == "__main__": st.run()
|
|
|
34 |
|
35 |
# Define a function for keyword highlighting
|
36 |
def highlight_keywords(text, keyword):
|
|
|
37 |
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
38 |
highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
|
39 |
return highlighted_text
|
|
|
58 |
if st.button("Run OCR"):
|
59 |
# Use GOT-OCR2 model for plain text OCR (structured documents)
|
60 |
with torch.no_grad():
|
61 |
+
res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
|
62 |
|
63 |
# Perform formatted text OCR
|
64 |
with torch.no_grad():
|
65 |
+
res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
|
66 |
|
67 |
# Use EasyOCR for both English and Hindi text recognition
|
68 |
result_easyocr = reader.readtext(temp_file_path, detail=0)
|
|
|
93 |
|
94 |
# Additional OCR types using GOT-OCR2
|
95 |
with torch.no_grad():
|
96 |
+
res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
|
97 |
st.subheader("Fine-Grained OCR Results:")
|
98 |
st.write(res_fine_grained)
|
99 |
|
100 |
# Render formatted OCR to HTML
|
101 |
with torch.no_grad():
|
102 |
+
res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
|
103 |
st.subheader("Rendered OCR Results (HTML):")
|
104 |
st.write(res_render)
|
105 |
|
|
|
108 |
|
109 |
if keyword:
|
110 |
st.subheader("Search Results:")
|
|
|
111 |
highlighted_text = highlight_keywords(extracted_text, keyword)
|
112 |
st.markdown(highlighted_text)
|
113 |
|
114 |
# Clean up the temporary file after use
|
115 |
os.remove(temp_file_path)
|
|
|
|