Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
|
3 |
+
from PIL import Image
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
import easyocr
|
7 |
+
import re
|
8 |
+
|
9 |
+
# Load EasyOCR reader with English and Hindi language support
|
10 |
+
reader = easyocr.Reader(['en', 'hi']) # 'en' for English, 'hi' for Hindi
|
11 |
+
|
12 |
+
# Load the GOT-OCR2 model and tokenizer
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
14 |
+
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
|
15 |
+
model = model.eval().cuda()
|
16 |
+
|
17 |
+
# Load MarianMT translation model for Hindi to English translation
|
18 |
+
translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
|
19 |
+
translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
|
20 |
+
|
21 |
+
# Define a function for keyword highlighting
|
22 |
+
def highlight_keywords(text, keyword):
|
23 |
+
# Escape keyword for regex to avoid issues with special characters
|
24 |
+
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
25 |
+
highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
|
26 |
+
return highlighted_text
|
27 |
+
|
28 |
+
# Streamlit App Title
|
29 |
+
st.title("OCR with GOT-OCR2 (English & Hindi Translation) and Keyword Search")
|
30 |
+
|
31 |
+
# File uploader for image input
|
32 |
+
image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
|
33 |
+
|
34 |
+
if image_file is not None:
|
35 |
+
# Display the uploaded image
|
36 |
+
image = Image.open(image_file)
|
37 |
+
st.image(image, caption='Uploaded Image', use_column_width=True)
|
38 |
+
|
39 |
+
# Save the uploaded file to a temporary file
|
40 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
|
41 |
+
temp_file.write(image_file.getvalue())
|
42 |
+
temp_file_path = temp_file.name
|
43 |
+
|
44 |
+
# Button to run OCR
|
45 |
+
if st.button("Run OCR"):
|
46 |
+
# Use GOT-OCR2 model for plain text OCR (structured documents)
|
47 |
+
res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
|
48 |
+
|
49 |
+
# Perform formatted text OCR
|
50 |
+
res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
|
51 |
+
|
52 |
+
# Use EasyOCR for both English and Hindi text recognition
|
53 |
+
result_easyocr = reader.readtext(temp_file_path, detail=0)
|
54 |
+
|
55 |
+
# Display the results
|
56 |
+
st.subheader("Plain Text OCR Results (English):")
|
57 |
+
st.write(res_plain)
|
58 |
+
|
59 |
+
st.subheader("Formatted Text OCR Results:")
|
60 |
+
st.write(res_format)
|
61 |
+
|
62 |
+
st.subheader("Detected Text using EasyOCR (English and Hindi):")
|
63 |
+
extracted_text = " ".join(result_easyocr) # Combine the list of text results
|
64 |
+
st.write(extracted_text)
|
65 |
+
|
66 |
+
# Translate Hindi text to English using MarianMT (optional step)
|
67 |
+
st.subheader("Translated Hindi Text to English:")
|
68 |
+
translated_text = []
|
69 |
+
for sentence in result_easyocr:
|
70 |
+
# Detect if the text is in Hindi (you can customize this based on text properties)
|
71 |
+
if sentence: # Assuming non-empty text is translated
|
72 |
+
tokenized_text = translation_tokenizer([sentence], return_tensors="pt", truncation=True)
|
73 |
+
translation = translation_model.generate(**tokenized_text)
|
74 |
+
translated_sentence = translation_tokenizer.decode(translation[0], skip_special_tokens=True)
|
75 |
+
translated_text.append(translated_sentence)
|
76 |
+
|
77 |
+
st.write(" ".join(translated_text))
|
78 |
+
|
79 |
+
# Additional OCR types using GOT-OCR2
|
80 |
+
res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
|
81 |
+
st.subheader("Fine-Grained OCR Results:")
|
82 |
+
st.write(res_fine_grained)
|
83 |
+
|
84 |
+
# Render formatted OCR to HTML
|
85 |
+
res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
|
86 |
+
st.subheader("Rendered OCR Results (HTML):")
|
87 |
+
st.write(res_render)
|
88 |
+
|
89 |
+
# Search functionality
|
90 |
+
keyword = st.text_input("Enter keyword to search in extracted text:")
|
91 |
+
|
92 |
+
if keyword:
|
93 |
+
st.subheader("Search Results:")
|
94 |
+
# Highlight the matching sections in the extracted text
|
95 |
+
highlighted_text = highlight_keywords(extracted_text, keyword)
|
96 |
+
st.markdown(highlighted_text)
|
97 |
+
|
98 |
+
# Clean up the temporary file after use
|
99 |
+
os.remove(temp_file_path)
|
100 |
+
|
101 |
+
# Note: No need for if __name__ == "__main__": st.run()
|