Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer | |
from PIL import Image | |
import tempfile | |
import os | |
import easyocr | |
import re | |
# Load EasyOCR reader with English and Hindi language support | |
reader = easyocr.Reader(['en', 'hi']) # 'en' for English, 'hi' for Hindi | |
# Load the GOT-OCR2 model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True) | |
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id) | |
model = model.eval().cuda() | |
# Load MarianMT translation model for Hindi to English translation | |
translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-hi-en') | |
translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en') | |
# Define a function for keyword highlighting | |
def highlight_keywords(text, keyword): | |
# Escape keyword for regex to avoid issues with special characters | |
pattern = re.compile(re.escape(keyword), re.IGNORECASE) | |
highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text) | |
return highlighted_text | |
# Streamlit App Title | |
st.title("OCR with GOT-OCR2 (English & Hindi Translation) and Keyword Search") | |
# File uploader for image input | |
image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) | |
if image_file is not None: | |
# Display the uploaded image | |
image = Image.open(image_file) | |
st.image(image, caption='Uploaded Image', use_column_width=True) | |
# Save the uploaded file to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: | |
temp_file.write(image_file.getvalue()) | |
temp_file_path = temp_file.name | |
# Button to run OCR | |
if st.button("Run OCR"): | |
# Use GOT-OCR2 model for plain text OCR (structured documents) | |
res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr') | |
# Perform formatted text OCR | |
res_format = model.chat(tokenizer, temp_file_path, ocr_type='format') | |
# Use EasyOCR for both English and Hindi text recognition | |
result_easyocr = reader.readtext(temp_file_path, detail=0) | |
# Display the results | |
st.subheader("Plain Text OCR Results (English):") | |
st.write(res_plain) | |
st.subheader("Formatted Text OCR Results:") | |
st.write(res_format) | |
st.subheader("Detected Text using EasyOCR (English and Hindi):") | |
extracted_text = " ".join(result_easyocr) # Combine the list of text results | |
st.write(extracted_text) | |
# Translate Hindi text to English using MarianMT (optional step) | |
st.subheader("Translated Hindi Text to English:") | |
translated_text = [] | |
for sentence in result_easyocr: | |
# Detect if the text is in Hindi (you can customize this based on text properties) | |
if sentence: # Assuming non-empty text is translated | |
tokenized_text = translation_tokenizer([sentence], return_tensors="pt", truncation=True) | |
translation = translation_model.generate(**tokenized_text) | |
translated_sentence = translation_tokenizer.decode(translation[0], skip_special_tokens=True) | |
translated_text.append(translated_sentence) | |
st.write(" ".join(translated_text)) | |
# Additional OCR types using GOT-OCR2 | |
res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='') | |
st.subheader("Fine-Grained OCR Results:") | |
st.write(res_fine_grained) | |
# Render formatted OCR to HTML | |
res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html') | |
st.subheader("Rendered OCR Results (HTML):") | |
st.write(res_render) | |
# Search functionality | |
keyword = st.text_input("Enter keyword to search in extracted text:") | |
if keyword: | |
st.subheader("Search Results:") | |
# Highlight the matching sections in the extracted text | |
highlighted_text = highlight_keywords(extracted_text, keyword) | |
st.markdown(highlighted_text) | |
# Clean up the temporary file after use | |
os.remove(temp_file_path) | |
# Note: No need for if __name__ == "__main__": st.run() | |