DeepDiveDev commited on
Commit
dd3b7a5
·
verified ·
1 Parent(s): 113edf2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
3
+ from PIL import Image
4
+ import tempfile
5
+ import os
6
+ import easyocr
7
+ import re
8
+
9
+ # Load EasyOCR reader with English and Hindi language support
10
+ reader = easyocr.Reader(['en', 'hi']) # 'en' for English, 'hi' for Hindi
11
+
12
+ # Load the GOT-OCR2 model and tokenizer
13
+ tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
14
+ model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
15
+ model = model.eval().cuda()
16
+
17
+ # Load MarianMT translation model for Hindi to English translation
18
+ translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
19
+ translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
20
+
21
+ # Define a function for keyword highlighting
22
+ def highlight_keywords(text, keyword):
23
+ # Escape keyword for regex to avoid issues with special characters
24
+ pattern = re.compile(re.escape(keyword), re.IGNORECASE)
25
+ highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
26
+ return highlighted_text
27
+
28
+ # Streamlit App Title
29
+ st.title("OCR with GOT-OCR2 (English & Hindi Translation) and Keyword Search")
30
+
31
+ # File uploader for image input
32
+ image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
33
+
34
+ if image_file is not None:
35
+ # Display the uploaded image
36
+ image = Image.open(image_file)
37
+ st.image(image, caption='Uploaded Image', use_column_width=True)
38
+
39
+ # Save the uploaded file to a temporary file
40
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
41
+ temp_file.write(image_file.getvalue())
42
+ temp_file_path = temp_file.name
43
+
44
+ # Button to run OCR
45
+ if st.button("Run OCR"):
46
+ # Use GOT-OCR2 model for plain text OCR (structured documents)
47
+ res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
48
+
49
+ # Perform formatted text OCR
50
+ res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
51
+
52
+ # Use EasyOCR for both English and Hindi text recognition
53
+ result_easyocr = reader.readtext(temp_file_path, detail=0)
54
+
55
+ # Display the results
56
+ st.subheader("Plain Text OCR Results (English):")
57
+ st.write(res_plain)
58
+
59
+ st.subheader("Formatted Text OCR Results:")
60
+ st.write(res_format)
61
+
62
+ st.subheader("Detected Text using EasyOCR (English and Hindi):")
63
+ extracted_text = " ".join(result_easyocr) # Combine the list of text results
64
+ st.write(extracted_text)
65
+
66
+ # Translate Hindi text to English using MarianMT (optional step)
67
+ st.subheader("Translated Hindi Text to English:")
68
+ translated_text = []
69
+ for sentence in result_easyocr:
70
+ # Detect if the text is in Hindi (you can customize this based on text properties)
71
+ if sentence: # Assuming non-empty text is translated
72
+ tokenized_text = translation_tokenizer([sentence], return_tensors="pt", truncation=True)
73
+ translation = translation_model.generate(**tokenized_text)
74
+ translated_sentence = translation_tokenizer.decode(translation[0], skip_special_tokens=True)
75
+ translated_text.append(translated_sentence)
76
+
77
+ st.write(" ".join(translated_text))
78
+
79
+ # Additional OCR types using GOT-OCR2
80
+ res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
81
+ st.subheader("Fine-Grained OCR Results:")
82
+ st.write(res_fine_grained)
83
+
84
+ # Render formatted OCR to HTML
85
+ res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
86
+ st.subheader("Rendered OCR Results (HTML):")
87
+ st.write(res_render)
88
+
89
+ # Search functionality
90
+ keyword = st.text_input("Enter keyword to search in extracted text:")
91
+
92
+ if keyword:
93
+ st.subheader("Search Results:")
94
+ # Highlight the matching sections in the extracted text
95
+ highlighted_text = highlight_keywords(extracted_text, keyword)
96
+ st.markdown(highlighted_text)
97
+
98
+ # Clean up the temporary file after use
99
+ os.remove(temp_file_path)
100
+
101
+ # Note: No need for if __name__ == "__main__": st.run()