DeepDiveDev commited on
Commit
d657bf8
·
verified ·
1 Parent(s): b5623b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -101
app.py CHANGED
@@ -1,101 +1,101 @@
1
- import streamlit as st
2
- from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
3
- from PIL import Image
4
- import tempfile
5
- import os
6
- import easyocr
7
- import re
8
-
9
- # Load EasyOCR reader with English and Hindi language support
10
- reader = easyocr.Reader(['en', 'hi']) # 'en' for English, 'hi' for Hindi
11
-
12
- # Load the GOT-OCR2 model and tokenizer
13
- tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
14
- model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
15
- model = model.eval().cuda()
16
-
17
- # Load MarianMT translation model for Hindi to English translation
18
- translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
19
- translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
20
-
21
- # Define a function for keyword highlighting
22
- def highlight_keywords(text, keyword):
23
- # Escape keyword for regex to avoid issues with special characters
24
- pattern = re.compile(re.escape(keyword), re.IGNORECASE)
25
- highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
26
- return highlighted_text
27
-
28
- # Streamlit App Title
29
- st.title("OCR with GOT-OCR2 (English & Hindi Translation) and Keyword Search")
30
-
31
- # File uploader for image input
32
- image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
33
-
34
- if image_file is not None:
35
- # Display the uploaded image
36
- image = Image.open(image_file)
37
- st.image(image, caption='Uploaded Image', use_column_width=True)
38
-
39
- # Save the uploaded file to a temporary file
40
- with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
41
- temp_file.write(image_file.getvalue())
42
- temp_file_path = temp_file.name
43
-
44
- # Button to run OCR
45
- if st.button("Run OCR"):
46
- # Use GOT-OCR2 model for plain text OCR (structured documents)
47
- res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
48
-
49
- # Perform formatted text OCR
50
- res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
51
-
52
- # Use EasyOCR for both English and Hindi text recognition
53
- result_easyocr = reader.readtext(temp_file_path, detail=0)
54
-
55
- # Display the results
56
- st.subheader("Plain Text OCR Results (English):")
57
- st.write(res_plain)
58
-
59
- st.subheader("Formatted Text OCR Results:")
60
- st.write(res_format)
61
-
62
- st.subheader("Detected Text using EasyOCR (English and Hindi):")
63
- extracted_text = " ".join(result_easyocr) # Combine the list of text results
64
- st.write(extracted_text)
65
-
66
- # Translate Hindi text to English using MarianMT (optional step)
67
- st.subheader("Translated Hindi Text to English:")
68
- translated_text = []
69
- for sentence in result_easyocr:
70
- # Detect if the text is in Hindi (you can customize this based on text properties)
71
- if sentence: # Assuming non-empty text is translated
72
- tokenized_text = translation_tokenizer([sentence], return_tensors="pt", truncation=True)
73
- translation = translation_model.generate(**tokenized_text)
74
- translated_sentence = translation_tokenizer.decode(translation[0], skip_special_tokens=True)
75
- translated_text.append(translated_sentence)
76
-
77
- st.write(" ".join(translated_text))
78
-
79
- # Additional OCR types using GOT-OCR2
80
- res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
81
- st.subheader("Fine-Grained OCR Results:")
82
- st.write(res_fine_grained)
83
-
84
- # Render formatted OCR to HTML
85
- res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
86
- st.subheader("Rendered OCR Results (HTML):")
87
- st.write(res_render)
88
-
89
- # Search functionality
90
- keyword = st.text_input("Enter keyword to search in extracted text:")
91
-
92
- if keyword:
93
- st.subheader("Search Results:")
94
- # Highlight the matching sections in the extracted text
95
- highlighted_text = highlight_keywords(extracted_text, keyword)
96
- st.markdown(highlighted_text)
97
-
98
- # Clean up the temporary file after use
99
- os.remove(temp_file_path)
100
-
101
- # Note: No need for if __name__ == "__main__": st.run()
 
1
+ import streamlit as st
2
+ from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
3
+ from PIL import Image
4
+ import tempfile
5
+ import os
6
+ import easyocr
7
+ import re
8
+
9
+ # Load EasyOCR reader with English and Hindi language support
10
+ reader = easyocr.Reader(['en', 'hi']) # 'en' for English, 'hi' for Hindi
11
+
12
+ # Load the GOT-OCR2 model and tokenizer
13
+ tokenizer = AutoTokenizer.from_pretrained('stepfun-ai/GOT-OCR2_0', trust_remote_code=True)
14
+ model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
15
+ model = model.eval().cuda()
16
+
17
+ # Load MarianMT translation model for Hindi to English translation
18
+ translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
19
+ translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
20
+
21
+ # Define a function for keyword highlighting
22
+ def highlight_keywords(text, keyword):
23
+ # Escape keyword for regex to avoid issues with special characters
24
+ pattern = re.compile(re.escape(keyword), re.IGNORECASE)
25
+ highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
26
+ return highlighted_text
27
+
28
+ # Streamlit App Title
29
+ st.title("OCR with GOT-OCR2 (English & Hindi Translation) and Keyword Search")
30
+
31
+ # File uploader for image input
32
+ image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
33
+
34
+ if image_file is not None:
35
+ # Display the uploaded image
36
+ image = Image.open(image_file)
37
+ st.image(image, caption='Uploaded Image', use_column_width=True)
38
+
39
+ # Save the uploaded file to a temporary file
40
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
41
+ temp_file.write(image_file.getvalue())
42
+ temp_file_path = temp_file.name
43
+
44
+ # Button to run OCR
45
+ if st.button("Run OCR"):
46
+ # Use GOT-OCR2 model for plain text OCR (structured documents)
47
+ res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
48
+
49
+ # Perform formatted text OCR
50
+ res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
51
+
52
+ # Use EasyOCR for both English and Hindi text recognition
53
+ result_easyocr = reader.readtext(temp_file_path, detail=0)
54
+
55
+ # Display the results
56
+ st.subheader("Plain Text OCR Results (English):")
57
+ st.write(res_plain)
58
+
59
+ st.subheader("Formatted Text OCR Results:")
60
+ st.write(res_format)
61
+
62
+ st.subheader("Detected Text using EasyOCR (English and Hindi):")
63
+ extracted_text = " ".join(result_easyocr) # Combine the list of text results
64
+ st.write(extracted_text)
65
+
66
+ # Translate Hindi text to English using MarianMT (optional step)
67
+ st.subheader("Translated Hindi Text to English:")
68
+ translated_text = []
69
+ for sentence in result_easyocr:
70
+ # Detect if the text is in Hindi (you can customize this based on text properties)
71
+ if sentence: # Assuming non-empty text is translated
72
+ tokenized_text = translation_tokenizer([sentence], return_tensors="pt", truncation=True)
73
+ translation = translation_model.generate(**tokenized_text)
74
+ translated_sentence = translation_tokenizer.decode(translation[0], skip_special_tokens=True)
75
+ translated_text.append(translated_sentence)
76
+
77
+ st.write(" ".join(translated_text))
78
+
79
+ # Additional OCR types using GOT-OCR2
80
+ res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
81
+ st.subheader("Fine-Grained OCR Results:")
82
+ st.write(res_fine_grained)
83
+
84
+ # Render formatted OCR to HTML
85
+ res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
86
+ st.subheader("Rendered OCR Results (HTML):")
87
+ st.write(res_render)
88
+
89
+ # Search functionality
90
+ keyword = st.text_input("Enter keyword to search in extracted text:")
91
+
92
+ if keyword:
93
+ st.subheader("Search Results:")
94
+ # Highlight the matching sections in the extracted text
95
+ highlighted_text = highlight_keywords(extracted_text, keyword)
96
+ st.markdown(highlighted_text)
97
+
98
+ # Clean up the temporary file after use
99
+ os.remove(temp_file_path)
100
+
101
+ # Note: No need for if __name__ == "__main__": st.run()