File size: 4,835 Bytes
8f91176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import streamlit as st
import easyocr
import numpy as np
from PIL import Image
import re
import io
import base64
from streamlit_lottie import st_lottie
import requests

# Set page configuration
st.set_page_config(page_title="OCR & Search App", layout="wide")

# Custom CSS
st.markdown("""
<style>
.highlight {
    background-color: yellow;
    font-weight: bold;
}
.footer {
    position: fixed;
    left: 0;
    bottom: 0;
    width: 100%;
    background-color: #f0f2f6;
    color: black;
    text-align: center;
    padding: 10px 0;
    font-style: italic;
}
.stButton>button {
    width: 100%;
}
</style>
""", unsafe_allow_html=True)

# Lottie Animation
def load_lottieurl(url: str):
    r = requests.get(url)
    if r.status_code != 200:
        return None
    return r.json()

lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json"
lottie_json = load_lottieurl(lottie_url)

# Initialize the OCR reader
@st.cache_resource
def load_ocr_reader():
    return easyocr.Reader(['en', 'hi'])  # For English and Hindi

reader = load_ocr_reader()

def process_image(image):
    try:
        img_array = np.array(image)
        if len(img_array.shape) == 2:  # Grayscale
            img_array = np.stack((img_array,)*3, axis=-1)
        elif img_array.shape[2] == 4:  # RGBA
            img_array = img_array[:,:,:3]
        
        results = reader.readtext(img_array)
        extracted_text = '\n'.join([result[1] for result in results])
        return extracted_text
    except Exception as e:
        return f"Error processing image: {str(e)}"

def search_in_text(extracted_text, keyword):
    if not keyword:
        return "No keyword provided."
    
    try:
        lines = extracted_text.split('\n')
        highlighted_lines = []
        for line in lines:
            if keyword.lower() in line.lower():
                pattern = re.compile(re.escape(keyword), re.IGNORECASE)
                highlighted_line = pattern.sub(lambda m: f"<span class='highlight'>{m.group()}</span>", line)
                highlighted_lines.append(highlighted_line)
        
        if highlighted_lines:
            return "<br>".join(highlighted_lines)
        else:
            return "Keyword not found."
    except Exception as e:
        return f"Error searching text: {str(e)}"

# Streamlit app
st.title("πŸ“· OCR and Keyword Search Application")
st.write("Upload an image containing Hindi or English text, extract the content, and search for keywords.")

# Create three columns
col1, col2, col3 = st.columns([1, 1, 1])

with col1:
    st.header("πŸ“€ Upload Image")
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
    
    if uploaded_file is not None:
        if st.button('πŸ” Extract Text', key='extract'):
            with st.spinner('Extracting text...'):
                image = Image.open(uploaded_file)
                extracted_text = process_image(image)
                st.session_state['extracted_text'] = extracted_text
            st.success('Text extracted successfully!')
        
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Image', use_column_width=True)
    else:
        st_lottie(lottie_json, key="lottie", height=300)

with col2:
    st.header("πŸ“ Extracted Text")
    if 'extracted_text' in st.session_state:
        st.text_area("", st.session_state['extracted_text'], height=300)
        
        # Download button
        st.download_button(
            label="πŸ“₯ Download Extracted Text",
            data=st.session_state['extracted_text'].encode('utf-8'),
            file_name="extracted_text.txt",
            mime="text/plain"
        )
    else:
        st.info("Upload an image and extract text to see the results here.")

with col3:
    st.header("πŸ”Ž Keyword Search")
    if 'extracted_text' in st.session_state:
        keyword = st.text_input("Enter keyword to search")
        if keyword:
            search_result = search_in_text(st.session_state['extracted_text'], keyword)
            st.markdown(search_result, unsafe_allow_html=True)
        
        # Word count
        word_count = len(st.session_state['extracted_text'].split())
        st.metric(label="Word Count", value=word_count)
        
        # Language detection
        def detect_language(text):
            hindi_pattern = re.compile(r'[\u0900-\u097F]')
            if hindi_pattern.search(text):
                return "Hindi (and possibly English)"
            return "English"
        
        language = detect_language(st.session_state['extracted_text'])
        st.info(f"Detected Language: {language}")
    else:
        st.info("Extract text from an image to use the search functionality.")

# Add a footer
st.markdown("""
<div class="footer">
    <p>Created By Devender Singh</p>
</div>
""", unsafe_allow_html=True)