Spaces:

rajsecrets0
/

Hindi-English-OCR

Sleeping

File size: 4,835 Bytes

8f91176

import streamlit as st
import easyocr
import numpy as np
from PIL import Image
import re
import io
import base64
from streamlit_lottie import st_lottie
import requests

# Set page configuration
st.set_page_config(page_title="OCR & Search App", layout="wide")

# Custom CSS
st.markdown("""
<style>
.highlight {
    background-color: yellow;
    font-weight: bold;
}
.footer {
    position: fixed;
    left: 0;
    bottom: 0;
    width: 100%;
    background-color: #f0f2f6;
    color: black;
    text-align: center;
    padding: 10px 0;
    font-style: italic;
}
.stButton>button {
    width: 100%;
}
</style>
""", unsafe_allow_html=True)

# Lottie Animation
def load_lottieurl(url: str):
    r = requests.get(url)
    if r.status_code != 200:
        return None
    return r.json()

lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json"
lottie_json = load_lottieurl(lottie_url)

# Initialize the OCR reader
@st.cache_resource
def load_ocr_reader():
    return easyocr.Reader(['en', 'hi'])  # For English and Hindi

reader = load_ocr_reader()

def process_image(image):
    try:
        img_array = np.array(image)
        if len(img_array.shape) == 2:  # Grayscale
            img_array = np.stack((img_array,)*3, axis=-1)
        elif img_array.shape[2] == 4:  # RGBA
            img_array = img_array[:,:,:3]
        
        results = reader.readtext(img_array)
        extracted_text = '\n'.join([result[1] for result in results])
        return extracted_text
    except Exception as e:
        return f"Error processing image: {str(e)}"

def search_in_text(extracted_text, keyword):
    if not keyword:
        return "No keyword provided."
    
    try:
        lines = extracted_text.split('\n')
        highlighted_lines = []
        for line in lines:
            if keyword.lower() in line.lower():
                pattern = re.compile(re.escape(keyword), re.IGNORECASE)
                highlighted_line = pattern.sub(lambda m: f"<span class='highlight'>{m.group()}</span>", line)
                highlighted_lines.append(highlighted_line)
        
        if highlighted_lines:
            return "<br>".join(highlighted_lines)
        else:
            return "Keyword not found."
    except Exception as e:
        return f"Error searching text: {str(e)}"

# Streamlit app
st.title("📷 OCR and Keyword Search Application")
st.write("Upload an image containing Hindi or English text, extract the content, and search for keywords.")

# Create three columns
col1, col2, col3 = st.columns([1, 1, 1])

with col1:
    st.header("📤 Upload Image")
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
    
    if uploaded_file is not None:
        if st.button('🔍 Extract Text', key='extract'):
            with st.spinner('Extracting text...'):
                image = Image.open(uploaded_file)
                extracted_text = process_image(image)
                st.session_state['extracted_text'] = extracted_text
            st.success('Text extracted successfully!')
        
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Image', use_column_width=True)
    else:
        st_lottie(lottie_json, key="lottie", height=300)

with col2:
    st.header("📝 Extracted Text")
    if 'extracted_text' in st.session_state:
        st.text_area("", st.session_state['extracted_text'], height=300)
        
        # Download button
        st.download_button(
            label="📥 Download Extracted Text",
            data=st.session_state['extracted_text'].encode('utf-8'),
            file_name="extracted_text.txt",
            mime="text/plain"
        )
    else:
        st.info("Upload an image and extract text to see the results here.")

with col3:
    st.header("🔎 Keyword Search")
    if 'extracted_text' in st.session_state:
        keyword = st.text_input("Enter keyword to search")
        if keyword:
            search_result = search_in_text(st.session_state['extracted_text'], keyword)
            st.markdown(search_result, unsafe_allow_html=True)
        
        # Word count
        word_count = len(st.session_state['extracted_text'].split())
        st.metric(label="Word Count", value=word_count)
        
        # Language detection
        def detect_language(text):
            hindi_pattern = re.compile(r'[\u0900-\u097F]')
            if hindi_pattern.search(text):
                return "Hindi (and possibly English)"
            return "English"
        
        language = detect_language(st.session_state['extracted_text'])
        st.info(f"Detected Language: {language}")
    else:
        st.info("Extract text from an image to use the search functionality.")

# Add a footer
st.markdown("""
<div class="footer">
    <p>Created By Devender Singh</p>
</div>
""", unsafe_allow_html=True)