Spaces:
Sleeping
Sleeping
import streamlit as st | |
import easyocr | |
import numpy as np | |
from PIL import Image | |
import re | |
import io | |
import base64 | |
from streamlit_lottie import st_lottie | |
import requests | |
# Set page configuration | |
st.set_page_config(page_title="OCR & Search App", layout="wide") | |
# Custom CSS | |
st.markdown(""" | |
<style> | |
.highlight { | |
background-color: yellow; | |
font-weight: bold; | |
} | |
.footer { | |
position: fixed; | |
left: 0; | |
bottom: 0; | |
width: 100%; | |
background-color: #f0f2f6; | |
color: black; | |
text-align: center; | |
padding: 10px 0; | |
font-style: italic; | |
} | |
.stButton>button { | |
width: 100%; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Lottie Animation | |
def load_lottieurl(url: str): | |
r = requests.get(url) | |
if r.status_code != 200: | |
return None | |
return r.json() | |
lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json" | |
lottie_json = load_lottieurl(lottie_url) | |
# Initialize the OCR reader | |
def load_ocr_reader(): | |
return easyocr.Reader(['en', 'hi']) # For English and Hindi | |
reader = load_ocr_reader() | |
def process_image(image): | |
try: | |
img_array = np.array(image) | |
if len(img_array.shape) == 2: # Grayscale | |
img_array = np.stack((img_array,)*3, axis=-1) | |
elif img_array.shape[2] == 4: # RGBA | |
img_array = img_array[:,:,:3] | |
results = reader.readtext(img_array) | |
extracted_text = '\n'.join([result[1] for result in results]) | |
return extracted_text | |
except Exception as e: | |
return f"Error processing image: {str(e)}" | |
def search_in_text(extracted_text, keyword): | |
if not keyword: | |
return "No keyword provided." | |
try: | |
lines = extracted_text.split('\n') | |
highlighted_lines = [] | |
for line in lines: | |
if keyword.lower() in line.lower(): | |
pattern = re.compile(re.escape(keyword), re.IGNORECASE) | |
highlighted_line = pattern.sub(lambda m: f"<span class='highlight'>{m.group()}</span>", line) | |
highlighted_lines.append(highlighted_line) | |
if highlighted_lines: | |
return "<br>".join(highlighted_lines) | |
else: | |
return "Keyword not found." | |
except Exception as e: | |
return f"Error searching text: {str(e)}" | |
# Streamlit app | |
st.title("π· OCR and Keyword Search Application") | |
st.write("Upload an image containing Hindi or English text, extract the content, and search for keywords.") | |
# Create three columns | |
col1, col2, col3 = st.columns([1, 1, 1]) | |
with col1: | |
st.header("π€ Upload Image") | |
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
if uploaded_file is not None: | |
if st.button('π Extract Text', key='extract'): | |
with st.spinner('Extracting text...'): | |
image = Image.open(uploaded_file) | |
extracted_text = process_image(image) | |
st.session_state['extracted_text'] = extracted_text | |
st.success('Text extracted successfully!') | |
image = Image.open(uploaded_file) | |
st.image(image, caption='Uploaded Image', use_column_width=True) | |
else: | |
st_lottie(lottie_json, key="lottie", height=300) | |
with col2: | |
st.header("π Extracted Text") | |
if 'extracted_text' in st.session_state: | |
st.text_area("", st.session_state['extracted_text'], height=300) | |
# Download button | |
st.download_button( | |
label="π₯ Download Extracted Text", | |
data=st.session_state['extracted_text'].encode('utf-8'), | |
file_name="extracted_text.txt", | |
mime="text/plain" | |
) | |
else: | |
st.info("Upload an image and extract text to see the results here.") | |
with col3: | |
st.header("π Keyword Search") | |
if 'extracted_text' in st.session_state: | |
keyword = st.text_input("Enter keyword to search") | |
if keyword: | |
search_result = search_in_text(st.session_state['extracted_text'], keyword) | |
st.markdown(search_result, unsafe_allow_html=True) | |
# Word count | |
word_count = len(st.session_state['extracted_text'].split()) | |
st.metric(label="Word Count", value=word_count) | |
# Language detection | |
def detect_language(text): | |
hindi_pattern = re.compile(r'[\u0900-\u097F]') | |
if hindi_pattern.search(text): | |
return "Hindi (and possibly English)" | |
return "English" | |
language = detect_language(st.session_state['extracted_text']) | |
st.info(f"Detected Language: {language}") | |
else: | |
st.info("Extract text from an image to use the search functionality.") | |
# Add a footer | |
st.markdown(""" | |
<div class="footer"> | |
<p>Created By Devender Singh</p> | |
</div> | |
""", unsafe_allow_html=True) |