Spaces:
Sleeping
Sleeping
File size: 4,835 Bytes
8f91176 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
import easyocr
import numpy as np
from PIL import Image
import re
import io
import base64
from streamlit_lottie import st_lottie
import requests
# Set page configuration
st.set_page_config(page_title="OCR & Search App", layout="wide")
# Custom CSS
st.markdown("""
<style>
.highlight {
background-color: yellow;
font-weight: bold;
}
.footer {
position: fixed;
left: 0;
bottom: 0;
width: 100%;
background-color: #f0f2f6;
color: black;
text-align: center;
padding: 10px 0;
font-style: italic;
}
.stButton>button {
width: 100%;
}
</style>
""", unsafe_allow_html=True)
# Lottie Animation
def load_lottieurl(url: str):
r = requests.get(url)
if r.status_code != 200:
return None
return r.json()
lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json"
lottie_json = load_lottieurl(lottie_url)
# Initialize the OCR reader
@st.cache_resource
def load_ocr_reader():
return easyocr.Reader(['en', 'hi']) # For English and Hindi
reader = load_ocr_reader()
def process_image(image):
try:
img_array = np.array(image)
if len(img_array.shape) == 2: # Grayscale
img_array = np.stack((img_array,)*3, axis=-1)
elif img_array.shape[2] == 4: # RGBA
img_array = img_array[:,:,:3]
results = reader.readtext(img_array)
extracted_text = '\n'.join([result[1] for result in results])
return extracted_text
except Exception as e:
return f"Error processing image: {str(e)}"
def search_in_text(extracted_text, keyword):
if not keyword:
return "No keyword provided."
try:
lines = extracted_text.split('\n')
highlighted_lines = []
for line in lines:
if keyword.lower() in line.lower():
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
highlighted_line = pattern.sub(lambda m: f"<span class='highlight'>{m.group()}</span>", line)
highlighted_lines.append(highlighted_line)
if highlighted_lines:
return "<br>".join(highlighted_lines)
else:
return "Keyword not found."
except Exception as e:
return f"Error searching text: {str(e)}"
# Streamlit app
st.title("π· OCR and Keyword Search Application")
st.write("Upload an image containing Hindi or English text, extract the content, and search for keywords.")
# Create three columns
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
st.header("π€ Upload Image")
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
if st.button('π Extract Text', key='extract'):
with st.spinner('Extracting text...'):
image = Image.open(uploaded_file)
extracted_text = process_image(image)
st.session_state['extracted_text'] = extracted_text
st.success('Text extracted successfully!')
image = Image.open(uploaded_file)
st.image(image, caption='Uploaded Image', use_column_width=True)
else:
st_lottie(lottie_json, key="lottie", height=300)
with col2:
st.header("π Extracted Text")
if 'extracted_text' in st.session_state:
st.text_area("", st.session_state['extracted_text'], height=300)
# Download button
st.download_button(
label="π₯ Download Extracted Text",
data=st.session_state['extracted_text'].encode('utf-8'),
file_name="extracted_text.txt",
mime="text/plain"
)
else:
st.info("Upload an image and extract text to see the results here.")
with col3:
st.header("π Keyword Search")
if 'extracted_text' in st.session_state:
keyword = st.text_input("Enter keyword to search")
if keyword:
search_result = search_in_text(st.session_state['extracted_text'], keyword)
st.markdown(search_result, unsafe_allow_html=True)
# Word count
word_count = len(st.session_state['extracted_text'].split())
st.metric(label="Word Count", value=word_count)
# Language detection
def detect_language(text):
hindi_pattern = re.compile(r'[\u0900-\u097F]')
if hindi_pattern.search(text):
return "Hindi (and possibly English)"
return "English"
language = detect_language(st.session_state['extracted_text'])
st.info(f"Detected Language: {language}")
else:
st.info("Extract text from an image to use the search functionality.")
# Add a footer
st.markdown("""
<div class="footer">
<p>Created By Devender Singh</p>
</div>
""", unsafe_allow_html=True) |