Spaces:

rajsecrets0
/

Hindi-English-OCR

Sleeping

App Files Files Community

Hindi-English-OCR / app.py

rajsecrets0

Upload 2 files

8f91176 verified 8 months ago

raw

history blame contribute delete

4.84 kB

	import streamlit as st
	import easyocr
	import numpy as np
	from PIL import Image
	import re
	import io
	import base64
	from streamlit_lottie import st_lottie
	import requests

	# Set page configuration
	st.set_page_config(page_title="OCR & Search App", layout="wide")

	# Custom CSS
	st.markdown("""
	<style>
	.highlight {
	background-color: yellow;
	font-weight: bold;
	}
	.footer {
	position: fixed;
	left: 0;
	bottom: 0;
	width: 100%;
	background-color: #f0f2f6;
	color: black;
	text-align: center;
	padding: 10px 0;
	font-style: italic;
	}
	.stButton>button {
	width: 100%;
	}
	</style>
	""", unsafe_allow_html=True)

	# Lottie Animation
	def load_lottieurl(url: str):
	r = requests.get(url)
	if r.status_code != 200:
	return None
	return r.json()

	lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json"
	lottie_json = load_lottieurl(lottie_url)

	# Initialize the OCR reader
	@st.cache_resource
	def load_ocr_reader():
	return easyocr.Reader(['en', 'hi']) # For English and Hindi

	reader = load_ocr_reader()

	def process_image(image):
	try:
	img_array = np.array(image)
	if len(img_array.shape) == 2: # Grayscale
	img_array = np.stack((img_array,)*3, axis=-1)
	elif img_array.shape[2] == 4: # RGBA
	img_array = img_array[:,:,:3]

	results = reader.readtext(img_array)
	extracted_text = '\n'.join([result[1] for result in results])
	return extracted_text
	except Exception as e:
	return f"Error processing image: {str(e)}"

	def search_in_text(extracted_text, keyword):
	if not keyword:
	return "No keyword provided."

	try:
	lines = extracted_text.split('\n')
	highlighted_lines = []
	for line in lines:
	if keyword.lower() in line.lower():
	pattern = re.compile(re.escape(keyword), re.IGNORECASE)
	highlighted_line = pattern.sub(lambda m: f"<span class='highlight'>{m.group()}</span>", line)
	highlighted_lines.append(highlighted_line)

	if highlighted_lines:
	return "<br>".join(highlighted_lines)
	else:
	return "Keyword not found."
	except Exception as e:
	return f"Error searching text: {str(e)}"

	# Streamlit app
	st.title("📷 OCR and Keyword Search Application")
	st.write("Upload an image containing Hindi or English text, extract the content, and search for keywords.")

	# Create three columns
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	st.header("📤 Upload Image")
	uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	if st.button('🔍 Extract Text', key='extract'):
	with st.spinner('Extracting text...'):
	image = Image.open(uploaded_file)
	extracted_text = process_image(image)
	st.session_state['extracted_text'] = extracted_text
	st.success('Text extracted successfully!')

	image = Image.open(uploaded_file)
	st.image(image, caption='Uploaded Image', use_column_width=True)
	else:
	st_lottie(lottie_json, key="lottie", height=300)

	with col2:
	st.header("📝 Extracted Text")
	if 'extracted_text' in st.session_state:
	st.text_area("", st.session_state['extracted_text'], height=300)

	# Download button
	st.download_button(
	label="📥 Download Extracted Text",
	data=st.session_state['extracted_text'].encode('utf-8'),
	file_name="extracted_text.txt",
	mime="text/plain"
	)
	else:
	st.info("Upload an image and extract text to see the results here.")

	with col3:
	st.header("🔎 Keyword Search")
	if 'extracted_text' in st.session_state:
	keyword = st.text_input("Enter keyword to search")
	if keyword:
	search_result = search_in_text(st.session_state['extracted_text'], keyword)
	st.markdown(search_result, unsafe_allow_html=True)

	# Word count
	word_count = len(st.session_state['extracted_text'].split())
	st.metric(label="Word Count", value=word_count)

	# Language detection
	def detect_language(text):
	hindi_pattern = re.compile(r'[\u0900-\u097F]')
	if hindi_pattern.search(text):
	return "Hindi (and possibly English)"
	return "English"

	language = detect_language(st.session_state['extracted_text'])
	st.info(f"Detected Language: {language}")
	else:
	st.info("Extract text from an image to use the search functionality.")

	# Add a footer
	st.markdown("""
	<div class="footer">
	<p>Created By Devender Singh</p>
	</div>
	""", unsafe_allow_html=True)