Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

Data_Extraction_OCR / app.py

Update app.py

dbf63c3 verified 9 months ago

2.06 kB

	# app.py
	import streamlit as st
	import cv2
	import numpy as np
	import pytesseract
	from PIL import Image
	import re

	# Set the title of the webpage
	st.title("OCR Text Extraction Tool")

	# Uploading an image
	uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	# Convert the uploaded file content to an image
	image = Image.open(uploaded_file)

	# Convert PIL Image to OpenCV format
	opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Display the image
	st.image(image, caption='Uploaded Image', use_column_width=True)

	try:
	# Perform OCR
	text = pytesseract.image_to_string(opencv_image)

	st.subheader("Extracted Text:")
	st.write(text)

	# Search functionality
	search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
	if search_keyword:
	pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
	matches = list(pattern.finditer(text))

	if matches:
	st.markdown("### Keyword Found:")
	for match in matches:
	start, end = match.span()
	context_start = max(0, start - 50)
	context_end = min(len(text), end + 50)
	context = text[context_start:context_end]
	highlighted_text = (
	context[:start-context_start] +
	f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
	context[end-context_start:]
	)
	st.markdown(f"...{highlighted_text}...")
	else:
	st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
	except Exception as e:
	st.error(f"An error occurred while processing the image: {str(e)}")