Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

Data_Extraction_OCR / app.py

PRIYANSHUDHAKED

Update app.py

a9c96c1 verified 6 months ago

raw

history blame

3.17 kB

	import os
	import google.generativeai as genai
	from PIL import Image
	import io
	import streamlit as st
	import re

	# Google Gemini API Key
	GOOGLE_API_KEY = os.getenv("AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

	# Configure Google Gemini with your API key
	genai.configure(api_key=GOOGLE_API_KEY)

	# Create a GenerativeModel instance
	model = genai.GenerativeModel("gemini-1.5-flash")

	def extract_text_with_gemini(image):
	prompt = """
	Extract all text from this image. Provide the output as plain text,
	maintaining the general layout and structure of the document.
	Include all visible text, headings, and any important information.
	"""
	response = model.generate_content([prompt, image])
	return response.text

	def search_and_highlight(full_text, keyword):
	pattern = re.compile(re.escape(keyword), re.IGNORECASE)
	matches = list(pattern.finditer(full_text))

	if not matches:
	return [], full_text

	highlighted_text = full_text
	html_text = full_text
	results = []

	for match in reversed(matches):
	start, end = match.span()
	context_start = max(0, start - 50)
	context_end = min(len(full_text), end + 50)
	context = full_text[context_start:context_end]

	# Highlight for console output
	highlighted_context = (
	context[:start-context_start] +
	'\033[43m' + context[start-context_start:end-context_start] + '\033[0m' +
	context[end-context_start:]
	)
	results.append(highlighted_context)

	# Highlight for HTML output
	html_text = (
	html_text[:start] +
	f'<mark>{html_text[start:end]}</mark>' +
	html_text[end:]
	)

	return results, html_text

	def app():
	st.title("Image OCR and Search")
	uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

	if uploaded_file is not None:
	# Open and display the image
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	print("Extracting text from the image...")
	extracted_text = extract_text_with_gemini(image)

	st.subheader("Extracted Text:")
	st.write(extracted_text)

	# Search functionality
	search_keyword = st.text_input("Enter a keyword to search (or press Enter to exit)")
	if search_keyword:
	results, html_text = search_and_highlight(extracted_text, search_keyword)

	if results:
	st.subheader(f"Keyword '{search_keyword}' found in the extracted text:")
	for i, result in enumerate(results, 1):
	st.write(f"{i}. ...{result}...")

	# Display HTML with highlighted text
	st.markdown(f"<p>{html_text}</p>", unsafe_allow_html=True)
	else:
	st.write(f"Keyword '{search_keyword}' not found in the extracted text.")

	st.write("OCR and search completed.")

	if __name__ == "__main__":
	app()