Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

Data_Extraction_OCR / app.py

PRIYANSHUDHAKED

Update app.py

ece64b8 verified 7 months ago

raw

history blame

3.11 kB

	import os
	import google.generativeai as genai
	from google.colab import files
	from PIL import Image
	import io
	from IPython.display import HTML, display
	import re

	# Google Gemini API Key
	GOOGLE_API_KEY = os.getenv("AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

	# Configure Google Gemini with your API key
	genai.configure(api_key=GOOGLE_API_KEY)

	# Create a GenerativeModel instance
	model = genai.GenerativeModel("gemini-1.5-flash")

	def extract_text_with_gemini(image):
	prompt = """
	Extract all text from this image. Provide the output as plain text,
	maintaining the general layout and structure of the document.
	Include all visible text, headings, and any important information.
	"""
	response = model.generate_content([prompt, image])
	return response.text

	def search_and_highlight(full_text, keyword):
	pattern = re.compile(re.escape(keyword), re.IGNORECASE)
	matches = list(pattern.finditer(full_text))

	if not matches:
	return [], full_text

	highlighted_text = full_text
	html_text = full_text
	results = []

	for match in reversed(matches):
	start, end = match.span()
	context_start = max(0, start - 50)
	context_end = min(len(full_text), end + 50)
	context = full_text[context_start:context_end]

	# Highlight for console output
	highlighted_context = (
	context[:start-context_start] +
	'\033[43m' + context[start-context_start:end-context_start] + '\033[0m' +
	context[end-context_start:]
	)
	results.append(highlighted_context)

	# Highlight for HTML output
	html_text = (
	html_text[:start] +
	f'<mark>{html_text[start:end]}</mark>' +
	html_text[end:]
	)

	return results, html_text

	def app():
	uploaded = files.upload()

	for filename, file_content in uploaded.items():
	# Open and display the image
	image = Image.open(io.BytesIO(file_content))
	display(image)

	print("Extracting text from the image...")
	extracted_text = extract_text_with_gemini(image)

	print("Extracted Text:")
	print(extracted_text)

	# Search functionality
	while True:
	search_keyword = input("\nEnter a keyword to search (or press Enter to exit): ")
	if not search_keyword:
	break

	results, html_text = search_and_highlight(extracted_text, search_keyword)

	if results:
	print(f"Keyword '{search_keyword}' found in the extracted text:")
	for i, result in enumerate(results, 1):
	print(f"{i}. ...{result}...")

	# Display HTML with highlighted text
	display(HTML(f"<p>{html_text}</p>"))
	else:
	print(f"Keyword '{search_keyword}' not found in the extracted text.")

	print("OCR and search completed.")

	if __name__ == "__main__":
	app()