Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

PRIYANSHUDHAKED commited on Sep 29, 2024

Commit

ece64b8

verified ·

1 Parent(s): e50af9e

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -48

app.py CHANGED Viewed

@@ -1,54 +1,96 @@
-# app.py
-import streamlit as st
-import cv2
-import numpy as np
-import pytesseract
 from PIL import Image
 import re
-# Set the title of the webpage
-st.title("OCR Text Extraction Tool")
-# Uploading an image
-uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
-if uploaded_file is not None:
-    # Convert the uploaded file content to an image
-    image = Image.open(uploaded_file)
-    # Convert PIL Image to OpenCV format
-    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    # Display the image
-    st.image(image, caption='Uploaded Image', use_column_width=True)
-    try:
-        # Perform OCR
-        text = pytesseract.image_to_string(opencv_image)
-        st.subheader("Extracted Text:")
-        st.write(text)
         # Search functionality
-        search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
-        if search_keyword:
-            pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
-            matches = list(pattern.finditer(text))
-            if matches:
-                st.markdown("### Keyword Found:")
-                for match in matches:
-                    start, end = match.span()
-                    context_start = max(0, start - 50)
-                    context_end = min(len(text), end + 50)
-                    context = text[context_start:context_end]
-                    highlighted_text = (
-                        context[:start-context_start] +
-                        f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
-                        context[end-context_start:]
-                    )
-                    st.markdown(f"...{highlighted_text}...")
             else:
-                st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
-    except Exception as e:
-        st.error(f"An error occurred while processing the image: {str(e)}")

+import os
+import google.generativeai as genai
+from google.colab import files
 from PIL import Image
+import io
+from IPython.display import HTML, display
 import re
+# Google Gemini API Key
+GOOGLE_API_KEY = os.getenv("AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")
+# Configure Google Gemini with your API key
+genai.configure(api_key=GOOGLE_API_KEY)
+# Create a GenerativeModel instance
+model = genai.GenerativeModel("gemini-1.5-flash")
+def extract_text_with_gemini(image):
+    prompt = """
+    Extract all text from this image. Provide the output as plain text,
+    maintaining the general layout and structure of the document.
+    Include all visible text, headings, and any important information.
+    """
+    response = model.generate_content([prompt, image])
+    return response.text
+def search_and_highlight(full_text, keyword):
+    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+    matches = list(pattern.finditer(full_text))
+    if not matches:
+        return [], full_text
+    highlighted_text = full_text
+    html_text = full_text
+    results = []
+    for match in reversed(matches):
+        start, end = match.span()
+        context_start = max(0, start - 50)
+        context_end = min(len(full_text), end + 50)
+        context = full_text[context_start:context_end]
+        # Highlight for console output
+        highlighted_context = (
+            context[:start-context_start] +
+            '\033[43m' + context[start-context_start:end-context_start] + '\033[0m' +
+            context[end-context_start:]
+        )
+        results.append(highlighted_context)
+        # Highlight for HTML output
+        html_text = (
+            html_text[:start] +
+            f'<mark>{html_text[start:end]}</mark>' +
+            html_text[end:]
+        )
+    return results, html_text
+def app():
+    uploaded = files.upload()
+    for filename, file_content in uploaded.items():
+        # Open and display the image
+        image = Image.open(io.BytesIO(file_content))
+        display(image)
+        print("Extracting text from the image...")
+        extracted_text = extract_text_with_gemini(image)
+        print("Extracted Text:")
+        print(extracted_text)
         # Search functionality
+        while True:
+            search_keyword = input("\nEnter a keyword to search (or press Enter to exit): ")
+            if not search_keyword:
+                break
+            results, html_text = search_and_highlight(extracted_text, search_keyword)
+            if results:
+                print(f"Keyword '{search_keyword}' found in the extracted text:")
+                for i, result in enumerate(results, 1):
+                    print(f"{i}. ...{result}...")
+                # Display HTML with highlighted text
+                display(HTML(f"<p>{html_text}</p>"))
             else:
+                print(f"Keyword '{search_keyword}' not found in the extracted text.")
+    print("OCR and search completed.")
+if __name__ == "__main__":
+    app()