Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

File size: 5,605 Bytes

import os
import google.generativeai as genai
from PIL import Image
import io
import streamlit as st
import re

# Google Gemini API Key
GOOGLE_API_KEY = os.getenv("AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

# Configure Google Gemini with your API key
genai.configure(api_key="AIzaSyD0GxR2J1JxGic807Cc89Jq6MB4aDJYgDc")

# Create a GenerativeModel instance
model = genai.GenerativeModel("gemini-1.5-flash")

def extract_text_with_gemini(image, keyword=None):
    if keyword:
        prompt = f"""
        1. Extract all text from this image. 
        2. Search for the keyword '{keyword}' (case-insensitive) in the extracted text.
        3. Provide the output as HTML, maintaining the general layout and structure of the document.
        4. Highlight all instances of the keyword '{keyword}' with a yellow background using HTML span tags.
        For example: <span style="background-color: yellow;">keyword</span>
        5. If the keyword is not found, simply return the extracted text without highlighting.
        """
    else:
        prompt = """
        Extract all text from this image. Provide the output as plain text, maintaining the general layout and structure of the document. Include all visible text, headings, and any important information.
        """
    
    response = model.generate_content([prompt, image])
    text = response.text
    
    if not keyword:
        # Remove HTML tags from the extracted text when no keyword is provided
        text = re.sub(r'<[^>]+>', '', text)
    
    return text

def extract_ner_with_gemini(image):
    prompt = """
    Analyze this image and extract all Named Entities (NER) present in the text. 
    Categorize them into types such as Person, Organization, Location, Date, etc. 
    Provide the output as a formatted list with categories and entities.
    """
    
    response = model.generate_content([prompt, image])
    ner_text = response.text
    
    return ner_text

def search_and_highlight(full_text, keyword):
    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
    matches = list(pattern.finditer(full_text))
    if not matches:
        return [], full_text

    highlighted_text = full_text
    results = []
    for match in reversed(matches):
        start, end = match.span()
        context_start = max(0, start - 50)
        context_end = min(len(full_text), end + 50)
        context = full_text[context_start:context_end]
        
        # Highlight for results list
        highlighted_context = (
            context[:start-context_start] +
            f'<span style="background-color: yellow;">{context[start-context_start:end-context_start]}</span>' +
            context[end-context_start:]
        )
        results.append(highlighted_context)
        
        # Highlight for full text
        highlighted_text = (
            highlighted_text[:start] +
            f'<span style="background-color: yellow;">{highlighted_text[start:end]}</span>' +
            highlighted_text[end:]
        )

    return results, highlighted_text

def app():
    st.title("Image OCR, Search, and NER Extraction")
    uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

    if uploaded_file is not None:
        # Open and display the image
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)

        # Select search method
        search_method = st.radio("Choose search method:", 
                                 ("Extract text first, then search", 
                                  "Search while extracting text (using Gemini API)"))

        search_keyword = st.text_input("Enter a keyword to search (or press Enter to exit)")

        col1, col2 = st.columns(2)
        with col1:
            if st.button("Process Image"):
                if search_method == "Extract text first, then search":
                    print("Extracting text from the image...")
                    extracted_text = extract_text_with_gemini(image)
                    st.subheader("Extracted Text:")
                    st.write(extracted_text)

                    if search_keyword:
                        results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
                        if results:
                            st.subheader(f"Keyword '{search_keyword}' found in the extracted text:")
                            for i, result in enumerate(results, 1):
                                st.markdown(f"{i}. ...{result}...", unsafe_allow_html=True)

                            st.subheader("Full Text with Highlighted Keywords:")
                            st.markdown(highlighted_text, unsafe_allow_html=True)
                        else:
                            st.write(f"Keyword '{search_keyword}' not found in the extracted text.")

                else:  # Search while extracting text using Gemini API
                    print("Extracting text and searching keyword using Gemini API...")
                    highlighted_text = extract_text_with_gemini(image, search_keyword)
                    st.subheader("Extracted Text with Highlighted Keyword:")
                    st.markdown(highlighted_text, unsafe_allow_html=True)

                st.write("OCR and search completed.")

        with col2:
            if st.button("Extract NER"):
                print("Extracting Named Entities...")
                ner_results = extract_ner_with_gemini(image)
                st.subheader("Named Entities Extracted:")
                st.write(ner_results)

if __name__ == "__main__":
    app()