Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

PRIYANSHUDHAKED commited on Sep 28, 2024

Commit

858b316

verified ·

1 Parent(s): cdadee9

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -49

app.py CHANGED Viewed

@@ -1,54 +1,43 @@
-# app.py
-import streamlit as st
-import cv2
-import numpy as np
-import pytesseract
-from PIL import Image
-import re
-# Set the title of the webpage
-st.title("OCR Text Extraction Tool")
-# Uploading an image
-uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
-if uploaded_file is not None:
-    # Convert the uploaded file content to an image
-    image = Image.open(uploaded_file)
-    # Convert PIL Image to OpenCV format
-    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    # Display the image
-    st.image(image, caption='Uploaded Image', use_column_width=True)
-    try:
-        # Perform OCR
-        text = pytesseract.image_to_string(opencv_image)
-        st.subheader("Extracted Text:")
-        st.write(text)
-        # Search functionality
-        search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
-        if search_keyword:
-            pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
-            matches = list(pattern.finditer(text))
-            if matches:
-                st.markdown("### Keyword Found:")
-                for match in matches:
-                    start, end = match.span()
-                    context_start = max(0, start - 50)
-                    context_end = min(len(text), end + 50)
-                    context = text[context_start:context_end]
-                    highlighted_text = (
-                        context[:start-context_start] +
-                        f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
-                        context[end-context_start:]
-                    )
-                    st.markdown(f"...{highlighted_text}...")
-            else:
-                st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
-    except Exception as e:
-        st.error(f"An error occurred while processing the image: {str(e)}")

+import streamlit as st
+import cv2
+import pytesseract
+import numpy as np
+from PIL import Image
+import io
+import re
+# Function for OCR processing (similar to your existing code)
+def process_image(image_bytes):
+  # Convert bytes to image and process
+  image = Image.open(io.BytesIO(image_bytes))
+  opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+  text = pytesseract.image_to_string(opencv_image)
+  return text
+# Function for search and highlight (similar to your existing code)
+def search_and_highlight(full_text, keyword):
+  # Implement search and highlighting logic here
+# Streamlit app layout
+st.title("Image Text Search App")
+uploaded_file = st.file_uploader("Upload an Image", type="jpg,png")
+if uploaded_file is not None:
+  image_bytes = uploaded_file.read()
+  st.image(image_bytes)
+  # Perform OCR
+  extracted_text = process_image(image_bytes)
+  st.write("Extracted Text:")
+  st.write(extracted_text)
+  # Search functionality
+  search_keyword = st.text_input("Enter a keyword to search:")
+  if search_keyword:
+    results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
+    if results:
+      st.write(f"Keyword '{search_keyword}' found in the extracted text:")
+      for i, result in enumerate(results, 1):
+        st.write(f"{i}. ...{result}...")
+    else:
+      st.write(f"Keyword '{search_keyword}' not found in the extracted text.")