Spaces:

PRIYANSHUDHAKED
/

Data_Extraction_OCR

Sleeping

App Files Files Community

PRIYANSHUDHAKED commited on Sep 28, 2024

Commit

3af70ed

verified ·

1 Parent(s): 858b316

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -26

app.py CHANGED Viewed

@@ -6,38 +6,74 @@ from PIL import Image
 import io
 import re
-# Function for OCR processing (similar to your existing code)
 def process_image(image_bytes):
-  # Convert bytes to image and process
-  image = Image.open(io.BytesIO(image_bytes))
-  opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-  text = pytesseract.image_to_string(opencv_image)
-  return text
-# Function for search and highlight (similar to your existing code)
 def search_and_highlight(full_text, keyword):
-  # Implement search and highlighting logic here
 # Streamlit app layout
 st.title("Image Text Search App")
 uploaded_file = st.file_uploader("Upload an Image", type="jpg,png")
 if uploaded_file is not None:
-  image_bytes = uploaded_file.read()
-  st.image(image_bytes)
-  # Perform OCR
-  extracted_text = process_image(image_bytes)
-  st.write("Extracted Text:")
-  st.write(extracted_text)
-  # Search functionality
-  search_keyword = st.text_input("Enter a keyword to search:")
-  if search_keyword:
-    results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
-    if results:
-      st.write(f"Keyword '{search_keyword}' found in the extracted text:")
-      for i, result in enumerate(results, 1):
-        st.write(f"{i}. ...{result}...")
-    else:
-      st.write(f"Keyword '{search_keyword}' not found in the extracted text.")

 import io
 import re
+# ANSI escape codes for console color
+YELLOW_HIGHLIGHT = '\033[43m'
+RESET_COLOR = '\033[0m'
+# Function for OCR processing
 def process_image(image_bytes):
+    # Convert bytes to image and process
+    image = Image.open(io.BytesIO(image_bytes))
+    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    text = pytesseract.image_to_string(opencv_image)
+    return text
+# Function for search and highlight
 def search_and_highlight(full_text, keyword):
+    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+    matches = list(pattern.finditer(full_text))
+    if not matches:
+        return [], full_text
+    highlighted_text = full_text
+    html_text = full_text
+    results = []
+    for match in reversed(matches):
+        start, end = match.span()
+        context_start = max(0, start - 50)
+        context_end = min(len(full_text), end + 50)
+        context = full_text[context_start:context_end]
+        # Highlight for console output
+        highlighted_context = (
+            context[:start - context_start] +
+            YELLOW_HIGHLIGHT + context[start - context_start:end - context_start] + RESET_COLOR +
+            context[end - context_start:]
+        )
+        results.append(highlighted_context)
+        # Highlight for HTML output
+        html_text = (
+            html_text[:start] +
+            f'<span style="background-color: yellow;">{html_text[start:end]}</span>' +
+            html_text[end:]
+        )
+    return results, html_text
 # Streamlit app layout
 st.title("Image Text Search App")
 uploaded_file = st.file_uploader("Upload an Image", type="jpg,png")
 if uploaded_file is not None:
+    image_bytes = uploaded_file.read()
+    st.image(image_bytes)
+    # Perform OCR
+    extracted_text = process_image(image_bytes)
+    st.write("Extracted Text:")
+    st.write(extracted_text)
+    # Search functionality
+    search_keyword = st.text_input("Enter a keyword to search:")
+    if search_keyword:
+        results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
+        if results:
+            st.write(f"Keyword '{search_keyword}' found in the extracted text:")
+            for i, result in enumerate(results, 1):
+                st.write(f"{i}. ...{result}...")
+        else:
+            st.write(f"Keyword '{search_keyword}' not found in the extracted text.")