Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,38 +6,74 @@ from PIL import Image
|
|
6 |
import io
|
7 |
import re
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
10 |
def process_image(image_bytes):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
-
# Function for search and highlight
|
18 |
def search_and_highlight(full_text, keyword):
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Streamlit app layout
|
22 |
st.title("Image Text Search App")
|
|
|
23 |
uploaded_file = st.file_uploader("Upload an Image", type="jpg,png")
|
24 |
|
25 |
if uploaded_file is not None:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
6 |
import io
|
7 |
import re
|
8 |
|
9 |
+
# ANSI escape codes for console color
|
10 |
+
YELLOW_HIGHLIGHT = '\033[43m'
|
11 |
+
RESET_COLOR = '\033[0m'
|
12 |
+
|
13 |
+
# Function for OCR processing
|
14 |
def process_image(image_bytes):
|
15 |
+
# Convert bytes to image and process
|
16 |
+
image = Image.open(io.BytesIO(image_bytes))
|
17 |
+
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
18 |
+
text = pytesseract.image_to_string(opencv_image)
|
19 |
+
return text
|
20 |
|
21 |
+
# Function for search and highlight
|
22 |
def search_and_highlight(full_text, keyword):
|
23 |
+
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
|
24 |
+
matches = list(pattern.finditer(full_text))
|
25 |
+
|
26 |
+
if not matches:
|
27 |
+
return [], full_text
|
28 |
+
|
29 |
+
highlighted_text = full_text
|
30 |
+
html_text = full_text
|
31 |
+
results = []
|
32 |
+
|
33 |
+
for match in reversed(matches):
|
34 |
+
start, end = match.span()
|
35 |
+
context_start = max(0, start - 50)
|
36 |
+
context_end = min(len(full_text), end + 50)
|
37 |
+
context = full_text[context_start:context_end]
|
38 |
+
|
39 |
+
# Highlight for console output
|
40 |
+
highlighted_context = (
|
41 |
+
context[:start - context_start] +
|
42 |
+
YELLOW_HIGHLIGHT + context[start - context_start:end - context_start] + RESET_COLOR +
|
43 |
+
context[end - context_start:]
|
44 |
+
)
|
45 |
+
results.append(highlighted_context)
|
46 |
+
|
47 |
+
# Highlight for HTML output
|
48 |
+
html_text = (
|
49 |
+
html_text[:start] +
|
50 |
+
f'<span style="background-color: yellow;">{html_text[start:end]}</span>' +
|
51 |
+
html_text[end:]
|
52 |
+
)
|
53 |
+
|
54 |
+
return results, html_text
|
55 |
|
56 |
# Streamlit app layout
|
57 |
st.title("Image Text Search App")
|
58 |
+
|
59 |
uploaded_file = st.file_uploader("Upload an Image", type="jpg,png")
|
60 |
|
61 |
if uploaded_file is not None:
|
62 |
+
image_bytes = uploaded_file.read()
|
63 |
+
st.image(image_bytes)
|
64 |
+
|
65 |
+
# Perform OCR
|
66 |
+
extracted_text = process_image(image_bytes)
|
67 |
+
st.write("Extracted Text:")
|
68 |
+
st.write(extracted_text)
|
69 |
+
|
70 |
+
# Search functionality
|
71 |
+
search_keyword = st.text_input("Enter a keyword to search:")
|
72 |
+
if search_keyword:
|
73 |
+
results, highlighted_text = search_and_highlight(extracted_text, search_keyword)
|
74 |
+
if results:
|
75 |
+
st.write(f"Keyword '{search_keyword}' found in the extracted text:")
|
76 |
+
for i, result in enumerate(results, 1):
|
77 |
+
st.write(f"{i}. ...{result}...")
|
78 |
+
else:
|
79 |
+
st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
|